1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 /* 26 * Copyright (c) 1990 Mentat Inc. 27 */ 28 29 /* 30 * This file contains routines that manipulate Internet Routing Entries (IREs). 31 */ 32 #include <sys/types.h> 33 #include <sys/stream.h> 34 #include <sys/stropts.h> 35 #include <sys/ddi.h> 36 #include <sys/cmn_err.h> 37 38 #include <sys/systm.h> 39 #include <sys/param.h> 40 #include <sys/socket.h> 41 #include <net/if.h> 42 #include <net/route.h> 43 #include <netinet/in.h> 44 #include <net/if_dl.h> 45 #include <netinet/ip6.h> 46 #include <netinet/icmp6.h> 47 48 #include <inet/common.h> 49 #include <inet/mi.h> 50 #include <inet/ip.h> 51 #include <inet/ip6.h> 52 #include <inet/ip_ndp.h> 53 #include <inet/ip_if.h> 54 #include <inet/ip_ire.h> 55 #include <inet/ipclassifier.h> 56 #include <inet/nd.h> 57 #include <sys/kmem.h> 58 #include <sys/zone.h> 59 60 #include <sys/tsol/label.h> 61 #include <sys/tsol/tnet.h> 62 63 #define IS_DEFAULT_ROUTE_V6(ire) \ 64 (((ire)->ire_type & IRE_DEFAULT) || \ 65 (((ire)->ire_type & IRE_INTERFACE) && \ 66 (IN6_IS_ADDR_UNSPECIFIED(&(ire)->ire_addr_v6)))) 67 68 static ire_t ire_null; 69 70 static ire_t * 71 ire_ftable_lookup_impl_v6(const in6_addr_t *addr, const in6_addr_t *mask, 72 const in6_addr_t *gateway, int type, const ill_t *ill, 73 zoneid_t zoneid, const ts_label_t *tsl, int flags, 74 ip_stack_t *ipst); 75 76 /* 77 * Initialize the ire that is specific to IPv6 part and call 78 * ire_init_common to finish it. 79 * Returns zero or errno. 80 */ 81 int 82 ire_init_v6(ire_t *ire, const in6_addr_t *v6addr, const in6_addr_t *v6mask, 83 const in6_addr_t *v6gateway, ushort_t type, ill_t *ill, 84 zoneid_t zoneid, uint_t flags, tsol_gc_t *gc, ip_stack_t *ipst) 85 { 86 int error; 87 88 /* 89 * Reject IRE security attmakeribute creation/initialization 90 * if system is not running in Trusted mode. 91 */ 92 if (gc != NULL && !is_system_labeled()) 93 return (EINVAL); 94 95 BUMP_IRE_STATS(ipst->ips_ire_stats_v6, ire_stats_alloced); 96 if (v6addr != NULL) 97 ire->ire_addr_v6 = *v6addr; 98 if (v6gateway != NULL) 99 ire->ire_gateway_addr_v6 = *v6gateway; 100 101 /* Make sure we don't have stray values in some fields */ 102 switch (type) { 103 case IRE_LOOPBACK: 104 ire->ire_gateway_addr_v6 = ire->ire_addr_v6; 105 /* FALLTHRU */ 106 case IRE_HOST: 107 case IRE_LOCAL: 108 case IRE_IF_CLONE: 109 ire->ire_mask_v6 = ipv6_all_ones; 110 ire->ire_masklen = IPV6_ABITS; 111 break; 112 case IRE_PREFIX: 113 case IRE_DEFAULT: 114 case IRE_IF_RESOLVER: 115 case IRE_IF_NORESOLVER: 116 if (v6mask != NULL) { 117 ire->ire_mask_v6 = *v6mask; 118 ire->ire_masklen = 119 ip_mask_to_plen_v6(&ire->ire_mask_v6); 120 } 121 break; 122 case IRE_MULTICAST: 123 case IRE_NOROUTE: 124 ASSERT(v6mask == NULL); 125 break; 126 default: 127 ASSERT(0); 128 return (EINVAL); 129 } 130 131 error = ire_init_common(ire, type, ill, zoneid, flags, IPV6_VERSION, 132 gc, ipst); 133 if (error != NULL) 134 return (error); 135 136 /* Determine which function pointers to use */ 137 ire->ire_postfragfn = ip_xmit; /* Common case */ 138 139 switch (ire->ire_type) { 140 case IRE_LOCAL: 141 ire->ire_sendfn = ire_send_local_v6; 142 ire->ire_recvfn = ire_recv_local_v6; 143 #ifdef SO_VRRP 144 ASSERT(ire->ire_ill != NULL); 145 if (ire->ire_ill->ill_flags & ILLF_NOACCEPT) { 146 ire->ire_noaccept = B_TRUE; 147 ire->ire_recvfn = ire_recv_noaccept_v6; 148 } 149 #endif 150 break; 151 case IRE_LOOPBACK: 152 ire->ire_sendfn = ire_send_local_v6; 153 ire->ire_recvfn = ire_recv_loopback_v6; 154 break; 155 case IRE_MULTICAST: 156 ire->ire_postfragfn = ip_postfrag_loopcheck; 157 ire->ire_sendfn = ire_send_multicast_v6; 158 ire->ire_recvfn = ire_recv_multicast_v6; 159 break; 160 default: 161 /* 162 * For IRE_IF_ALL and IRE_OFFLINK we forward received 163 * packets by default. 164 */ 165 ire->ire_sendfn = ire_send_wire_v6; 166 ire->ire_recvfn = ire_recv_forward_v6; 167 break; 168 } 169 if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) { 170 ire->ire_sendfn = ire_send_noroute_v6; 171 ire->ire_recvfn = ire_recv_noroute_v6; 172 } else if (ire->ire_flags & RTF_MULTIRT) { 173 ire->ire_postfragfn = ip_postfrag_multirt_v6; 174 ire->ire_sendfn = ire_send_multirt_v6; 175 ire->ire_recvfn = ire_recv_multirt_v6; 176 } 177 ire->ire_nce_capable = ire_determine_nce_capable(ire); 178 return (0); 179 } 180 181 /* 182 * ire_create_v6 is called to allocate and initialize a new IRE. 183 * 184 * NOTE : This is called as writer sometimes though not required 185 * by this function. 186 */ 187 /* ARGSUSED */ 188 ire_t * 189 ire_create_v6(const in6_addr_t *v6addr, const in6_addr_t *v6mask, 190 const in6_addr_t *v6gateway, ushort_t type, ill_t *ill, zoneid_t zoneid, 191 uint_t flags, tsol_gc_t *gc, ip_stack_t *ipst) 192 { 193 ire_t *ire; 194 int error; 195 196 ASSERT(!IN6_IS_ADDR_V4MAPPED(v6addr)); 197 198 ire = kmem_cache_alloc(ire_cache, KM_NOSLEEP); 199 if (ire == NULL) { 200 DTRACE_PROBE(kmem__cache__alloc); 201 return (NULL); 202 } 203 *ire = ire_null; 204 205 error = ire_init_v6(ire, v6addr, v6mask, v6gateway, 206 type, ill, zoneid, flags, gc, ipst); 207 208 if (error != 0) { 209 DTRACE_PROBE2(ire__init__v6, ire_t *, ire, int, error); 210 kmem_cache_free(ire_cache, ire); 211 return (NULL); 212 } 213 return (ire); 214 } 215 216 /* 217 * Find the ill matching a multicast group. 218 * Allows different routes for multicast addresses 219 * in the unicast routing table (akin to FF::0/8 but could be more specific) 220 * which point at different interfaces. This is used when IPV6_MULTICAST_IF 221 * isn't specified (when sending) and when IPV6_JOIN_GROUP doesn't 222 * specify the interface to join on. 223 * 224 * Supports link-local addresses by using ire_route_recursive which follows 225 * the ill when recursing. 226 * 227 * To handle CGTP, since we don't have a separate IRE_MULTICAST for each group 228 * and the MULTIRT property can be different for different groups, we 229 * extract RTF_MULTIRT from the special unicast route added for a group 230 * with CGTP and pass that back in the multirtp argument. 231 * This is used in ip_set_destination etc to set ixa_postfragfn for multicast. 232 * We have a setsrcp argument for the same reason. 233 */ 234 ill_t * 235 ire_lookup_multi_ill_v6(const in6_addr_t *group, zoneid_t zoneid, 236 ip_stack_t *ipst, boolean_t *multirtp, in6_addr_t *setsrcp) 237 { 238 ire_t *ire; 239 ill_t *ill; 240 241 ire = ire_route_recursive_v6(group, 0, NULL, zoneid, NULL, 242 MATCH_IRE_DSTONLY, B_FALSE, 0, ipst, setsrcp, NULL, NULL); 243 ASSERT(ire != NULL); 244 245 if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) { 246 ire_refrele(ire); 247 return (NULL); 248 } 249 250 if (multirtp != NULL) 251 *multirtp = (ire->ire_flags & RTF_MULTIRT) != 0; 252 253 ill = ire_nexthop_ill(ire); 254 ire_refrele(ire); 255 return (ill); 256 } 257 258 /* 259 * This function takes a mask and returns number of bits set in the 260 * mask (the represented prefix length). Assumes a contiguous mask. 261 */ 262 int 263 ip_mask_to_plen_v6(const in6_addr_t *v6mask) 264 { 265 int bits; 266 int plen = IPV6_ABITS; 267 int i; 268 269 for (i = 3; i >= 0; i--) { 270 if (v6mask->s6_addr32[i] == 0) { 271 plen -= 32; 272 continue; 273 } 274 bits = ffs(ntohl(v6mask->s6_addr32[i])) - 1; 275 if (bits == 0) 276 break; 277 plen -= bits; 278 } 279 280 return (plen); 281 } 282 283 /* 284 * Convert a prefix length to the mask for that prefix. 285 * Returns the argument bitmask. 286 */ 287 in6_addr_t * 288 ip_plen_to_mask_v6(uint_t plen, in6_addr_t *bitmask) 289 { 290 uint32_t *ptr; 291 292 if (plen < 0 || plen > IPV6_ABITS) 293 return (NULL); 294 *bitmask = ipv6_all_zeros; 295 if (plen == 0) 296 return (bitmask); 297 298 ptr = (uint32_t *)bitmask; 299 while (plen > 32) { 300 *ptr++ = 0xffffffffU; 301 plen -= 32; 302 } 303 *ptr = htonl(0xffffffffU << (32 - plen)); 304 return (bitmask); 305 } 306 307 /* 308 * Add a fully initialized IPv6 IRE to the forwarding table. 309 * This returns NULL on failure, or a held IRE on success. 310 * Normally the returned IRE is the same as the argument. But a different 311 * IRE will be returned if the added IRE is deemed identical to an existing 312 * one. In that case ire_identical_ref will be increased. 313 * The caller always needs to do an ire_refrele() on the returned IRE. 314 */ 315 ire_t * 316 ire_add_v6(ire_t *ire) 317 { 318 ire_t *ire1; 319 int mask_table_index; 320 irb_t *irb_ptr; 321 ire_t **irep; 322 int match_flags; 323 int error; 324 ip_stack_t *ipst = ire->ire_ipst; 325 326 ASSERT(ire->ire_ipversion == IPV6_VERSION); 327 328 /* Make sure the address is properly masked. */ 329 V6_MASK_COPY(ire->ire_addr_v6, ire->ire_mask_v6, ire->ire_addr_v6); 330 331 mask_table_index = ip_mask_to_plen_v6(&ire->ire_mask_v6); 332 if ((ipst->ips_ip_forwarding_table_v6[mask_table_index]) == NULL) { 333 irb_t *ptr; 334 int i; 335 336 ptr = (irb_t *)mi_zalloc((ipst->ips_ip6_ftable_hash_size * 337 sizeof (irb_t))); 338 if (ptr == NULL) { 339 ire_delete(ire); 340 return (NULL); 341 } 342 for (i = 0; i < ipst->ips_ip6_ftable_hash_size; i++) { 343 rw_init(&ptr[i].irb_lock, NULL, RW_DEFAULT, NULL); 344 } 345 mutex_enter(&ipst->ips_ire_ft_init_lock); 346 if (ipst->ips_ip_forwarding_table_v6[mask_table_index] == 347 NULL) { 348 ipst->ips_ip_forwarding_table_v6[mask_table_index] = 349 ptr; 350 mutex_exit(&ipst->ips_ire_ft_init_lock); 351 } else { 352 /* 353 * Some other thread won the race in 354 * initializing the forwarding table at the 355 * same index. 356 */ 357 mutex_exit(&ipst->ips_ire_ft_init_lock); 358 for (i = 0; i < ipst->ips_ip6_ftable_hash_size; i++) { 359 rw_destroy(&ptr[i].irb_lock); 360 } 361 mi_free(ptr); 362 } 363 } 364 irb_ptr = &(ipst->ips_ip_forwarding_table_v6[mask_table_index][ 365 IRE_ADDR_MASK_HASH_V6(ire->ire_addr_v6, ire->ire_mask_v6, 366 ipst->ips_ip6_ftable_hash_size)]); 367 368 match_flags = (MATCH_IRE_MASK | MATCH_IRE_TYPE | MATCH_IRE_GW); 369 if (ire->ire_ill != NULL) 370 match_flags |= MATCH_IRE_ILL; 371 /* 372 * Start the atomic add of the ire. Grab the bucket lock and the 373 * ill lock. Check for condemned. 374 */ 375 error = ire_atomic_start(irb_ptr, ire); 376 if (error != 0) { 377 ire_delete(ire); 378 return (NULL); 379 } 380 381 /* 382 * If we are creating a hidden IRE, make sure we search for 383 * hidden IREs when searching for duplicates below. 384 * Otherwise, we might find an IRE on some other interface 385 * that's not marked hidden. 386 */ 387 if (ire->ire_testhidden) 388 match_flags |= MATCH_IRE_TESTHIDDEN; 389 390 /* 391 * Atomically check for duplicate and insert in the table. 392 */ 393 for (ire1 = irb_ptr->irb_ire; ire1 != NULL; ire1 = ire1->ire_next) { 394 if (IRE_IS_CONDEMNED(ire1)) 395 continue; 396 /* 397 * Here we need an exact match on zoneid, i.e., 398 * ire_match_args doesn't fit. 399 */ 400 if (ire1->ire_zoneid != ire->ire_zoneid) 401 continue; 402 403 if (ire1->ire_type != ire->ire_type) 404 continue; 405 406 /* 407 * Note: We do not allow multiple routes that differ only 408 * in the gateway security attributes; such routes are 409 * considered duplicates. 410 * To change that we explicitly have to treat them as 411 * different here. 412 */ 413 if (ire_match_args_v6(ire1, &ire->ire_addr_v6, 414 &ire->ire_mask_v6, &ire->ire_gateway_addr_v6, 415 ire->ire_type, ire->ire_ill, ire->ire_zoneid, NULL, 416 match_flags)) { 417 /* 418 * Return the old ire after doing a REFHOLD. 419 * As most of the callers continue to use the IRE 420 * after adding, we return a held ire. This will 421 * avoid a lookup in the caller again. If the callers 422 * don't want to use it, they need to do a REFRELE. 423 */ 424 ip1dbg(("found dup ire existing %p new %p", 425 (void *)ire1, (void *)ire)); 426 ire_refhold(ire1); 427 atomic_add_32(&ire1->ire_identical_ref, 1); 428 ire_atomic_end(irb_ptr, ire); 429 ire_delete(ire); 430 return (ire1); 431 } 432 } 433 434 /* 435 * Normally we do head insertion since most things do not care about 436 * the order of the IREs in the bucket. 437 * However, due to shared-IP zones (and restrict_interzone_loopback) 438 * we can have an IRE_LOCAL as well as IRE_IF_CLONE for the same 439 * address. For that reason we do tail insertion for IRE_IF_CLONE. 440 */ 441 irep = (ire_t **)irb_ptr; 442 if (ire->ire_type & IRE_IF_CLONE) { 443 while ((ire1 = *irep) != NULL) 444 irep = &ire1->ire_next; 445 } 446 /* Insert at *irep */ 447 ire1 = *irep; 448 if (ire1 != NULL) 449 ire1->ire_ptpn = &ire->ire_next; 450 ire->ire_next = ire1; 451 /* Link the new one in. */ 452 ire->ire_ptpn = irep; 453 /* 454 * ire_walk routines de-reference ire_next without holding 455 * a lock. Before we point to the new ire, we want to make 456 * sure the store that sets the ire_next of the new ire 457 * reaches global visibility, so that ire_walk routines 458 * don't see a truncated list of ires i.e if the ire_next 459 * of the new ire gets set after we do "*irep = ire" due 460 * to re-ordering, the ire_walk thread will see a NULL 461 * once it accesses the ire_next of the new ire. 462 * membar_producer() makes sure that the following store 463 * happens *after* all of the above stores. 464 */ 465 membar_producer(); 466 *irep = ire; 467 ire->ire_bucket = irb_ptr; 468 /* 469 * We return a bumped up IRE above. Keep it symmetrical 470 * so that the callers will always have to release. This 471 * helps the callers of this function because they continue 472 * to use the IRE after adding and hence they don't have to 473 * lookup again after we return the IRE. 474 * 475 * NOTE : We don't have to use atomics as this is appearing 476 * in the list for the first time and no one else can bump 477 * up the reference count on this yet. 478 */ 479 ire_refhold_locked(ire); 480 BUMP_IRE_STATS(ipst->ips_ire_stats_v6, ire_stats_inserted); 481 irb_ptr->irb_ire_cnt++; 482 483 if (ire->ire_ill != NULL) { 484 DTRACE_PROBE3(ill__incr__cnt, (ill_t *), ire->ire_ill, 485 (char *), "ire", (void *), ire); 486 ire->ire_ill->ill_ire_cnt++; 487 ASSERT(ire->ire_ill->ill_ire_cnt != 0); /* Wraparound */ 488 } 489 ire_atomic_end(irb_ptr, ire); 490 491 /* Make any caching of the IREs be notified or updated */ 492 ire_flush_cache_v6(ire, IRE_FLUSH_ADD); 493 494 return (ire); 495 } 496 497 /* 498 * Search for all HOST REDIRECT routes that are 499 * pointing at the specified gateway and 500 * delete them. This routine is called only 501 * when a default gateway is going away. 502 */ 503 static void 504 ire_delete_host_redirects_v6(const in6_addr_t *gateway, ip_stack_t *ipst) 505 { 506 irb_t *irb_ptr; 507 irb_t *irb; 508 ire_t *ire; 509 in6_addr_t gw_addr_v6; 510 int i; 511 512 /* get the hash table for HOST routes */ 513 irb_ptr = ipst->ips_ip_forwarding_table_v6[(IP6_MASK_TABLE_SIZE - 1)]; 514 if (irb_ptr == NULL) 515 return; 516 for (i = 0; (i < ipst->ips_ip6_ftable_hash_size); i++) { 517 irb = &irb_ptr[i]; 518 irb_refhold(irb); 519 for (ire = irb->irb_ire; ire != NULL; ire = ire->ire_next) { 520 if (!(ire->ire_flags & RTF_DYNAMIC)) 521 continue; 522 mutex_enter(&ire->ire_lock); 523 gw_addr_v6 = ire->ire_gateway_addr_v6; 524 mutex_exit(&ire->ire_lock); 525 if (IN6_ARE_ADDR_EQUAL(&gw_addr_v6, gateway)) 526 ire_delete(ire); 527 } 528 irb_refrele(irb); 529 } 530 } 531 532 /* 533 * Delete the specified IRE. 534 * All calls should use ire_delete(). 535 * Sometimes called as writer though not required by this function. 536 * 537 * NOTE : This function is called only if the ire was added 538 * in the list. 539 */ 540 void 541 ire_delete_v6(ire_t *ire) 542 { 543 in6_addr_t gw_addr_v6; 544 ip_stack_t *ipst = ire->ire_ipst; 545 546 /* 547 * Make sure ire_generation increases from ire_flush_cache happen 548 * after any lookup/reader has read ire_generation. 549 * Since the rw_enter makes us wait until any lookup/reader has 550 * completed we can exit the lock immediately. 551 */ 552 rw_enter(&ipst->ips_ip6_ire_head_lock, RW_WRITER); 553 rw_exit(&ipst->ips_ip6_ire_head_lock); 554 555 ASSERT(ire->ire_refcnt >= 1); 556 ASSERT(ire->ire_ipversion == IPV6_VERSION); 557 558 ire_flush_cache_v6(ire, IRE_FLUSH_DELETE); 559 560 if (ire->ire_type == IRE_DEFAULT) { 561 /* 562 * when a default gateway is going away 563 * delete all the host redirects pointing at that 564 * gateway. 565 */ 566 mutex_enter(&ire->ire_lock); 567 gw_addr_v6 = ire->ire_gateway_addr_v6; 568 mutex_exit(&ire->ire_lock); 569 ire_delete_host_redirects_v6(&gw_addr_v6, ipst); 570 } 571 572 /* 573 * If we are deleting an IRE_INTERFACE then we make sure we also 574 * delete any IRE_IF_CLONE that has been created from it. 575 * Those are always in ire_dep_children. 576 */ 577 if ((ire->ire_type & IRE_INTERFACE) && ire->ire_dep_children != 0) 578 ire_dep_delete_if_clone(ire); 579 580 /* Remove from parent dependencies and child */ 581 rw_enter(&ipst->ips_ire_dep_lock, RW_WRITER); 582 if (ire->ire_dep_parent != NULL) { 583 ire_dep_remove(ire); 584 } 585 while (ire->ire_dep_children != NULL) 586 ire_dep_remove(ire->ire_dep_children); 587 rw_exit(&ipst->ips_ire_dep_lock); 588 } 589 590 /* 591 * When an IRE is added or deleted this routine is called to make sure 592 * any caching of IRE information is notified or updated. 593 * 594 * The flag argument indicates if the flush request is due to addition 595 * of new route (IRE_FLUSH_ADD), deletion of old route (IRE_FLUSH_DELETE), 596 * or a change to ire_gateway_addr (IRE_FLUSH_GWCHANGE). 597 */ 598 void 599 ire_flush_cache_v6(ire_t *ire, int flag) 600 { 601 ip_stack_t *ipst = ire->ire_ipst; 602 603 /* 604 * IRE_IF_CLONE ire's don't provide any new information 605 * than the parent from which they are cloned, so don't 606 * perturb the generation numbers. 607 */ 608 if (ire->ire_type & IRE_IF_CLONE) 609 return; 610 611 /* 612 * Ensure that an ire_add during a lookup serializes the updates of 613 * the generation numbers under ire_head_lock so that the lookup gets 614 * either the old ire and old generation number, or a new ire and new 615 * generation number. 616 */ 617 rw_enter(&ipst->ips_ip6_ire_head_lock, RW_WRITER); 618 619 /* 620 * If a route was just added, we need to notify everybody that 621 * has cached an IRE_NOROUTE since there might now be a better 622 * route for them. 623 */ 624 if (flag == IRE_FLUSH_ADD) { 625 ire_increment_generation(ipst->ips_ire_reject_v6); 626 ire_increment_generation(ipst->ips_ire_blackhole_v6); 627 } 628 629 /* Adding a default can't otherwise provide a better route */ 630 if (ire->ire_type == IRE_DEFAULT && flag == IRE_FLUSH_ADD) { 631 rw_exit(&ipst->ips_ip6_ire_head_lock); 632 return; 633 } 634 635 switch (flag) { 636 case IRE_FLUSH_DELETE: 637 case IRE_FLUSH_GWCHANGE: 638 /* 639 * Update ire_generation for all ire_dep_children chains 640 * starting with this IRE 641 */ 642 ire_dep_incr_generation(ire); 643 break; 644 case IRE_FLUSH_ADD: { 645 in6_addr_t addr; 646 in6_addr_t mask; 647 ip_stack_t *ipst = ire->ire_ipst; 648 uint_t masklen; 649 650 /* 651 * Find an IRE which is a shorter match than the ire to be added 652 * For any such IRE (which we repeat) we update the 653 * ire_generation the same way as in the delete case. 654 */ 655 addr = ire->ire_addr_v6; 656 mask = ire->ire_mask_v6; 657 masklen = ip_mask_to_plen_v6(&mask); 658 659 ire = ire_ftable_lookup_impl_v6(&addr, &mask, NULL, 0, NULL, 660 ALL_ZONES, NULL, MATCH_IRE_SHORTERMASK, ipst); 661 while (ire != NULL) { 662 /* We need to handle all in the same bucket */ 663 irb_increment_generation(ire->ire_bucket); 664 665 mask = ire->ire_mask_v6; 666 ASSERT(masklen > ip_mask_to_plen_v6(&mask)); 667 masklen = ip_mask_to_plen_v6(&mask); 668 ire_refrele(ire); 669 ire = ire_ftable_lookup_impl_v6(&addr, &mask, NULL, 0, 670 NULL, ALL_ZONES, NULL, MATCH_IRE_SHORTERMASK, ipst); 671 } 672 } 673 break; 674 } 675 rw_exit(&ipst->ips_ip6_ire_head_lock); 676 } 677 678 /* 679 * Matches the arguments passed with the values in the ire. 680 * 681 * Note: for match types that match using "ill" passed in, ill 682 * must be checked for non-NULL before calling this routine. 683 */ 684 boolean_t 685 ire_match_args_v6(ire_t *ire, const in6_addr_t *addr, const in6_addr_t *mask, 686 const in6_addr_t *gateway, int type, const ill_t *ill, zoneid_t zoneid, 687 const ts_label_t *tsl, int match_flags) 688 { 689 in6_addr_t masked_addr; 690 in6_addr_t gw_addr_v6; 691 ill_t *ire_ill = NULL, *dst_ill; 692 ip_stack_t *ipst = ire->ire_ipst; 693 694 ASSERT(ire->ire_ipversion == IPV6_VERSION); 695 ASSERT(addr != NULL); 696 ASSERT(mask != NULL); 697 ASSERT((!(match_flags & MATCH_IRE_GW)) || gateway != NULL); 698 ASSERT((!(match_flags & MATCH_IRE_ILL)) || 699 (ill != NULL && ill->ill_isv6)); 700 701 /* 702 * If MATCH_IRE_TESTHIDDEN is set, then only return the IRE if it 703 * is in fact hidden, to ensure the caller gets the right one. 704 */ 705 if (ire->ire_testhidden) { 706 if (!(match_flags & MATCH_IRE_TESTHIDDEN)) 707 return (B_FALSE); 708 } 709 710 if (zoneid != ALL_ZONES && zoneid != ire->ire_zoneid && 711 ire->ire_zoneid != ALL_ZONES) { 712 /* 713 * If MATCH_IRE_ZONEONLY has been set and the supplied zoneid 714 * does not match that of ire_zoneid, a failure to 715 * match is reported at this point. Otherwise, since some IREs 716 * that are available in the global zone can be used in local 717 * zones, additional checks need to be performed: 718 * 719 * IRE_LOOPBACK 720 * entries should never be matched in this situation. 721 * Each zone has its own IRE_LOOPBACK. 722 * 723 * IRE_LOCAL 724 * We allow them for any zoneid. ire_route_recursive 725 * does additional checks when 726 * ip_restrict_interzone_loopback is set. 727 * 728 * If ill_usesrc_ifindex is set 729 * Then we check if the zone has a valid source address 730 * on the usesrc ill. 731 * 732 * If ire_ill is set, then check that the zone has an ipif 733 * on that ill. 734 * 735 * Outside of this function (in ire_round_robin) we check 736 * that any IRE_OFFLINK has a gateway that reachable from the 737 * zone when we have multiple choices (ECMP). 738 */ 739 if (match_flags & MATCH_IRE_ZONEONLY) 740 return (B_FALSE); 741 if (ire->ire_type & IRE_LOOPBACK) 742 return (B_FALSE); 743 744 if (ire->ire_type & IRE_LOCAL) 745 goto matchit; 746 747 /* 748 * The normal case of IRE_ONLINK has a matching zoneid. 749 * Here we handle the case when shared-IP zones have been 750 * configured with IP addresses on vniN. In that case it 751 * is ok for traffic from a zone to use IRE_ONLINK routes 752 * if the ill has a usesrc pointing at vniN 753 * Applies to IRE_INTERFACE. 754 */ 755 dst_ill = ire->ire_ill; 756 if (ire->ire_type & IRE_ONLINK) { 757 uint_t ifindex; 758 759 /* 760 * Note there is no IRE_INTERFACE on vniN thus 761 * can't do an IRE lookup for a matching route. 762 */ 763 ifindex = dst_ill->ill_usesrc_ifindex; 764 if (ifindex == 0) 765 return (B_FALSE); 766 767 /* 768 * If there is a usable source address in the 769 * zone, then it's ok to return this IRE_INTERFACE 770 */ 771 if (!ipif_zone_avail(ifindex, dst_ill->ill_isv6, 772 zoneid, ipst)) { 773 ip3dbg(("ire_match_args: no usrsrc for zone" 774 " dst_ill %p\n", (void *)dst_ill)); 775 return (B_FALSE); 776 } 777 } 778 /* 779 * For exampe, with 780 * route add 11.0.0.0 gw1 -ifp bge0 781 * route add 11.0.0.0 gw2 -ifp bge1 782 * this code would differentiate based on 783 * where the sending zone has addresses. 784 * Only if the zone has an address on bge0 can it use the first 785 * route. It isn't clear if this behavior is documented 786 * anywhere. 787 */ 788 if (dst_ill != NULL && (ire->ire_type & IRE_OFFLINK)) { 789 ipif_t *tipif; 790 791 mutex_enter(&dst_ill->ill_lock); 792 for (tipif = dst_ill->ill_ipif; 793 tipif != NULL; tipif = tipif->ipif_next) { 794 if (!IPIF_IS_CONDEMNED(tipif) && 795 (tipif->ipif_flags & IPIF_UP) && 796 (tipif->ipif_zoneid == zoneid || 797 tipif->ipif_zoneid == ALL_ZONES)) 798 break; 799 } 800 mutex_exit(&dst_ill->ill_lock); 801 if (tipif == NULL) 802 return (B_FALSE); 803 } 804 } 805 806 matchit: 807 if (match_flags & MATCH_IRE_GW) { 808 mutex_enter(&ire->ire_lock); 809 gw_addr_v6 = ire->ire_gateway_addr_v6; 810 mutex_exit(&ire->ire_lock); 811 } 812 if (match_flags & MATCH_IRE_ILL) { 813 ire_ill = ire->ire_ill; 814 815 /* 816 * If asked to match an ill, we *must* match 817 * on the ire_ill for ipmp test addresses, or 818 * any of the ill in the group for data addresses. 819 * If we don't, we may as well fail. 820 * However, we need an exception for IRE_LOCALs to ensure 821 * we loopback packets even sent to test addresses on different 822 * interfaces in the group. 823 */ 824 if ((match_flags & MATCH_IRE_TESTHIDDEN) && 825 !(ire->ire_type & IRE_LOCAL)) { 826 if (ire->ire_ill != ill) 827 return (B_FALSE); 828 } else { 829 match_flags &= ~MATCH_IRE_TESTHIDDEN; 830 /* 831 * We know that ill is not NULL, but ire_ill could be 832 * NULL 833 */ 834 if (ire_ill == NULL || !IS_ON_SAME_LAN(ill, ire_ill)) 835 return (B_FALSE); 836 } 837 } 838 /* No ire_addr_v6 bits set past the mask */ 839 ASSERT(V6_MASK_EQ(ire->ire_addr_v6, ire->ire_mask_v6, 840 ire->ire_addr_v6)); 841 V6_MASK_COPY(*addr, *mask, masked_addr); 842 if (V6_MASK_EQ(*addr, *mask, ire->ire_addr_v6) && 843 ((!(match_flags & MATCH_IRE_GW)) || 844 IN6_ARE_ADDR_EQUAL(&gw_addr_v6, gateway)) && 845 ((!(match_flags & MATCH_IRE_TYPE)) || (ire->ire_type & type)) && 846 ((!(match_flags & MATCH_IRE_TESTHIDDEN)) || ire->ire_testhidden) && 847 ((!(match_flags & MATCH_IRE_MASK)) || 848 (IN6_ARE_ADDR_EQUAL(&ire->ire_mask_v6, mask))) && 849 ((!(match_flags & MATCH_IRE_SECATTR)) || 850 (!is_system_labeled()) || 851 (tsol_ire_match_gwattr(ire, tsl) == 0))) { 852 /* We found the matched IRE */ 853 return (B_TRUE); 854 } 855 return (B_FALSE); 856 } 857 858 /* 859 * Check if the zoneid (not ALL_ZONES) has an IRE_INTERFACE for the specified 860 * gateway address. If ill is non-NULL we also match on it. 861 * The caller must hold a read lock on RADIX_NODE_HEAD if lock_held is set. 862 */ 863 boolean_t 864 ire_gateway_ok_zone_v6(const in6_addr_t *gateway, zoneid_t zoneid, ill_t *ill, 865 const ts_label_t *tsl, ip_stack_t *ipst, boolean_t lock_held) 866 { 867 ire_t *ire; 868 uint_t match_flags; 869 870 if (lock_held) 871 ASSERT(RW_READ_HELD(&ipst->ips_ip6_ire_head_lock)); 872 else 873 rw_enter(&ipst->ips_ip6_ire_head_lock, RW_READER); 874 875 match_flags = MATCH_IRE_TYPE | MATCH_IRE_SECATTR; 876 if (ill != NULL) 877 match_flags |= MATCH_IRE_ILL; 878 879 ire = ire_ftable_lookup_impl_v6(gateway, &ipv6_all_zeros, 880 &ipv6_all_zeros, IRE_INTERFACE, ill, zoneid, tsl, match_flags, 881 ipst); 882 883 if (!lock_held) 884 rw_exit(&ipst->ips_ip6_ire_head_lock); 885 if (ire != NULL) { 886 ire_refrele(ire); 887 return (B_TRUE); 888 } else { 889 return (B_FALSE); 890 } 891 } 892 893 /* 894 * Lookup a route in forwarding table. 895 * specific lookup is indicated by passing the 896 * required parameters and indicating the 897 * match required in flag field. 898 * 899 * Supports link-local addresses by following the ipif/ill when recursing. 900 */ 901 ire_t * 902 ire_ftable_lookup_v6(const in6_addr_t *addr, const in6_addr_t *mask, 903 const in6_addr_t *gateway, int type, const ill_t *ill, 904 zoneid_t zoneid, const ts_label_t *tsl, int flags, 905 uint32_t xmit_hint, ip_stack_t *ipst, uint_t *generationp) 906 { 907 ire_t *ire = NULL; 908 909 ASSERT(addr != NULL); 910 ASSERT((!(flags & MATCH_IRE_MASK)) || mask != NULL); 911 ASSERT((!(flags & MATCH_IRE_GW)) || gateway != NULL); 912 ASSERT(ill == NULL || ill->ill_isv6); 913 914 ASSERT(!IN6_IS_ADDR_V4MAPPED(addr)); 915 916 /* 917 * ire_match_args_v6() will dereference ill if MATCH_IRE_ILL 918 * is set. 919 */ 920 if ((flags & (MATCH_IRE_ILL)) && (ill == NULL)) 921 return (NULL); 922 923 rw_enter(&ipst->ips_ip6_ire_head_lock, RW_READER); 924 ire = ire_ftable_lookup_impl_v6(addr, mask, gateway, type, ill, zoneid, 925 tsl, flags, ipst); 926 if (ire == NULL) { 927 rw_exit(&ipst->ips_ip6_ire_head_lock); 928 return (NULL); 929 } 930 931 /* 932 * round-robin only if we have more than one route in the bucket. 933 * ips_ip_ecmp_behavior controls when we do ECMP 934 * 2: always 935 * 1: for IRE_DEFAULT and /0 IRE_INTERFACE 936 * 0: never 937 * 938 * Note: if we found an IRE_IF_CLONE we won't look at the bucket with 939 * other ECMP IRE_INTERFACEs since the IRE_IF_CLONE is a /128 match 940 * and the IRE_INTERFACESs are likely to be shorter matches. 941 */ 942 if (ire->ire_bucket->irb_ire_cnt > 1 && !(flags & MATCH_IRE_GW)) { 943 if (ipst->ips_ip_ecmp_behavior == 2 || 944 (ipst->ips_ip_ecmp_behavior == 1 && 945 IS_DEFAULT_ROUTE_V6(ire))) { 946 ire_t *next_ire; 947 ire_ftable_args_t margs; 948 949 (void) memset(&margs, 0, sizeof (margs)); 950 margs.ift_addr_v6 = *addr; 951 if (mask != NULL) 952 margs.ift_mask_v6 = *mask; 953 if (gateway != NULL) 954 margs.ift_gateway_v6 = *gateway; 955 margs.ift_type = type; 956 margs.ift_ill = ill; 957 margs.ift_zoneid = zoneid; 958 margs.ift_tsl = tsl; 959 margs.ift_flags = flags; 960 961 next_ire = ire_round_robin(ire->ire_bucket, &margs, 962 xmit_hint, ire, ipst); 963 if (next_ire == NULL) { 964 /* keep ire if next_ire is null */ 965 goto done; 966 } 967 ire_refrele(ire); 968 ire = next_ire; 969 } 970 } 971 972 done: 973 /* Return generation before dropping lock */ 974 if (generationp != NULL) 975 *generationp = ire->ire_generation; 976 977 rw_exit(&ipst->ips_ip6_ire_head_lock); 978 979 /* 980 * For shared-IP zones we need additional checks to what was 981 * done in ire_match_args to make sure IRE_LOCALs are handled. 982 * 983 * When ip_restrict_interzone_loopback is set, then 984 * we ensure that IRE_LOCAL are only used for loopback 985 * between zones when the logical "Ethernet" would 986 * have looped them back. That is, if in the absense of 987 * the IRE_LOCAL we would have sent to packet out the 988 * same ill. 989 */ 990 if ((ire->ire_type & IRE_LOCAL) && zoneid != ALL_ZONES && 991 ire->ire_zoneid != zoneid && ire->ire_zoneid != ALL_ZONES && 992 ipst->ips_ip_restrict_interzone_loopback) { 993 ire = ire_alt_local(ire, zoneid, tsl, ill, generationp); 994 ASSERT(ire != NULL); 995 } 996 997 return (ire); 998 } 999 1000 /* 1001 * Look up a single ire. The caller holds either the read or write lock. 1002 */ 1003 ire_t * 1004 ire_ftable_lookup_impl_v6(const in6_addr_t *addr, const in6_addr_t *mask, 1005 const in6_addr_t *gateway, int type, const ill_t *ill, 1006 zoneid_t zoneid, const ts_label_t *tsl, int flags, 1007 ip_stack_t *ipst) 1008 { 1009 irb_t *irb_ptr; 1010 ire_t *ire = NULL; 1011 int i; 1012 1013 ASSERT(RW_LOCK_HELD(&ipst->ips_ip6_ire_head_lock)); 1014 1015 /* 1016 * If the mask is known, the lookup 1017 * is simple, if the mask is not known 1018 * we need to search. 1019 */ 1020 if (flags & MATCH_IRE_MASK) { 1021 uint_t masklen; 1022 1023 masklen = ip_mask_to_plen_v6(mask); 1024 if (ipst->ips_ip_forwarding_table_v6[masklen] == NULL) { 1025 return (NULL); 1026 } 1027 irb_ptr = &(ipst->ips_ip_forwarding_table_v6[masklen][ 1028 IRE_ADDR_MASK_HASH_V6(*addr, *mask, 1029 ipst->ips_ip6_ftable_hash_size)]); 1030 rw_enter(&irb_ptr->irb_lock, RW_READER); 1031 for (ire = irb_ptr->irb_ire; ire != NULL; 1032 ire = ire->ire_next) { 1033 if (IRE_IS_CONDEMNED(ire)) 1034 continue; 1035 if (ire_match_args_v6(ire, addr, mask, gateway, type, 1036 ill, zoneid, tsl, flags)) 1037 goto found_ire; 1038 } 1039 rw_exit(&irb_ptr->irb_lock); 1040 } else { 1041 uint_t masklen; 1042 1043 /* 1044 * In this case we don't know the mask, we need to 1045 * search the table assuming different mask sizes. 1046 */ 1047 if (flags & MATCH_IRE_SHORTERMASK) { 1048 masklen = ip_mask_to_plen_v6(mask); 1049 if (masklen == 0) { 1050 /* Nothing shorter than zero */ 1051 return (NULL); 1052 } 1053 masklen--; 1054 } else { 1055 masklen = IP6_MASK_TABLE_SIZE - 1; 1056 } 1057 1058 for (i = masklen; i >= 0; i--) { 1059 in6_addr_t tmpmask; 1060 1061 if ((ipst->ips_ip_forwarding_table_v6[i]) == NULL) 1062 continue; 1063 (void) ip_plen_to_mask_v6(i, &tmpmask); 1064 irb_ptr = &ipst->ips_ip_forwarding_table_v6[i][ 1065 IRE_ADDR_MASK_HASH_V6(*addr, tmpmask, 1066 ipst->ips_ip6_ftable_hash_size)]; 1067 rw_enter(&irb_ptr->irb_lock, RW_READER); 1068 for (ire = irb_ptr->irb_ire; ire != NULL; 1069 ire = ire->ire_next) { 1070 if (IRE_IS_CONDEMNED(ire)) 1071 continue; 1072 if (ire_match_args_v6(ire, addr, 1073 &ire->ire_mask_v6, gateway, type, ill, 1074 zoneid, tsl, flags)) 1075 goto found_ire; 1076 } 1077 rw_exit(&irb_ptr->irb_lock); 1078 } 1079 } 1080 ASSERT(ire == NULL); 1081 ip1dbg(("ire_ftable_lookup_v6: returning NULL ire")); 1082 return (NULL); 1083 1084 found_ire: 1085 ire_refhold(ire); 1086 rw_exit(&irb_ptr->irb_lock); 1087 return (ire); 1088 } 1089 1090 1091 /* 1092 * This function is called by 1093 * ip_input/ire_route_recursive when doing a route lookup on only the 1094 * destination address. 1095 * 1096 * The optimizations of this function over ire_ftable_lookup are: 1097 * o removing unnecessary flag matching 1098 * o doing longest prefix match instead of overloading it further 1099 * with the unnecessary "best_prefix_match" 1100 * 1101 * If no route is found we return IRE_NOROUTE. 1102 */ 1103 ire_t * 1104 ire_ftable_lookup_simple_v6(const in6_addr_t *addr, uint32_t xmit_hint, 1105 ip_stack_t *ipst, uint_t *generationp) 1106 { 1107 ire_t *ire; 1108 1109 ire = ire_ftable_lookup_v6(addr, NULL, NULL, 0, NULL, ALL_ZONES, NULL, 1110 MATCH_IRE_DSTONLY, xmit_hint, ipst, generationp); 1111 if (ire == NULL) { 1112 ire = ire_reject(ipst, B_TRUE); 1113 if (generationp != NULL) 1114 *generationp = IRE_GENERATION_VERIFY; 1115 } 1116 /* ftable_lookup did round robin */ 1117 return (ire); 1118 } 1119 1120 ire_t * 1121 ip_select_route_v6(const in6_addr_t *dst, ip_xmit_attr_t *ixa, 1122 uint_t *generationp, in6_addr_t *setsrcp, int *errorp, boolean_t *multirtp) 1123 { 1124 ASSERT(!(ixa->ixa_flags & IXAF_IS_IPV4)); 1125 1126 return (ip_select_route(dst, ixa, generationp, setsrcp, errorp, 1127 multirtp)); 1128 } 1129 1130 /* 1131 * Recursively look for a route to the destination. Can also match on 1132 * the zoneid, ill, and label. Used for the data paths. See also 1133 * ire_route_recursive_dstonly. 1134 * 1135 * If ill is set this means we will match it by adding MATCH_IRE_ILL. 1136 * 1137 * If allocate is not set then we will only inspect the existing IREs; never 1138 * create an IRE_IF_CLONE. This is used on the receive side when we are not 1139 * forwarding. 1140 * 1141 * Note that this function never returns NULL. It returns an IRE_NOROUTE 1142 * instead. 1143 * 1144 * If we find any IRE_LOCAL|BROADCAST etc past the first iteration it 1145 * is an error. 1146 * Allow at most one RTF_INDIRECT. 1147 */ 1148 ire_t * 1149 ire_route_recursive_impl_v6(ire_t *ire, 1150 const in6_addr_t *nexthop, uint_t ire_type, const ill_t *ill_arg, 1151 zoneid_t zoneid, const ts_label_t *tsl, uint_t match_args, 1152 boolean_t allocate, uint32_t xmit_hint, ip_stack_t *ipst, 1153 in6_addr_t *setsrcp, tsol_ire_gw_secattr_t **gwattrp, uint_t *generationp) 1154 { 1155 int i, j; 1156 in6_addr_t v6nexthop = *nexthop; 1157 ire_t *ires[MAX_IRE_RECURSION]; 1158 uint_t generation; 1159 uint_t generations[MAX_IRE_RECURSION]; 1160 boolean_t need_refrele = B_FALSE; 1161 boolean_t invalidate = B_FALSE; 1162 int prefs[MAX_IRE_RECURSION]; 1163 ill_t *ill = NULL; 1164 1165 if (setsrcp != NULL) 1166 ASSERT(IN6_IS_ADDR_UNSPECIFIED(setsrcp)); 1167 if (gwattrp != NULL) 1168 ASSERT(*gwattrp == NULL); 1169 1170 if (ill_arg != NULL) 1171 match_args |= MATCH_IRE_ILL; 1172 1173 /* 1174 * We iterate up to three times to resolve a route, even though 1175 * we have four slots in the array. The extra slot is for an 1176 * IRE_IF_CLONE we might need to create. 1177 */ 1178 i = 0; 1179 while (i < MAX_IRE_RECURSION - 1) { 1180 /* ire_ftable_lookup handles round-robin/ECMP */ 1181 if (ire == NULL) { 1182 ire = ire_ftable_lookup_v6(&v6nexthop, 0, 0, ire_type, 1183 (ill_arg != NULL ? ill_arg : ill), zoneid, tsl, 1184 match_args, xmit_hint, ipst, &generation); 1185 } else { 1186 /* Caller passed it; extra hold since we will rele */ 1187 ire_refhold(ire); 1188 if (generationp != NULL) 1189 generation = *generationp; 1190 else 1191 generation = IRE_GENERATION_VERIFY; 1192 } 1193 1194 if (ire == NULL) 1195 ire = ire_reject(ipst, B_TRUE); 1196 1197 /* Need to return the ire with RTF_REJECT|BLACKHOLE */ 1198 if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) 1199 goto error; 1200 1201 ASSERT(!(ire->ire_type & IRE_MULTICAST)); /* Not in ftable */ 1202 1203 prefs[i] = ire_pref(ire); 1204 if (i != 0) { 1205 /* 1206 * Don't allow anything unusual past the first 1207 * iteration. 1208 */ 1209 if ((ire->ire_type & 1210 (IRE_LOCAL|IRE_LOOPBACK|IRE_BROADCAST)) || 1211 prefs[i] <= prefs[i-1]) { 1212 ire_refrele(ire); 1213 ire = ire_reject(ipst, B_TRUE); 1214 goto error; 1215 } 1216 } 1217 /* We have a usable IRE */ 1218 ires[i] = ire; 1219 generations[i] = generation; 1220 i++; 1221 1222 /* The first RTF_SETSRC address is passed back if setsrcp */ 1223 if ((ire->ire_flags & RTF_SETSRC) && 1224 setsrcp != NULL && IN6_IS_ADDR_UNSPECIFIED(setsrcp)) { 1225 ASSERT(!IN6_IS_ADDR_UNSPECIFIED( 1226 &ire->ire_setsrc_addr_v6)); 1227 *setsrcp = ire->ire_setsrc_addr_v6; 1228 } 1229 1230 /* The first ire_gw_secattr is passed back if gwattrp */ 1231 if (ire->ire_gw_secattr != NULL && 1232 gwattrp != NULL && *gwattrp == NULL) 1233 *gwattrp = ire->ire_gw_secattr; 1234 1235 /* 1236 * Check if we have a short-cut pointer to an IRE for this 1237 * destination, and that the cached dependency isn't stale. 1238 * In that case we've rejoined an existing tree towards a 1239 * parent, thus we don't need to continue the loop to 1240 * discover the rest of the tree. 1241 */ 1242 mutex_enter(&ire->ire_lock); 1243 if (ire->ire_dep_parent != NULL && 1244 ire->ire_dep_parent->ire_generation == 1245 ire->ire_dep_parent_generation) { 1246 mutex_exit(&ire->ire_lock); 1247 ire = NULL; 1248 goto done; 1249 } 1250 mutex_exit(&ire->ire_lock); 1251 1252 /* 1253 * If this type should have an ire_nce_cache (even if it 1254 * doesn't yet have one) then we are done. Includes 1255 * IRE_INTERFACE with a full 128 bit mask. 1256 */ 1257 if (ire->ire_nce_capable) { 1258 ire = NULL; 1259 goto done; 1260 } 1261 1262 ASSERT(!(ire->ire_type & IRE_IF_CLONE)); 1263 /* 1264 * For an IRE_INTERFACE we create an IRE_IF_CLONE for this 1265 * particular destination 1266 */ 1267 if (ire->ire_type & IRE_INTERFACE) { 1268 ire_t *clone; 1269 1270 ASSERT(ire->ire_masklen != IPV6_ABITS); 1271 1272 /* 1273 * In the case of ip_input and ILLF_FORWARDING not 1274 * being set, and in the case of RTM_GET, 1275 * there is no point in allocating 1276 * an IRE_IF_CLONE. We return the IRE_INTERFACE. 1277 * Note that !allocate can result in a ire_dep_parent 1278 * which is IRE_IF_* without an IRE_IF_CLONE. 1279 * We recover from that when we need to send packets 1280 * by ensuring that the generations become 1281 * IRE_GENERATION_VERIFY in this case. 1282 */ 1283 if (!allocate) { 1284 invalidate = B_TRUE; 1285 ire = NULL; 1286 goto done; 1287 } 1288 1289 clone = ire_create_if_clone(ire, &v6nexthop, 1290 &generation); 1291 if (clone == NULL) { 1292 /* 1293 * Temporary failure - no memory. 1294 * Don't want caller to cache IRE_NOROUTE. 1295 */ 1296 invalidate = B_TRUE; 1297 ire = ire_blackhole(ipst, B_TRUE); 1298 goto error; 1299 } 1300 /* 1301 * Make clone next to last entry and the 1302 * IRE_INTERFACE the last in the dependency 1303 * chain since the clone depends on the 1304 * IRE_INTERFACE. 1305 */ 1306 ASSERT(i >= 1); 1307 ASSERT(i < MAX_IRE_RECURSION); 1308 1309 ires[i] = ires[i-1]; 1310 generations[i] = generations[i-1]; 1311 ires[i-1] = clone; 1312 generations[i-1] = generation; 1313 i++; 1314 1315 ire = NULL; 1316 goto done; 1317 } 1318 1319 /* 1320 * We only match on the type and optionally ILL when 1321 * recursing. The type match is used by some callers 1322 * to exclude certain types (such as IRE_IF_CLONE or 1323 * IRE_LOCAL|IRE_LOOPBACK). 1324 */ 1325 match_args &= MATCH_IRE_TYPE; 1326 v6nexthop = ire->ire_gateway_addr_v6; 1327 if (ill == NULL && ire->ire_ill != NULL) { 1328 ill = ire->ire_ill; 1329 need_refrele = B_TRUE; 1330 ill_refhold(ill); 1331 match_args |= MATCH_IRE_ILL; 1332 } 1333 1334 ire = NULL; 1335 } 1336 ASSERT(ire == NULL); 1337 ire = ire_reject(ipst, B_TRUE); 1338 1339 error: 1340 ASSERT(ire != NULL); 1341 if (need_refrele) 1342 ill_refrele(ill); 1343 1344 /* 1345 * In the case of MULTIRT we want to try a different IRE the next 1346 * time. We let the next packet retry in that case. 1347 */ 1348 if (i > 0 && (ires[0]->ire_flags & RTF_MULTIRT)) 1349 (void) ire_no_good(ires[0]); 1350 1351 cleanup: 1352 /* cleanup ires[i] */ 1353 ire_dep_unbuild(ires, i); 1354 for (j = 0; j < i; j++) 1355 ire_refrele(ires[j]); 1356 1357 ASSERT(ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)); 1358 /* 1359 * Use IRE_GENERATION_VERIFY to ensure that ip_output will redo the 1360 * ip_select_route since the reject or lack of memory might be gone. 1361 */ 1362 if (generationp != NULL) 1363 *generationp = IRE_GENERATION_VERIFY; 1364 return (ire); 1365 1366 done: 1367 ASSERT(ire == NULL); 1368 if (need_refrele) 1369 ill_refrele(ill); 1370 1371 /* Build dependencies */ 1372 if (!ire_dep_build(ires, generations, i)) { 1373 /* Something in chain was condemned; tear it apart */ 1374 ire = ire_blackhole(ipst, B_TRUE); 1375 goto cleanup; 1376 } 1377 1378 /* 1379 * Release all refholds except the one for ires[0] that we 1380 * will return to the caller. 1381 */ 1382 for (j = 1; j < i; j++) 1383 ire_refrele(ires[j]); 1384 1385 if (invalidate) { 1386 /* 1387 * Since we needed to allocate but couldn't we need to make 1388 * sure that the dependency chain is rebuilt the next time. 1389 */ 1390 ire_dep_invalidate_generations(ires[0]); 1391 generation = IRE_GENERATION_VERIFY; 1392 } else { 1393 /* 1394 * IREs can have been added or deleted while we did the 1395 * recursive lookup and we can't catch those until we've built 1396 * the dependencies. We verify the stored 1397 * ire_dep_parent_generation to catch any such changes and 1398 * return IRE_GENERATION_VERIFY (which will cause 1399 * ip_select_route to be called again so we can redo the 1400 * recursive lookup next time we send a packet. 1401 */ 1402 generation = ire_dep_validate_generations(ires[0]); 1403 if (generations[0] != ires[0]->ire_generation) { 1404 /* Something changed at the top */ 1405 generation = IRE_GENERATION_VERIFY; 1406 } 1407 } 1408 if (generationp != NULL) 1409 *generationp = generation; 1410 1411 return (ires[0]); 1412 } 1413 1414 ire_t * 1415 ire_route_recursive_v6(const in6_addr_t *nexthop, uint_t ire_type, 1416 const ill_t *ill, zoneid_t zoneid, const ts_label_t *tsl, uint_t match_args, 1417 boolean_t allocate, uint32_t xmit_hint, ip_stack_t *ipst, 1418 in6_addr_t *setsrcp, tsol_ire_gw_secattr_t **gwattrp, uint_t *generationp) 1419 { 1420 return (ire_route_recursive_impl_v6(NULL, nexthop, ire_type, ill, 1421 zoneid, tsl, match_args, allocate, xmit_hint, ipst, setsrcp, 1422 gwattrp, generationp)); 1423 } 1424 1425 /* 1426 * Recursively look for a route to the destination. 1427 * We only handle a destination match here, yet we have the same arguments 1428 * as the full match to allow function pointers to select between the two. 1429 * 1430 * Note that this function never returns NULL. It returns an IRE_NOROUTE 1431 * instead. 1432 * 1433 * If we find any IRE_LOCAL|BROADCAST etc past the first iteration it 1434 * is an error. 1435 * Allow at most one RTF_INDIRECT. 1436 */ 1437 ire_t * 1438 ire_route_recursive_dstonly_v6(const in6_addr_t *nexthop, boolean_t allocate, 1439 uint32_t xmit_hint, ip_stack_t *ipst) 1440 { 1441 ire_t *ire; 1442 ire_t *ire1; 1443 uint_t generation; 1444 1445 /* ire_ftable_lookup handles round-robin/ECMP */ 1446 ire = ire_ftable_lookup_simple_v6(nexthop, xmit_hint, ipst, 1447 &generation); 1448 ASSERT(ire != NULL); 1449 1450 /* 1451 * If this type should have an ire_nce_cache (even if it 1452 * doesn't yet have one) then we are done. Includes 1453 * IRE_INTERFACE with a full 128 bit mask. 1454 */ 1455 if (ire->ire_nce_capable) 1456 return (ire); 1457 1458 /* 1459 * If the IRE has a current cached parent we know that the whole 1460 * parent chain is current, hence we don't need to discover and 1461 * build any dependencies by doing a recursive lookup. 1462 */ 1463 mutex_enter(&ire->ire_lock); 1464 if (ire->ire_dep_parent != NULL && 1465 ire->ire_dep_parent->ire_generation == 1466 ire->ire_dep_parent_generation) { 1467 mutex_exit(&ire->ire_lock); 1468 return (ire); 1469 } 1470 mutex_exit(&ire->ire_lock); 1471 1472 /* 1473 * Fallback to loop in the normal code starting with the ire 1474 * we found. Normally this would return the same ire. 1475 */ 1476 ire1 = ire_route_recursive_impl_v6(ire, nexthop, 0, NULL, ALL_ZONES, 1477 NULL, MATCH_IRE_DSTONLY, allocate, xmit_hint, ipst, NULL, NULL, 1478 &generation); 1479 ire_refrele(ire); 1480 return (ire1); 1481 } 1482