1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2010 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 /* 26 * Copyright (c) 1990 Mentat Inc. 27 */ 28 29 /* 30 * This file contains routines that manipulate Internet Routing Entries (IREs). 31 */ 32 #include <sys/types.h> 33 #include <sys/stream.h> 34 #include <sys/stropts.h> 35 #include <sys/ddi.h> 36 #include <sys/cmn_err.h> 37 38 #include <sys/systm.h> 39 #include <sys/param.h> 40 #include <sys/socket.h> 41 #include <net/if.h> 42 #include <net/route.h> 43 #include <netinet/in.h> 44 #include <net/if_dl.h> 45 #include <netinet/ip6.h> 46 #include <netinet/icmp6.h> 47 48 #include <inet/common.h> 49 #include <inet/mi.h> 50 #include <inet/ip.h> 51 #include <inet/ip6.h> 52 #include <inet/ip_ndp.h> 53 #include <inet/ip_if.h> 54 #include <inet/ip_ire.h> 55 #include <inet/ipclassifier.h> 56 #include <inet/nd.h> 57 #include <sys/kmem.h> 58 #include <sys/zone.h> 59 60 #include <sys/tsol/label.h> 61 #include <sys/tsol/tnet.h> 62 63 #define IS_DEFAULT_ROUTE_V6(ire) \ 64 (((ire)->ire_type & IRE_DEFAULT) || \ 65 (((ire)->ire_type & IRE_INTERFACE) && \ 66 (IN6_IS_ADDR_UNSPECIFIED(&(ire)->ire_addr_v6)))) 67 68 static ire_t ire_null; 69 70 static ire_t * 71 ire_ftable_lookup_impl_v6(const in6_addr_t *addr, const in6_addr_t *mask, 72 const in6_addr_t *gateway, int type, const ill_t *ill, 73 zoneid_t zoneid, const ts_label_t *tsl, int flags, 74 ip_stack_t *ipst); 75 76 /* 77 * Initialize the ire that is specific to IPv6 part and call 78 * ire_init_common to finish it. 79 * Returns zero or errno. 80 */ 81 int 82 ire_init_v6(ire_t *ire, const in6_addr_t *v6addr, const in6_addr_t *v6mask, 83 const in6_addr_t *v6gateway, ushort_t type, ill_t *ill, 84 zoneid_t zoneid, uint_t flags, tsol_gc_t *gc, ip_stack_t *ipst) 85 { 86 int error; 87 88 /* 89 * Reject IRE security attmakeribute creation/initialization 90 * if system is not running in Trusted mode. 91 */ 92 if (gc != NULL && !is_system_labeled()) 93 return (EINVAL); 94 95 BUMP_IRE_STATS(ipst->ips_ire_stats_v6, ire_stats_alloced); 96 if (v6addr != NULL) 97 ire->ire_addr_v6 = *v6addr; 98 if (v6gateway != NULL) 99 ire->ire_gateway_addr_v6 = *v6gateway; 100 101 /* Make sure we don't have stray values in some fields */ 102 switch (type) { 103 case IRE_LOOPBACK: 104 case IRE_HOST: 105 case IRE_LOCAL: 106 case IRE_IF_CLONE: 107 ire->ire_mask_v6 = ipv6_all_ones; 108 ire->ire_masklen = IPV6_ABITS; 109 break; 110 case IRE_PREFIX: 111 case IRE_DEFAULT: 112 case IRE_IF_RESOLVER: 113 case IRE_IF_NORESOLVER: 114 if (v6mask != NULL) { 115 ire->ire_mask_v6 = *v6mask; 116 ire->ire_masklen = 117 ip_mask_to_plen_v6(&ire->ire_mask_v6); 118 } 119 break; 120 case IRE_MULTICAST: 121 case IRE_NOROUTE: 122 ASSERT(v6mask == NULL); 123 break; 124 default: 125 ASSERT(0); 126 return (EINVAL); 127 } 128 129 error = ire_init_common(ire, type, ill, zoneid, flags, IPV6_VERSION, 130 gc, ipst); 131 if (error != NULL) 132 return (error); 133 134 /* Determine which function pointers to use */ 135 ire->ire_postfragfn = ip_xmit; /* Common case */ 136 137 switch (ire->ire_type) { 138 case IRE_LOCAL: 139 ire->ire_sendfn = ire_send_local_v6; 140 ire->ire_recvfn = ire_recv_local_v6; 141 ASSERT(ire->ire_ill != NULL); 142 if (ire->ire_ill->ill_flags & ILLF_NOACCEPT) 143 ire->ire_recvfn = ire_recv_noaccept_v6; 144 break; 145 case IRE_LOOPBACK: 146 ire->ire_sendfn = ire_send_local_v6; 147 ire->ire_recvfn = ire_recv_loopback_v6; 148 break; 149 case IRE_MULTICAST: 150 ire->ire_postfragfn = ip_postfrag_loopcheck; 151 ire->ire_sendfn = ire_send_multicast_v6; 152 ire->ire_recvfn = ire_recv_multicast_v6; 153 break; 154 default: 155 /* 156 * For IRE_IF_ALL and IRE_OFFLINK we forward received 157 * packets by default. 158 */ 159 ire->ire_sendfn = ire_send_wire_v6; 160 ire->ire_recvfn = ire_recv_forward_v6; 161 break; 162 } 163 if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) { 164 ire->ire_sendfn = ire_send_noroute_v6; 165 ire->ire_recvfn = ire_recv_noroute_v6; 166 } else if (ire->ire_flags & RTF_MULTIRT) { 167 ire->ire_postfragfn = ip_postfrag_multirt_v6; 168 ire->ire_sendfn = ire_send_multirt_v6; 169 ire->ire_recvfn = ire_recv_multirt_v6; 170 } 171 ire->ire_nce_capable = ire_determine_nce_capable(ire); 172 return (0); 173 } 174 175 /* 176 * ire_create_v6 is called to allocate and initialize a new IRE. 177 * 178 * NOTE : This is called as writer sometimes though not required 179 * by this function. 180 */ 181 /* ARGSUSED */ 182 ire_t * 183 ire_create_v6(const in6_addr_t *v6addr, const in6_addr_t *v6mask, 184 const in6_addr_t *v6gateway, ushort_t type, ill_t *ill, zoneid_t zoneid, 185 uint_t flags, tsol_gc_t *gc, ip_stack_t *ipst) 186 { 187 ire_t *ire; 188 int error; 189 190 ASSERT(!IN6_IS_ADDR_V4MAPPED(v6addr)); 191 192 ire = kmem_cache_alloc(ire_cache, KM_NOSLEEP); 193 if (ire == NULL) { 194 DTRACE_PROBE(kmem__cache__alloc); 195 return (NULL); 196 } 197 *ire = ire_null; 198 199 error = ire_init_v6(ire, v6addr, v6mask, v6gateway, 200 type, ill, zoneid, flags, gc, ipst); 201 202 if (error != 0) { 203 DTRACE_PROBE2(ire__init__v6, ire_t *, ire, int, error); 204 kmem_cache_free(ire_cache, ire); 205 return (NULL); 206 } 207 return (ire); 208 } 209 210 /* 211 * Find the ill matching a multicast group. 212 * Allows different routes for multicast addresses 213 * in the unicast routing table (akin to FF::0/8 but could be more specific) 214 * which point at different interfaces. This is used when IPV6_MULTICAST_IF 215 * isn't specified (when sending) and when IPV6_JOIN_GROUP doesn't 216 * specify the interface to join on. 217 * 218 * Supports link-local addresses by using ire_route_recursive which follows 219 * the ill when recursing. 220 * 221 * To handle CGTP, since we don't have a separate IRE_MULTICAST for each group 222 * and the MULTIRT property can be different for different groups, we 223 * extract RTF_MULTIRT from the special unicast route added for a group 224 * with CGTP and pass that back in the multirtp argument. 225 * This is used in ip_set_destination etc to set ixa_postfragfn for multicast. 226 * We have a setsrcp argument for the same reason. 227 */ 228 ill_t * 229 ire_lookup_multi_ill_v6(const in6_addr_t *group, zoneid_t zoneid, 230 ip_stack_t *ipst, boolean_t *multirtp, in6_addr_t *setsrcp) 231 { 232 ire_t *ire; 233 ill_t *ill; 234 235 ire = ire_route_recursive_v6(group, 0, NULL, zoneid, NULL, 236 MATCH_IRE_DSTONLY, IRR_NONE, 0, ipst, setsrcp, NULL, NULL); 237 ASSERT(ire != NULL); 238 239 if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) { 240 ire_refrele(ire); 241 return (NULL); 242 } 243 244 if (multirtp != NULL) 245 *multirtp = (ire->ire_flags & RTF_MULTIRT) != 0; 246 247 ill = ire_nexthop_ill(ire); 248 ire_refrele(ire); 249 return (ill); 250 } 251 252 /* 253 * This function takes a mask and returns number of bits set in the 254 * mask (the represented prefix length). Assumes a contiguous mask. 255 */ 256 int 257 ip_mask_to_plen_v6(const in6_addr_t *v6mask) 258 { 259 int bits; 260 int plen = IPV6_ABITS; 261 int i; 262 263 for (i = 3; i >= 0; i--) { 264 if (v6mask->s6_addr32[i] == 0) { 265 plen -= 32; 266 continue; 267 } 268 bits = ffs(ntohl(v6mask->s6_addr32[i])) - 1; 269 if (bits == 0) 270 break; 271 plen -= bits; 272 } 273 274 return (plen); 275 } 276 277 /* 278 * Convert a prefix length to the mask for that prefix. 279 * Returns the argument bitmask. 280 */ 281 in6_addr_t * 282 ip_plen_to_mask_v6(uint_t plen, in6_addr_t *bitmask) 283 { 284 uint32_t *ptr; 285 286 if (plen < 0 || plen > IPV6_ABITS) 287 return (NULL); 288 *bitmask = ipv6_all_zeros; 289 if (plen == 0) 290 return (bitmask); 291 292 ptr = (uint32_t *)bitmask; 293 while (plen > 32) { 294 *ptr++ = 0xffffffffU; 295 plen -= 32; 296 } 297 *ptr = htonl(0xffffffffU << (32 - plen)); 298 return (bitmask); 299 } 300 301 /* 302 * Add a fully initialized IPv6 IRE to the forwarding table. 303 * This returns NULL on failure, or a held IRE on success. 304 * Normally the returned IRE is the same as the argument. But a different 305 * IRE will be returned if the added IRE is deemed identical to an existing 306 * one. In that case ire_identical_ref will be increased. 307 * The caller always needs to do an ire_refrele() on the returned IRE. 308 */ 309 ire_t * 310 ire_add_v6(ire_t *ire) 311 { 312 ire_t *ire1; 313 int mask_table_index; 314 irb_t *irb_ptr; 315 ire_t **irep; 316 int match_flags; 317 int error; 318 ip_stack_t *ipst = ire->ire_ipst; 319 320 ASSERT(ire->ire_ipversion == IPV6_VERSION); 321 322 /* Make sure the address is properly masked. */ 323 V6_MASK_COPY(ire->ire_addr_v6, ire->ire_mask_v6, ire->ire_addr_v6); 324 325 mask_table_index = ip_mask_to_plen_v6(&ire->ire_mask_v6); 326 if ((ipst->ips_ip_forwarding_table_v6[mask_table_index]) == NULL) { 327 irb_t *ptr; 328 int i; 329 330 ptr = (irb_t *)mi_zalloc((ipst->ips_ip6_ftable_hash_size * 331 sizeof (irb_t))); 332 if (ptr == NULL) { 333 ire_delete(ire); 334 return (NULL); 335 } 336 for (i = 0; i < ipst->ips_ip6_ftable_hash_size; i++) { 337 rw_init(&ptr[i].irb_lock, NULL, RW_DEFAULT, NULL); 338 ptr[i].irb_ipst = ipst; 339 } 340 mutex_enter(&ipst->ips_ire_ft_init_lock); 341 if (ipst->ips_ip_forwarding_table_v6[mask_table_index] == 342 NULL) { 343 ipst->ips_ip_forwarding_table_v6[mask_table_index] = 344 ptr; 345 mutex_exit(&ipst->ips_ire_ft_init_lock); 346 } else { 347 /* 348 * Some other thread won the race in 349 * initializing the forwarding table at the 350 * same index. 351 */ 352 mutex_exit(&ipst->ips_ire_ft_init_lock); 353 for (i = 0; i < ipst->ips_ip6_ftable_hash_size; i++) { 354 rw_destroy(&ptr[i].irb_lock); 355 } 356 mi_free(ptr); 357 } 358 } 359 irb_ptr = &(ipst->ips_ip_forwarding_table_v6[mask_table_index][ 360 IRE_ADDR_MASK_HASH_V6(ire->ire_addr_v6, ire->ire_mask_v6, 361 ipst->ips_ip6_ftable_hash_size)]); 362 363 match_flags = (MATCH_IRE_MASK | MATCH_IRE_TYPE | MATCH_IRE_GW); 364 if (ire->ire_ill != NULL) 365 match_flags |= MATCH_IRE_ILL; 366 /* 367 * Start the atomic add of the ire. Grab the bucket lock and the 368 * ill lock. Check for condemned. 369 */ 370 error = ire_atomic_start(irb_ptr, ire); 371 if (error != 0) { 372 ire_delete(ire); 373 return (NULL); 374 } 375 376 /* 377 * If we are creating a hidden IRE, make sure we search for 378 * hidden IREs when searching for duplicates below. 379 * Otherwise, we might find an IRE on some other interface 380 * that's not marked hidden. 381 */ 382 if (ire->ire_testhidden) 383 match_flags |= MATCH_IRE_TESTHIDDEN; 384 385 /* 386 * Atomically check for duplicate and insert in the table. 387 */ 388 for (ire1 = irb_ptr->irb_ire; ire1 != NULL; ire1 = ire1->ire_next) { 389 if (IRE_IS_CONDEMNED(ire1)) 390 continue; 391 /* 392 * Here we need an exact match on zoneid, i.e., 393 * ire_match_args doesn't fit. 394 */ 395 if (ire1->ire_zoneid != ire->ire_zoneid) 396 continue; 397 398 if (ire1->ire_type != ire->ire_type) 399 continue; 400 401 /* 402 * Note: We do not allow multiple routes that differ only 403 * in the gateway security attributes; such routes are 404 * considered duplicates. 405 * To change that we explicitly have to treat them as 406 * different here. 407 */ 408 if (ire_match_args_v6(ire1, &ire->ire_addr_v6, 409 &ire->ire_mask_v6, &ire->ire_gateway_addr_v6, 410 ire->ire_type, ire->ire_ill, ire->ire_zoneid, NULL, 411 match_flags)) { 412 /* 413 * Return the old ire after doing a REFHOLD. 414 * As most of the callers continue to use the IRE 415 * after adding, we return a held ire. This will 416 * avoid a lookup in the caller again. If the callers 417 * don't want to use it, they need to do a REFRELE. 418 */ 419 ip1dbg(("found dup ire existing %p new %p", 420 (void *)ire1, (void *)ire)); 421 ire_refhold(ire1); 422 atomic_add_32(&ire1->ire_identical_ref, 1); 423 ire_atomic_end(irb_ptr, ire); 424 ire_delete(ire); 425 return (ire1); 426 } 427 } 428 429 /* 430 * Normally we do head insertion since most things do not care about 431 * the order of the IREs in the bucket. 432 * However, due to shared-IP zones (and restrict_interzone_loopback) 433 * we can have an IRE_LOCAL as well as IRE_IF_CLONE for the same 434 * address. For that reason we do tail insertion for IRE_IF_CLONE. 435 */ 436 irep = (ire_t **)irb_ptr; 437 if (ire->ire_type & IRE_IF_CLONE) { 438 while ((ire1 = *irep) != NULL) 439 irep = &ire1->ire_next; 440 } 441 /* Insert at *irep */ 442 ire1 = *irep; 443 if (ire1 != NULL) 444 ire1->ire_ptpn = &ire->ire_next; 445 ire->ire_next = ire1; 446 /* Link the new one in. */ 447 ire->ire_ptpn = irep; 448 /* 449 * ire_walk routines de-reference ire_next without holding 450 * a lock. Before we point to the new ire, we want to make 451 * sure the store that sets the ire_next of the new ire 452 * reaches global visibility, so that ire_walk routines 453 * don't see a truncated list of ires i.e if the ire_next 454 * of the new ire gets set after we do "*irep = ire" due 455 * to re-ordering, the ire_walk thread will see a NULL 456 * once it accesses the ire_next of the new ire. 457 * membar_producer() makes sure that the following store 458 * happens *after* all of the above stores. 459 */ 460 membar_producer(); 461 *irep = ire; 462 ire->ire_bucket = irb_ptr; 463 /* 464 * We return a bumped up IRE above. Keep it symmetrical 465 * so that the callers will always have to release. This 466 * helps the callers of this function because they continue 467 * to use the IRE after adding and hence they don't have to 468 * lookup again after we return the IRE. 469 * 470 * NOTE : We don't have to use atomics as this is appearing 471 * in the list for the first time and no one else can bump 472 * up the reference count on this yet. 473 */ 474 ire_refhold_locked(ire); 475 BUMP_IRE_STATS(ipst->ips_ire_stats_v6, ire_stats_inserted); 476 irb_ptr->irb_ire_cnt++; 477 478 if (ire->ire_ill != NULL) { 479 DTRACE_PROBE3(ill__incr__cnt, (ill_t *), ire->ire_ill, 480 (char *), "ire", (void *), ire); 481 ire->ire_ill->ill_ire_cnt++; 482 ASSERT(ire->ire_ill->ill_ire_cnt != 0); /* Wraparound */ 483 } 484 ire_atomic_end(irb_ptr, ire); 485 486 /* Make any caching of the IREs be notified or updated */ 487 ire_flush_cache_v6(ire, IRE_FLUSH_ADD); 488 489 return (ire); 490 } 491 492 /* 493 * Search for all HOST REDIRECT routes that are 494 * pointing at the specified gateway and 495 * delete them. This routine is called only 496 * when a default gateway is going away. 497 */ 498 static void 499 ire_delete_host_redirects_v6(const in6_addr_t *gateway, ip_stack_t *ipst) 500 { 501 irb_t *irb_ptr; 502 irb_t *irb; 503 ire_t *ire; 504 in6_addr_t gw_addr_v6; 505 int i; 506 507 /* get the hash table for HOST routes */ 508 irb_ptr = ipst->ips_ip_forwarding_table_v6[(IP6_MASK_TABLE_SIZE - 1)]; 509 if (irb_ptr == NULL) 510 return; 511 for (i = 0; (i < ipst->ips_ip6_ftable_hash_size); i++) { 512 irb = &irb_ptr[i]; 513 irb_refhold(irb); 514 for (ire = irb->irb_ire; ire != NULL; ire = ire->ire_next) { 515 if (!(ire->ire_flags & RTF_DYNAMIC)) 516 continue; 517 mutex_enter(&ire->ire_lock); 518 gw_addr_v6 = ire->ire_gateway_addr_v6; 519 mutex_exit(&ire->ire_lock); 520 if (IN6_ARE_ADDR_EQUAL(&gw_addr_v6, gateway)) 521 ire_delete(ire); 522 } 523 irb_refrele(irb); 524 } 525 } 526 527 /* 528 * Delete the specified IRE. 529 * All calls should use ire_delete(). 530 * Sometimes called as writer though not required by this function. 531 * 532 * NOTE : This function is called only if the ire was added 533 * in the list. 534 */ 535 void 536 ire_delete_v6(ire_t *ire) 537 { 538 in6_addr_t gw_addr_v6; 539 ip_stack_t *ipst = ire->ire_ipst; 540 541 /* 542 * Make sure ire_generation increases from ire_flush_cache happen 543 * after any lookup/reader has read ire_generation. 544 * Since the rw_enter makes us wait until any lookup/reader has 545 * completed we can exit the lock immediately. 546 */ 547 rw_enter(&ipst->ips_ip6_ire_head_lock, RW_WRITER); 548 rw_exit(&ipst->ips_ip6_ire_head_lock); 549 550 ASSERT(ire->ire_refcnt >= 1); 551 ASSERT(ire->ire_ipversion == IPV6_VERSION); 552 553 ire_flush_cache_v6(ire, IRE_FLUSH_DELETE); 554 555 if (ire->ire_type == IRE_DEFAULT) { 556 /* 557 * when a default gateway is going away 558 * delete all the host redirects pointing at that 559 * gateway. 560 */ 561 mutex_enter(&ire->ire_lock); 562 gw_addr_v6 = ire->ire_gateway_addr_v6; 563 mutex_exit(&ire->ire_lock); 564 ire_delete_host_redirects_v6(&gw_addr_v6, ipst); 565 } 566 567 /* 568 * If we are deleting an IRE_INTERFACE then we make sure we also 569 * delete any IRE_IF_CLONE that has been created from it. 570 * Those are always in ire_dep_children. 571 */ 572 if ((ire->ire_type & IRE_INTERFACE) && ire->ire_dep_children != 0) 573 ire_dep_delete_if_clone(ire); 574 575 /* Remove from parent dependencies and child */ 576 rw_enter(&ipst->ips_ire_dep_lock, RW_WRITER); 577 if (ire->ire_dep_parent != NULL) { 578 ire_dep_remove(ire); 579 } 580 while (ire->ire_dep_children != NULL) 581 ire_dep_remove(ire->ire_dep_children); 582 rw_exit(&ipst->ips_ire_dep_lock); 583 } 584 585 /* 586 * When an IRE is added or deleted this routine is called to make sure 587 * any caching of IRE information is notified or updated. 588 * 589 * The flag argument indicates if the flush request is due to addition 590 * of new route (IRE_FLUSH_ADD), deletion of old route (IRE_FLUSH_DELETE), 591 * or a change to ire_gateway_addr (IRE_FLUSH_GWCHANGE). 592 */ 593 void 594 ire_flush_cache_v6(ire_t *ire, int flag) 595 { 596 ip_stack_t *ipst = ire->ire_ipst; 597 598 /* 599 * IRE_IF_CLONE ire's don't provide any new information 600 * than the parent from which they are cloned, so don't 601 * perturb the generation numbers. 602 */ 603 if (ire->ire_type & IRE_IF_CLONE) 604 return; 605 606 /* 607 * Ensure that an ire_add during a lookup serializes the updates of 608 * the generation numbers under ire_head_lock so that the lookup gets 609 * either the old ire and old generation number, or a new ire and new 610 * generation number. 611 */ 612 rw_enter(&ipst->ips_ip6_ire_head_lock, RW_WRITER); 613 614 /* 615 * If a route was just added, we need to notify everybody that 616 * has cached an IRE_NOROUTE since there might now be a better 617 * route for them. 618 */ 619 if (flag == IRE_FLUSH_ADD) { 620 ire_increment_generation(ipst->ips_ire_reject_v6); 621 ire_increment_generation(ipst->ips_ire_blackhole_v6); 622 } 623 624 /* Adding a default can't otherwise provide a better route */ 625 if (ire->ire_type == IRE_DEFAULT && flag == IRE_FLUSH_ADD) { 626 rw_exit(&ipst->ips_ip6_ire_head_lock); 627 return; 628 } 629 630 switch (flag) { 631 case IRE_FLUSH_DELETE: 632 case IRE_FLUSH_GWCHANGE: 633 /* 634 * Update ire_generation for all ire_dep_children chains 635 * starting with this IRE 636 */ 637 ire_dep_incr_generation(ire); 638 break; 639 case IRE_FLUSH_ADD: { 640 in6_addr_t addr; 641 in6_addr_t mask; 642 ip_stack_t *ipst = ire->ire_ipst; 643 uint_t masklen; 644 645 /* 646 * Find an IRE which is a shorter match than the ire to be added 647 * For any such IRE (which we repeat) we update the 648 * ire_generation the same way as in the delete case. 649 */ 650 addr = ire->ire_addr_v6; 651 mask = ire->ire_mask_v6; 652 masklen = ip_mask_to_plen_v6(&mask); 653 654 ire = ire_ftable_lookup_impl_v6(&addr, &mask, NULL, 0, NULL, 655 ALL_ZONES, NULL, MATCH_IRE_SHORTERMASK, ipst); 656 while (ire != NULL) { 657 /* We need to handle all in the same bucket */ 658 irb_increment_generation(ire->ire_bucket); 659 660 mask = ire->ire_mask_v6; 661 ASSERT(masklen > ip_mask_to_plen_v6(&mask)); 662 masklen = ip_mask_to_plen_v6(&mask); 663 ire_refrele(ire); 664 ire = ire_ftable_lookup_impl_v6(&addr, &mask, NULL, 0, 665 NULL, ALL_ZONES, NULL, MATCH_IRE_SHORTERMASK, ipst); 666 } 667 } 668 break; 669 } 670 rw_exit(&ipst->ips_ip6_ire_head_lock); 671 } 672 673 /* 674 * Matches the arguments passed with the values in the ire. 675 * 676 * Note: for match types that match using "ill" passed in, ill 677 * must be checked for non-NULL before calling this routine. 678 */ 679 boolean_t 680 ire_match_args_v6(ire_t *ire, const in6_addr_t *addr, const in6_addr_t *mask, 681 const in6_addr_t *gateway, int type, const ill_t *ill, zoneid_t zoneid, 682 const ts_label_t *tsl, int match_flags) 683 { 684 in6_addr_t masked_addr; 685 in6_addr_t gw_addr_v6; 686 ill_t *ire_ill = NULL, *dst_ill; 687 ip_stack_t *ipst = ire->ire_ipst; 688 689 ASSERT(ire->ire_ipversion == IPV6_VERSION); 690 ASSERT(addr != NULL); 691 ASSERT(mask != NULL); 692 ASSERT((!(match_flags & MATCH_IRE_GW)) || gateway != NULL); 693 ASSERT((!(match_flags & (MATCH_IRE_ILL|MATCH_IRE_SRC_ILL))) || 694 (ill != NULL && ill->ill_isv6)); 695 696 /* 697 * If MATCH_IRE_TESTHIDDEN is set, then only return the IRE if it 698 * is in fact hidden, to ensure the caller gets the right one. 699 */ 700 if (ire->ire_testhidden) { 701 if (!(match_flags & MATCH_IRE_TESTHIDDEN)) 702 return (B_FALSE); 703 } 704 705 if (zoneid != ALL_ZONES && zoneid != ire->ire_zoneid && 706 ire->ire_zoneid != ALL_ZONES) { 707 /* 708 * If MATCH_IRE_ZONEONLY has been set and the supplied zoneid 709 * does not match that of ire_zoneid, a failure to 710 * match is reported at this point. Otherwise, since some IREs 711 * that are available in the global zone can be used in local 712 * zones, additional checks need to be performed: 713 * 714 * IRE_LOOPBACK 715 * entries should never be matched in this situation. 716 * Each zone has its own IRE_LOOPBACK. 717 * 718 * IRE_LOCAL 719 * We allow them for any zoneid. ire_route_recursive 720 * does additional checks when 721 * ip_restrict_interzone_loopback is set. 722 * 723 * If ill_usesrc_ifindex is set 724 * Then we check if the zone has a valid source address 725 * on the usesrc ill. 726 * 727 * If ire_ill is set, then check that the zone has an ipif 728 * on that ill. 729 * 730 * Outside of this function (in ire_round_robin) we check 731 * that any IRE_OFFLINK has a gateway that reachable from the 732 * zone when we have multiple choices (ECMP). 733 */ 734 if (match_flags & MATCH_IRE_ZONEONLY) 735 return (B_FALSE); 736 if (ire->ire_type & IRE_LOOPBACK) 737 return (B_FALSE); 738 739 if (ire->ire_type & IRE_LOCAL) 740 goto matchit; 741 742 /* 743 * The normal case of IRE_ONLINK has a matching zoneid. 744 * Here we handle the case when shared-IP zones have been 745 * configured with IP addresses on vniN. In that case it 746 * is ok for traffic from a zone to use IRE_ONLINK routes 747 * if the ill has a usesrc pointing at vniN 748 * Applies to IRE_INTERFACE. 749 */ 750 dst_ill = ire->ire_ill; 751 if (ire->ire_type & IRE_ONLINK) { 752 uint_t ifindex; 753 754 /* 755 * Note there is no IRE_INTERFACE on vniN thus 756 * can't do an IRE lookup for a matching route. 757 */ 758 ifindex = dst_ill->ill_usesrc_ifindex; 759 if (ifindex == 0) 760 return (B_FALSE); 761 762 /* 763 * If there is a usable source address in the 764 * zone, then it's ok to return this IRE_INTERFACE 765 */ 766 if (!ipif_zone_avail(ifindex, dst_ill->ill_isv6, 767 zoneid, ipst)) { 768 ip3dbg(("ire_match_args: no usrsrc for zone" 769 " dst_ill %p\n", (void *)dst_ill)); 770 return (B_FALSE); 771 } 772 } 773 /* 774 * For example, with 775 * route add 11.0.0.0 gw1 -ifp bge0 776 * route add 11.0.0.0 gw2 -ifp bge1 777 * this code would differentiate based on 778 * where the sending zone has addresses. 779 * Only if the zone has an address on bge0 can it use the first 780 * route. It isn't clear if this behavior is documented 781 * anywhere. 782 */ 783 if (dst_ill != NULL && (ire->ire_type & IRE_OFFLINK)) { 784 ipif_t *tipif; 785 786 mutex_enter(&dst_ill->ill_lock); 787 for (tipif = dst_ill->ill_ipif; 788 tipif != NULL; tipif = tipif->ipif_next) { 789 if (!IPIF_IS_CONDEMNED(tipif) && 790 (tipif->ipif_flags & IPIF_UP) && 791 (tipif->ipif_zoneid == zoneid || 792 tipif->ipif_zoneid == ALL_ZONES)) 793 break; 794 } 795 mutex_exit(&dst_ill->ill_lock); 796 if (tipif == NULL) 797 return (B_FALSE); 798 } 799 } 800 801 matchit: 802 ire_ill = ire->ire_ill; 803 if (match_flags & MATCH_IRE_GW) { 804 mutex_enter(&ire->ire_lock); 805 gw_addr_v6 = ire->ire_gateway_addr_v6; 806 mutex_exit(&ire->ire_lock); 807 } 808 if (match_flags & MATCH_IRE_ILL) { 809 810 /* 811 * If asked to match an ill, we *must* match 812 * on the ire_ill for ipmp test addresses, or 813 * any of the ill in the group for data addresses. 814 * If we don't, we may as well fail. 815 * However, we need an exception for IRE_LOCALs to ensure 816 * we loopback packets even sent to test addresses on different 817 * interfaces in the group. 818 */ 819 if ((match_flags & MATCH_IRE_TESTHIDDEN) && 820 !(ire->ire_type & IRE_LOCAL)) { 821 if (ire->ire_ill != ill) 822 return (B_FALSE); 823 } else { 824 match_flags &= ~MATCH_IRE_TESTHIDDEN; 825 /* 826 * We know that ill is not NULL, but ire_ill could be 827 * NULL 828 */ 829 if (ire_ill == NULL || !IS_ON_SAME_LAN(ill, ire_ill)) 830 return (B_FALSE); 831 } 832 } 833 if (match_flags & MATCH_IRE_SRC_ILL) { 834 if (ire_ill == NULL) 835 return (B_FALSE); 836 if (!IS_ON_SAME_LAN(ill, ire_ill)) { 837 if (ire_ill->ill_usesrc_ifindex == 0 || 838 (ire_ill->ill_usesrc_ifindex != 839 ill->ill_phyint->phyint_ifindex)) 840 return (B_FALSE); 841 } 842 } 843 844 /* No ire_addr_v6 bits set past the mask */ 845 ASSERT(V6_MASK_EQ(ire->ire_addr_v6, ire->ire_mask_v6, 846 ire->ire_addr_v6)); 847 V6_MASK_COPY(*addr, *mask, masked_addr); 848 if (V6_MASK_EQ(*addr, *mask, ire->ire_addr_v6) && 849 ((!(match_flags & MATCH_IRE_GW)) || 850 IN6_ARE_ADDR_EQUAL(&gw_addr_v6, gateway)) && 851 ((!(match_flags & MATCH_IRE_TYPE)) || (ire->ire_type & type)) && 852 ((!(match_flags & MATCH_IRE_TESTHIDDEN)) || ire->ire_testhidden) && 853 ((!(match_flags & MATCH_IRE_MASK)) || 854 (IN6_ARE_ADDR_EQUAL(&ire->ire_mask_v6, mask))) && 855 ((!(match_flags & MATCH_IRE_SECATTR)) || 856 (!is_system_labeled()) || 857 (tsol_ire_match_gwattr(ire, tsl) == 0))) { 858 /* We found the matched IRE */ 859 return (B_TRUE); 860 } 861 return (B_FALSE); 862 } 863 864 /* 865 * Check if the zoneid (not ALL_ZONES) has an IRE_INTERFACE for the specified 866 * gateway address. If ill is non-NULL we also match on it. 867 * The caller must hold a read lock on RADIX_NODE_HEAD if lock_held is set. 868 */ 869 boolean_t 870 ire_gateway_ok_zone_v6(const in6_addr_t *gateway, zoneid_t zoneid, ill_t *ill, 871 const ts_label_t *tsl, ip_stack_t *ipst, boolean_t lock_held) 872 { 873 ire_t *ire; 874 uint_t match_flags; 875 876 if (lock_held) 877 ASSERT(RW_READ_HELD(&ipst->ips_ip6_ire_head_lock)); 878 else 879 rw_enter(&ipst->ips_ip6_ire_head_lock, RW_READER); 880 881 match_flags = MATCH_IRE_TYPE | MATCH_IRE_SECATTR; 882 if (ill != NULL) 883 match_flags |= MATCH_IRE_ILL; 884 885 ire = ire_ftable_lookup_impl_v6(gateway, &ipv6_all_zeros, 886 &ipv6_all_zeros, IRE_INTERFACE, ill, zoneid, tsl, match_flags, 887 ipst); 888 889 if (!lock_held) 890 rw_exit(&ipst->ips_ip6_ire_head_lock); 891 if (ire != NULL) { 892 ire_refrele(ire); 893 return (B_TRUE); 894 } else { 895 return (B_FALSE); 896 } 897 } 898 899 /* 900 * Lookup a route in forwarding table. 901 * specific lookup is indicated by passing the 902 * required parameters and indicating the 903 * match required in flag field. 904 * 905 * Supports link-local addresses by following the ipif/ill when recursing. 906 */ 907 ire_t * 908 ire_ftable_lookup_v6(const in6_addr_t *addr, const in6_addr_t *mask, 909 const in6_addr_t *gateway, int type, const ill_t *ill, 910 zoneid_t zoneid, const ts_label_t *tsl, int flags, 911 uint32_t xmit_hint, ip_stack_t *ipst, uint_t *generationp) 912 { 913 ire_t *ire = NULL; 914 915 ASSERT(addr != NULL); 916 ASSERT((!(flags & MATCH_IRE_MASK)) || mask != NULL); 917 ASSERT((!(flags & MATCH_IRE_GW)) || gateway != NULL); 918 ASSERT(ill == NULL || ill->ill_isv6); 919 920 ASSERT(!IN6_IS_ADDR_V4MAPPED(addr)); 921 922 /* 923 * ire_match_args_v6() will dereference ill if MATCH_IRE_ILL 924 * or MATCH_IRE_SRC_ILL is set. 925 */ 926 if ((flags & (MATCH_IRE_ILL|MATCH_IRE_SRC_ILL)) && (ill == NULL)) 927 return (NULL); 928 929 rw_enter(&ipst->ips_ip6_ire_head_lock, RW_READER); 930 ire = ire_ftable_lookup_impl_v6(addr, mask, gateway, type, ill, zoneid, 931 tsl, flags, ipst); 932 if (ire == NULL) { 933 rw_exit(&ipst->ips_ip6_ire_head_lock); 934 return (NULL); 935 } 936 937 /* 938 * round-robin only if we have more than one route in the bucket. 939 * ips_ip_ecmp_behavior controls when we do ECMP 940 * 2: always 941 * 1: for IRE_DEFAULT and /0 IRE_INTERFACE 942 * 0: never 943 * 944 * Note: if we found an IRE_IF_CLONE we won't look at the bucket with 945 * other ECMP IRE_INTERFACEs since the IRE_IF_CLONE is a /128 match 946 * and the IRE_INTERFACESs are likely to be shorter matches. 947 */ 948 if (ire->ire_bucket->irb_ire_cnt > 1 && !(flags & MATCH_IRE_GW)) { 949 if (ipst->ips_ip_ecmp_behavior == 2 || 950 (ipst->ips_ip_ecmp_behavior == 1 && 951 IS_DEFAULT_ROUTE_V6(ire))) { 952 ire_t *next_ire; 953 ire_ftable_args_t margs; 954 955 bzero(&margs, sizeof (margs)); 956 margs.ift_addr_v6 = *addr; 957 if (mask != NULL) 958 margs.ift_mask_v6 = *mask; 959 if (gateway != NULL) 960 margs.ift_gateway_v6 = *gateway; 961 margs.ift_type = type; 962 margs.ift_ill = ill; 963 margs.ift_zoneid = zoneid; 964 margs.ift_tsl = tsl; 965 margs.ift_flags = flags; 966 967 next_ire = ire_round_robin(ire->ire_bucket, &margs, 968 xmit_hint, ire, ipst); 969 if (next_ire == NULL) { 970 /* keep ire if next_ire is null */ 971 goto done; 972 } 973 ire_refrele(ire); 974 ire = next_ire; 975 } 976 } 977 978 done: 979 /* Return generation before dropping lock */ 980 if (generationp != NULL) 981 *generationp = ire->ire_generation; 982 983 rw_exit(&ipst->ips_ip6_ire_head_lock); 984 985 /* 986 * For shared-IP zones we need additional checks to what was 987 * done in ire_match_args to make sure IRE_LOCALs are handled. 988 * 989 * When ip_restrict_interzone_loopback is set, then 990 * we ensure that IRE_LOCAL are only used for loopback 991 * between zones when the logical "Ethernet" would 992 * have looped them back. That is, if in the absense of 993 * the IRE_LOCAL we would have sent to packet out the 994 * same ill. 995 */ 996 if ((ire->ire_type & IRE_LOCAL) && zoneid != ALL_ZONES && 997 ire->ire_zoneid != zoneid && ire->ire_zoneid != ALL_ZONES && 998 ipst->ips_ip_restrict_interzone_loopback) { 999 ire = ire_alt_local(ire, zoneid, tsl, ill, generationp); 1000 ASSERT(ire != NULL); 1001 } 1002 1003 return (ire); 1004 } 1005 1006 /* 1007 * Look up a single ire. The caller holds either the read or write lock. 1008 */ 1009 ire_t * 1010 ire_ftable_lookup_impl_v6(const in6_addr_t *addr, const in6_addr_t *mask, 1011 const in6_addr_t *gateway, int type, const ill_t *ill, 1012 zoneid_t zoneid, const ts_label_t *tsl, int flags, 1013 ip_stack_t *ipst) 1014 { 1015 irb_t *irb_ptr; 1016 ire_t *ire = NULL; 1017 int i; 1018 1019 ASSERT(RW_LOCK_HELD(&ipst->ips_ip6_ire_head_lock)); 1020 1021 /* 1022 * If the mask is known, the lookup 1023 * is simple, if the mask is not known 1024 * we need to search. 1025 */ 1026 if (flags & MATCH_IRE_MASK) { 1027 uint_t masklen; 1028 1029 masklen = ip_mask_to_plen_v6(mask); 1030 if (ipst->ips_ip_forwarding_table_v6[masklen] == NULL) { 1031 return (NULL); 1032 } 1033 irb_ptr = &(ipst->ips_ip_forwarding_table_v6[masklen][ 1034 IRE_ADDR_MASK_HASH_V6(*addr, *mask, 1035 ipst->ips_ip6_ftable_hash_size)]); 1036 rw_enter(&irb_ptr->irb_lock, RW_READER); 1037 for (ire = irb_ptr->irb_ire; ire != NULL; 1038 ire = ire->ire_next) { 1039 if (IRE_IS_CONDEMNED(ire)) 1040 continue; 1041 if (ire_match_args_v6(ire, addr, mask, gateway, type, 1042 ill, zoneid, tsl, flags)) 1043 goto found_ire; 1044 } 1045 rw_exit(&irb_ptr->irb_lock); 1046 } else { 1047 uint_t masklen; 1048 1049 /* 1050 * In this case we don't know the mask, we need to 1051 * search the table assuming different mask sizes. 1052 */ 1053 if (flags & MATCH_IRE_SHORTERMASK) { 1054 masklen = ip_mask_to_plen_v6(mask); 1055 if (masklen == 0) { 1056 /* Nothing shorter than zero */ 1057 return (NULL); 1058 } 1059 masklen--; 1060 } else { 1061 masklen = IP6_MASK_TABLE_SIZE - 1; 1062 } 1063 1064 for (i = masklen; i >= 0; i--) { 1065 in6_addr_t tmpmask; 1066 1067 if ((ipst->ips_ip_forwarding_table_v6[i]) == NULL) 1068 continue; 1069 (void) ip_plen_to_mask_v6(i, &tmpmask); 1070 irb_ptr = &ipst->ips_ip_forwarding_table_v6[i][ 1071 IRE_ADDR_MASK_HASH_V6(*addr, tmpmask, 1072 ipst->ips_ip6_ftable_hash_size)]; 1073 rw_enter(&irb_ptr->irb_lock, RW_READER); 1074 for (ire = irb_ptr->irb_ire; ire != NULL; 1075 ire = ire->ire_next) { 1076 if (IRE_IS_CONDEMNED(ire)) 1077 continue; 1078 if (ire_match_args_v6(ire, addr, 1079 &ire->ire_mask_v6, gateway, type, ill, 1080 zoneid, tsl, flags)) 1081 goto found_ire; 1082 } 1083 rw_exit(&irb_ptr->irb_lock); 1084 } 1085 } 1086 ASSERT(ire == NULL); 1087 ip1dbg(("ire_ftable_lookup_v6: returning NULL ire")); 1088 return (NULL); 1089 1090 found_ire: 1091 ire_refhold(ire); 1092 rw_exit(&irb_ptr->irb_lock); 1093 return (ire); 1094 } 1095 1096 1097 /* 1098 * This function is called by 1099 * ip_input/ire_route_recursive when doing a route lookup on only the 1100 * destination address. 1101 * 1102 * The optimizations of this function over ire_ftable_lookup are: 1103 * o removing unnecessary flag matching 1104 * o doing longest prefix match instead of overloading it further 1105 * with the unnecessary "best_prefix_match" 1106 * 1107 * If no route is found we return IRE_NOROUTE. 1108 */ 1109 ire_t * 1110 ire_ftable_lookup_simple_v6(const in6_addr_t *addr, uint32_t xmit_hint, 1111 ip_stack_t *ipst, uint_t *generationp) 1112 { 1113 ire_t *ire; 1114 1115 ire = ire_ftable_lookup_v6(addr, NULL, NULL, 0, NULL, ALL_ZONES, NULL, 1116 MATCH_IRE_DSTONLY, xmit_hint, ipst, generationp); 1117 if (ire == NULL) { 1118 ire = ire_reject(ipst, B_TRUE); 1119 if (generationp != NULL) 1120 *generationp = IRE_GENERATION_VERIFY; 1121 } 1122 /* ftable_lookup did round robin */ 1123 return (ire); 1124 } 1125 1126 ire_t * 1127 ip_select_route_v6(const in6_addr_t *dst, const in6_addr_t src, 1128 ip_xmit_attr_t *ixa, uint_t *generationp, in6_addr_t *setsrcp, 1129 int *errorp, boolean_t *multirtp) 1130 { 1131 ASSERT(!(ixa->ixa_flags & IXAF_IS_IPV4)); 1132 1133 return (ip_select_route(dst, src, ixa, generationp, setsrcp, errorp, 1134 multirtp)); 1135 } 1136 1137 /* 1138 * Recursively look for a route to the destination. Can also match on 1139 * the zoneid, ill, and label. Used for the data paths. See also 1140 * ire_route_recursive_dstonly. 1141 * 1142 * If IRR_ALLOCATE is not set then we will only inspect the existing IREs; never 1143 * create an IRE_IF_CLONE. This is used on the receive side when we are not 1144 * forwarding. 1145 * If IRR_INCOMPLETE is set then we return the IRE even if we can't correctly 1146 * resolve the gateway. 1147 * 1148 * Note that this function never returns NULL. It returns an IRE_NOROUTE 1149 * instead. 1150 * 1151 * If we find any IRE_LOCAL|BROADCAST etc past the first iteration it 1152 * is an error. 1153 * Allow at most one RTF_INDIRECT. 1154 */ 1155 ire_t * 1156 ire_route_recursive_impl_v6(ire_t *ire, 1157 const in6_addr_t *nexthop, uint_t ire_type, const ill_t *ill_arg, 1158 zoneid_t zoneid, const ts_label_t *tsl, uint_t match_args, 1159 uint_t irr_flags, uint32_t xmit_hint, ip_stack_t *ipst, 1160 in6_addr_t *setsrcp, tsol_ire_gw_secattr_t **gwattrp, uint_t *generationp) 1161 { 1162 int i, j; 1163 in6_addr_t v6nexthop = *nexthop; 1164 ire_t *ires[MAX_IRE_RECURSION]; 1165 uint_t generation; 1166 uint_t generations[MAX_IRE_RECURSION]; 1167 boolean_t need_refrele = B_FALSE; 1168 boolean_t invalidate = B_FALSE; 1169 int prefs[MAX_IRE_RECURSION]; 1170 ill_t *ill = NULL; 1171 1172 if (setsrcp != NULL) 1173 ASSERT(IN6_IS_ADDR_UNSPECIFIED(setsrcp)); 1174 if (gwattrp != NULL) 1175 ASSERT(*gwattrp == NULL); 1176 1177 /* 1178 * We iterate up to three times to resolve a route, even though 1179 * we have four slots in the array. The extra slot is for an 1180 * IRE_IF_CLONE we might need to create. 1181 */ 1182 i = 0; 1183 while (i < MAX_IRE_RECURSION - 1) { 1184 /* ire_ftable_lookup handles round-robin/ECMP */ 1185 if (ire == NULL) { 1186 ire = ire_ftable_lookup_v6(&v6nexthop, 0, 0, ire_type, 1187 (ill != NULL ? ill : ill_arg), zoneid, tsl, 1188 match_args, xmit_hint, ipst, &generation); 1189 } else { 1190 /* Caller passed it; extra hold since we will rele */ 1191 ire_refhold(ire); 1192 if (generationp != NULL) 1193 generation = *generationp; 1194 else 1195 generation = IRE_GENERATION_VERIFY; 1196 } 1197 1198 if (ire == NULL) 1199 ire = ire_reject(ipst, B_TRUE); 1200 1201 /* Need to return the ire with RTF_REJECT|BLACKHOLE */ 1202 if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) 1203 goto error; 1204 1205 ASSERT(!(ire->ire_type & IRE_MULTICAST)); /* Not in ftable */ 1206 1207 if (i != 0) { 1208 prefs[i] = ire_pref(ire); 1209 /* 1210 * Don't allow anything unusual past the first 1211 * iteration. 1212 */ 1213 if ((ire->ire_type & 1214 (IRE_LOCAL|IRE_LOOPBACK|IRE_BROADCAST)) || 1215 prefs[i] <= prefs[i-1]) { 1216 ire_refrele(ire); 1217 if (irr_flags & IRR_INCOMPLETE) { 1218 ire = ires[0]; 1219 ire_refhold(ire); 1220 } else { 1221 ire = ire_reject(ipst, B_TRUE); 1222 } 1223 goto error; 1224 } 1225 } 1226 /* We have a usable IRE */ 1227 ires[i] = ire; 1228 generations[i] = generation; 1229 i++; 1230 1231 /* The first RTF_SETSRC address is passed back if setsrcp */ 1232 if ((ire->ire_flags & RTF_SETSRC) && 1233 setsrcp != NULL && IN6_IS_ADDR_UNSPECIFIED(setsrcp)) { 1234 ASSERT(!IN6_IS_ADDR_UNSPECIFIED( 1235 &ire->ire_setsrc_addr_v6)); 1236 *setsrcp = ire->ire_setsrc_addr_v6; 1237 } 1238 1239 /* The first ire_gw_secattr is passed back if gwattrp */ 1240 if (ire->ire_gw_secattr != NULL && 1241 gwattrp != NULL && *gwattrp == NULL) 1242 *gwattrp = ire->ire_gw_secattr; 1243 1244 /* 1245 * Check if we have a short-cut pointer to an IRE for this 1246 * destination, and that the cached dependency isn't stale. 1247 * In that case we've rejoined an existing tree towards a 1248 * parent, thus we don't need to continue the loop to 1249 * discover the rest of the tree. 1250 */ 1251 mutex_enter(&ire->ire_lock); 1252 if (ire->ire_dep_parent != NULL && 1253 ire->ire_dep_parent->ire_generation == 1254 ire->ire_dep_parent_generation) { 1255 mutex_exit(&ire->ire_lock); 1256 ire = NULL; 1257 goto done; 1258 } 1259 mutex_exit(&ire->ire_lock); 1260 1261 /* 1262 * If this type should have an ire_nce_cache (even if it 1263 * doesn't yet have one) then we are done. Includes 1264 * IRE_INTERFACE with a full 128 bit mask. 1265 */ 1266 if (ire->ire_nce_capable) { 1267 ire = NULL; 1268 goto done; 1269 } 1270 ASSERT(!(ire->ire_type & IRE_IF_CLONE)); 1271 /* 1272 * For an IRE_INTERFACE we create an IRE_IF_CLONE for this 1273 * particular destination 1274 */ 1275 if (ire->ire_type & IRE_INTERFACE) { 1276 ire_t *clone; 1277 1278 ASSERT(ire->ire_masklen != IPV6_ABITS); 1279 1280 /* 1281 * In the case of ip_input and ILLF_FORWARDING not 1282 * being set, and in the case of RTM_GET, there is 1283 * no point in allocating an IRE_IF_CLONE. We return 1284 * the IRE_INTERFACE. Note that !IRR_ALLOCATE can 1285 * result in a ire_dep_parent which is IRE_IF_* 1286 * without an IRE_IF_CLONE. 1287 * We recover from that when we need to send packets 1288 * by ensuring that the generations become 1289 * IRE_GENERATION_VERIFY in this case. 1290 */ 1291 if (!(irr_flags & IRR_ALLOCATE)) { 1292 invalidate = B_TRUE; 1293 ire = NULL; 1294 goto done; 1295 } 1296 1297 clone = ire_create_if_clone(ire, &v6nexthop, 1298 &generation); 1299 if (clone == NULL) { 1300 /* 1301 * Temporary failure - no memory. 1302 * Don't want caller to cache IRE_NOROUTE. 1303 */ 1304 invalidate = B_TRUE; 1305 ire = ire_blackhole(ipst, B_TRUE); 1306 goto error; 1307 } 1308 /* 1309 * Make clone next to last entry and the 1310 * IRE_INTERFACE the last in the dependency 1311 * chain since the clone depends on the 1312 * IRE_INTERFACE. 1313 */ 1314 ASSERT(i >= 1); 1315 ASSERT(i < MAX_IRE_RECURSION); 1316 1317 ires[i] = ires[i-1]; 1318 generations[i] = generations[i-1]; 1319 ires[i-1] = clone; 1320 generations[i-1] = generation; 1321 i++; 1322 1323 ire = NULL; 1324 goto done; 1325 } 1326 1327 /* 1328 * We only match on the type and optionally ILL when 1329 * recursing. The type match is used by some callers 1330 * to exclude certain types (such as IRE_IF_CLONE or 1331 * IRE_LOCAL|IRE_LOOPBACK). 1332 * 1333 * In the MATCH_IRE_SRC_ILL case, ill_arg may be the 'srcof' 1334 * ire->ire_ill, and we want to find the IRE_INTERFACE for 1335 * ire_ill, so we set ill to the ire_ill 1336 */ 1337 match_args &= MATCH_IRE_TYPE; 1338 v6nexthop = ire->ire_gateway_addr_v6; 1339 if (ill == NULL && ire->ire_ill != NULL) { 1340 ill = ire->ire_ill; 1341 need_refrele = B_TRUE; 1342 ill_refhold(ill); 1343 match_args |= MATCH_IRE_ILL; 1344 } 1345 /* 1346 * We set the prefs[i] value above if i > 0. We've already 1347 * done i++ so i is one in the case of the first time around. 1348 */ 1349 if (i == 1) 1350 prefs[0] = ire_pref(ire); 1351 ire = NULL; 1352 } 1353 ASSERT(ire == NULL); 1354 ire = ire_reject(ipst, B_TRUE); 1355 1356 error: 1357 ASSERT(ire != NULL); 1358 if (need_refrele) 1359 ill_refrele(ill); 1360 1361 /* 1362 * In the case of MULTIRT we want to try a different IRE the next 1363 * time. We let the next packet retry in that case. 1364 */ 1365 if (i > 0 && (ires[0]->ire_flags & RTF_MULTIRT)) 1366 (void) ire_no_good(ires[0]); 1367 1368 cleanup: 1369 /* cleanup ires[i] */ 1370 ire_dep_unbuild(ires, i); 1371 for (j = 0; j < i; j++) 1372 ire_refrele(ires[j]); 1373 1374 ASSERT((ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) || 1375 (irr_flags & IRR_INCOMPLETE)); 1376 /* 1377 * Use IRE_GENERATION_VERIFY to ensure that ip_output will redo the 1378 * ip_select_route since the reject or lack of memory might be gone. 1379 */ 1380 if (generationp != NULL) 1381 *generationp = IRE_GENERATION_VERIFY; 1382 return (ire); 1383 1384 done: 1385 ASSERT(ire == NULL); 1386 if (need_refrele) 1387 ill_refrele(ill); 1388 1389 /* Build dependencies */ 1390 if (i > 1 && !ire_dep_build(ires, generations, i)) { 1391 /* Something in chain was condemned; tear it apart */ 1392 ire = ire_blackhole(ipst, B_TRUE); 1393 goto cleanup; 1394 } 1395 1396 /* 1397 * Release all refholds except the one for ires[0] that we 1398 * will return to the caller. 1399 */ 1400 for (j = 1; j < i; j++) 1401 ire_refrele(ires[j]); 1402 1403 if (invalidate) { 1404 /* 1405 * Since we needed to allocate but couldn't we need to make 1406 * sure that the dependency chain is rebuilt the next time. 1407 */ 1408 ire_dep_invalidate_generations(ires[0]); 1409 generation = IRE_GENERATION_VERIFY; 1410 } else { 1411 /* 1412 * IREs can have been added or deleted while we did the 1413 * recursive lookup and we can't catch those until we've built 1414 * the dependencies. We verify the stored 1415 * ire_dep_parent_generation to catch any such changes and 1416 * return IRE_GENERATION_VERIFY (which will cause 1417 * ip_select_route to be called again so we can redo the 1418 * recursive lookup next time we send a packet. 1419 */ 1420 if (ires[0]->ire_dep_parent == NULL) 1421 generation = ires[0]->ire_generation; 1422 else 1423 generation = ire_dep_validate_generations(ires[0]); 1424 if (generations[0] != ires[0]->ire_generation) { 1425 /* Something changed at the top */ 1426 generation = IRE_GENERATION_VERIFY; 1427 } 1428 } 1429 if (generationp != NULL) 1430 *generationp = generation; 1431 1432 return (ires[0]); 1433 } 1434 1435 ire_t * 1436 ire_route_recursive_v6(const in6_addr_t *nexthop, uint_t ire_type, 1437 const ill_t *ill, zoneid_t zoneid, const ts_label_t *tsl, uint_t match_args, 1438 uint_t irr_flags, uint32_t xmit_hint, ip_stack_t *ipst, 1439 in6_addr_t *setsrcp, tsol_ire_gw_secattr_t **gwattrp, uint_t *generationp) 1440 { 1441 return (ire_route_recursive_impl_v6(NULL, nexthop, ire_type, ill, 1442 zoneid, tsl, match_args, irr_flags, xmit_hint, ipst, setsrcp, 1443 gwattrp, generationp)); 1444 } 1445 1446 /* 1447 * Recursively look for a route to the destination. 1448 * We only handle a destination match here, yet we have the same arguments 1449 * as the full match to allow function pointers to select between the two. 1450 * 1451 * Note that this function never returns NULL. It returns an IRE_NOROUTE 1452 * instead. 1453 * 1454 * If we find any IRE_LOCAL|BROADCAST etc past the first iteration it 1455 * is an error. 1456 * Allow at most one RTF_INDIRECT. 1457 */ 1458 ire_t * 1459 ire_route_recursive_dstonly_v6(const in6_addr_t *nexthop, uint_t irr_flags, 1460 uint32_t xmit_hint, ip_stack_t *ipst) 1461 { 1462 ire_t *ire; 1463 ire_t *ire1; 1464 uint_t generation; 1465 1466 /* ire_ftable_lookup handles round-robin/ECMP */ 1467 ire = ire_ftable_lookup_simple_v6(nexthop, xmit_hint, ipst, 1468 &generation); 1469 ASSERT(ire != NULL); 1470 1471 /* 1472 * If this type should have an ire_nce_cache (even if it 1473 * doesn't yet have one) then we are done. Includes 1474 * IRE_INTERFACE with a full 128 bit mask. 1475 */ 1476 if (ire->ire_nce_capable) 1477 return (ire); 1478 1479 /* 1480 * If the IRE has a current cached parent we know that the whole 1481 * parent chain is current, hence we don't need to discover and 1482 * build any dependencies by doing a recursive lookup. 1483 */ 1484 mutex_enter(&ire->ire_lock); 1485 if (ire->ire_dep_parent != NULL && 1486 ire->ire_dep_parent->ire_generation == 1487 ire->ire_dep_parent_generation) { 1488 mutex_exit(&ire->ire_lock); 1489 return (ire); 1490 } 1491 mutex_exit(&ire->ire_lock); 1492 1493 /* 1494 * Fallback to loop in the normal code starting with the ire 1495 * we found. Normally this would return the same ire. 1496 */ 1497 ire1 = ire_route_recursive_impl_v6(ire, nexthop, 0, NULL, ALL_ZONES, 1498 NULL, MATCH_IRE_DSTONLY, irr_flags, xmit_hint, ipst, NULL, NULL, 1499 &generation); 1500 ire_refrele(ire); 1501 return (ire1); 1502 } 1503