1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2010 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 /* 26 * Copyright (c) 1990 Mentat Inc. 27 */ 28 29 /* 30 * This file contains routines that manipulate Internet Routing Entries (IREs). 31 */ 32 #include <sys/types.h> 33 #include <sys/stream.h> 34 #include <sys/stropts.h> 35 #include <sys/ddi.h> 36 #include <sys/cmn_err.h> 37 38 #include <sys/systm.h> 39 #include <sys/param.h> 40 #include <sys/socket.h> 41 #include <net/if.h> 42 #include <net/route.h> 43 #include <netinet/in.h> 44 #include <net/if_dl.h> 45 #include <netinet/ip6.h> 46 #include <netinet/icmp6.h> 47 48 #include <inet/common.h> 49 #include <inet/mi.h> 50 #include <inet/ip.h> 51 #include <inet/ip6.h> 52 #include <inet/ip_ndp.h> 53 #include <inet/ip_if.h> 54 #include <inet/ip_ire.h> 55 #include <inet/ipclassifier.h> 56 #include <inet/nd.h> 57 #include <inet/tunables.h> 58 #include <sys/kmem.h> 59 #include <sys/zone.h> 60 61 #include <sys/tsol/label.h> 62 #include <sys/tsol/tnet.h> 63 64 #define IS_DEFAULT_ROUTE_V6(ire) \ 65 (((ire)->ire_type & IRE_DEFAULT) || \ 66 (((ire)->ire_type & IRE_INTERFACE) && \ 67 (IN6_IS_ADDR_UNSPECIFIED(&(ire)->ire_addr_v6)))) 68 69 static ire_t ire_null; 70 71 static ire_t * 72 ire_ftable_lookup_impl_v6(const in6_addr_t *addr, const in6_addr_t *mask, 73 const in6_addr_t *gateway, int type, const ill_t *ill, 74 zoneid_t zoneid, const ts_label_t *tsl, int flags, 75 ip_stack_t *ipst); 76 77 /* 78 * Initialize the ire that is specific to IPv6 part and call 79 * ire_init_common to finish it. 80 * Returns zero or errno. 81 */ 82 int 83 ire_init_v6(ire_t *ire, const in6_addr_t *v6addr, const in6_addr_t *v6mask, 84 const in6_addr_t *v6gateway, ushort_t type, ill_t *ill, 85 zoneid_t zoneid, uint_t flags, tsol_gc_t *gc, ip_stack_t *ipst) 86 { 87 int error; 88 89 /* 90 * Reject IRE security attmakeribute creation/initialization 91 * if system is not running in Trusted mode. 92 */ 93 if (gc != NULL && !is_system_labeled()) 94 return (EINVAL); 95 96 BUMP_IRE_STATS(ipst->ips_ire_stats_v6, ire_stats_alloced); 97 if (v6addr != NULL) 98 ire->ire_addr_v6 = *v6addr; 99 if (v6gateway != NULL) 100 ire->ire_gateway_addr_v6 = *v6gateway; 101 102 /* Make sure we don't have stray values in some fields */ 103 switch (type) { 104 case IRE_LOOPBACK: 105 case IRE_HOST: 106 case IRE_LOCAL: 107 case IRE_IF_CLONE: 108 ire->ire_mask_v6 = ipv6_all_ones; 109 ire->ire_masklen = IPV6_ABITS; 110 break; 111 case IRE_PREFIX: 112 case IRE_DEFAULT: 113 case IRE_IF_RESOLVER: 114 case IRE_IF_NORESOLVER: 115 if (v6mask != NULL) { 116 ire->ire_mask_v6 = *v6mask; 117 ire->ire_masklen = 118 ip_mask_to_plen_v6(&ire->ire_mask_v6); 119 } 120 break; 121 case IRE_MULTICAST: 122 case IRE_NOROUTE: 123 ASSERT(v6mask == NULL); 124 break; 125 default: 126 ASSERT(0); 127 return (EINVAL); 128 } 129 130 error = ire_init_common(ire, type, ill, zoneid, flags, IPV6_VERSION, 131 gc, ipst); 132 if (error != NULL) 133 return (error); 134 135 /* Determine which function pointers to use */ 136 ire->ire_postfragfn = ip_xmit; /* Common case */ 137 138 switch (ire->ire_type) { 139 case IRE_LOCAL: 140 ire->ire_sendfn = ire_send_local_v6; 141 ire->ire_recvfn = ire_recv_local_v6; 142 ASSERT(ire->ire_ill != NULL); 143 if (ire->ire_ill->ill_flags & ILLF_NOACCEPT) 144 ire->ire_recvfn = ire_recv_noaccept_v6; 145 break; 146 case IRE_LOOPBACK: 147 ire->ire_sendfn = ire_send_local_v6; 148 ire->ire_recvfn = ire_recv_loopback_v6; 149 break; 150 case IRE_MULTICAST: 151 ire->ire_postfragfn = ip_postfrag_loopcheck; 152 ire->ire_sendfn = ire_send_multicast_v6; 153 ire->ire_recvfn = ire_recv_multicast_v6; 154 break; 155 default: 156 /* 157 * For IRE_IF_ALL and IRE_OFFLINK we forward received 158 * packets by default. 159 */ 160 ire->ire_sendfn = ire_send_wire_v6; 161 ire->ire_recvfn = ire_recv_forward_v6; 162 break; 163 } 164 if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) { 165 ire->ire_sendfn = ire_send_noroute_v6; 166 ire->ire_recvfn = ire_recv_noroute_v6; 167 } else if (ire->ire_flags & RTF_MULTIRT) { 168 ire->ire_postfragfn = ip_postfrag_multirt_v6; 169 ire->ire_sendfn = ire_send_multirt_v6; 170 ire->ire_recvfn = ire_recv_multirt_v6; 171 } 172 ire->ire_nce_capable = ire_determine_nce_capable(ire); 173 return (0); 174 } 175 176 /* 177 * ire_create_v6 is called to allocate and initialize a new IRE. 178 * 179 * NOTE : This is called as writer sometimes though not required 180 * by this function. 181 */ 182 /* ARGSUSED */ 183 ire_t * 184 ire_create_v6(const in6_addr_t *v6addr, const in6_addr_t *v6mask, 185 const in6_addr_t *v6gateway, ushort_t type, ill_t *ill, zoneid_t zoneid, 186 uint_t flags, tsol_gc_t *gc, ip_stack_t *ipst) 187 { 188 ire_t *ire; 189 int error; 190 191 ASSERT(!IN6_IS_ADDR_V4MAPPED(v6addr)); 192 193 ire = kmem_cache_alloc(ire_cache, KM_NOSLEEP); 194 if (ire == NULL) { 195 DTRACE_PROBE(kmem__cache__alloc); 196 return (NULL); 197 } 198 *ire = ire_null; 199 200 error = ire_init_v6(ire, v6addr, v6mask, v6gateway, 201 type, ill, zoneid, flags, gc, ipst); 202 203 if (error != 0) { 204 DTRACE_PROBE2(ire__init__v6, ire_t *, ire, int, error); 205 kmem_cache_free(ire_cache, ire); 206 return (NULL); 207 } 208 return (ire); 209 } 210 211 /* 212 * Find the ill matching a multicast group. 213 * Allows different routes for multicast addresses 214 * in the unicast routing table (akin to FF::0/8 but could be more specific) 215 * which point at different interfaces. This is used when IPV6_MULTICAST_IF 216 * isn't specified (when sending) and when IPV6_JOIN_GROUP doesn't 217 * specify the interface to join on. 218 * 219 * Supports link-local addresses by using ire_route_recursive which follows 220 * the ill when recursing. 221 * 222 * To handle CGTP, since we don't have a separate IRE_MULTICAST for each group 223 * and the MULTIRT property can be different for different groups, we 224 * extract RTF_MULTIRT from the special unicast route added for a group 225 * with CGTP and pass that back in the multirtp argument. 226 * This is used in ip_set_destination etc to set ixa_postfragfn for multicast. 227 * We have a setsrcp argument for the same reason. 228 */ 229 ill_t * 230 ire_lookup_multi_ill_v6(const in6_addr_t *group, zoneid_t zoneid, 231 ip_stack_t *ipst, boolean_t *multirtp, in6_addr_t *setsrcp) 232 { 233 ire_t *ire; 234 ill_t *ill; 235 236 ire = ire_route_recursive_v6(group, 0, NULL, zoneid, NULL, 237 MATCH_IRE_DSTONLY, IRR_NONE, 0, ipst, setsrcp, NULL, NULL); 238 ASSERT(ire != NULL); 239 240 if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) { 241 ire_refrele(ire); 242 return (NULL); 243 } 244 245 if (multirtp != NULL) 246 *multirtp = (ire->ire_flags & RTF_MULTIRT) != 0; 247 248 ill = ire_nexthop_ill(ire); 249 ire_refrele(ire); 250 return (ill); 251 } 252 253 /* 254 * This function takes a mask and returns number of bits set in the 255 * mask (the represented prefix length). Assumes a contiguous mask. 256 */ 257 int 258 ip_mask_to_plen_v6(const in6_addr_t *v6mask) 259 { 260 int bits; 261 int plen = IPV6_ABITS; 262 int i; 263 264 for (i = 3; i >= 0; i--) { 265 if (v6mask->s6_addr32[i] == 0) { 266 plen -= 32; 267 continue; 268 } 269 bits = ffs(ntohl(v6mask->s6_addr32[i])) - 1; 270 if (bits == 0) 271 break; 272 plen -= bits; 273 } 274 275 return (plen); 276 } 277 278 /* 279 * Convert a prefix length to the mask for that prefix. 280 * Returns the argument bitmask. 281 */ 282 in6_addr_t * 283 ip_plen_to_mask_v6(uint_t plen, in6_addr_t *bitmask) 284 { 285 uint32_t *ptr; 286 287 if (plen < 0 || plen > IPV6_ABITS) 288 return (NULL); 289 *bitmask = ipv6_all_zeros; 290 if (plen == 0) 291 return (bitmask); 292 293 ptr = (uint32_t *)bitmask; 294 while (plen > 32) { 295 *ptr++ = 0xffffffffU; 296 plen -= 32; 297 } 298 *ptr = htonl(0xffffffffU << (32 - plen)); 299 return (bitmask); 300 } 301 302 /* 303 * Add a fully initialized IPv6 IRE to the forwarding table. 304 * This returns NULL on failure, or a held IRE on success. 305 * Normally the returned IRE is the same as the argument. But a different 306 * IRE will be returned if the added IRE is deemed identical to an existing 307 * one. In that case ire_identical_ref will be increased. 308 * The caller always needs to do an ire_refrele() on the returned IRE. 309 */ 310 ire_t * 311 ire_add_v6(ire_t *ire) 312 { 313 ire_t *ire1; 314 int mask_table_index; 315 irb_t *irb_ptr; 316 ire_t **irep; 317 int match_flags; 318 int error; 319 ip_stack_t *ipst = ire->ire_ipst; 320 321 ASSERT(ire->ire_ipversion == IPV6_VERSION); 322 323 /* Make sure the address is properly masked. */ 324 V6_MASK_COPY(ire->ire_addr_v6, ire->ire_mask_v6, ire->ire_addr_v6); 325 326 mask_table_index = ip_mask_to_plen_v6(&ire->ire_mask_v6); 327 if ((ipst->ips_ip_forwarding_table_v6[mask_table_index]) == NULL) { 328 irb_t *ptr; 329 int i; 330 331 ptr = (irb_t *)mi_zalloc((ipst->ips_ip6_ftable_hash_size * 332 sizeof (irb_t))); 333 if (ptr == NULL) { 334 ire_delete(ire); 335 return (NULL); 336 } 337 for (i = 0; i < ipst->ips_ip6_ftable_hash_size; i++) { 338 rw_init(&ptr[i].irb_lock, NULL, RW_DEFAULT, NULL); 339 ptr[i].irb_ipst = ipst; 340 } 341 mutex_enter(&ipst->ips_ire_ft_init_lock); 342 if (ipst->ips_ip_forwarding_table_v6[mask_table_index] == 343 NULL) { 344 ipst->ips_ip_forwarding_table_v6[mask_table_index] = 345 ptr; 346 mutex_exit(&ipst->ips_ire_ft_init_lock); 347 } else { 348 /* 349 * Some other thread won the race in 350 * initializing the forwarding table at the 351 * same index. 352 */ 353 mutex_exit(&ipst->ips_ire_ft_init_lock); 354 for (i = 0; i < ipst->ips_ip6_ftable_hash_size; i++) { 355 rw_destroy(&ptr[i].irb_lock); 356 } 357 mi_free(ptr); 358 } 359 } 360 irb_ptr = &(ipst->ips_ip_forwarding_table_v6[mask_table_index][ 361 IRE_ADDR_MASK_HASH_V6(ire->ire_addr_v6, ire->ire_mask_v6, 362 ipst->ips_ip6_ftable_hash_size)]); 363 364 match_flags = (MATCH_IRE_MASK | MATCH_IRE_TYPE | MATCH_IRE_GW); 365 if (ire->ire_ill != NULL) 366 match_flags |= MATCH_IRE_ILL; 367 /* 368 * Start the atomic add of the ire. Grab the bucket lock and the 369 * ill lock. Check for condemned. 370 */ 371 error = ire_atomic_start(irb_ptr, ire); 372 if (error != 0) { 373 ire_delete(ire); 374 return (NULL); 375 } 376 377 /* 378 * If we are creating a hidden IRE, make sure we search for 379 * hidden IREs when searching for duplicates below. 380 * Otherwise, we might find an IRE on some other interface 381 * that's not marked hidden. 382 */ 383 if (ire->ire_testhidden) 384 match_flags |= MATCH_IRE_TESTHIDDEN; 385 386 /* 387 * Atomically check for duplicate and insert in the table. 388 */ 389 for (ire1 = irb_ptr->irb_ire; ire1 != NULL; ire1 = ire1->ire_next) { 390 if (IRE_IS_CONDEMNED(ire1)) 391 continue; 392 /* 393 * Here we need an exact match on zoneid, i.e., 394 * ire_match_args doesn't fit. 395 */ 396 if (ire1->ire_zoneid != ire->ire_zoneid) 397 continue; 398 399 if (ire1->ire_type != ire->ire_type) 400 continue; 401 402 /* 403 * Note: We do not allow multiple routes that differ only 404 * in the gateway security attributes; such routes are 405 * considered duplicates. 406 * To change that we explicitly have to treat them as 407 * different here. 408 */ 409 if (ire_match_args_v6(ire1, &ire->ire_addr_v6, 410 &ire->ire_mask_v6, &ire->ire_gateway_addr_v6, 411 ire->ire_type, ire->ire_ill, ire->ire_zoneid, NULL, 412 match_flags)) { 413 /* 414 * Return the old ire after doing a REFHOLD. 415 * As most of the callers continue to use the IRE 416 * after adding, we return a held ire. This will 417 * avoid a lookup in the caller again. If the callers 418 * don't want to use it, they need to do a REFRELE. 419 */ 420 ip1dbg(("found dup ire existing %p new %p", 421 (void *)ire1, (void *)ire)); 422 ire_refhold(ire1); 423 atomic_add_32(&ire1->ire_identical_ref, 1); 424 ire_atomic_end(irb_ptr, ire); 425 ire_delete(ire); 426 return (ire1); 427 } 428 } 429 430 /* 431 * Normally we do head insertion since most things do not care about 432 * the order of the IREs in the bucket. 433 * However, due to shared-IP zones (and restrict_interzone_loopback) 434 * we can have an IRE_LOCAL as well as IRE_IF_CLONE for the same 435 * address. For that reason we do tail insertion for IRE_IF_CLONE. 436 */ 437 irep = (ire_t **)irb_ptr; 438 if (ire->ire_type & IRE_IF_CLONE) { 439 while ((ire1 = *irep) != NULL) 440 irep = &ire1->ire_next; 441 } 442 /* Insert at *irep */ 443 ire1 = *irep; 444 if (ire1 != NULL) 445 ire1->ire_ptpn = &ire->ire_next; 446 ire->ire_next = ire1; 447 /* Link the new one in. */ 448 ire->ire_ptpn = irep; 449 /* 450 * ire_walk routines de-reference ire_next without holding 451 * a lock. Before we point to the new ire, we want to make 452 * sure the store that sets the ire_next of the new ire 453 * reaches global visibility, so that ire_walk routines 454 * don't see a truncated list of ires i.e if the ire_next 455 * of the new ire gets set after we do "*irep = ire" due 456 * to re-ordering, the ire_walk thread will see a NULL 457 * once it accesses the ire_next of the new ire. 458 * membar_producer() makes sure that the following store 459 * happens *after* all of the above stores. 460 */ 461 membar_producer(); 462 *irep = ire; 463 ire->ire_bucket = irb_ptr; 464 /* 465 * We return a bumped up IRE above. Keep it symmetrical 466 * so that the callers will always have to release. This 467 * helps the callers of this function because they continue 468 * to use the IRE after adding and hence they don't have to 469 * lookup again after we return the IRE. 470 * 471 * NOTE : We don't have to use atomics as this is appearing 472 * in the list for the first time and no one else can bump 473 * up the reference count on this yet. 474 */ 475 ire_refhold_locked(ire); 476 BUMP_IRE_STATS(ipst->ips_ire_stats_v6, ire_stats_inserted); 477 irb_ptr->irb_ire_cnt++; 478 479 if (ire->ire_ill != NULL) { 480 DTRACE_PROBE3(ill__incr__cnt, (ill_t *), ire->ire_ill, 481 (char *), "ire", (void *), ire); 482 ire->ire_ill->ill_ire_cnt++; 483 ASSERT(ire->ire_ill->ill_ire_cnt != 0); /* Wraparound */ 484 } 485 ire_atomic_end(irb_ptr, ire); 486 487 /* Make any caching of the IREs be notified or updated */ 488 ire_flush_cache_v6(ire, IRE_FLUSH_ADD); 489 490 return (ire); 491 } 492 493 /* 494 * Search for all HOST REDIRECT routes that are 495 * pointing at the specified gateway and 496 * delete them. This routine is called only 497 * when a default gateway is going away. 498 */ 499 static void 500 ire_delete_host_redirects_v6(const in6_addr_t *gateway, ip_stack_t *ipst) 501 { 502 irb_t *irb_ptr; 503 irb_t *irb; 504 ire_t *ire; 505 in6_addr_t gw_addr_v6; 506 int i; 507 508 /* get the hash table for HOST routes */ 509 irb_ptr = ipst->ips_ip_forwarding_table_v6[(IP6_MASK_TABLE_SIZE - 1)]; 510 if (irb_ptr == NULL) 511 return; 512 for (i = 0; (i < ipst->ips_ip6_ftable_hash_size); i++) { 513 irb = &irb_ptr[i]; 514 irb_refhold(irb); 515 for (ire = irb->irb_ire; ire != NULL; ire = ire->ire_next) { 516 if (!(ire->ire_flags & RTF_DYNAMIC)) 517 continue; 518 mutex_enter(&ire->ire_lock); 519 gw_addr_v6 = ire->ire_gateway_addr_v6; 520 mutex_exit(&ire->ire_lock); 521 if (IN6_ARE_ADDR_EQUAL(&gw_addr_v6, gateway)) 522 ire_delete(ire); 523 } 524 irb_refrele(irb); 525 } 526 } 527 528 /* 529 * Delete the specified IRE. 530 * All calls should use ire_delete(). 531 * Sometimes called as writer though not required by this function. 532 * 533 * NOTE : This function is called only if the ire was added 534 * in the list. 535 */ 536 void 537 ire_delete_v6(ire_t *ire) 538 { 539 in6_addr_t gw_addr_v6; 540 ip_stack_t *ipst = ire->ire_ipst; 541 542 /* 543 * Make sure ire_generation increases from ire_flush_cache happen 544 * after any lookup/reader has read ire_generation. 545 * Since the rw_enter makes us wait until any lookup/reader has 546 * completed we can exit the lock immediately. 547 */ 548 rw_enter(&ipst->ips_ip6_ire_head_lock, RW_WRITER); 549 rw_exit(&ipst->ips_ip6_ire_head_lock); 550 551 ASSERT(ire->ire_refcnt >= 1); 552 ASSERT(ire->ire_ipversion == IPV6_VERSION); 553 554 ire_flush_cache_v6(ire, IRE_FLUSH_DELETE); 555 556 if (ire->ire_type == IRE_DEFAULT) { 557 /* 558 * when a default gateway is going away 559 * delete all the host redirects pointing at that 560 * gateway. 561 */ 562 mutex_enter(&ire->ire_lock); 563 gw_addr_v6 = ire->ire_gateway_addr_v6; 564 mutex_exit(&ire->ire_lock); 565 ire_delete_host_redirects_v6(&gw_addr_v6, ipst); 566 } 567 568 /* 569 * If we are deleting an IRE_INTERFACE then we make sure we also 570 * delete any IRE_IF_CLONE that has been created from it. 571 * Those are always in ire_dep_children. 572 */ 573 if ((ire->ire_type & IRE_INTERFACE) && ire->ire_dep_children != 0) 574 ire_dep_delete_if_clone(ire); 575 576 /* Remove from parent dependencies and child */ 577 rw_enter(&ipst->ips_ire_dep_lock, RW_WRITER); 578 if (ire->ire_dep_parent != NULL) { 579 ire_dep_remove(ire); 580 } 581 while (ire->ire_dep_children != NULL) 582 ire_dep_remove(ire->ire_dep_children); 583 rw_exit(&ipst->ips_ire_dep_lock); 584 } 585 586 /* 587 * When an IRE is added or deleted this routine is called to make sure 588 * any caching of IRE information is notified or updated. 589 * 590 * The flag argument indicates if the flush request is due to addition 591 * of new route (IRE_FLUSH_ADD), deletion of old route (IRE_FLUSH_DELETE), 592 * or a change to ire_gateway_addr (IRE_FLUSH_GWCHANGE). 593 */ 594 void 595 ire_flush_cache_v6(ire_t *ire, int flag) 596 { 597 ip_stack_t *ipst = ire->ire_ipst; 598 599 /* 600 * IRE_IF_CLONE ire's don't provide any new information 601 * than the parent from which they are cloned, so don't 602 * perturb the generation numbers. 603 */ 604 if (ire->ire_type & IRE_IF_CLONE) 605 return; 606 607 /* 608 * Ensure that an ire_add during a lookup serializes the updates of 609 * the generation numbers under ire_head_lock so that the lookup gets 610 * either the old ire and old generation number, or a new ire and new 611 * generation number. 612 */ 613 rw_enter(&ipst->ips_ip6_ire_head_lock, RW_WRITER); 614 615 /* 616 * If a route was just added, we need to notify everybody that 617 * has cached an IRE_NOROUTE since there might now be a better 618 * route for them. 619 */ 620 if (flag == IRE_FLUSH_ADD) { 621 ire_increment_generation(ipst->ips_ire_reject_v6); 622 ire_increment_generation(ipst->ips_ire_blackhole_v6); 623 } 624 625 /* Adding a default can't otherwise provide a better route */ 626 if (ire->ire_type == IRE_DEFAULT && flag == IRE_FLUSH_ADD) { 627 rw_exit(&ipst->ips_ip6_ire_head_lock); 628 return; 629 } 630 631 switch (flag) { 632 case IRE_FLUSH_DELETE: 633 case IRE_FLUSH_GWCHANGE: 634 /* 635 * Update ire_generation for all ire_dep_children chains 636 * starting with this IRE 637 */ 638 ire_dep_incr_generation(ire); 639 break; 640 case IRE_FLUSH_ADD: { 641 in6_addr_t addr; 642 in6_addr_t mask; 643 ip_stack_t *ipst = ire->ire_ipst; 644 uint_t masklen; 645 646 /* 647 * Find an IRE which is a shorter match than the ire to be added 648 * For any such IRE (which we repeat) we update the 649 * ire_generation the same way as in the delete case. 650 */ 651 addr = ire->ire_addr_v6; 652 mask = ire->ire_mask_v6; 653 masklen = ip_mask_to_plen_v6(&mask); 654 655 ire = ire_ftable_lookup_impl_v6(&addr, &mask, NULL, 0, NULL, 656 ALL_ZONES, NULL, MATCH_IRE_SHORTERMASK, ipst); 657 while (ire != NULL) { 658 /* We need to handle all in the same bucket */ 659 irb_increment_generation(ire->ire_bucket); 660 661 mask = ire->ire_mask_v6; 662 ASSERT(masklen > ip_mask_to_plen_v6(&mask)); 663 masklen = ip_mask_to_plen_v6(&mask); 664 ire_refrele(ire); 665 ire = ire_ftable_lookup_impl_v6(&addr, &mask, NULL, 0, 666 NULL, ALL_ZONES, NULL, MATCH_IRE_SHORTERMASK, ipst); 667 } 668 } 669 break; 670 } 671 rw_exit(&ipst->ips_ip6_ire_head_lock); 672 } 673 674 /* 675 * Matches the arguments passed with the values in the ire. 676 * 677 * Note: for match types that match using "ill" passed in, ill 678 * must be checked for non-NULL before calling this routine. 679 */ 680 boolean_t 681 ire_match_args_v6(ire_t *ire, const in6_addr_t *addr, const in6_addr_t *mask, 682 const in6_addr_t *gateway, int type, const ill_t *ill, zoneid_t zoneid, 683 const ts_label_t *tsl, int match_flags) 684 { 685 in6_addr_t masked_addr; 686 in6_addr_t gw_addr_v6; 687 ill_t *ire_ill = NULL, *dst_ill; 688 ip_stack_t *ipst = ire->ire_ipst; 689 690 ASSERT(ire->ire_ipversion == IPV6_VERSION); 691 ASSERT(addr != NULL); 692 ASSERT(mask != NULL); 693 ASSERT((!(match_flags & MATCH_IRE_GW)) || gateway != NULL); 694 ASSERT((!(match_flags & (MATCH_IRE_ILL|MATCH_IRE_SRC_ILL))) || 695 (ill != NULL && ill->ill_isv6)); 696 697 /* 698 * If MATCH_IRE_TESTHIDDEN is set, then only return the IRE if it 699 * is in fact hidden, to ensure the caller gets the right one. 700 */ 701 if (ire->ire_testhidden) { 702 if (!(match_flags & MATCH_IRE_TESTHIDDEN)) 703 return (B_FALSE); 704 } 705 706 if (zoneid != ALL_ZONES && zoneid != ire->ire_zoneid && 707 ire->ire_zoneid != ALL_ZONES) { 708 /* 709 * If MATCH_IRE_ZONEONLY has been set and the supplied zoneid 710 * does not match that of ire_zoneid, a failure to 711 * match is reported at this point. Otherwise, since some IREs 712 * that are available in the global zone can be used in local 713 * zones, additional checks need to be performed: 714 * 715 * IRE_LOOPBACK 716 * entries should never be matched in this situation. 717 * Each zone has its own IRE_LOOPBACK. 718 * 719 * IRE_LOCAL 720 * We allow them for any zoneid. ire_route_recursive 721 * does additional checks when 722 * ip_restrict_interzone_loopback is set. 723 * 724 * If ill_usesrc_ifindex is set 725 * Then we check if the zone has a valid source address 726 * on the usesrc ill. 727 * 728 * If ire_ill is set, then check that the zone has an ipif 729 * on that ill. 730 * 731 * Outside of this function (in ire_round_robin) we check 732 * that any IRE_OFFLINK has a gateway that reachable from the 733 * zone when we have multiple choices (ECMP). 734 */ 735 if (match_flags & MATCH_IRE_ZONEONLY) 736 return (B_FALSE); 737 if (ire->ire_type & IRE_LOOPBACK) 738 return (B_FALSE); 739 740 if (ire->ire_type & IRE_LOCAL) 741 goto matchit; 742 743 /* 744 * The normal case of IRE_ONLINK has a matching zoneid. 745 * Here we handle the case when shared-IP zones have been 746 * configured with IP addresses on vniN. In that case it 747 * is ok for traffic from a zone to use IRE_ONLINK routes 748 * if the ill has a usesrc pointing at vniN 749 * Applies to IRE_INTERFACE. 750 */ 751 dst_ill = ire->ire_ill; 752 if (ire->ire_type & IRE_ONLINK) { 753 uint_t ifindex; 754 755 /* 756 * Note there is no IRE_INTERFACE on vniN thus 757 * can't do an IRE lookup for a matching route. 758 */ 759 ifindex = dst_ill->ill_usesrc_ifindex; 760 if (ifindex == 0) 761 return (B_FALSE); 762 763 /* 764 * If there is a usable source address in the 765 * zone, then it's ok to return this IRE_INTERFACE 766 */ 767 if (!ipif_zone_avail(ifindex, dst_ill->ill_isv6, 768 zoneid, ipst)) { 769 ip3dbg(("ire_match_args: no usrsrc for zone" 770 " dst_ill %p\n", (void *)dst_ill)); 771 return (B_FALSE); 772 } 773 } 774 /* 775 * For example, with 776 * route add 11.0.0.0 gw1 -ifp bge0 777 * route add 11.0.0.0 gw2 -ifp bge1 778 * this code would differentiate based on 779 * where the sending zone has addresses. 780 * Only if the zone has an address on bge0 can it use the first 781 * route. It isn't clear if this behavior is documented 782 * anywhere. 783 */ 784 if (dst_ill != NULL && (ire->ire_type & IRE_OFFLINK)) { 785 ipif_t *tipif; 786 787 mutex_enter(&dst_ill->ill_lock); 788 for (tipif = dst_ill->ill_ipif; 789 tipif != NULL; tipif = tipif->ipif_next) { 790 if (!IPIF_IS_CONDEMNED(tipif) && 791 (tipif->ipif_flags & IPIF_UP) && 792 (tipif->ipif_zoneid == zoneid || 793 tipif->ipif_zoneid == ALL_ZONES)) 794 break; 795 } 796 mutex_exit(&dst_ill->ill_lock); 797 if (tipif == NULL) 798 return (B_FALSE); 799 } 800 } 801 802 matchit: 803 ire_ill = ire->ire_ill; 804 if (match_flags & MATCH_IRE_GW) { 805 mutex_enter(&ire->ire_lock); 806 gw_addr_v6 = ire->ire_gateway_addr_v6; 807 mutex_exit(&ire->ire_lock); 808 } 809 if (match_flags & MATCH_IRE_ILL) { 810 811 /* 812 * If asked to match an ill, we *must* match 813 * on the ire_ill for ipmp test addresses, or 814 * any of the ill in the group for data addresses. 815 * If we don't, we may as well fail. 816 * However, we need an exception for IRE_LOCALs to ensure 817 * we loopback packets even sent to test addresses on different 818 * interfaces in the group. 819 */ 820 if ((match_flags & MATCH_IRE_TESTHIDDEN) && 821 !(ire->ire_type & IRE_LOCAL)) { 822 if (ire->ire_ill != ill) 823 return (B_FALSE); 824 } else { 825 match_flags &= ~MATCH_IRE_TESTHIDDEN; 826 /* 827 * We know that ill is not NULL, but ire_ill could be 828 * NULL 829 */ 830 if (ire_ill == NULL || !IS_ON_SAME_LAN(ill, ire_ill)) 831 return (B_FALSE); 832 } 833 } 834 if (match_flags & MATCH_IRE_SRC_ILL) { 835 if (ire_ill == NULL) 836 return (B_FALSE); 837 if (!IS_ON_SAME_LAN(ill, ire_ill)) { 838 if (ire_ill->ill_usesrc_ifindex == 0 || 839 (ire_ill->ill_usesrc_ifindex != 840 ill->ill_phyint->phyint_ifindex)) 841 return (B_FALSE); 842 } 843 } 844 845 /* No ire_addr_v6 bits set past the mask */ 846 ASSERT(V6_MASK_EQ(ire->ire_addr_v6, ire->ire_mask_v6, 847 ire->ire_addr_v6)); 848 V6_MASK_COPY(*addr, *mask, masked_addr); 849 if (V6_MASK_EQ(*addr, *mask, ire->ire_addr_v6) && 850 ((!(match_flags & MATCH_IRE_GW)) || 851 IN6_ARE_ADDR_EQUAL(&gw_addr_v6, gateway)) && 852 ((!(match_flags & MATCH_IRE_TYPE)) || (ire->ire_type & type)) && 853 ((!(match_flags & MATCH_IRE_TESTHIDDEN)) || ire->ire_testhidden) && 854 ((!(match_flags & MATCH_IRE_MASK)) || 855 (IN6_ARE_ADDR_EQUAL(&ire->ire_mask_v6, mask))) && 856 ((!(match_flags & MATCH_IRE_SECATTR)) || 857 (!is_system_labeled()) || 858 (tsol_ire_match_gwattr(ire, tsl) == 0))) { 859 /* We found the matched IRE */ 860 return (B_TRUE); 861 } 862 return (B_FALSE); 863 } 864 865 /* 866 * Check if the zoneid (not ALL_ZONES) has an IRE_INTERFACE for the specified 867 * gateway address. If ill is non-NULL we also match on it. 868 * The caller must hold a read lock on RADIX_NODE_HEAD if lock_held is set. 869 */ 870 boolean_t 871 ire_gateway_ok_zone_v6(const in6_addr_t *gateway, zoneid_t zoneid, ill_t *ill, 872 const ts_label_t *tsl, ip_stack_t *ipst, boolean_t lock_held) 873 { 874 ire_t *ire; 875 uint_t match_flags; 876 877 if (lock_held) 878 ASSERT(RW_READ_HELD(&ipst->ips_ip6_ire_head_lock)); 879 else 880 rw_enter(&ipst->ips_ip6_ire_head_lock, RW_READER); 881 882 match_flags = MATCH_IRE_TYPE | MATCH_IRE_SECATTR; 883 if (ill != NULL) 884 match_flags |= MATCH_IRE_ILL; 885 886 ire = ire_ftable_lookup_impl_v6(gateway, &ipv6_all_zeros, 887 &ipv6_all_zeros, IRE_INTERFACE, ill, zoneid, tsl, match_flags, 888 ipst); 889 890 if (!lock_held) 891 rw_exit(&ipst->ips_ip6_ire_head_lock); 892 if (ire != NULL) { 893 ire_refrele(ire); 894 return (B_TRUE); 895 } else { 896 return (B_FALSE); 897 } 898 } 899 900 /* 901 * Lookup a route in forwarding table. 902 * specific lookup is indicated by passing the 903 * required parameters and indicating the 904 * match required in flag field. 905 * 906 * Supports link-local addresses by following the ipif/ill when recursing. 907 */ 908 ire_t * 909 ire_ftable_lookup_v6(const in6_addr_t *addr, const in6_addr_t *mask, 910 const in6_addr_t *gateway, int type, const ill_t *ill, 911 zoneid_t zoneid, const ts_label_t *tsl, int flags, 912 uint32_t xmit_hint, ip_stack_t *ipst, uint_t *generationp) 913 { 914 ire_t *ire = NULL; 915 916 ASSERT(addr != NULL); 917 ASSERT((!(flags & MATCH_IRE_MASK)) || mask != NULL); 918 ASSERT((!(flags & MATCH_IRE_GW)) || gateway != NULL); 919 ASSERT(ill == NULL || ill->ill_isv6); 920 921 ASSERT(!IN6_IS_ADDR_V4MAPPED(addr)); 922 923 /* 924 * ire_match_args_v6() will dereference ill if MATCH_IRE_ILL 925 * or MATCH_IRE_SRC_ILL is set. 926 */ 927 if ((flags & (MATCH_IRE_ILL|MATCH_IRE_SRC_ILL)) && (ill == NULL)) 928 return (NULL); 929 930 rw_enter(&ipst->ips_ip6_ire_head_lock, RW_READER); 931 ire = ire_ftable_lookup_impl_v6(addr, mask, gateway, type, ill, zoneid, 932 tsl, flags, ipst); 933 if (ire == NULL) { 934 rw_exit(&ipst->ips_ip6_ire_head_lock); 935 return (NULL); 936 } 937 938 /* 939 * round-robin only if we have more than one route in the bucket. 940 * ips_ip_ecmp_behavior controls when we do ECMP 941 * 2: always 942 * 1: for IRE_DEFAULT and /0 IRE_INTERFACE 943 * 0: never 944 * 945 * Note: if we found an IRE_IF_CLONE we won't look at the bucket with 946 * other ECMP IRE_INTERFACEs since the IRE_IF_CLONE is a /128 match 947 * and the IRE_INTERFACESs are likely to be shorter matches. 948 */ 949 if (ire->ire_bucket->irb_ire_cnt > 1 && !(flags & MATCH_IRE_GW)) { 950 if (ipst->ips_ip_ecmp_behavior == 2 || 951 (ipst->ips_ip_ecmp_behavior == 1 && 952 IS_DEFAULT_ROUTE_V6(ire))) { 953 ire_t *next_ire; 954 ire_ftable_args_t margs; 955 956 bzero(&margs, sizeof (margs)); 957 margs.ift_addr_v6 = *addr; 958 if (mask != NULL) 959 margs.ift_mask_v6 = *mask; 960 if (gateway != NULL) 961 margs.ift_gateway_v6 = *gateway; 962 margs.ift_type = type; 963 margs.ift_ill = ill; 964 margs.ift_zoneid = zoneid; 965 margs.ift_tsl = tsl; 966 margs.ift_flags = flags; 967 968 next_ire = ire_round_robin(ire->ire_bucket, &margs, 969 xmit_hint, ire, ipst); 970 if (next_ire == NULL) { 971 /* keep ire if next_ire is null */ 972 goto done; 973 } 974 ire_refrele(ire); 975 ire = next_ire; 976 } 977 } 978 979 done: 980 /* Return generation before dropping lock */ 981 if (generationp != NULL) 982 *generationp = ire->ire_generation; 983 984 rw_exit(&ipst->ips_ip6_ire_head_lock); 985 986 /* 987 * For shared-IP zones we need additional checks to what was 988 * done in ire_match_args to make sure IRE_LOCALs are handled. 989 * 990 * When ip_restrict_interzone_loopback is set, then 991 * we ensure that IRE_LOCAL are only used for loopback 992 * between zones when the logical "Ethernet" would 993 * have looped them back. That is, if in the absense of 994 * the IRE_LOCAL we would have sent to packet out the 995 * same ill. 996 */ 997 if ((ire->ire_type & IRE_LOCAL) && zoneid != ALL_ZONES && 998 ire->ire_zoneid != zoneid && ire->ire_zoneid != ALL_ZONES && 999 ipst->ips_ip_restrict_interzone_loopback) { 1000 ire = ire_alt_local(ire, zoneid, tsl, ill, generationp); 1001 ASSERT(ire != NULL); 1002 } 1003 1004 return (ire); 1005 } 1006 1007 /* 1008 * Look up a single ire. The caller holds either the read or write lock. 1009 */ 1010 ire_t * 1011 ire_ftable_lookup_impl_v6(const in6_addr_t *addr, const in6_addr_t *mask, 1012 const in6_addr_t *gateway, int type, const ill_t *ill, 1013 zoneid_t zoneid, const ts_label_t *tsl, int flags, 1014 ip_stack_t *ipst) 1015 { 1016 irb_t *irb_ptr; 1017 ire_t *ire = NULL; 1018 int i; 1019 1020 ASSERT(RW_LOCK_HELD(&ipst->ips_ip6_ire_head_lock)); 1021 1022 /* 1023 * If the mask is known, the lookup 1024 * is simple, if the mask is not known 1025 * we need to search. 1026 */ 1027 if (flags & MATCH_IRE_MASK) { 1028 uint_t masklen; 1029 1030 masklen = ip_mask_to_plen_v6(mask); 1031 if (ipst->ips_ip_forwarding_table_v6[masklen] == NULL) { 1032 return (NULL); 1033 } 1034 irb_ptr = &(ipst->ips_ip_forwarding_table_v6[masklen][ 1035 IRE_ADDR_MASK_HASH_V6(*addr, *mask, 1036 ipst->ips_ip6_ftable_hash_size)]); 1037 rw_enter(&irb_ptr->irb_lock, RW_READER); 1038 for (ire = irb_ptr->irb_ire; ire != NULL; 1039 ire = ire->ire_next) { 1040 if (IRE_IS_CONDEMNED(ire)) 1041 continue; 1042 if (ire_match_args_v6(ire, addr, mask, gateway, type, 1043 ill, zoneid, tsl, flags)) 1044 goto found_ire; 1045 } 1046 rw_exit(&irb_ptr->irb_lock); 1047 } else { 1048 uint_t masklen; 1049 1050 /* 1051 * In this case we don't know the mask, we need to 1052 * search the table assuming different mask sizes. 1053 */ 1054 if (flags & MATCH_IRE_SHORTERMASK) { 1055 masklen = ip_mask_to_plen_v6(mask); 1056 if (masklen == 0) { 1057 /* Nothing shorter than zero */ 1058 return (NULL); 1059 } 1060 masklen--; 1061 } else { 1062 masklen = IP6_MASK_TABLE_SIZE - 1; 1063 } 1064 1065 for (i = masklen; i >= 0; i--) { 1066 in6_addr_t tmpmask; 1067 1068 if ((ipst->ips_ip_forwarding_table_v6[i]) == NULL) 1069 continue; 1070 (void) ip_plen_to_mask_v6(i, &tmpmask); 1071 irb_ptr = &ipst->ips_ip_forwarding_table_v6[i][ 1072 IRE_ADDR_MASK_HASH_V6(*addr, tmpmask, 1073 ipst->ips_ip6_ftable_hash_size)]; 1074 rw_enter(&irb_ptr->irb_lock, RW_READER); 1075 for (ire = irb_ptr->irb_ire; ire != NULL; 1076 ire = ire->ire_next) { 1077 if (IRE_IS_CONDEMNED(ire)) 1078 continue; 1079 if (ire_match_args_v6(ire, addr, 1080 &ire->ire_mask_v6, gateway, type, ill, 1081 zoneid, tsl, flags)) 1082 goto found_ire; 1083 } 1084 rw_exit(&irb_ptr->irb_lock); 1085 } 1086 } 1087 ASSERT(ire == NULL); 1088 ip1dbg(("ire_ftable_lookup_v6: returning NULL ire")); 1089 return (NULL); 1090 1091 found_ire: 1092 ire_refhold(ire); 1093 rw_exit(&irb_ptr->irb_lock); 1094 return (ire); 1095 } 1096 1097 1098 /* 1099 * This function is called by 1100 * ip_input/ire_route_recursive when doing a route lookup on only the 1101 * destination address. 1102 * 1103 * The optimizations of this function over ire_ftable_lookup are: 1104 * o removing unnecessary flag matching 1105 * o doing longest prefix match instead of overloading it further 1106 * with the unnecessary "best_prefix_match" 1107 * 1108 * If no route is found we return IRE_NOROUTE. 1109 */ 1110 ire_t * 1111 ire_ftable_lookup_simple_v6(const in6_addr_t *addr, uint32_t xmit_hint, 1112 ip_stack_t *ipst, uint_t *generationp) 1113 { 1114 ire_t *ire; 1115 1116 ire = ire_ftable_lookup_v6(addr, NULL, NULL, 0, NULL, ALL_ZONES, NULL, 1117 MATCH_IRE_DSTONLY, xmit_hint, ipst, generationp); 1118 if (ire == NULL) { 1119 ire = ire_reject(ipst, B_TRUE); 1120 if (generationp != NULL) 1121 *generationp = IRE_GENERATION_VERIFY; 1122 } 1123 /* ftable_lookup did round robin */ 1124 return (ire); 1125 } 1126 1127 ire_t * 1128 ip_select_route_v6(const in6_addr_t *dst, const in6_addr_t src, 1129 ip_xmit_attr_t *ixa, uint_t *generationp, in6_addr_t *setsrcp, 1130 int *errorp, boolean_t *multirtp) 1131 { 1132 ASSERT(!(ixa->ixa_flags & IXAF_IS_IPV4)); 1133 1134 return (ip_select_route(dst, src, ixa, generationp, setsrcp, errorp, 1135 multirtp)); 1136 } 1137 1138 /* 1139 * Recursively look for a route to the destination. Can also match on 1140 * the zoneid, ill, and label. Used for the data paths. See also 1141 * ire_route_recursive_dstonly. 1142 * 1143 * If IRR_ALLOCATE is not set then we will only inspect the existing IREs; never 1144 * create an IRE_IF_CLONE. This is used on the receive side when we are not 1145 * forwarding. 1146 * If IRR_INCOMPLETE is set then we return the IRE even if we can't correctly 1147 * resolve the gateway. 1148 * 1149 * Note that this function never returns NULL. It returns an IRE_NOROUTE 1150 * instead. 1151 * 1152 * If we find any IRE_LOCAL|BROADCAST etc past the first iteration it 1153 * is an error. 1154 * Allow at most one RTF_INDIRECT. 1155 */ 1156 ire_t * 1157 ire_route_recursive_impl_v6(ire_t *ire, 1158 const in6_addr_t *nexthop, uint_t ire_type, const ill_t *ill_arg, 1159 zoneid_t zoneid, const ts_label_t *tsl, uint_t match_args, 1160 uint_t irr_flags, uint32_t xmit_hint, ip_stack_t *ipst, 1161 in6_addr_t *setsrcp, tsol_ire_gw_secattr_t **gwattrp, uint_t *generationp) 1162 { 1163 int i, j; 1164 in6_addr_t v6nexthop = *nexthop; 1165 ire_t *ires[MAX_IRE_RECURSION]; 1166 uint_t generation; 1167 uint_t generations[MAX_IRE_RECURSION]; 1168 boolean_t need_refrele = B_FALSE; 1169 boolean_t invalidate = B_FALSE; 1170 int prefs[MAX_IRE_RECURSION]; 1171 ill_t *ill = NULL; 1172 1173 if (setsrcp != NULL) 1174 ASSERT(IN6_IS_ADDR_UNSPECIFIED(setsrcp)); 1175 if (gwattrp != NULL) 1176 ASSERT(*gwattrp == NULL); 1177 1178 /* 1179 * We iterate up to three times to resolve a route, even though 1180 * we have four slots in the array. The extra slot is for an 1181 * IRE_IF_CLONE we might need to create. 1182 */ 1183 i = 0; 1184 while (i < MAX_IRE_RECURSION - 1) { 1185 /* ire_ftable_lookup handles round-robin/ECMP */ 1186 if (ire == NULL) { 1187 ire = ire_ftable_lookup_v6(&v6nexthop, 0, 0, ire_type, 1188 (ill != NULL ? ill : ill_arg), zoneid, tsl, 1189 match_args, xmit_hint, ipst, &generation); 1190 } else { 1191 /* Caller passed it; extra hold since we will rele */ 1192 ire_refhold(ire); 1193 if (generationp != NULL) 1194 generation = *generationp; 1195 else 1196 generation = IRE_GENERATION_VERIFY; 1197 } 1198 1199 if (ire == NULL) 1200 ire = ire_reject(ipst, B_TRUE); 1201 1202 /* Need to return the ire with RTF_REJECT|BLACKHOLE */ 1203 if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) 1204 goto error; 1205 1206 ASSERT(!(ire->ire_type & IRE_MULTICAST)); /* Not in ftable */ 1207 1208 if (i != 0) { 1209 prefs[i] = ire_pref(ire); 1210 /* 1211 * Don't allow anything unusual past the first 1212 * iteration. 1213 */ 1214 if ((ire->ire_type & 1215 (IRE_LOCAL|IRE_LOOPBACK|IRE_BROADCAST)) || 1216 prefs[i] <= prefs[i-1]) { 1217 ire_refrele(ire); 1218 if (irr_flags & IRR_INCOMPLETE) { 1219 ire = ires[0]; 1220 ire_refhold(ire); 1221 } else { 1222 ire = ire_reject(ipst, B_TRUE); 1223 } 1224 goto error; 1225 } 1226 } 1227 /* We have a usable IRE */ 1228 ires[i] = ire; 1229 generations[i] = generation; 1230 i++; 1231 1232 /* The first RTF_SETSRC address is passed back if setsrcp */ 1233 if ((ire->ire_flags & RTF_SETSRC) && 1234 setsrcp != NULL && IN6_IS_ADDR_UNSPECIFIED(setsrcp)) { 1235 ASSERT(!IN6_IS_ADDR_UNSPECIFIED( 1236 &ire->ire_setsrc_addr_v6)); 1237 *setsrcp = ire->ire_setsrc_addr_v6; 1238 } 1239 1240 /* The first ire_gw_secattr is passed back if gwattrp */ 1241 if (ire->ire_gw_secattr != NULL && 1242 gwattrp != NULL && *gwattrp == NULL) 1243 *gwattrp = ire->ire_gw_secattr; 1244 1245 /* 1246 * Check if we have a short-cut pointer to an IRE for this 1247 * destination, and that the cached dependency isn't stale. 1248 * In that case we've rejoined an existing tree towards a 1249 * parent, thus we don't need to continue the loop to 1250 * discover the rest of the tree. 1251 */ 1252 mutex_enter(&ire->ire_lock); 1253 if (ire->ire_dep_parent != NULL && 1254 ire->ire_dep_parent->ire_generation == 1255 ire->ire_dep_parent_generation) { 1256 mutex_exit(&ire->ire_lock); 1257 ire = NULL; 1258 goto done; 1259 } 1260 mutex_exit(&ire->ire_lock); 1261 1262 /* 1263 * If this type should have an ire_nce_cache (even if it 1264 * doesn't yet have one) then we are done. Includes 1265 * IRE_INTERFACE with a full 128 bit mask. 1266 */ 1267 if (ire->ire_nce_capable) { 1268 ire = NULL; 1269 goto done; 1270 } 1271 ASSERT(!(ire->ire_type & IRE_IF_CLONE)); 1272 /* 1273 * For an IRE_INTERFACE we create an IRE_IF_CLONE for this 1274 * particular destination 1275 */ 1276 if (ire->ire_type & IRE_INTERFACE) { 1277 ire_t *clone; 1278 1279 ASSERT(ire->ire_masklen != IPV6_ABITS); 1280 1281 /* 1282 * In the case of ip_input and ILLF_FORWARDING not 1283 * being set, and in the case of RTM_GET, there is 1284 * no point in allocating an IRE_IF_CLONE. We return 1285 * the IRE_INTERFACE. Note that !IRR_ALLOCATE can 1286 * result in a ire_dep_parent which is IRE_IF_* 1287 * without an IRE_IF_CLONE. 1288 * We recover from that when we need to send packets 1289 * by ensuring that the generations become 1290 * IRE_GENERATION_VERIFY in this case. 1291 */ 1292 if (!(irr_flags & IRR_ALLOCATE)) { 1293 invalidate = B_TRUE; 1294 ire = NULL; 1295 goto done; 1296 } 1297 1298 clone = ire_create_if_clone(ire, &v6nexthop, 1299 &generation); 1300 if (clone == NULL) { 1301 /* 1302 * Temporary failure - no memory. 1303 * Don't want caller to cache IRE_NOROUTE. 1304 */ 1305 invalidate = B_TRUE; 1306 ire = ire_blackhole(ipst, B_TRUE); 1307 goto error; 1308 } 1309 /* 1310 * Make clone next to last entry and the 1311 * IRE_INTERFACE the last in the dependency 1312 * chain since the clone depends on the 1313 * IRE_INTERFACE. 1314 */ 1315 ASSERT(i >= 1); 1316 ASSERT(i < MAX_IRE_RECURSION); 1317 1318 ires[i] = ires[i-1]; 1319 generations[i] = generations[i-1]; 1320 ires[i-1] = clone; 1321 generations[i-1] = generation; 1322 i++; 1323 1324 ire = NULL; 1325 goto done; 1326 } 1327 1328 /* 1329 * We only match on the type and optionally ILL when 1330 * recursing. The type match is used by some callers 1331 * to exclude certain types (such as IRE_IF_CLONE or 1332 * IRE_LOCAL|IRE_LOOPBACK). 1333 * 1334 * In the MATCH_IRE_SRC_ILL case, ill_arg may be the 'srcof' 1335 * ire->ire_ill, and we want to find the IRE_INTERFACE for 1336 * ire_ill, so we set ill to the ire_ill 1337 */ 1338 match_args &= MATCH_IRE_TYPE; 1339 v6nexthop = ire->ire_gateway_addr_v6; 1340 if (ill == NULL && ire->ire_ill != NULL) { 1341 ill = ire->ire_ill; 1342 need_refrele = B_TRUE; 1343 ill_refhold(ill); 1344 match_args |= MATCH_IRE_ILL; 1345 } 1346 /* 1347 * We set the prefs[i] value above if i > 0. We've already 1348 * done i++ so i is one in the case of the first time around. 1349 */ 1350 if (i == 1) 1351 prefs[0] = ire_pref(ire); 1352 ire = NULL; 1353 } 1354 ASSERT(ire == NULL); 1355 ire = ire_reject(ipst, B_TRUE); 1356 1357 error: 1358 ASSERT(ire != NULL); 1359 if (need_refrele) 1360 ill_refrele(ill); 1361 1362 /* 1363 * In the case of MULTIRT we want to try a different IRE the next 1364 * time. We let the next packet retry in that case. 1365 */ 1366 if (i > 0 && (ires[0]->ire_flags & RTF_MULTIRT)) 1367 (void) ire_no_good(ires[0]); 1368 1369 cleanup: 1370 /* cleanup ires[i] */ 1371 ire_dep_unbuild(ires, i); 1372 for (j = 0; j < i; j++) 1373 ire_refrele(ires[j]); 1374 1375 ASSERT((ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) || 1376 (irr_flags & IRR_INCOMPLETE)); 1377 /* 1378 * Use IRE_GENERATION_VERIFY to ensure that ip_output will redo the 1379 * ip_select_route since the reject or lack of memory might be gone. 1380 */ 1381 if (generationp != NULL) 1382 *generationp = IRE_GENERATION_VERIFY; 1383 return (ire); 1384 1385 done: 1386 ASSERT(ire == NULL); 1387 if (need_refrele) 1388 ill_refrele(ill); 1389 1390 /* Build dependencies */ 1391 if (i > 1 && !ire_dep_build(ires, generations, i)) { 1392 /* Something in chain was condemned; tear it apart */ 1393 ire = ire_blackhole(ipst, B_TRUE); 1394 goto cleanup; 1395 } 1396 1397 /* 1398 * Release all refholds except the one for ires[0] that we 1399 * will return to the caller. 1400 */ 1401 for (j = 1; j < i; j++) 1402 ire_refrele(ires[j]); 1403 1404 if (invalidate) { 1405 /* 1406 * Since we needed to allocate but couldn't we need to make 1407 * sure that the dependency chain is rebuilt the next time. 1408 */ 1409 ire_dep_invalidate_generations(ires[0]); 1410 generation = IRE_GENERATION_VERIFY; 1411 } else { 1412 /* 1413 * IREs can have been added or deleted while we did the 1414 * recursive lookup and we can't catch those until we've built 1415 * the dependencies. We verify the stored 1416 * ire_dep_parent_generation to catch any such changes and 1417 * return IRE_GENERATION_VERIFY (which will cause 1418 * ip_select_route to be called again so we can redo the 1419 * recursive lookup next time we send a packet. 1420 */ 1421 if (ires[0]->ire_dep_parent == NULL) 1422 generation = ires[0]->ire_generation; 1423 else 1424 generation = ire_dep_validate_generations(ires[0]); 1425 if (generations[0] != ires[0]->ire_generation) { 1426 /* Something changed at the top */ 1427 generation = IRE_GENERATION_VERIFY; 1428 } 1429 } 1430 if (generationp != NULL) 1431 *generationp = generation; 1432 1433 return (ires[0]); 1434 } 1435 1436 ire_t * 1437 ire_route_recursive_v6(const in6_addr_t *nexthop, uint_t ire_type, 1438 const ill_t *ill, zoneid_t zoneid, const ts_label_t *tsl, uint_t match_args, 1439 uint_t irr_flags, uint32_t xmit_hint, ip_stack_t *ipst, 1440 in6_addr_t *setsrcp, tsol_ire_gw_secattr_t **gwattrp, uint_t *generationp) 1441 { 1442 return (ire_route_recursive_impl_v6(NULL, nexthop, ire_type, ill, 1443 zoneid, tsl, match_args, irr_flags, xmit_hint, ipst, setsrcp, 1444 gwattrp, generationp)); 1445 } 1446 1447 /* 1448 * Recursively look for a route to the destination. 1449 * We only handle a destination match here, yet we have the same arguments 1450 * as the full match to allow function pointers to select between the two. 1451 * 1452 * Note that this function never returns NULL. It returns an IRE_NOROUTE 1453 * instead. 1454 * 1455 * If we find any IRE_LOCAL|BROADCAST etc past the first iteration it 1456 * is an error. 1457 * Allow at most one RTF_INDIRECT. 1458 */ 1459 ire_t * 1460 ire_route_recursive_dstonly_v6(const in6_addr_t *nexthop, uint_t irr_flags, 1461 uint32_t xmit_hint, ip_stack_t *ipst) 1462 { 1463 ire_t *ire; 1464 ire_t *ire1; 1465 uint_t generation; 1466 1467 /* ire_ftable_lookup handles round-robin/ECMP */ 1468 ire = ire_ftable_lookup_simple_v6(nexthop, xmit_hint, ipst, 1469 &generation); 1470 ASSERT(ire != NULL); 1471 1472 /* 1473 * If this type should have an ire_nce_cache (even if it 1474 * doesn't yet have one) then we are done. Includes 1475 * IRE_INTERFACE with a full 128 bit mask. 1476 */ 1477 if (ire->ire_nce_capable) 1478 return (ire); 1479 1480 /* 1481 * If the IRE has a current cached parent we know that the whole 1482 * parent chain is current, hence we don't need to discover and 1483 * build any dependencies by doing a recursive lookup. 1484 */ 1485 mutex_enter(&ire->ire_lock); 1486 if (ire->ire_dep_parent != NULL && 1487 ire->ire_dep_parent->ire_generation == 1488 ire->ire_dep_parent_generation) { 1489 mutex_exit(&ire->ire_lock); 1490 return (ire); 1491 } 1492 mutex_exit(&ire->ire_lock); 1493 1494 /* 1495 * Fallback to loop in the normal code starting with the ire 1496 * we found. Normally this would return the same ire. 1497 */ 1498 ire1 = ire_route_recursive_impl_v6(ire, nexthop, 0, NULL, ALL_ZONES, 1499 NULL, MATCH_IRE_DSTONLY, irr_flags, xmit_hint, ipst, NULL, NULL, 1500 &generation); 1501 ire_refrele(ire); 1502 return (ire1); 1503 } 1504