1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2010 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 /* 26 * Copyright (c) 1990 Mentat Inc. 27 */ 28 29 /* 30 * This file contains routines that manipulate Internet Routing Entries (IREs). 31 */ 32 #include <sys/types.h> 33 #include <sys/stream.h> 34 #include <sys/stropts.h> 35 #include <sys/ddi.h> 36 #include <sys/cmn_err.h> 37 38 #include <sys/systm.h> 39 #include <sys/param.h> 40 #include <sys/socket.h> 41 #include <net/if.h> 42 #include <net/route.h> 43 #include <netinet/in.h> 44 #include <net/if_dl.h> 45 #include <netinet/ip6.h> 46 #include <netinet/icmp6.h> 47 48 #include <inet/common.h> 49 #include <inet/mi.h> 50 #include <inet/ip.h> 51 #include <inet/ip6.h> 52 #include <inet/ip_ndp.h> 53 #include <inet/ip_if.h> 54 #include <inet/ip_ire.h> 55 #include <inet/ipclassifier.h> 56 #include <inet/nd.h> 57 #include <inet/tunables.h> 58 #include <sys/kmem.h> 59 #include <sys/zone.h> 60 61 #include <sys/tsol/label.h> 62 #include <sys/tsol/tnet.h> 63 64 #define IS_DEFAULT_ROUTE_V6(ire) \ 65 (((ire)->ire_type & IRE_DEFAULT) || \ 66 (((ire)->ire_type & IRE_INTERFACE) && \ 67 (IN6_IS_ADDR_UNSPECIFIED(&(ire)->ire_addr_v6)))) 68 69 static ire_t ire_null; 70 71 static ire_t * 72 ire_ftable_lookup_impl_v6(const in6_addr_t *addr, const in6_addr_t *mask, 73 const in6_addr_t *gateway, int type, const ill_t *ill, 74 zoneid_t zoneid, const ts_label_t *tsl, int flags, 75 ip_stack_t *ipst); 76 77 /* 78 * Initialize the ire that is specific to IPv6 part and call 79 * ire_init_common to finish it. 80 * Returns zero or errno. 81 */ 82 int 83 ire_init_v6(ire_t *ire, const in6_addr_t *v6addr, const in6_addr_t *v6mask, 84 const in6_addr_t *v6gateway, ushort_t type, ill_t *ill, 85 zoneid_t zoneid, uint_t flags, tsol_gc_t *gc, ip_stack_t *ipst) 86 { 87 int error; 88 89 /* 90 * Reject IRE security attmakeribute creation/initialization 91 * if system is not running in Trusted mode. 92 */ 93 if (gc != NULL && !is_system_labeled()) 94 return (EINVAL); 95 96 BUMP_IRE_STATS(ipst->ips_ire_stats_v6, ire_stats_alloced); 97 if (v6addr != NULL) 98 ire->ire_addr_v6 = *v6addr; 99 if (v6gateway != NULL) 100 ire->ire_gateway_addr_v6 = *v6gateway; 101 102 /* Make sure we don't have stray values in some fields */ 103 switch (type) { 104 case IRE_LOOPBACK: 105 case IRE_HOST: 106 case IRE_LOCAL: 107 case IRE_IF_CLONE: 108 ire->ire_mask_v6 = ipv6_all_ones; 109 ire->ire_masklen = IPV6_ABITS; 110 break; 111 case IRE_PREFIX: 112 case IRE_DEFAULT: 113 case IRE_IF_RESOLVER: 114 case IRE_IF_NORESOLVER: 115 if (v6mask != NULL) { 116 ire->ire_mask_v6 = *v6mask; 117 ire->ire_masklen = 118 ip_mask_to_plen_v6(&ire->ire_mask_v6); 119 } 120 break; 121 case IRE_MULTICAST: 122 case IRE_NOROUTE: 123 ASSERT(v6mask == NULL); 124 break; 125 default: 126 ASSERT(0); 127 return (EINVAL); 128 } 129 130 error = ire_init_common(ire, type, ill, zoneid, flags, IPV6_VERSION, 131 gc, ipst); 132 if (error != NULL) 133 return (error); 134 135 /* Determine which function pointers to use */ 136 ire->ire_postfragfn = ip_xmit; /* Common case */ 137 138 switch (ire->ire_type) { 139 case IRE_LOCAL: 140 ire->ire_sendfn = ire_send_local_v6; 141 ire->ire_recvfn = ire_recv_local_v6; 142 ASSERT(ire->ire_ill != NULL); 143 if (ire->ire_ill->ill_flags & ILLF_NOACCEPT) 144 ire->ire_recvfn = ire_recv_noaccept_v6; 145 break; 146 case IRE_LOOPBACK: 147 ire->ire_sendfn = ire_send_local_v6; 148 ire->ire_recvfn = ire_recv_loopback_v6; 149 break; 150 case IRE_MULTICAST: 151 ire->ire_postfragfn = ip_postfrag_loopcheck; 152 ire->ire_sendfn = ire_send_multicast_v6; 153 ire->ire_recvfn = ire_recv_multicast_v6; 154 break; 155 default: 156 /* 157 * For IRE_IF_ALL and IRE_OFFLINK we forward received 158 * packets by default. 159 */ 160 ire->ire_sendfn = ire_send_wire_v6; 161 ire->ire_recvfn = ire_recv_forward_v6; 162 break; 163 } 164 if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) { 165 ire->ire_sendfn = ire_send_noroute_v6; 166 ire->ire_recvfn = ire_recv_noroute_v6; 167 } else if (ire->ire_flags & RTF_MULTIRT) { 168 ire->ire_postfragfn = ip_postfrag_multirt_v6; 169 ire->ire_sendfn = ire_send_multirt_v6; 170 ire->ire_recvfn = ire_recv_multirt_v6; 171 } 172 ire->ire_nce_capable = ire_determine_nce_capable(ire); 173 return (0); 174 } 175 176 /* 177 * ire_create_v6 is called to allocate and initialize a new IRE. 178 * 179 * NOTE : This is called as writer sometimes though not required 180 * by this function. 181 */ 182 /* ARGSUSED */ 183 ire_t * 184 ire_create_v6(const in6_addr_t *v6addr, const in6_addr_t *v6mask, 185 const in6_addr_t *v6gateway, ushort_t type, ill_t *ill, zoneid_t zoneid, 186 uint_t flags, tsol_gc_t *gc, ip_stack_t *ipst) 187 { 188 ire_t *ire; 189 int error; 190 191 ASSERT(!IN6_IS_ADDR_V4MAPPED(v6addr)); 192 193 ire = kmem_cache_alloc(ire_cache, KM_NOSLEEP); 194 if (ire == NULL) { 195 DTRACE_PROBE(kmem__cache__alloc); 196 return (NULL); 197 } 198 *ire = ire_null; 199 200 error = ire_init_v6(ire, v6addr, v6mask, v6gateway, 201 type, ill, zoneid, flags, gc, ipst); 202 203 if (error != 0) { 204 DTRACE_PROBE2(ire__init__v6, ire_t *, ire, int, error); 205 kmem_cache_free(ire_cache, ire); 206 return (NULL); 207 } 208 return (ire); 209 } 210 211 /* 212 * Find the ill matching a multicast group. 213 * Allows different routes for multicast addresses 214 * in the unicast routing table (akin to FF::0/8 but could be more specific) 215 * which point at different interfaces. This is used when IPV6_MULTICAST_IF 216 * isn't specified (when sending) and when IPV6_JOIN_GROUP doesn't 217 * specify the interface to join on. 218 * 219 * Supports link-local addresses by using ire_route_recursive which follows 220 * the ill when recursing. 221 * 222 * To handle CGTP, since we don't have a separate IRE_MULTICAST for each group 223 * and the MULTIRT property can be different for different groups, we 224 * extract RTF_MULTIRT from the special unicast route added for a group 225 * with CGTP and pass that back in the multirtp argument. 226 * This is used in ip_set_destination etc to set ixa_postfragfn for multicast. 227 * We have a setsrcp argument for the same reason. 228 */ 229 ill_t * 230 ire_lookup_multi_ill_v6(const in6_addr_t *group, zoneid_t zoneid, 231 ip_stack_t *ipst, boolean_t *multirtp, in6_addr_t *setsrcp) 232 { 233 ire_t *ire; 234 ill_t *ill; 235 236 ire = ire_route_recursive_v6(group, 0, NULL, zoneid, NULL, 237 MATCH_IRE_DSTONLY, IRR_NONE, 0, ipst, setsrcp, NULL, NULL); 238 ASSERT(ire != NULL); 239 240 if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) { 241 ire_refrele(ire); 242 return (NULL); 243 } 244 245 if (multirtp != NULL) 246 *multirtp = (ire->ire_flags & RTF_MULTIRT) != 0; 247 248 ill = ire_nexthop_ill(ire); 249 ire_refrele(ire); 250 return (ill); 251 } 252 253 /* 254 * This function takes a mask and returns number of bits set in the 255 * mask (the represented prefix length). Assumes a contiguous mask. 256 */ 257 int 258 ip_mask_to_plen_v6(const in6_addr_t *v6mask) 259 { 260 int bits; 261 int plen = IPV6_ABITS; 262 int i; 263 264 for (i = 3; i >= 0; i--) { 265 if (v6mask->s6_addr32[i] == 0) { 266 plen -= 32; 267 continue; 268 } 269 bits = ffs(ntohl(v6mask->s6_addr32[i])) - 1; 270 if (bits == 0) 271 break; 272 plen -= bits; 273 } 274 275 return (plen); 276 } 277 278 /* 279 * Convert a prefix length to the mask for that prefix. 280 * Returns the argument bitmask. 281 */ 282 in6_addr_t * 283 ip_plen_to_mask_v6(uint_t plen, in6_addr_t *bitmask) 284 { 285 uint32_t *ptr; 286 287 if (plen < 0 || plen > IPV6_ABITS) 288 return (NULL); 289 *bitmask = ipv6_all_zeros; 290 if (plen == 0) 291 return (bitmask); 292 293 ptr = (uint32_t *)bitmask; 294 while (plen > 32) { 295 *ptr++ = 0xffffffffU; 296 plen -= 32; 297 } 298 *ptr = htonl(0xffffffffU << (32 - plen)); 299 return (bitmask); 300 } 301 302 /* 303 * Add a fully initialized IPv6 IRE to the forwarding table. 304 * This returns NULL on failure, or a held IRE on success. 305 * Normally the returned IRE is the same as the argument. But a different 306 * IRE will be returned if the added IRE is deemed identical to an existing 307 * one. In that case ire_identical_ref will be increased. 308 * The caller always needs to do an ire_refrele() on the returned IRE. 309 */ 310 ire_t * 311 ire_add_v6(ire_t *ire) 312 { 313 ire_t *ire1; 314 int mask_table_index; 315 irb_t *irb_ptr; 316 ire_t **irep; 317 int match_flags; 318 int error; 319 ip_stack_t *ipst = ire->ire_ipst; 320 321 ASSERT(ire->ire_ipversion == IPV6_VERSION); 322 323 /* Make sure the address is properly masked. */ 324 V6_MASK_COPY(ire->ire_addr_v6, ire->ire_mask_v6, ire->ire_addr_v6); 325 326 mask_table_index = ip_mask_to_plen_v6(&ire->ire_mask_v6); 327 if ((ipst->ips_ip_forwarding_table_v6[mask_table_index]) == NULL) { 328 irb_t *ptr; 329 int i; 330 331 ptr = (irb_t *)mi_zalloc((ipst->ips_ip6_ftable_hash_size * 332 sizeof (irb_t))); 333 if (ptr == NULL) { 334 ire_delete(ire); 335 return (NULL); 336 } 337 for (i = 0; i < ipst->ips_ip6_ftable_hash_size; i++) { 338 rw_init(&ptr[i].irb_lock, NULL, RW_DEFAULT, NULL); 339 ptr[i].irb_ipst = ipst; 340 } 341 mutex_enter(&ipst->ips_ire_ft_init_lock); 342 if (ipst->ips_ip_forwarding_table_v6[mask_table_index] == 343 NULL) { 344 ipst->ips_ip_forwarding_table_v6[mask_table_index] = 345 ptr; 346 mutex_exit(&ipst->ips_ire_ft_init_lock); 347 } else { 348 /* 349 * Some other thread won the race in 350 * initializing the forwarding table at the 351 * same index. 352 */ 353 mutex_exit(&ipst->ips_ire_ft_init_lock); 354 for (i = 0; i < ipst->ips_ip6_ftable_hash_size; i++) { 355 rw_destroy(&ptr[i].irb_lock); 356 } 357 mi_free(ptr); 358 } 359 } 360 irb_ptr = &(ipst->ips_ip_forwarding_table_v6[mask_table_index][ 361 IRE_ADDR_MASK_HASH_V6(ire->ire_addr_v6, ire->ire_mask_v6, 362 ipst->ips_ip6_ftable_hash_size)]); 363 364 match_flags = (MATCH_IRE_MASK | MATCH_IRE_TYPE | MATCH_IRE_GW); 365 if (ire->ire_ill != NULL) 366 match_flags |= MATCH_IRE_ILL; 367 /* 368 * Start the atomic add of the ire. Grab the bucket lock and the 369 * ill lock. Check for condemned. 370 */ 371 error = ire_atomic_start(irb_ptr, ire); 372 if (error != 0) { 373 ire_delete(ire); 374 return (NULL); 375 } 376 377 /* 378 * If we are creating a hidden IRE, make sure we search for 379 * hidden IREs when searching for duplicates below. 380 * Otherwise, we might find an IRE on some other interface 381 * that's not marked hidden. 382 */ 383 if (ire->ire_testhidden) 384 match_flags |= MATCH_IRE_TESTHIDDEN; 385 386 /* 387 * Atomically check for duplicate and insert in the table. 388 */ 389 for (ire1 = irb_ptr->irb_ire; ire1 != NULL; ire1 = ire1->ire_next) { 390 if (IRE_IS_CONDEMNED(ire1)) 391 continue; 392 /* 393 * Here we need an exact match on zoneid, i.e., 394 * ire_match_args doesn't fit. 395 */ 396 if (ire1->ire_zoneid != ire->ire_zoneid) 397 continue; 398 399 if (ire1->ire_type != ire->ire_type) 400 continue; 401 402 /* 403 * Note: We do not allow multiple routes that differ only 404 * in the gateway security attributes; such routes are 405 * considered duplicates. 406 * To change that we explicitly have to treat them as 407 * different here. 408 */ 409 if (ire_match_args_v6(ire1, &ire->ire_addr_v6, 410 &ire->ire_mask_v6, &ire->ire_gateway_addr_v6, 411 ire->ire_type, ire->ire_ill, ire->ire_zoneid, NULL, 412 match_flags)) { 413 /* 414 * Return the old ire after doing a REFHOLD. 415 * As most of the callers continue to use the IRE 416 * after adding, we return a held ire. This will 417 * avoid a lookup in the caller again. If the callers 418 * don't want to use it, they need to do a REFRELE. 419 */ 420 ip1dbg(("found dup ire existing %p new %p", 421 (void *)ire1, (void *)ire)); 422 ire_refhold(ire1); 423 atomic_add_32(&ire1->ire_identical_ref, 1); 424 ire_atomic_end(irb_ptr, ire); 425 ire_delete(ire); 426 return (ire1); 427 } 428 } 429 430 /* 431 * Normally we do head insertion since most things do not care about 432 * the order of the IREs in the bucket. 433 * However, due to shared-IP zones (and restrict_interzone_loopback) 434 * we can have an IRE_LOCAL as well as IRE_IF_CLONE for the same 435 * address. For that reason we do tail insertion for IRE_IF_CLONE. 436 */ 437 irep = (ire_t **)irb_ptr; 438 if (ire->ire_type & IRE_IF_CLONE) { 439 while ((ire1 = *irep) != NULL) 440 irep = &ire1->ire_next; 441 } 442 /* Insert at *irep */ 443 ire1 = *irep; 444 if (ire1 != NULL) 445 ire1->ire_ptpn = &ire->ire_next; 446 ire->ire_next = ire1; 447 /* Link the new one in. */ 448 ire->ire_ptpn = irep; 449 /* 450 * ire_walk routines de-reference ire_next without holding 451 * a lock. Before we point to the new ire, we want to make 452 * sure the store that sets the ire_next of the new ire 453 * reaches global visibility, so that ire_walk routines 454 * don't see a truncated list of ires i.e if the ire_next 455 * of the new ire gets set after we do "*irep = ire" due 456 * to re-ordering, the ire_walk thread will see a NULL 457 * once it accesses the ire_next of the new ire. 458 * membar_producer() makes sure that the following store 459 * happens *after* all of the above stores. 460 */ 461 membar_producer(); 462 *irep = ire; 463 ire->ire_bucket = irb_ptr; 464 /* 465 * We return a bumped up IRE above. Keep it symmetrical 466 * so that the callers will always have to release. This 467 * helps the callers of this function because they continue 468 * to use the IRE after adding and hence they don't have to 469 * lookup again after we return the IRE. 470 * 471 * NOTE : We don't have to use atomics as this is appearing 472 * in the list for the first time and no one else can bump 473 * up the reference count on this yet. 474 */ 475 ire_refhold_locked(ire); 476 BUMP_IRE_STATS(ipst->ips_ire_stats_v6, ire_stats_inserted); 477 irb_ptr->irb_ire_cnt++; 478 479 if (ire->ire_ill != NULL) { 480 DTRACE_PROBE3(ill__incr__cnt, (ill_t *), ire->ire_ill, 481 (char *), "ire", (void *), ire); 482 ire->ire_ill->ill_ire_cnt++; 483 ASSERT(ire->ire_ill->ill_ire_cnt != 0); /* Wraparound */ 484 } 485 ire_atomic_end(irb_ptr, ire); 486 487 /* Make any caching of the IREs be notified or updated */ 488 ire_flush_cache_v6(ire, IRE_FLUSH_ADD); 489 490 return (ire); 491 } 492 493 /* 494 * Search for all HOST REDIRECT routes that are 495 * pointing at the specified gateway and 496 * delete them. This routine is called only 497 * when a default gateway is going away. 498 */ 499 static void 500 ire_delete_host_redirects_v6(const in6_addr_t *gateway, ip_stack_t *ipst) 501 { 502 irb_t *irb_ptr; 503 irb_t *irb; 504 ire_t *ire; 505 in6_addr_t gw_addr_v6; 506 int i; 507 508 /* get the hash table for HOST routes */ 509 irb_ptr = ipst->ips_ip_forwarding_table_v6[(IP6_MASK_TABLE_SIZE - 1)]; 510 if (irb_ptr == NULL) 511 return; 512 for (i = 0; (i < ipst->ips_ip6_ftable_hash_size); i++) { 513 irb = &irb_ptr[i]; 514 irb_refhold(irb); 515 for (ire = irb->irb_ire; ire != NULL; ire = ire->ire_next) { 516 if (!(ire->ire_flags & RTF_DYNAMIC)) 517 continue; 518 mutex_enter(&ire->ire_lock); 519 gw_addr_v6 = ire->ire_gateway_addr_v6; 520 mutex_exit(&ire->ire_lock); 521 if (IN6_ARE_ADDR_EQUAL(&gw_addr_v6, gateway)) 522 ire_delete(ire); 523 } 524 irb_refrele(irb); 525 } 526 } 527 528 /* 529 * Delete the specified IRE. 530 * All calls should use ire_delete(). 531 * Sometimes called as writer though not required by this function. 532 * 533 * NOTE : This function is called only if the ire was added 534 * in the list. 535 */ 536 void 537 ire_delete_v6(ire_t *ire) 538 { 539 in6_addr_t gw_addr_v6; 540 ip_stack_t *ipst = ire->ire_ipst; 541 542 /* 543 * Make sure ire_generation increases from ire_flush_cache happen 544 * after any lookup/reader has read ire_generation. 545 * Since the rw_enter makes us wait until any lookup/reader has 546 * completed we can exit the lock immediately. 547 */ 548 rw_enter(&ipst->ips_ip6_ire_head_lock, RW_WRITER); 549 rw_exit(&ipst->ips_ip6_ire_head_lock); 550 551 ASSERT(ire->ire_refcnt >= 1); 552 ASSERT(ire->ire_ipversion == IPV6_VERSION); 553 554 ire_flush_cache_v6(ire, IRE_FLUSH_DELETE); 555 556 if (ire->ire_type == IRE_DEFAULT) { 557 /* 558 * when a default gateway is going away 559 * delete all the host redirects pointing at that 560 * gateway. 561 */ 562 mutex_enter(&ire->ire_lock); 563 gw_addr_v6 = ire->ire_gateway_addr_v6; 564 mutex_exit(&ire->ire_lock); 565 ire_delete_host_redirects_v6(&gw_addr_v6, ipst); 566 } 567 568 /* 569 * If we are deleting an IRE_INTERFACE then we make sure we also 570 * delete any IRE_IF_CLONE that has been created from it. 571 * Those are always in ire_dep_children. 572 */ 573 if ((ire->ire_type & IRE_INTERFACE) && ire->ire_dep_children != 0) 574 ire_dep_delete_if_clone(ire); 575 576 /* Remove from parent dependencies and child */ 577 rw_enter(&ipst->ips_ire_dep_lock, RW_WRITER); 578 if (ire->ire_dep_parent != NULL) { 579 ire_dep_remove(ire); 580 } 581 while (ire->ire_dep_children != NULL) 582 ire_dep_remove(ire->ire_dep_children); 583 rw_exit(&ipst->ips_ire_dep_lock); 584 } 585 586 /* 587 * When an IRE is added or deleted this routine is called to make sure 588 * any caching of IRE information is notified or updated. 589 * 590 * The flag argument indicates if the flush request is due to addition 591 * of new route (IRE_FLUSH_ADD), deletion of old route (IRE_FLUSH_DELETE), 592 * or a change to ire_gateway_addr (IRE_FLUSH_GWCHANGE). 593 */ 594 void 595 ire_flush_cache_v6(ire_t *ire, int flag) 596 { 597 ip_stack_t *ipst = ire->ire_ipst; 598 599 /* 600 * IRE_IF_CLONE ire's don't provide any new information 601 * than the parent from which they are cloned, so don't 602 * perturb the generation numbers. 603 */ 604 if (ire->ire_type & IRE_IF_CLONE) 605 return; 606 607 /* 608 * Ensure that an ire_add during a lookup serializes the updates of 609 * the generation numbers under ire_head_lock so that the lookup gets 610 * either the old ire and old generation number, or a new ire and new 611 * generation number. 612 */ 613 rw_enter(&ipst->ips_ip6_ire_head_lock, RW_WRITER); 614 615 /* 616 * If a route was just added, we need to notify everybody that 617 * has cached an IRE_NOROUTE since there might now be a better 618 * route for them. 619 */ 620 if (flag == IRE_FLUSH_ADD) { 621 ire_increment_generation(ipst->ips_ire_reject_v6); 622 ire_increment_generation(ipst->ips_ire_blackhole_v6); 623 } 624 625 /* Adding a default can't otherwise provide a better route */ 626 if (ire->ire_type == IRE_DEFAULT && flag == IRE_FLUSH_ADD) { 627 rw_exit(&ipst->ips_ip6_ire_head_lock); 628 return; 629 } 630 631 switch (flag) { 632 case IRE_FLUSH_DELETE: 633 case IRE_FLUSH_GWCHANGE: 634 /* 635 * Update ire_generation for all ire_dep_children chains 636 * starting with this IRE 637 */ 638 ire_dep_incr_generation(ire); 639 break; 640 case IRE_FLUSH_ADD: { 641 in6_addr_t addr; 642 in6_addr_t mask; 643 ip_stack_t *ipst = ire->ire_ipst; 644 uint_t masklen; 645 646 /* 647 * Find an IRE which is a shorter match than the ire to be added 648 * For any such IRE (which we repeat) we update the 649 * ire_generation the same way as in the delete case. 650 */ 651 addr = ire->ire_addr_v6; 652 mask = ire->ire_mask_v6; 653 masklen = ip_mask_to_plen_v6(&mask); 654 655 ire = ire_ftable_lookup_impl_v6(&addr, &mask, NULL, 0, NULL, 656 ALL_ZONES, NULL, MATCH_IRE_SHORTERMASK, ipst); 657 while (ire != NULL) { 658 /* We need to handle all in the same bucket */ 659 irb_increment_generation(ire->ire_bucket); 660 661 mask = ire->ire_mask_v6; 662 ASSERT(masklen > ip_mask_to_plen_v6(&mask)); 663 masklen = ip_mask_to_plen_v6(&mask); 664 ire_refrele(ire); 665 ire = ire_ftable_lookup_impl_v6(&addr, &mask, NULL, 0, 666 NULL, ALL_ZONES, NULL, MATCH_IRE_SHORTERMASK, ipst); 667 } 668 } 669 break; 670 } 671 rw_exit(&ipst->ips_ip6_ire_head_lock); 672 } 673 674 /* 675 * Matches the arguments passed with the values in the ire. 676 * 677 * Note: for match types that match using "ill" passed in, ill 678 * must be checked for non-NULL before calling this routine. 679 */ 680 boolean_t 681 ire_match_args_v6(ire_t *ire, const in6_addr_t *addr, const in6_addr_t *mask, 682 const in6_addr_t *gateway, int type, const ill_t *ill, zoneid_t zoneid, 683 const ts_label_t *tsl, int match_flags) 684 { 685 in6_addr_t masked_addr; 686 in6_addr_t gw_addr_v6; 687 ill_t *ire_ill = NULL, *dst_ill; 688 ip_stack_t *ipst = ire->ire_ipst; 689 690 ASSERT(ire->ire_ipversion == IPV6_VERSION); 691 ASSERT(addr != NULL); 692 ASSERT(mask != NULL); 693 ASSERT((!(match_flags & MATCH_IRE_GW)) || gateway != NULL); 694 ASSERT((!(match_flags & (MATCH_IRE_ILL|MATCH_IRE_SRC_ILL))) || 695 (ill != NULL && ill->ill_isv6)); 696 697 /* 698 * If MATCH_IRE_TESTHIDDEN is set, then only return the IRE if it 699 * is in fact hidden, to ensure the caller gets the right one. 700 */ 701 if (ire->ire_testhidden) { 702 if (!(match_flags & MATCH_IRE_TESTHIDDEN)) 703 return (B_FALSE); 704 } 705 706 if (zoneid != ALL_ZONES && zoneid != ire->ire_zoneid && 707 ire->ire_zoneid != ALL_ZONES) { 708 /* 709 * If MATCH_IRE_ZONEONLY has been set and the supplied zoneid 710 * does not match that of ire_zoneid, a failure to 711 * match is reported at this point. Otherwise, since some IREs 712 * that are available in the global zone can be used in local 713 * zones, additional checks need to be performed: 714 * 715 * IRE_LOOPBACK 716 * entries should never be matched in this situation. 717 * Each zone has its own IRE_LOOPBACK. 718 * 719 * IRE_LOCAL 720 * We allow them for any zoneid. ire_route_recursive 721 * does additional checks when 722 * ip_restrict_interzone_loopback is set. 723 * 724 * If ill_usesrc_ifindex is set 725 * Then we check if the zone has a valid source address 726 * on the usesrc ill. 727 * 728 * If ire_ill is set, then check that the zone has an ipif 729 * on that ill. 730 * 731 * Outside of this function (in ire_round_robin) we check 732 * that any IRE_OFFLINK has a gateway that reachable from the 733 * zone when we have multiple choices (ECMP). 734 */ 735 if (match_flags & MATCH_IRE_ZONEONLY) 736 return (B_FALSE); 737 if (ire->ire_type & IRE_LOOPBACK) 738 return (B_FALSE); 739 740 if (ire->ire_type & IRE_LOCAL) 741 goto matchit; 742 743 /* 744 * The normal case of IRE_ONLINK has a matching zoneid. 745 * Here we handle the case when shared-IP zones have been 746 * configured with IP addresses on vniN. In that case it 747 * is ok for traffic from a zone to use IRE_ONLINK routes 748 * if the ill has a usesrc pointing at vniN 749 * Applies to IRE_INTERFACE. 750 */ 751 dst_ill = ire->ire_ill; 752 if (ire->ire_type & IRE_ONLINK) { 753 uint_t ifindex; 754 755 /* 756 * Note there is no IRE_INTERFACE on vniN thus 757 * can't do an IRE lookup for a matching route. 758 */ 759 ifindex = dst_ill->ill_usesrc_ifindex; 760 if (ifindex == 0) 761 return (B_FALSE); 762 763 /* 764 * If there is a usable source address in the 765 * zone, then it's ok to return this IRE_INTERFACE 766 */ 767 if (!ipif_zone_avail(ifindex, dst_ill->ill_isv6, 768 zoneid, ipst)) { 769 ip3dbg(("ire_match_args: no usrsrc for zone" 770 " dst_ill %p\n", (void *)dst_ill)); 771 return (B_FALSE); 772 } 773 } 774 /* 775 * For example, with 776 * route add 11.0.0.0 gw1 -ifp bge0 777 * route add 11.0.0.0 gw2 -ifp bge1 778 * this code would differentiate based on 779 * where the sending zone has addresses. 780 * Only if the zone has an address on bge0 can it use the first 781 * route. It isn't clear if this behavior is documented 782 * anywhere. 783 */ 784 if (dst_ill != NULL && (ire->ire_type & IRE_OFFLINK)) { 785 ipif_t *tipif; 786 787 mutex_enter(&dst_ill->ill_lock); 788 for (tipif = dst_ill->ill_ipif; 789 tipif != NULL; tipif = tipif->ipif_next) { 790 if (!IPIF_IS_CONDEMNED(tipif) && 791 (tipif->ipif_flags & IPIF_UP) && 792 (tipif->ipif_zoneid == zoneid || 793 tipif->ipif_zoneid == ALL_ZONES)) 794 break; 795 } 796 mutex_exit(&dst_ill->ill_lock); 797 if (tipif == NULL) 798 return (B_FALSE); 799 } 800 } 801 802 matchit: 803 ire_ill = ire->ire_ill; 804 if (match_flags & MATCH_IRE_GW) { 805 mutex_enter(&ire->ire_lock); 806 gw_addr_v6 = ire->ire_gateway_addr_v6; 807 mutex_exit(&ire->ire_lock); 808 } 809 if (match_flags & MATCH_IRE_ILL) { 810 811 /* 812 * If asked to match an ill, we *must* match 813 * on the ire_ill for ipmp test addresses, or 814 * any of the ill in the group for data addresses. 815 * If we don't, we may as well fail. 816 * However, we need an exception for IRE_LOCALs to ensure 817 * we loopback packets even sent to test addresses on different 818 * interfaces in the group. 819 */ 820 if ((match_flags & MATCH_IRE_TESTHIDDEN) && 821 !(ire->ire_type & IRE_LOCAL)) { 822 if (ire->ire_ill != ill) 823 return (B_FALSE); 824 } else { 825 match_flags &= ~MATCH_IRE_TESTHIDDEN; 826 /* 827 * We know that ill is not NULL, but ire_ill could be 828 * NULL 829 */ 830 if (ire_ill == NULL || !IS_ON_SAME_LAN(ill, ire_ill)) 831 return (B_FALSE); 832 } 833 } 834 if (match_flags & MATCH_IRE_SRC_ILL) { 835 if (ire_ill == NULL) 836 return (B_FALSE); 837 if (!IS_ON_SAME_LAN(ill, ire_ill)) { 838 if (ire_ill->ill_usesrc_ifindex == 0 || 839 (ire_ill->ill_usesrc_ifindex != 840 ill->ill_phyint->phyint_ifindex)) 841 return (B_FALSE); 842 } 843 } 844 845 /* No ire_addr_v6 bits set past the mask */ 846 ASSERT(V6_MASK_EQ(ire->ire_addr_v6, ire->ire_mask_v6, 847 ire->ire_addr_v6)); 848 V6_MASK_COPY(*addr, *mask, masked_addr); 849 if (V6_MASK_EQ(*addr, *mask, ire->ire_addr_v6) && 850 ((!(match_flags & MATCH_IRE_GW)) || 851 ((!(match_flags & MATCH_IRE_DIRECT)) || 852 !(ire->ire_flags & RTF_INDIRECT)) && 853 IN6_ARE_ADDR_EQUAL(&gw_addr_v6, gateway)) && 854 ((!(match_flags & MATCH_IRE_TYPE)) || (ire->ire_type & type)) && 855 ((!(match_flags & MATCH_IRE_TESTHIDDEN)) || ire->ire_testhidden) && 856 ((!(match_flags & MATCH_IRE_MASK)) || 857 (IN6_ARE_ADDR_EQUAL(&ire->ire_mask_v6, mask))) && 858 ((!(match_flags & MATCH_IRE_SECATTR)) || 859 (!is_system_labeled()) || 860 (tsol_ire_match_gwattr(ire, tsl) == 0))) { 861 /* We found the matched IRE */ 862 return (B_TRUE); 863 } 864 return (B_FALSE); 865 } 866 867 /* 868 * Check if the zoneid (not ALL_ZONES) has an IRE_INTERFACE for the specified 869 * gateway address. If ill is non-NULL we also match on it. 870 * The caller must hold a read lock on RADIX_NODE_HEAD if lock_held is set. 871 */ 872 boolean_t 873 ire_gateway_ok_zone_v6(const in6_addr_t *gateway, zoneid_t zoneid, ill_t *ill, 874 const ts_label_t *tsl, ip_stack_t *ipst, boolean_t lock_held) 875 { 876 ire_t *ire; 877 uint_t match_flags; 878 879 if (lock_held) 880 ASSERT(RW_READ_HELD(&ipst->ips_ip6_ire_head_lock)); 881 else 882 rw_enter(&ipst->ips_ip6_ire_head_lock, RW_READER); 883 884 match_flags = MATCH_IRE_TYPE | MATCH_IRE_SECATTR; 885 if (ill != NULL) 886 match_flags |= MATCH_IRE_ILL; 887 888 ire = ire_ftable_lookup_impl_v6(gateway, &ipv6_all_zeros, 889 &ipv6_all_zeros, IRE_INTERFACE, ill, zoneid, tsl, match_flags, 890 ipst); 891 892 if (!lock_held) 893 rw_exit(&ipst->ips_ip6_ire_head_lock); 894 if (ire != NULL) { 895 ire_refrele(ire); 896 return (B_TRUE); 897 } else { 898 return (B_FALSE); 899 } 900 } 901 902 /* 903 * Lookup a route in forwarding table. 904 * specific lookup is indicated by passing the 905 * required parameters and indicating the 906 * match required in flag field. 907 * 908 * Supports link-local addresses by following the ipif/ill when recursing. 909 */ 910 ire_t * 911 ire_ftable_lookup_v6(const in6_addr_t *addr, const in6_addr_t *mask, 912 const in6_addr_t *gateway, int type, const ill_t *ill, 913 zoneid_t zoneid, const ts_label_t *tsl, int flags, 914 uint32_t xmit_hint, ip_stack_t *ipst, uint_t *generationp) 915 { 916 ire_t *ire = NULL; 917 918 ASSERT(addr != NULL); 919 ASSERT((!(flags & MATCH_IRE_MASK)) || mask != NULL); 920 ASSERT((!(flags & MATCH_IRE_GW)) || gateway != NULL); 921 ASSERT(ill == NULL || ill->ill_isv6); 922 923 ASSERT(!IN6_IS_ADDR_V4MAPPED(addr)); 924 925 /* 926 * ire_match_args_v6() will dereference ill if MATCH_IRE_ILL 927 * or MATCH_IRE_SRC_ILL is set. 928 */ 929 if ((flags & (MATCH_IRE_ILL|MATCH_IRE_SRC_ILL)) && (ill == NULL)) 930 return (NULL); 931 932 rw_enter(&ipst->ips_ip6_ire_head_lock, RW_READER); 933 ire = ire_ftable_lookup_impl_v6(addr, mask, gateway, type, ill, zoneid, 934 tsl, flags, ipst); 935 if (ire == NULL) { 936 rw_exit(&ipst->ips_ip6_ire_head_lock); 937 return (NULL); 938 } 939 940 /* 941 * round-robin only if we have more than one route in the bucket. 942 * ips_ip_ecmp_behavior controls when we do ECMP 943 * 2: always 944 * 1: for IRE_DEFAULT and /0 IRE_INTERFACE 945 * 0: never 946 * 947 * Note: if we found an IRE_IF_CLONE we won't look at the bucket with 948 * other ECMP IRE_INTERFACEs since the IRE_IF_CLONE is a /128 match 949 * and the IRE_INTERFACESs are likely to be shorter matches. 950 */ 951 if (ire->ire_bucket->irb_ire_cnt > 1 && !(flags & MATCH_IRE_GW)) { 952 if (ipst->ips_ip_ecmp_behavior == 2 || 953 (ipst->ips_ip_ecmp_behavior == 1 && 954 IS_DEFAULT_ROUTE_V6(ire))) { 955 ire_t *next_ire; 956 ire_ftable_args_t margs; 957 958 bzero(&margs, sizeof (margs)); 959 margs.ift_addr_v6 = *addr; 960 if (mask != NULL) 961 margs.ift_mask_v6 = *mask; 962 if (gateway != NULL) 963 margs.ift_gateway_v6 = *gateway; 964 margs.ift_type = type; 965 margs.ift_ill = ill; 966 margs.ift_zoneid = zoneid; 967 margs.ift_tsl = tsl; 968 margs.ift_flags = flags; 969 970 next_ire = ire_round_robin(ire->ire_bucket, &margs, 971 xmit_hint, ire, ipst); 972 if (next_ire == NULL) { 973 /* keep ire if next_ire is null */ 974 goto done; 975 } 976 ire_refrele(ire); 977 ire = next_ire; 978 } 979 } 980 981 done: 982 /* Return generation before dropping lock */ 983 if (generationp != NULL) 984 *generationp = ire->ire_generation; 985 986 rw_exit(&ipst->ips_ip6_ire_head_lock); 987 988 /* 989 * For shared-IP zones we need additional checks to what was 990 * done in ire_match_args to make sure IRE_LOCALs are handled. 991 * 992 * When ip_restrict_interzone_loopback is set, then 993 * we ensure that IRE_LOCAL are only used for loopback 994 * between zones when the logical "Ethernet" would 995 * have looped them back. That is, if in the absense of 996 * the IRE_LOCAL we would have sent to packet out the 997 * same ill. 998 */ 999 if ((ire->ire_type & IRE_LOCAL) && zoneid != ALL_ZONES && 1000 ire->ire_zoneid != zoneid && ire->ire_zoneid != ALL_ZONES && 1001 ipst->ips_ip_restrict_interzone_loopback) { 1002 ire = ire_alt_local(ire, zoneid, tsl, ill, generationp); 1003 ASSERT(ire != NULL); 1004 } 1005 1006 return (ire); 1007 } 1008 1009 /* 1010 * Look up a single ire. The caller holds either the read or write lock. 1011 */ 1012 ire_t * 1013 ire_ftable_lookup_impl_v6(const in6_addr_t *addr, const in6_addr_t *mask, 1014 const in6_addr_t *gateway, int type, const ill_t *ill, 1015 zoneid_t zoneid, const ts_label_t *tsl, int flags, 1016 ip_stack_t *ipst) 1017 { 1018 irb_t *irb_ptr; 1019 ire_t *ire = NULL; 1020 int i; 1021 1022 ASSERT(RW_LOCK_HELD(&ipst->ips_ip6_ire_head_lock)); 1023 1024 /* 1025 * If the mask is known, the lookup 1026 * is simple, if the mask is not known 1027 * we need to search. 1028 */ 1029 if (flags & MATCH_IRE_MASK) { 1030 uint_t masklen; 1031 1032 masklen = ip_mask_to_plen_v6(mask); 1033 if (ipst->ips_ip_forwarding_table_v6[masklen] == NULL) { 1034 return (NULL); 1035 } 1036 irb_ptr = &(ipst->ips_ip_forwarding_table_v6[masklen][ 1037 IRE_ADDR_MASK_HASH_V6(*addr, *mask, 1038 ipst->ips_ip6_ftable_hash_size)]); 1039 rw_enter(&irb_ptr->irb_lock, RW_READER); 1040 for (ire = irb_ptr->irb_ire; ire != NULL; 1041 ire = ire->ire_next) { 1042 if (IRE_IS_CONDEMNED(ire)) 1043 continue; 1044 if (ire_match_args_v6(ire, addr, mask, gateway, type, 1045 ill, zoneid, tsl, flags)) 1046 goto found_ire; 1047 } 1048 rw_exit(&irb_ptr->irb_lock); 1049 } else { 1050 uint_t masklen; 1051 1052 /* 1053 * In this case we don't know the mask, we need to 1054 * search the table assuming different mask sizes. 1055 */ 1056 if (flags & MATCH_IRE_SHORTERMASK) { 1057 masklen = ip_mask_to_plen_v6(mask); 1058 if (masklen == 0) { 1059 /* Nothing shorter than zero */ 1060 return (NULL); 1061 } 1062 masklen--; 1063 } else { 1064 masklen = IP6_MASK_TABLE_SIZE - 1; 1065 } 1066 1067 for (i = masklen; i >= 0; i--) { 1068 in6_addr_t tmpmask; 1069 1070 if ((ipst->ips_ip_forwarding_table_v6[i]) == NULL) 1071 continue; 1072 (void) ip_plen_to_mask_v6(i, &tmpmask); 1073 irb_ptr = &ipst->ips_ip_forwarding_table_v6[i][ 1074 IRE_ADDR_MASK_HASH_V6(*addr, tmpmask, 1075 ipst->ips_ip6_ftable_hash_size)]; 1076 rw_enter(&irb_ptr->irb_lock, RW_READER); 1077 for (ire = irb_ptr->irb_ire; ire != NULL; 1078 ire = ire->ire_next) { 1079 if (IRE_IS_CONDEMNED(ire)) 1080 continue; 1081 if (ire_match_args_v6(ire, addr, 1082 &ire->ire_mask_v6, gateway, type, ill, 1083 zoneid, tsl, flags)) 1084 goto found_ire; 1085 } 1086 rw_exit(&irb_ptr->irb_lock); 1087 } 1088 } 1089 ASSERT(ire == NULL); 1090 ip1dbg(("ire_ftable_lookup_v6: returning NULL ire")); 1091 return (NULL); 1092 1093 found_ire: 1094 ire_refhold(ire); 1095 rw_exit(&irb_ptr->irb_lock); 1096 return (ire); 1097 } 1098 1099 1100 /* 1101 * This function is called by 1102 * ip_input/ire_route_recursive when doing a route lookup on only the 1103 * destination address. 1104 * 1105 * The optimizations of this function over ire_ftable_lookup are: 1106 * o removing unnecessary flag matching 1107 * o doing longest prefix match instead of overloading it further 1108 * with the unnecessary "best_prefix_match" 1109 * 1110 * If no route is found we return IRE_NOROUTE. 1111 */ 1112 ire_t * 1113 ire_ftable_lookup_simple_v6(const in6_addr_t *addr, uint32_t xmit_hint, 1114 ip_stack_t *ipst, uint_t *generationp) 1115 { 1116 ire_t *ire; 1117 1118 ire = ire_ftable_lookup_v6(addr, NULL, NULL, 0, NULL, ALL_ZONES, NULL, 1119 MATCH_IRE_DSTONLY, xmit_hint, ipst, generationp); 1120 if (ire == NULL) { 1121 ire = ire_reject(ipst, B_TRUE); 1122 if (generationp != NULL) 1123 *generationp = IRE_GENERATION_VERIFY; 1124 } 1125 /* ftable_lookup did round robin */ 1126 return (ire); 1127 } 1128 1129 ire_t * 1130 ip_select_route_v6(const in6_addr_t *dst, const in6_addr_t src, 1131 ip_xmit_attr_t *ixa, uint_t *generationp, in6_addr_t *setsrcp, 1132 int *errorp, boolean_t *multirtp) 1133 { 1134 ASSERT(!(ixa->ixa_flags & IXAF_IS_IPV4)); 1135 1136 return (ip_select_route(dst, src, ixa, generationp, setsrcp, errorp, 1137 multirtp)); 1138 } 1139 1140 /* 1141 * Recursively look for a route to the destination. Can also match on 1142 * the zoneid, ill, and label. Used for the data paths. See also 1143 * ire_route_recursive_dstonly. 1144 * 1145 * If IRR_ALLOCATE is not set then we will only inspect the existing IREs; never 1146 * create an IRE_IF_CLONE. This is used on the receive side when we are not 1147 * forwarding. 1148 * If IRR_INCOMPLETE is set then we return the IRE even if we can't correctly 1149 * resolve the gateway. 1150 * 1151 * Note that this function never returns NULL. It returns an IRE_NOROUTE 1152 * instead. 1153 * 1154 * If we find any IRE_LOCAL|BROADCAST etc past the first iteration it 1155 * is an error. 1156 * Allow at most one RTF_INDIRECT. 1157 */ 1158 ire_t * 1159 ire_route_recursive_impl_v6(ire_t *ire, 1160 const in6_addr_t *nexthop, uint_t ire_type, const ill_t *ill_arg, 1161 zoneid_t zoneid, const ts_label_t *tsl, uint_t match_args, 1162 uint_t irr_flags, uint32_t xmit_hint, ip_stack_t *ipst, 1163 in6_addr_t *setsrcp, tsol_ire_gw_secattr_t **gwattrp, uint_t *generationp) 1164 { 1165 int i, j; 1166 in6_addr_t v6nexthop = *nexthop; 1167 ire_t *ires[MAX_IRE_RECURSION]; 1168 uint_t generation; 1169 uint_t generations[MAX_IRE_RECURSION]; 1170 boolean_t need_refrele = B_FALSE; 1171 boolean_t invalidate = B_FALSE; 1172 ill_t *ill = NULL; 1173 uint_t maskoff = (IRE_LOCAL|IRE_LOOPBACK); 1174 1175 if (setsrcp != NULL) 1176 ASSERT(IN6_IS_ADDR_UNSPECIFIED(setsrcp)); 1177 if (gwattrp != NULL) 1178 ASSERT(*gwattrp == NULL); 1179 1180 /* 1181 * We iterate up to three times to resolve a route, even though 1182 * we have four slots in the array. The extra slot is for an 1183 * IRE_IF_CLONE we might need to create. 1184 */ 1185 i = 0; 1186 while (i < MAX_IRE_RECURSION - 1) { 1187 /* ire_ftable_lookup handles round-robin/ECMP */ 1188 if (ire == NULL) { 1189 ire = ire_ftable_lookup_v6(&v6nexthop, 0, 0, ire_type, 1190 (ill != NULL ? ill : ill_arg), zoneid, tsl, 1191 match_args, xmit_hint, ipst, &generation); 1192 } else { 1193 /* Caller passed it; extra hold since we will rele */ 1194 ire_refhold(ire); 1195 if (generationp != NULL) 1196 generation = *generationp; 1197 else 1198 generation = IRE_GENERATION_VERIFY; 1199 } 1200 1201 if (ire == NULL) { 1202 if (i > 0 && (irr_flags & IRR_INCOMPLETE)) { 1203 ire = ires[0]; 1204 ire_refhold(ire); 1205 } else { 1206 ire = ire_reject(ipst, B_TRUE); 1207 } 1208 goto error; 1209 } 1210 1211 /* Need to return the ire with RTF_REJECT|BLACKHOLE */ 1212 if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) 1213 goto error; 1214 1215 ASSERT(!(ire->ire_type & IRE_MULTICAST)); /* Not in ftable */ 1216 1217 /* 1218 * Don't allow anything unusual past the first iteration. 1219 * After the first lookup, we should no longer look for 1220 * (IRE_LOCAL|IRE_LOOPBACK) or RTF_INDIRECT routes. 1221 * 1222 * In addition, after we have found a direct IRE_OFFLINK, 1223 * we should only look for interface or clone routes. 1224 */ 1225 match_args |= MATCH_IRE_DIRECT; /* no more RTF_INDIRECTs */ 1226 if ((ire->ire_type & IRE_OFFLINK) && 1227 !(ire->ire_flags & RTF_INDIRECT)) { 1228 ire_type = IRE_IF_ALL; 1229 } else { 1230 if (!(match_args & MATCH_IRE_TYPE)) 1231 ire_type = (IRE_OFFLINK|IRE_ONLINK); 1232 ire_type &= ~maskoff; /* no more LOCAL, LOOPBACK */ 1233 } 1234 match_args |= MATCH_IRE_TYPE; 1235 /* We have a usable IRE */ 1236 ires[i] = ire; 1237 generations[i] = generation; 1238 i++; 1239 1240 /* The first RTF_SETSRC address is passed back if setsrcp */ 1241 if ((ire->ire_flags & RTF_SETSRC) && 1242 setsrcp != NULL && IN6_IS_ADDR_UNSPECIFIED(setsrcp)) { 1243 ASSERT(!IN6_IS_ADDR_UNSPECIFIED( 1244 &ire->ire_setsrc_addr_v6)); 1245 *setsrcp = ire->ire_setsrc_addr_v6; 1246 } 1247 1248 /* The first ire_gw_secattr is passed back if gwattrp */ 1249 if (ire->ire_gw_secattr != NULL && 1250 gwattrp != NULL && *gwattrp == NULL) 1251 *gwattrp = ire->ire_gw_secattr; 1252 1253 /* 1254 * Check if we have a short-cut pointer to an IRE for this 1255 * destination, and that the cached dependency isn't stale. 1256 * In that case we've rejoined an existing tree towards a 1257 * parent, thus we don't need to continue the loop to 1258 * discover the rest of the tree. 1259 */ 1260 mutex_enter(&ire->ire_lock); 1261 if (ire->ire_dep_parent != NULL && 1262 ire->ire_dep_parent->ire_generation == 1263 ire->ire_dep_parent_generation) { 1264 mutex_exit(&ire->ire_lock); 1265 ire = NULL; 1266 goto done; 1267 } 1268 mutex_exit(&ire->ire_lock); 1269 1270 /* 1271 * If this type should have an ire_nce_cache (even if it 1272 * doesn't yet have one) then we are done. Includes 1273 * IRE_INTERFACE with a full 128 bit mask. 1274 */ 1275 if (ire->ire_nce_capable) { 1276 ire = NULL; 1277 goto done; 1278 } 1279 ASSERT(!(ire->ire_type & IRE_IF_CLONE)); 1280 /* 1281 * For an IRE_INTERFACE we create an IRE_IF_CLONE for this 1282 * particular destination 1283 */ 1284 if (ire->ire_type & IRE_INTERFACE) { 1285 ire_t *clone; 1286 1287 ASSERT(ire->ire_masklen != IPV6_ABITS); 1288 1289 /* 1290 * In the case of ip_input and ILLF_FORWARDING not 1291 * being set, and in the case of RTM_GET, there is 1292 * no point in allocating an IRE_IF_CLONE. We return 1293 * the IRE_INTERFACE. Note that !IRR_ALLOCATE can 1294 * result in a ire_dep_parent which is IRE_IF_* 1295 * without an IRE_IF_CLONE. 1296 * We recover from that when we need to send packets 1297 * by ensuring that the generations become 1298 * IRE_GENERATION_VERIFY in this case. 1299 */ 1300 if (!(irr_flags & IRR_ALLOCATE)) { 1301 invalidate = B_TRUE; 1302 ire = NULL; 1303 goto done; 1304 } 1305 1306 clone = ire_create_if_clone(ire, &v6nexthop, 1307 &generation); 1308 if (clone == NULL) { 1309 /* 1310 * Temporary failure - no memory. 1311 * Don't want caller to cache IRE_NOROUTE. 1312 */ 1313 invalidate = B_TRUE; 1314 ire = ire_blackhole(ipst, B_TRUE); 1315 goto error; 1316 } 1317 /* 1318 * Make clone next to last entry and the 1319 * IRE_INTERFACE the last in the dependency 1320 * chain since the clone depends on the 1321 * IRE_INTERFACE. 1322 */ 1323 ASSERT(i >= 1); 1324 ASSERT(i < MAX_IRE_RECURSION); 1325 1326 ires[i] = ires[i-1]; 1327 generations[i] = generations[i-1]; 1328 ires[i-1] = clone; 1329 generations[i-1] = generation; 1330 i++; 1331 1332 ire = NULL; 1333 goto done; 1334 } 1335 1336 /* 1337 * We only match on the type and optionally ILL when 1338 * recursing. The type match is used by some callers 1339 * to exclude certain types (such as IRE_IF_CLONE or 1340 * IRE_LOCAL|IRE_LOOPBACK). 1341 * 1342 * In the MATCH_IRE_SRC_ILL case, ill_arg may be the 'srcof' 1343 * ire->ire_ill, and we want to find the IRE_INTERFACE for 1344 * ire_ill, so we set ill to the ire_ill 1345 */ 1346 match_args &= (MATCH_IRE_TYPE | MATCH_IRE_DIRECT); 1347 v6nexthop = ire->ire_gateway_addr_v6; 1348 if (ill == NULL && ire->ire_ill != NULL) { 1349 ill = ire->ire_ill; 1350 need_refrele = B_TRUE; 1351 ill_refhold(ill); 1352 match_args |= MATCH_IRE_ILL; 1353 } 1354 ire = NULL; 1355 } 1356 ASSERT(ire == NULL); 1357 ire = ire_reject(ipst, B_TRUE); 1358 1359 error: 1360 ASSERT(ire != NULL); 1361 if (need_refrele) 1362 ill_refrele(ill); 1363 1364 /* 1365 * In the case of MULTIRT we want to try a different IRE the next 1366 * time. We let the next packet retry in that case. 1367 */ 1368 if (i > 0 && (ires[0]->ire_flags & RTF_MULTIRT)) 1369 (void) ire_no_good(ires[0]); 1370 1371 cleanup: 1372 /* cleanup ires[i] */ 1373 ire_dep_unbuild(ires, i); 1374 for (j = 0; j < i; j++) 1375 ire_refrele(ires[j]); 1376 1377 ASSERT((ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) || 1378 (irr_flags & IRR_INCOMPLETE)); 1379 /* 1380 * Use IRE_GENERATION_VERIFY to ensure that ip_output will redo the 1381 * ip_select_route since the reject or lack of memory might be gone. 1382 */ 1383 if (generationp != NULL) 1384 *generationp = IRE_GENERATION_VERIFY; 1385 return (ire); 1386 1387 done: 1388 ASSERT(ire == NULL); 1389 if (need_refrele) 1390 ill_refrele(ill); 1391 1392 /* Build dependencies */ 1393 if (i > 1 && !ire_dep_build(ires, generations, i)) { 1394 /* Something in chain was condemned; tear it apart */ 1395 ire = ire_blackhole(ipst, B_TRUE); 1396 goto cleanup; 1397 } 1398 1399 /* 1400 * Release all refholds except the one for ires[0] that we 1401 * will return to the caller. 1402 */ 1403 for (j = 1; j < i; j++) 1404 ire_refrele(ires[j]); 1405 1406 if (invalidate) { 1407 /* 1408 * Since we needed to allocate but couldn't we need to make 1409 * sure that the dependency chain is rebuilt the next time. 1410 */ 1411 ire_dep_invalidate_generations(ires[0]); 1412 generation = IRE_GENERATION_VERIFY; 1413 } else { 1414 /* 1415 * IREs can have been added or deleted while we did the 1416 * recursive lookup and we can't catch those until we've built 1417 * the dependencies. We verify the stored 1418 * ire_dep_parent_generation to catch any such changes and 1419 * return IRE_GENERATION_VERIFY (which will cause 1420 * ip_select_route to be called again so we can redo the 1421 * recursive lookup next time we send a packet. 1422 */ 1423 if (ires[0]->ire_dep_parent == NULL) 1424 generation = ires[0]->ire_generation; 1425 else 1426 generation = ire_dep_validate_generations(ires[0]); 1427 if (generations[0] != ires[0]->ire_generation) { 1428 /* Something changed at the top */ 1429 generation = IRE_GENERATION_VERIFY; 1430 } 1431 } 1432 if (generationp != NULL) 1433 *generationp = generation; 1434 1435 return (ires[0]); 1436 } 1437 1438 ire_t * 1439 ire_route_recursive_v6(const in6_addr_t *nexthop, uint_t ire_type, 1440 const ill_t *ill, zoneid_t zoneid, const ts_label_t *tsl, uint_t match_args, 1441 uint_t irr_flags, uint32_t xmit_hint, ip_stack_t *ipst, 1442 in6_addr_t *setsrcp, tsol_ire_gw_secattr_t **gwattrp, uint_t *generationp) 1443 { 1444 return (ire_route_recursive_impl_v6(NULL, nexthop, ire_type, ill, 1445 zoneid, tsl, match_args, irr_flags, xmit_hint, ipst, setsrcp, 1446 gwattrp, generationp)); 1447 } 1448 1449 /* 1450 * Recursively look for a route to the destination. 1451 * We only handle a destination match here, yet we have the same arguments 1452 * as the full match to allow function pointers to select between the two. 1453 * 1454 * Note that this function never returns NULL. It returns an IRE_NOROUTE 1455 * instead. 1456 * 1457 * If we find any IRE_LOCAL|BROADCAST etc past the first iteration it 1458 * is an error. 1459 * Allow at most one RTF_INDIRECT. 1460 */ 1461 ire_t * 1462 ire_route_recursive_dstonly_v6(const in6_addr_t *nexthop, uint_t irr_flags, 1463 uint32_t xmit_hint, ip_stack_t *ipst) 1464 { 1465 ire_t *ire; 1466 ire_t *ire1; 1467 uint_t generation; 1468 1469 /* ire_ftable_lookup handles round-robin/ECMP */ 1470 ire = ire_ftable_lookup_simple_v6(nexthop, xmit_hint, ipst, 1471 &generation); 1472 ASSERT(ire != NULL); 1473 1474 /* 1475 * If this type should have an ire_nce_cache (even if it 1476 * doesn't yet have one) then we are done. Includes 1477 * IRE_INTERFACE with a full 128 bit mask. 1478 */ 1479 if (ire->ire_nce_capable) 1480 return (ire); 1481 1482 /* 1483 * If the IRE has a current cached parent we know that the whole 1484 * parent chain is current, hence we don't need to discover and 1485 * build any dependencies by doing a recursive lookup. 1486 */ 1487 mutex_enter(&ire->ire_lock); 1488 if (ire->ire_dep_parent != NULL && 1489 ire->ire_dep_parent->ire_generation == 1490 ire->ire_dep_parent_generation) { 1491 mutex_exit(&ire->ire_lock); 1492 return (ire); 1493 } 1494 mutex_exit(&ire->ire_lock); 1495 1496 /* 1497 * Fallback to loop in the normal code starting with the ire 1498 * we found. Normally this would return the same ire. 1499 */ 1500 ire1 = ire_route_recursive_impl_v6(ire, nexthop, 0, NULL, ALL_ZONES, 1501 NULL, MATCH_IRE_DSTONLY, irr_flags, xmit_hint, ipst, NULL, NULL, 1502 &generation); 1503 ire_refrele(ire); 1504 return (ire1); 1505 } 1506