1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. 24 */ 25 26 #include <sys/types.h> 27 #include <sys/stream.h> 28 #include <sys/strsun.h> 29 #include <sys/zone.h> 30 #include <sys/ddi.h> 31 #include <sys/sunddi.h> 32 #include <sys/cmn_err.h> 33 #include <sys/debug.h> 34 #include <sys/atomic.h> 35 #define _SUN_TPI_VERSION 2 36 #include <sys/tihdr.h> 37 38 #include <inet/common.h> 39 #include <inet/mi.h> 40 #include <inet/mib2.h> 41 #include <inet/snmpcom.h> 42 43 #include <netinet/ip6.h> 44 #include <netinet/icmp6.h> 45 46 #include <inet/ip.h> 47 #include <inet/ip_impl.h> 48 #include <inet/ip6.h> 49 #include <inet/ip6_asp.h> 50 #include <inet/ip_multi.h> 51 #include <inet/ip_if.h> 52 #include <inet/ip_ire.h> 53 #include <inet/ip_ftable.h> 54 #include <inet/ip_rts.h> 55 #include <inet/ip_ndp.h> 56 #include <inet/ipclassifier.h> 57 #include <inet/ip_listutils.h> 58 59 #include <sys/sunddi.h> 60 61 /* 62 * Routines for handling destination cache entries. 63 * There is always one DCEF_DEFAULT for each ip_stack_t created at init time. 64 * That entry holds both the IP ident value and the dce generation number. 65 * 66 * Any time a DCE is changed significantly (different path MTU, but NOT 67 * different ULP info!), the dce_generation number is increased. 68 * Also, when a new DCE is created, the dce_generation number in the default 69 * DCE is bumped. That allows the dce_t information to be cached efficiently 70 * as long as the entity caching the dce_t also caches the dce_generation, 71 * and compares the cached generation to detect any changes. 72 * Furthermore, when a DCE is deleted, if there are any outstanding references 73 * to the DCE it will be marked as condemned. The condemned mark is 74 * a designated generation number which is never otherwise used, hence 75 * the single comparison with the generation number captures that as well. 76 * 77 * An example of code which caches is as follows: 78 * 79 * if (mystruct->my_dce_generation != mystruct->my_dce->dce_generation) { 80 * The DCE has changed 81 * mystruct->my_dce = dce_lookup_pkt(mp, ixa, 82 * &mystruct->my_dce_generation); 83 * Not needed in practice, since we have the default DCE: 84 * if (DCE_IS_CONDEMNED(mystruct->my_dce)) 85 * return failure; 86 * } 87 * 88 * Note that for IPv6 link-local addresses we record the ifindex since the 89 * link-locals are not globally unique. 90 */ 91 92 /* 93 * Hash bucket structure for DCEs 94 */ 95 typedef struct dcb_s { 96 krwlock_t dcb_lock; 97 uint32_t dcb_cnt; 98 dce_t *dcb_dce; 99 } dcb_t; 100 101 static void dce_delete_locked(dcb_t *, dce_t *); 102 static void dce_make_condemned(dce_t *); 103 104 static kmem_cache_t *dce_cache; 105 106 107 /* Operates on a uint64_t */ 108 #define RANDOM_HASH(p) ((p) ^ ((p)>>16) ^ ((p)>>32) ^ ((p)>>48)) 109 110 /* 111 * Reclaim a fraction of dce's in the dcb. 112 * For now we have a higher probability to delete DCEs without DCE_PMTU. 113 */ 114 static void 115 dcb_reclaim(dcb_t *dcb, ip_stack_t *ipst, uint_t fraction) 116 { 117 uint_t fraction_pmtu = fraction*4; 118 uint_t hash; 119 dce_t *dce, *nextdce; 120 121 rw_enter(&dcb->dcb_lock, RW_WRITER); 122 for (dce = dcb->dcb_dce; dce != NULL; dce = nextdce) { 123 nextdce = dce->dce_next; 124 /* Clear DCEF_PMTU if the pmtu is too old */ 125 mutex_enter(&dce->dce_lock); 126 if ((dce->dce_flags & DCEF_PMTU) && 127 TICK_TO_SEC(ddi_get_lbolt64()) - dce->dce_last_change_time > 128 ipst->ips_ip_pathmtu_interval) { 129 dce->dce_flags &= ~DCEF_PMTU; 130 mutex_exit(&dce->dce_lock); 131 dce_increment_generation(dce); 132 } else { 133 mutex_exit(&dce->dce_lock); 134 } 135 hash = RANDOM_HASH((uint64_t)(uintptr_t)dce); 136 if (dce->dce_flags & DCEF_PMTU) { 137 if (hash % fraction_pmtu != 0) 138 continue; 139 } else { 140 if (hash % fraction != 0) 141 continue; 142 } 143 144 IP_STAT(ipst, ip_dce_reclaim_deleted); 145 dce_delete_locked(dcb, dce); 146 dce_refrele(dce); 147 } 148 rw_exit(&dcb->dcb_lock); 149 } 150 151 /* 152 * kmem_cache callback to free up memory. 153 * 154 */ 155 static void 156 ip_dce_reclaim_stack(ip_stack_t *ipst) 157 { 158 int i; 159 160 IP_STAT(ipst, ip_dce_reclaim_calls); 161 for (i = 0; i < ipst->ips_dce_hashsize; i++) { 162 dcb_reclaim(&ipst->ips_dce_hash_v4[i], ipst, 163 ipst->ips_ip_dce_reclaim_fraction); 164 165 dcb_reclaim(&ipst->ips_dce_hash_v6[i], ipst, 166 ipst->ips_ip_dce_reclaim_fraction); 167 } 168 169 /* 170 * Walk all CONNs that can have a reference on an ire, nce or dce. 171 * Get them to update any stale references to drop any refholds they 172 * have. 173 */ 174 ipcl_walk(conn_ixa_cleanup, (void *)B_FALSE, ipst); 175 } 176 177 /* 178 * Called by the memory allocator subsystem directly, when the system 179 * is running low on memory. 180 */ 181 /* ARGSUSED */ 182 void 183 ip_dce_reclaim(void *args) 184 { 185 netstack_handle_t nh; 186 netstack_t *ns; 187 ip_stack_t *ipst; 188 189 netstack_next_init(&nh); 190 while ((ns = netstack_next(&nh)) != NULL) { 191 /* 192 * netstack_next() can return a netstack_t with a NULL 193 * netstack_ip at boot time. 194 */ 195 if ((ipst = ns->netstack_ip) == NULL) { 196 netstack_rele(ns); 197 continue; 198 } 199 ip_dce_reclaim_stack(ipst); 200 netstack_rele(ns); 201 } 202 netstack_next_fini(&nh); 203 } 204 205 void 206 dce_g_init(void) 207 { 208 dce_cache = kmem_cache_create("dce_cache", 209 sizeof (dce_t), 0, NULL, NULL, ip_dce_reclaim, NULL, NULL, 0); 210 } 211 212 void 213 dce_g_destroy(void) 214 { 215 kmem_cache_destroy(dce_cache); 216 } 217 218 219 /* 220 * Allocate a default DCE and a hash table for per-IP address DCEs 221 */ 222 void 223 dce_stack_init(ip_stack_t *ipst) 224 { 225 int i; 226 227 ipst->ips_dce_default = kmem_cache_alloc(dce_cache, KM_SLEEP); 228 bzero(ipst->ips_dce_default, sizeof (dce_t)); 229 ipst->ips_dce_default->dce_flags = DCEF_DEFAULT; 230 ipst->ips_dce_default->dce_generation = DCE_GENERATION_INITIAL; 231 ipst->ips_dce_default->dce_last_change_time = 232 TICK_TO_SEC(ddi_get_lbolt64()); 233 ipst->ips_dce_default->dce_refcnt = 1; /* Should never go away */ 234 ipst->ips_dce_default->dce_ipst = ipst; 235 236 /* This must be a power of two since we are using IRE_ADDR_HASH macro */ 237 ipst->ips_dce_hashsize = 256; 238 ipst->ips_dce_hash_v4 = kmem_zalloc(ipst->ips_dce_hashsize * 239 sizeof (dcb_t), KM_SLEEP); 240 ipst->ips_dce_hash_v6 = kmem_zalloc(ipst->ips_dce_hashsize * 241 sizeof (dcb_t), KM_SLEEP); 242 for (i = 0; i < ipst->ips_dce_hashsize; i++) { 243 rw_init(&ipst->ips_dce_hash_v4[i].dcb_lock, NULL, RW_DEFAULT, 244 NULL); 245 rw_init(&ipst->ips_dce_hash_v6[i].dcb_lock, NULL, RW_DEFAULT, 246 NULL); 247 } 248 } 249 250 void 251 dce_stack_destroy(ip_stack_t *ipst) 252 { 253 int i; 254 for (i = 0; i < ipst->ips_dce_hashsize; i++) { 255 rw_destroy(&ipst->ips_dce_hash_v4[i].dcb_lock); 256 rw_destroy(&ipst->ips_dce_hash_v6[i].dcb_lock); 257 } 258 kmem_free(ipst->ips_dce_hash_v4, 259 ipst->ips_dce_hashsize * sizeof (dcb_t)); 260 ipst->ips_dce_hash_v4 = NULL; 261 kmem_free(ipst->ips_dce_hash_v6, 262 ipst->ips_dce_hashsize * sizeof (dcb_t)); 263 ipst->ips_dce_hash_v6 = NULL; 264 ipst->ips_dce_hashsize = 0; 265 266 ASSERT(ipst->ips_dce_default->dce_refcnt == 1); 267 kmem_cache_free(dce_cache, ipst->ips_dce_default); 268 ipst->ips_dce_default = NULL; 269 } 270 271 /* When any DCE is good enough */ 272 dce_t * 273 dce_get_default(ip_stack_t *ipst) 274 { 275 dce_t *dce; 276 277 dce = ipst->ips_dce_default; 278 dce_refhold(dce); 279 return (dce); 280 } 281 282 /* 283 * Generic for IPv4 and IPv6. 284 * 285 * Used by callers that need to cache e.g., the datapath 286 * Returns the generation number in the last argument. 287 */ 288 dce_t * 289 dce_lookup_pkt(mblk_t *mp, ip_xmit_attr_t *ixa, uint_t *generationp) 290 { 291 if (ixa->ixa_flags & IXAF_IS_IPV4) { 292 /* 293 * If we have a source route we need to look for the final 294 * destination in the source route option. 295 */ 296 ipaddr_t final_dst; 297 ipha_t *ipha = (ipha_t *)mp->b_rptr; 298 299 final_dst = ip_get_dst(ipha); 300 return (dce_lookup_v4(final_dst, ixa->ixa_ipst, generationp)); 301 } else { 302 uint_t ifindex; 303 /* 304 * If we have a routing header we need to look for the final 305 * destination in the routing extension header. 306 */ 307 in6_addr_t final_dst; 308 ip6_t *ip6h = (ip6_t *)mp->b_rptr; 309 310 final_dst = ip_get_dst_v6(ip6h, mp, NULL); 311 ifindex = 0; 312 if (IN6_IS_ADDR_LINKSCOPE(&final_dst) && ixa->ixa_nce != NULL) { 313 ifindex = ixa->ixa_nce->nce_common->ncec_ill-> 314 ill_phyint->phyint_ifindex; 315 } 316 return (dce_lookup_v6(&final_dst, ifindex, ixa->ixa_ipst, 317 generationp)); 318 } 319 } 320 321 /* 322 * Used by callers that need to cache e.g., the datapath 323 * Returns the generation number in the last argument. 324 */ 325 dce_t * 326 dce_lookup_v4(ipaddr_t dst, ip_stack_t *ipst, uint_t *generationp) 327 { 328 uint_t hash; 329 dcb_t *dcb; 330 dce_t *dce; 331 332 /* Set *generationp before dropping the lock(s) that allow additions */ 333 if (generationp != NULL) 334 *generationp = ipst->ips_dce_default->dce_generation; 335 336 hash = IRE_ADDR_HASH(dst, ipst->ips_dce_hashsize); 337 dcb = &ipst->ips_dce_hash_v4[hash]; 338 rw_enter(&dcb->dcb_lock, RW_READER); 339 for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) { 340 if (dce->dce_v4addr == dst) { 341 mutex_enter(&dce->dce_lock); 342 if (!DCE_IS_CONDEMNED(dce)) { 343 dce_refhold(dce); 344 if (generationp != NULL) 345 *generationp = dce->dce_generation; 346 mutex_exit(&dce->dce_lock); 347 rw_exit(&dcb->dcb_lock); 348 return (dce); 349 } 350 mutex_exit(&dce->dce_lock); 351 } 352 } 353 rw_exit(&dcb->dcb_lock); 354 /* Not found */ 355 dce = ipst->ips_dce_default; 356 dce_refhold(dce); 357 return (dce); 358 } 359 360 /* 361 * Used by callers that need to cache e.g., the datapath 362 * Returns the generation number in the last argument. 363 * ifindex should only be set for link-locals 364 */ 365 dce_t * 366 dce_lookup_v6(const in6_addr_t *dst, uint_t ifindex, ip_stack_t *ipst, 367 uint_t *generationp) 368 { 369 uint_t hash; 370 dcb_t *dcb; 371 dce_t *dce; 372 373 /* Set *generationp before dropping the lock(s) that allow additions */ 374 if (generationp != NULL) 375 *generationp = ipst->ips_dce_default->dce_generation; 376 377 hash = IRE_ADDR_HASH_V6(*dst, ipst->ips_dce_hashsize); 378 dcb = &ipst->ips_dce_hash_v6[hash]; 379 rw_enter(&dcb->dcb_lock, RW_READER); 380 for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) { 381 if (IN6_ARE_ADDR_EQUAL(&dce->dce_v6addr, dst) && 382 dce->dce_ifindex == ifindex) { 383 mutex_enter(&dce->dce_lock); 384 if (!DCE_IS_CONDEMNED(dce)) { 385 dce_refhold(dce); 386 if (generationp != NULL) 387 *generationp = dce->dce_generation; 388 mutex_exit(&dce->dce_lock); 389 rw_exit(&dcb->dcb_lock); 390 return (dce); 391 } 392 mutex_exit(&dce->dce_lock); 393 } 394 } 395 rw_exit(&dcb->dcb_lock); 396 /* Not found */ 397 dce = ipst->ips_dce_default; 398 dce_refhold(dce); 399 return (dce); 400 } 401 402 /* 403 * Atomically looks for a non-default DCE, and if not found tries to create one. 404 * If there is no memory it returns NULL. 405 * When an entry is created we increase the generation number on 406 * the default DCE so that conn_ip_output will detect there is a new DCE. 407 */ 408 dce_t * 409 dce_lookup_and_add_v4(ipaddr_t dst, ip_stack_t *ipst) 410 { 411 uint_t hash; 412 dcb_t *dcb; 413 dce_t *dce; 414 415 hash = IRE_ADDR_HASH(dst, ipst->ips_dce_hashsize); 416 dcb = &ipst->ips_dce_hash_v4[hash]; 417 rw_enter(&dcb->dcb_lock, RW_WRITER); 418 for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) { 419 if (dce->dce_v4addr == dst) { 420 mutex_enter(&dce->dce_lock); 421 if (!DCE_IS_CONDEMNED(dce)) { 422 dce_refhold(dce); 423 mutex_exit(&dce->dce_lock); 424 rw_exit(&dcb->dcb_lock); 425 return (dce); 426 } 427 mutex_exit(&dce->dce_lock); 428 } 429 } 430 dce = kmem_cache_alloc(dce_cache, KM_NOSLEEP); 431 if (dce == NULL) { 432 rw_exit(&dcb->dcb_lock); 433 return (NULL); 434 } 435 bzero(dce, sizeof (dce_t)); 436 dce->dce_ipst = ipst; /* No netstack_hold */ 437 dce->dce_v4addr = dst; 438 dce->dce_generation = DCE_GENERATION_INITIAL; 439 dce->dce_ipversion = IPV4_VERSION; 440 dce->dce_last_change_time = TICK_TO_SEC(ddi_get_lbolt64()); 441 dce_refhold(dce); /* For the hash list */ 442 443 /* Link into list */ 444 if (dcb->dcb_dce != NULL) 445 dcb->dcb_dce->dce_ptpn = &dce->dce_next; 446 dce->dce_next = dcb->dcb_dce; 447 dce->dce_ptpn = &dcb->dcb_dce; 448 dcb->dcb_dce = dce; 449 dce->dce_bucket = dcb; 450 dce_refhold(dce); /* For the caller */ 451 rw_exit(&dcb->dcb_lock); 452 453 /* Initialize dce_ident to be different than for the last packet */ 454 dce->dce_ident = ipst->ips_dce_default->dce_ident + 1; 455 456 dce_increment_generation(ipst->ips_dce_default); 457 return (dce); 458 } 459 460 /* 461 * Atomically looks for a non-default DCE, and if not found tries to create one. 462 * If there is no memory it returns NULL. 463 * When an entry is created we increase the generation number on 464 * the default DCE so that conn_ip_output will detect there is a new DCE. 465 * ifindex should only be used with link-local addresses. 466 */ 467 dce_t * 468 dce_lookup_and_add_v6(const in6_addr_t *dst, uint_t ifindex, ip_stack_t *ipst) 469 { 470 uint_t hash; 471 dcb_t *dcb; 472 dce_t *dce; 473 474 /* We should not create entries for link-locals w/o an ifindex */ 475 ASSERT(!(IN6_IS_ADDR_LINKSCOPE(dst)) || ifindex != 0); 476 477 hash = IRE_ADDR_HASH_V6(*dst, ipst->ips_dce_hashsize); 478 dcb = &ipst->ips_dce_hash_v6[hash]; 479 rw_enter(&dcb->dcb_lock, RW_WRITER); 480 for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) { 481 if (IN6_ARE_ADDR_EQUAL(&dce->dce_v6addr, dst) && 482 dce->dce_ifindex == ifindex) { 483 mutex_enter(&dce->dce_lock); 484 if (!DCE_IS_CONDEMNED(dce)) { 485 dce_refhold(dce); 486 mutex_exit(&dce->dce_lock); 487 rw_exit(&dcb->dcb_lock); 488 return (dce); 489 } 490 mutex_exit(&dce->dce_lock); 491 } 492 } 493 494 dce = kmem_cache_alloc(dce_cache, KM_NOSLEEP); 495 if (dce == NULL) { 496 rw_exit(&dcb->dcb_lock); 497 return (NULL); 498 } 499 bzero(dce, sizeof (dce_t)); 500 dce->dce_ipst = ipst; /* No netstack_hold */ 501 dce->dce_v6addr = *dst; 502 dce->dce_ifindex = ifindex; 503 dce->dce_generation = DCE_GENERATION_INITIAL; 504 dce->dce_ipversion = IPV6_VERSION; 505 dce->dce_last_change_time = TICK_TO_SEC(ddi_get_lbolt64()); 506 dce_refhold(dce); /* For the hash list */ 507 508 /* Link into list */ 509 if (dcb->dcb_dce != NULL) 510 dcb->dcb_dce->dce_ptpn = &dce->dce_next; 511 dce->dce_next = dcb->dcb_dce; 512 dce->dce_ptpn = &dcb->dcb_dce; 513 dcb->dcb_dce = dce; 514 dce->dce_bucket = dcb; 515 atomic_add_32(&dcb->dcb_cnt, 1); 516 dce_refhold(dce); /* For the caller */ 517 rw_exit(&dcb->dcb_lock); 518 519 /* Initialize dce_ident to be different than for the last packet */ 520 dce->dce_ident = ipst->ips_dce_default->dce_ident + 1; 521 dce_increment_generation(ipst->ips_dce_default); 522 return (dce); 523 } 524 525 /* 526 * Set/update uinfo. Creates a per-destination dce if none exists. 527 * 528 * Note that we do not bump the generation number here. 529 * New connections will find the new uinfo. 530 * 531 * The only use of this (tcp, sctp using iulp_t) is to set rtt+rtt_sd. 532 */ 533 static void 534 dce_setuinfo(dce_t *dce, iulp_t *uinfo) 535 { 536 /* 537 * Update the round trip time estimate and/or the max frag size 538 * and/or the slow start threshold. 539 * 540 * We serialize multiple advises using dce_lock. 541 */ 542 mutex_enter(&dce->dce_lock); 543 /* Gard against setting to zero */ 544 if (uinfo->iulp_rtt != 0) { 545 /* 546 * If there is no old cached values, initialize them 547 * conservatively. Set them to be (1.5 * new value). 548 */ 549 if (dce->dce_uinfo.iulp_rtt != 0) { 550 dce->dce_uinfo.iulp_rtt = (dce->dce_uinfo.iulp_rtt + 551 uinfo->iulp_rtt) >> 1; 552 } else { 553 dce->dce_uinfo.iulp_rtt = uinfo->iulp_rtt + 554 (uinfo->iulp_rtt >> 1); 555 } 556 if (dce->dce_uinfo.iulp_rtt_sd != 0) { 557 dce->dce_uinfo.iulp_rtt_sd = 558 (dce->dce_uinfo.iulp_rtt_sd + 559 uinfo->iulp_rtt_sd) >> 1; 560 } else { 561 dce->dce_uinfo.iulp_rtt_sd = uinfo->iulp_rtt_sd + 562 (uinfo->iulp_rtt_sd >> 1); 563 } 564 } 565 if (uinfo->iulp_mtu != 0) { 566 if (dce->dce_flags & DCEF_PMTU) { 567 dce->dce_pmtu = MIN(uinfo->iulp_mtu, dce->dce_pmtu); 568 } else { 569 dce->dce_pmtu = MIN(uinfo->iulp_mtu, IP_MAXPACKET); 570 dce->dce_flags |= DCEF_PMTU; 571 } 572 dce->dce_last_change_time = TICK_TO_SEC(ddi_get_lbolt64()); 573 } 574 if (uinfo->iulp_ssthresh != 0) { 575 if (dce->dce_uinfo.iulp_ssthresh != 0) 576 dce->dce_uinfo.iulp_ssthresh = 577 (uinfo->iulp_ssthresh + 578 dce->dce_uinfo.iulp_ssthresh) >> 1; 579 else 580 dce->dce_uinfo.iulp_ssthresh = uinfo->iulp_ssthresh; 581 } 582 /* We have uinfo for sure */ 583 dce->dce_flags |= DCEF_UINFO; 584 mutex_exit(&dce->dce_lock); 585 } 586 587 588 int 589 dce_update_uinfo_v4(ipaddr_t dst, iulp_t *uinfo, ip_stack_t *ipst) 590 { 591 dce_t *dce; 592 593 dce = dce_lookup_and_add_v4(dst, ipst); 594 if (dce == NULL) 595 return (ENOMEM); 596 597 dce_setuinfo(dce, uinfo); 598 dce_refrele(dce); 599 return (0); 600 } 601 602 int 603 dce_update_uinfo_v6(const in6_addr_t *dst, uint_t ifindex, iulp_t *uinfo, 604 ip_stack_t *ipst) 605 { 606 dce_t *dce; 607 608 dce = dce_lookup_and_add_v6(dst, ifindex, ipst); 609 if (dce == NULL) 610 return (ENOMEM); 611 612 dce_setuinfo(dce, uinfo); 613 dce_refrele(dce); 614 return (0); 615 } 616 617 /* Common routine for IPv4 and IPv6 */ 618 int 619 dce_update_uinfo(const in6_addr_t *dst, uint_t ifindex, iulp_t *uinfo, 620 ip_stack_t *ipst) 621 { 622 ipaddr_t dst4; 623 624 if (IN6_IS_ADDR_V4MAPPED_ANY(dst)) { 625 IN6_V4MAPPED_TO_IPADDR(dst, dst4); 626 return (dce_update_uinfo_v4(dst4, uinfo, ipst)); 627 } else { 628 return (dce_update_uinfo_v6(dst, ifindex, uinfo, ipst)); 629 } 630 } 631 632 static void 633 dce_make_condemned(dce_t *dce) 634 { 635 ip_stack_t *ipst = dce->dce_ipst; 636 637 mutex_enter(&dce->dce_lock); 638 ASSERT(!DCE_IS_CONDEMNED(dce)); 639 dce->dce_generation = DCE_GENERATION_CONDEMNED; 640 mutex_exit(&dce->dce_lock); 641 /* Count how many condemned dces for kmem_cache callback */ 642 atomic_add_32(&ipst->ips_num_dce_condemned, 1); 643 } 644 645 /* 646 * Increment the generation avoiding the special condemned value 647 */ 648 void 649 dce_increment_generation(dce_t *dce) 650 { 651 uint_t generation; 652 653 mutex_enter(&dce->dce_lock); 654 if (!DCE_IS_CONDEMNED(dce)) { 655 generation = dce->dce_generation + 1; 656 if (generation == DCE_GENERATION_CONDEMNED) 657 generation = DCE_GENERATION_INITIAL; 658 ASSERT(generation != DCE_GENERATION_VERIFY); 659 dce->dce_generation = generation; 660 } 661 mutex_exit(&dce->dce_lock); 662 } 663 664 /* 665 * Increment the generation number on all dces that have a path MTU and 666 * the default DCE. Used when ill_mtu or ill_mc_mtu changes. 667 */ 668 void 669 dce_increment_all_generations(boolean_t isv6, ip_stack_t *ipst) 670 { 671 int i; 672 dcb_t *dcb; 673 dce_t *dce; 674 675 for (i = 0; i < ipst->ips_dce_hashsize; i++) { 676 if (isv6) 677 dcb = &ipst->ips_dce_hash_v6[i]; 678 else 679 dcb = &ipst->ips_dce_hash_v4[i]; 680 rw_enter(&dcb->dcb_lock, RW_WRITER); 681 for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) { 682 if (DCE_IS_CONDEMNED(dce)) 683 continue; 684 dce_increment_generation(dce); 685 } 686 rw_exit(&dcb->dcb_lock); 687 } 688 dce_increment_generation(ipst->ips_dce_default); 689 } 690 691 /* 692 * Caller needs to do a dce_refrele since we can't do the 693 * dce_refrele under dcb_lock. 694 */ 695 static void 696 dce_delete_locked(dcb_t *dcb, dce_t *dce) 697 { 698 dce->dce_bucket = NULL; 699 *dce->dce_ptpn = dce->dce_next; 700 if (dce->dce_next != NULL) 701 dce->dce_next->dce_ptpn = dce->dce_ptpn; 702 dce->dce_ptpn = NULL; 703 dce->dce_next = NULL; 704 atomic_add_32(&dcb->dcb_cnt, -1); 705 dce_make_condemned(dce); 706 } 707 708 static void 709 dce_inactive(dce_t *dce) 710 { 711 ip_stack_t *ipst = dce->dce_ipst; 712 713 ASSERT(!(dce->dce_flags & DCEF_DEFAULT)); 714 ASSERT(dce->dce_ptpn == NULL); 715 ASSERT(dce->dce_bucket == NULL); 716 717 /* Count how many condemned dces for kmem_cache callback */ 718 if (DCE_IS_CONDEMNED(dce)) 719 atomic_add_32(&ipst->ips_num_dce_condemned, -1); 720 721 kmem_cache_free(dce_cache, dce); 722 } 723 724 void 725 dce_refrele(dce_t *dce) 726 { 727 ASSERT(dce->dce_refcnt != 0); 728 if (atomic_add_32_nv(&dce->dce_refcnt, -1) == 0) 729 dce_inactive(dce); 730 } 731 732 void 733 dce_refhold(dce_t *dce) 734 { 735 atomic_add_32(&dce->dce_refcnt, 1); 736 ASSERT(dce->dce_refcnt != 0); 737 } 738 739 /* No tracing support yet hence the same as the above functions */ 740 void 741 dce_refrele_notr(dce_t *dce) 742 { 743 ASSERT(dce->dce_refcnt != 0); 744 if (atomic_add_32_nv(&dce->dce_refcnt, -1) == 0) 745 dce_inactive(dce); 746 } 747 748 void 749 dce_refhold_notr(dce_t *dce) 750 { 751 atomic_add_32(&dce->dce_refcnt, 1); 752 ASSERT(dce->dce_refcnt != 0); 753 } 754 755 /* Report both the IPv4 and IPv6 DCEs. */ 756 mblk_t * 757 ip_snmp_get_mib2_ip_dce(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst) 758 { 759 struct opthdr *optp; 760 mblk_t *mp2ctl; 761 dest_cache_entry_t dest_cache; 762 mblk_t *mp_tail = NULL; 763 dce_t *dce; 764 dcb_t *dcb; 765 int i; 766 uint64_t current_time; 767 768 current_time = TICK_TO_SEC(ddi_get_lbolt64()); 769 770 /* 771 * make a copy of the original message 772 */ 773 mp2ctl = copymsg(mpctl); 774 775 /* First we do IPv4 entries */ 776 optp = (struct opthdr *)&mpctl->b_rptr[ 777 sizeof (struct T_optmgmt_ack)]; 778 optp->level = MIB2_IP; 779 optp->name = EXPER_IP_DCE; 780 781 for (i = 0; i < ipst->ips_dce_hashsize; i++) { 782 dcb = &ipst->ips_dce_hash_v4[i]; 783 rw_enter(&dcb->dcb_lock, RW_READER); 784 for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) { 785 dest_cache.DestIpv4Address = dce->dce_v4addr; 786 dest_cache.DestFlags = dce->dce_flags; 787 if (dce->dce_flags & DCEF_PMTU) 788 dest_cache.DestPmtu = dce->dce_pmtu; 789 else 790 dest_cache.DestPmtu = 0; 791 dest_cache.DestIdent = dce->dce_ident; 792 dest_cache.DestIfindex = 0; 793 dest_cache.DestAge = current_time - 794 dce->dce_last_change_time; 795 if (!snmp_append_data2(mpctl->b_cont, &mp_tail, 796 (char *)&dest_cache, (int)sizeof (dest_cache))) { 797 ip1dbg(("ip_snmp_get_mib2_ip_dce: " 798 "failed to allocate %u bytes\n", 799 (uint_t)sizeof (dest_cache))); 800 } 801 } 802 rw_exit(&dcb->dcb_lock); 803 } 804 optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont); 805 ip3dbg(("ip_snmp_get: level %d, name %d, len %d\n", 806 (int)optp->level, (int)optp->name, (int)optp->len)); 807 qreply(q, mpctl); 808 809 if (mp2ctl == NULL) { 810 /* Copymsg failed above */ 811 return (NULL); 812 } 813 814 /* Now for IPv6 */ 815 mpctl = mp2ctl; 816 mp_tail = NULL; 817 mp2ctl = copymsg(mpctl); 818 optp = (struct opthdr *)&mpctl->b_rptr[ 819 sizeof (struct T_optmgmt_ack)]; 820 optp->level = MIB2_IP6; 821 optp->name = EXPER_IP_DCE; 822 823 for (i = 0; i < ipst->ips_dce_hashsize; i++) { 824 dcb = &ipst->ips_dce_hash_v6[i]; 825 rw_enter(&dcb->dcb_lock, RW_READER); 826 for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) { 827 dest_cache.DestIpv6Address = dce->dce_v6addr; 828 dest_cache.DestFlags = dce->dce_flags; 829 if (dce->dce_flags & DCEF_PMTU) 830 dest_cache.DestPmtu = dce->dce_pmtu; 831 else 832 dest_cache.DestPmtu = 0; 833 dest_cache.DestIdent = dce->dce_ident; 834 if (IN6_IS_ADDR_LINKSCOPE(&dce->dce_v6addr)) 835 dest_cache.DestIfindex = dce->dce_ifindex; 836 else 837 dest_cache.DestIfindex = 0; 838 dest_cache.DestAge = current_time - 839 dce->dce_last_change_time; 840 if (!snmp_append_data2(mpctl->b_cont, &mp_tail, 841 (char *)&dest_cache, (int)sizeof (dest_cache))) { 842 ip1dbg(("ip_snmp_get_mib2_ip_dce: " 843 "failed to allocate %u bytes\n", 844 (uint_t)sizeof (dest_cache))); 845 } 846 } 847 rw_exit(&dcb->dcb_lock); 848 } 849 optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont); 850 ip3dbg(("ip_snmp_get: level %d, name %d, len %d\n", 851 (int)optp->level, (int)optp->name, (int)optp->len)); 852 qreply(q, mpctl); 853 854 return (mp2ctl); 855 } 856 857 /* 858 * Remove IPv6 DCEs which refer to an ifindex that is going away. 859 * This is not required for correctness, but it avoids netstat -d 860 * showing stale stuff that will never be used. 861 */ 862 void 863 dce_cleanup(uint_t ifindex, ip_stack_t *ipst) 864 { 865 uint_t i; 866 dcb_t *dcb; 867 dce_t *dce, *nextdce; 868 869 for (i = 0; i < ipst->ips_dce_hashsize; i++) { 870 dcb = &ipst->ips_dce_hash_v6[i]; 871 rw_enter(&dcb->dcb_lock, RW_WRITER); 872 873 for (dce = dcb->dcb_dce; dce != NULL; dce = nextdce) { 874 nextdce = dce->dce_next; 875 if (dce->dce_ifindex == ifindex) { 876 dce_delete_locked(dcb, dce); 877 dce_refrele(dce); 878 } 879 } 880 rw_exit(&dcb->dcb_lock); 881 } 882 } 883