1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #include <sys/types.h> 28 #include <sys/stream.h> 29 #include <sys/strsun.h> 30 #include <sys/zone.h> 31 #include <sys/ddi.h> 32 #include <sys/sunddi.h> 33 #include <sys/cmn_err.h> 34 #include <sys/debug.h> 35 #include <sys/atomic.h> 36 #define _SUN_TPI_VERSION 2 37 #include <sys/tihdr.h> 38 39 #include <inet/common.h> 40 #include <inet/mi.h> 41 #include <inet/mib2.h> 42 #include <inet/snmpcom.h> 43 44 #include <netinet/ip6.h> 45 #include <netinet/icmp6.h> 46 47 #include <inet/ip.h> 48 #include <inet/ip_impl.h> 49 #include <inet/ip6.h> 50 #include <inet/ip6_asp.h> 51 #include <inet/ip_multi.h> 52 #include <inet/ip_if.h> 53 #include <inet/ip_ire.h> 54 #include <inet/ip_ftable.h> 55 #include <inet/ip_rts.h> 56 #include <inet/ip_ndp.h> 57 #include <inet/ipclassifier.h> 58 #include <inet/ip_listutils.h> 59 60 #include <sys/sunddi.h> 61 62 /* 63 * Routines for handling destination cache entries. 64 * There is always one DCEF_DEFAULT for each ip_stack_t created at init time. 65 * That entry holds both the IP ident value and the dce generation number. 66 * 67 * Any time a DCE is changed significantly (different path MTU, but NOT 68 * different ULP info!), the dce_generation number is increased. 69 * Also, when a new DCE is created, the dce_generation number in the default 70 * DCE is bumped. That allows the dce_t information to be cached efficiently 71 * as long as the entity caching the dce_t also caches the dce_generation, 72 * and compares the cached generation to detect any changes. 73 * Furthermore, when a DCE is deleted, if there are any outstanding references 74 * to the DCE it will be marked as condemned. The condemned mark is 75 * a designated generation number which is never otherwise used, hence 76 * the single comparison with the generation number captures that as well. 77 * 78 * An example of code which caches is as follows: 79 * 80 * if (mystruct->my_dce_generation != mystruct->my_dce->dce_generation) { 81 * The DCE has changed 82 * mystruct->my_dce = dce_lookup_pkt(mp, ixa, 83 * &mystruct->my_dce_generation); 84 * Not needed in practice, since we have the default DCE: 85 * if (DCE_IS_CONDEMNED(mystruct->my_dce)) 86 * return failure; 87 * } 88 * 89 * Note that for IPv6 link-local addresses we record the ifindex since the 90 * link-locals are not globally unique. 91 */ 92 93 /* 94 * Hash bucket structure for DCEs 95 */ 96 typedef struct dcb_s { 97 krwlock_t dcb_lock; 98 uint32_t dcb_cnt; 99 dce_t *dcb_dce; 100 } dcb_t; 101 102 static void dce_delete_locked(dcb_t *, dce_t *); 103 static void dce_make_condemned(dce_t *); 104 105 static kmem_cache_t *dce_cache; 106 107 108 /* Operates on a uint64_t */ 109 #define RANDOM_HASH(p) ((p) ^ ((p)>>16) ^ ((p)>>32) ^ ((p)>>48)) 110 111 /* 112 * Reclaim a fraction of dce's in the dcb. 113 * For now we have a higher probability to delete DCEs without DCE_PMTU. 114 */ 115 static void 116 dcb_reclaim(dcb_t *dcb, ip_stack_t *ipst, uint_t fraction) 117 { 118 uint_t fraction_pmtu = fraction*4; 119 uint_t hash; 120 dce_t *dce, *nextdce; 121 122 rw_enter(&dcb->dcb_lock, RW_WRITER); 123 for (dce = dcb->dcb_dce; dce != NULL; dce = nextdce) { 124 nextdce = dce->dce_next; 125 /* Clear DCEF_PMTU if the pmtu is too old */ 126 mutex_enter(&dce->dce_lock); 127 if ((dce->dce_flags & DCEF_PMTU) && 128 TICK_TO_SEC(lbolt64) - dce->dce_last_change_time > 129 ipst->ips_ip_pathmtu_interval) { 130 dce->dce_flags &= ~DCEF_PMTU; 131 mutex_exit(&dce->dce_lock); 132 dce_increment_generation(dce); 133 } else { 134 mutex_exit(&dce->dce_lock); 135 } 136 hash = RANDOM_HASH((uint64_t)(uintptr_t)dce); 137 if (dce->dce_flags & DCEF_PMTU) { 138 if (hash % fraction_pmtu != 0) 139 continue; 140 } else { 141 if (hash % fraction != 0) 142 continue; 143 } 144 145 IP_STAT(ipst, ip_dce_reclaim_deleted); 146 dce_delete_locked(dcb, dce); 147 dce_refrele(dce); 148 } 149 rw_exit(&dcb->dcb_lock); 150 } 151 152 /* 153 * kmem_cache callback to free up memory. 154 * 155 */ 156 static void 157 ip_dce_reclaim_stack(ip_stack_t *ipst) 158 { 159 int i; 160 161 IP_STAT(ipst, ip_dce_reclaim_calls); 162 for (i = 0; i < ipst->ips_dce_hashsize; i++) { 163 dcb_reclaim(&ipst->ips_dce_hash_v4[i], ipst, 164 ipst->ips_ip_dce_reclaim_fraction); 165 166 dcb_reclaim(&ipst->ips_dce_hash_v6[i], ipst, 167 ipst->ips_ip_dce_reclaim_fraction); 168 } 169 170 /* 171 * Walk all CONNs that can have a reference on an ire, nce or dce. 172 * Get them to update any stale references to drop any refholds they 173 * have. 174 */ 175 ipcl_walk(conn_ixa_cleanup, (void *)B_FALSE, ipst); 176 } 177 178 /* 179 * Called by the memory allocator subsystem directly, when the system 180 * is running low on memory. 181 */ 182 /* ARGSUSED */ 183 void 184 ip_dce_reclaim(void *args) 185 { 186 netstack_handle_t nh; 187 netstack_t *ns; 188 189 netstack_next_init(&nh); 190 while ((ns = netstack_next(&nh)) != NULL) { 191 ip_dce_reclaim_stack(ns->netstack_ip); 192 netstack_rele(ns); 193 } 194 netstack_next_fini(&nh); 195 } 196 197 void 198 dce_g_init(void) 199 { 200 dce_cache = kmem_cache_create("dce_cache", 201 sizeof (dce_t), 0, NULL, NULL, ip_dce_reclaim, NULL, NULL, 0); 202 } 203 204 void 205 dce_g_destroy(void) 206 { 207 kmem_cache_destroy(dce_cache); 208 } 209 210 211 /* 212 * Allocate a default DCE and a hash table for per-IP address DCEs 213 */ 214 void 215 dce_stack_init(ip_stack_t *ipst) 216 { 217 int i; 218 219 ipst->ips_dce_default = kmem_cache_alloc(dce_cache, KM_SLEEP); 220 bzero(ipst->ips_dce_default, sizeof (dce_t)); 221 ipst->ips_dce_default->dce_flags = DCEF_DEFAULT; 222 ipst->ips_dce_default->dce_generation = DCE_GENERATION_INITIAL; 223 ipst->ips_dce_default->dce_last_change_time = TICK_TO_SEC(lbolt64); 224 ipst->ips_dce_default->dce_refcnt = 1; /* Should never go away */ 225 ipst->ips_dce_default->dce_ipst = ipst; 226 227 /* This must be a power of two since we are using IRE_ADDR_HASH macro */ 228 ipst->ips_dce_hashsize = 256; 229 ipst->ips_dce_hash_v4 = kmem_zalloc(ipst->ips_dce_hashsize * 230 sizeof (dcb_t), KM_SLEEP); 231 ipst->ips_dce_hash_v6 = kmem_zalloc(ipst->ips_dce_hashsize * 232 sizeof (dcb_t), KM_SLEEP); 233 for (i = 0; i < ipst->ips_dce_hashsize; i++) { 234 rw_init(&ipst->ips_dce_hash_v4[i].dcb_lock, NULL, RW_DEFAULT, 235 NULL); 236 rw_init(&ipst->ips_dce_hash_v6[i].dcb_lock, NULL, RW_DEFAULT, 237 NULL); 238 } 239 } 240 241 void 242 dce_stack_destroy(ip_stack_t *ipst) 243 { 244 int i; 245 for (i = 0; i < ipst->ips_dce_hashsize; i++) { 246 rw_destroy(&ipst->ips_dce_hash_v4[i].dcb_lock); 247 rw_destroy(&ipst->ips_dce_hash_v6[i].dcb_lock); 248 } 249 kmem_free(ipst->ips_dce_hash_v4, 250 ipst->ips_dce_hashsize * sizeof (dcb_t)); 251 ipst->ips_dce_hash_v4 = NULL; 252 kmem_free(ipst->ips_dce_hash_v6, 253 ipst->ips_dce_hashsize * sizeof (dcb_t)); 254 ipst->ips_dce_hash_v6 = NULL; 255 ipst->ips_dce_hashsize = 0; 256 257 ASSERT(ipst->ips_dce_default->dce_refcnt == 1); 258 kmem_cache_free(dce_cache, ipst->ips_dce_default); 259 ipst->ips_dce_default = NULL; 260 } 261 262 /* When any DCE is good enough */ 263 dce_t * 264 dce_get_default(ip_stack_t *ipst) 265 { 266 dce_t *dce; 267 268 dce = ipst->ips_dce_default; 269 dce_refhold(dce); 270 return (dce); 271 } 272 273 /* 274 * Generic for IPv4 and IPv6. 275 * 276 * Used by callers that need to cache e.g., the datapath 277 * Returns the generation number in the last argument. 278 */ 279 dce_t * 280 dce_lookup_pkt(mblk_t *mp, ip_xmit_attr_t *ixa, uint_t *generationp) 281 { 282 if (ixa->ixa_flags & IXAF_IS_IPV4) { 283 /* 284 * If we have a source route we need to look for the final 285 * destination in the source route option. 286 */ 287 ipaddr_t final_dst; 288 ipha_t *ipha = (ipha_t *)mp->b_rptr; 289 290 final_dst = ip_get_dst(ipha); 291 return (dce_lookup_v4(final_dst, ixa->ixa_ipst, generationp)); 292 } else { 293 uint_t ifindex; 294 /* 295 * If we have a routing header we need to look for the final 296 * destination in the routing extension header. 297 */ 298 in6_addr_t final_dst; 299 ip6_t *ip6h = (ip6_t *)mp->b_rptr; 300 301 final_dst = ip_get_dst_v6(ip6h, mp, NULL); 302 ifindex = 0; 303 if (IN6_IS_ADDR_LINKSCOPE(&final_dst) && ixa->ixa_nce != NULL) { 304 ifindex = ixa->ixa_nce->nce_common->ncec_ill-> 305 ill_phyint->phyint_ifindex; 306 } 307 return (dce_lookup_v6(&final_dst, ifindex, ixa->ixa_ipst, 308 generationp)); 309 } 310 } 311 312 /* 313 * Used by callers that need to cache e.g., the datapath 314 * Returns the generation number in the last argument. 315 */ 316 dce_t * 317 dce_lookup_v4(ipaddr_t dst, ip_stack_t *ipst, uint_t *generationp) 318 { 319 uint_t hash; 320 dcb_t *dcb; 321 dce_t *dce; 322 323 /* Set *generationp before dropping the lock(s) that allow additions */ 324 if (generationp != NULL) 325 *generationp = ipst->ips_dce_default->dce_generation; 326 327 hash = IRE_ADDR_HASH(dst, ipst->ips_dce_hashsize); 328 dcb = &ipst->ips_dce_hash_v4[hash]; 329 rw_enter(&dcb->dcb_lock, RW_READER); 330 for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) { 331 if (dce->dce_v4addr == dst) { 332 mutex_enter(&dce->dce_lock); 333 if (!DCE_IS_CONDEMNED(dce)) { 334 dce_refhold(dce); 335 if (generationp != NULL) 336 *generationp = dce->dce_generation; 337 mutex_exit(&dce->dce_lock); 338 rw_exit(&dcb->dcb_lock); 339 return (dce); 340 } 341 mutex_exit(&dce->dce_lock); 342 } 343 } 344 rw_exit(&dcb->dcb_lock); 345 /* Not found */ 346 dce = ipst->ips_dce_default; 347 dce_refhold(dce); 348 return (dce); 349 } 350 351 /* 352 * Used by callers that need to cache e.g., the datapath 353 * Returns the generation number in the last argument. 354 * ifindex should only be set for link-locals 355 */ 356 dce_t * 357 dce_lookup_v6(const in6_addr_t *dst, uint_t ifindex, ip_stack_t *ipst, 358 uint_t *generationp) 359 { 360 uint_t hash; 361 dcb_t *dcb; 362 dce_t *dce; 363 364 /* Set *generationp before dropping the lock(s) that allow additions */ 365 if (generationp != NULL) 366 *generationp = ipst->ips_dce_default->dce_generation; 367 368 hash = IRE_ADDR_HASH_V6(*dst, ipst->ips_dce_hashsize); 369 dcb = &ipst->ips_dce_hash_v6[hash]; 370 rw_enter(&dcb->dcb_lock, RW_READER); 371 for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) { 372 if (IN6_ARE_ADDR_EQUAL(&dce->dce_v6addr, dst) && 373 dce->dce_ifindex == ifindex) { 374 mutex_enter(&dce->dce_lock); 375 if (!DCE_IS_CONDEMNED(dce)) { 376 dce_refhold(dce); 377 if (generationp != NULL) 378 *generationp = dce->dce_generation; 379 mutex_exit(&dce->dce_lock); 380 rw_exit(&dcb->dcb_lock); 381 return (dce); 382 } 383 mutex_exit(&dce->dce_lock); 384 } 385 } 386 rw_exit(&dcb->dcb_lock); 387 /* Not found */ 388 dce = ipst->ips_dce_default; 389 dce_refhold(dce); 390 return (dce); 391 } 392 393 /* 394 * Atomically looks for a non-default DCE, and if not found tries to create one. 395 * If there is no memory it returns NULL. 396 * When an entry is created we increase the generation number on 397 * the default DCE so that conn_ip_output will detect there is a new DCE. 398 */ 399 dce_t * 400 dce_lookup_and_add_v4(ipaddr_t dst, ip_stack_t *ipst) 401 { 402 uint_t hash; 403 dcb_t *dcb; 404 dce_t *dce; 405 406 hash = IRE_ADDR_HASH(dst, ipst->ips_dce_hashsize); 407 dcb = &ipst->ips_dce_hash_v4[hash]; 408 rw_enter(&dcb->dcb_lock, RW_WRITER); 409 for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) { 410 if (dce->dce_v4addr == dst) { 411 mutex_enter(&dce->dce_lock); 412 if (!DCE_IS_CONDEMNED(dce)) { 413 dce_refhold(dce); 414 mutex_exit(&dce->dce_lock); 415 rw_exit(&dcb->dcb_lock); 416 return (dce); 417 } 418 mutex_exit(&dce->dce_lock); 419 } 420 } 421 dce = kmem_cache_alloc(dce_cache, KM_NOSLEEP); 422 if (dce == NULL) { 423 rw_exit(&dcb->dcb_lock); 424 return (NULL); 425 } 426 bzero(dce, sizeof (dce_t)); 427 dce->dce_ipst = ipst; /* No netstack_hold */ 428 dce->dce_v4addr = dst; 429 dce->dce_generation = DCE_GENERATION_INITIAL; 430 dce->dce_ipversion = IPV4_VERSION; 431 dce->dce_last_change_time = TICK_TO_SEC(lbolt64); 432 dce_refhold(dce); /* For the hash list */ 433 434 /* Link into list */ 435 if (dcb->dcb_dce != NULL) 436 dcb->dcb_dce->dce_ptpn = &dce->dce_next; 437 dce->dce_next = dcb->dcb_dce; 438 dce->dce_ptpn = &dcb->dcb_dce; 439 dcb->dcb_dce = dce; 440 dce->dce_bucket = dcb; 441 dce_refhold(dce); /* For the caller */ 442 rw_exit(&dcb->dcb_lock); 443 444 /* Initialize dce_ident to be different than for the last packet */ 445 dce->dce_ident = ipst->ips_dce_default->dce_ident + 1; 446 447 dce_increment_generation(ipst->ips_dce_default); 448 return (dce); 449 } 450 451 /* 452 * Atomically looks for a non-default DCE, and if not found tries to create one. 453 * If there is no memory it returns NULL. 454 * When an entry is created we increase the generation number on 455 * the default DCE so that conn_ip_output will detect there is a new DCE. 456 * ifindex should only be used with link-local addresses. 457 */ 458 dce_t * 459 dce_lookup_and_add_v6(const in6_addr_t *dst, uint_t ifindex, ip_stack_t *ipst) 460 { 461 uint_t hash; 462 dcb_t *dcb; 463 dce_t *dce; 464 465 /* We should not create entries for link-locals w/o an ifindex */ 466 ASSERT(!(IN6_IS_ADDR_LINKSCOPE(dst)) || ifindex != 0); 467 468 hash = IRE_ADDR_HASH_V6(*dst, ipst->ips_dce_hashsize); 469 dcb = &ipst->ips_dce_hash_v6[hash]; 470 rw_enter(&dcb->dcb_lock, RW_WRITER); 471 for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) { 472 if (IN6_ARE_ADDR_EQUAL(&dce->dce_v6addr, dst) && 473 dce->dce_ifindex == ifindex) { 474 mutex_enter(&dce->dce_lock); 475 if (!DCE_IS_CONDEMNED(dce)) { 476 dce_refhold(dce); 477 mutex_exit(&dce->dce_lock); 478 rw_exit(&dcb->dcb_lock); 479 return (dce); 480 } 481 mutex_exit(&dce->dce_lock); 482 } 483 } 484 485 dce = kmem_cache_alloc(dce_cache, KM_NOSLEEP); 486 if (dce == NULL) { 487 rw_exit(&dcb->dcb_lock); 488 return (NULL); 489 } 490 bzero(dce, sizeof (dce_t)); 491 dce->dce_ipst = ipst; /* No netstack_hold */ 492 dce->dce_v6addr = *dst; 493 dce->dce_ifindex = ifindex; 494 dce->dce_generation = DCE_GENERATION_INITIAL; 495 dce->dce_ipversion = IPV6_VERSION; 496 dce->dce_last_change_time = TICK_TO_SEC(lbolt64); 497 dce_refhold(dce); /* For the hash list */ 498 499 /* Link into list */ 500 if (dcb->dcb_dce != NULL) 501 dcb->dcb_dce->dce_ptpn = &dce->dce_next; 502 dce->dce_next = dcb->dcb_dce; 503 dce->dce_ptpn = &dcb->dcb_dce; 504 dcb->dcb_dce = dce; 505 dce->dce_bucket = dcb; 506 atomic_add_32(&dcb->dcb_cnt, 1); 507 dce_refhold(dce); /* For the caller */ 508 rw_exit(&dcb->dcb_lock); 509 510 /* Initialize dce_ident to be different than for the last packet */ 511 dce->dce_ident = ipst->ips_dce_default->dce_ident + 1; 512 dce_increment_generation(ipst->ips_dce_default); 513 return (dce); 514 } 515 516 /* 517 * Set/update uinfo. Creates a per-destination dce if none exists. 518 * 519 * Note that we do not bump the generation number here. 520 * New connections will find the new uinfo. 521 * 522 * The only use of this (tcp, sctp using iulp_t) is to set rtt+rtt_sd. 523 */ 524 static void 525 dce_setuinfo(dce_t *dce, iulp_t *uinfo) 526 { 527 /* 528 * Update the round trip time estimate and/or the max frag size 529 * and/or the slow start threshold. 530 * 531 * We serialize multiple advises using dce_lock. 532 */ 533 mutex_enter(&dce->dce_lock); 534 /* Gard against setting to zero */ 535 if (uinfo->iulp_rtt != 0) { 536 /* 537 * If there is no old cached values, initialize them 538 * conservatively. Set them to be (1.5 * new value). 539 */ 540 if (dce->dce_uinfo.iulp_rtt != 0) { 541 dce->dce_uinfo.iulp_rtt = (dce->dce_uinfo.iulp_rtt + 542 uinfo->iulp_rtt) >> 1; 543 } else { 544 dce->dce_uinfo.iulp_rtt = uinfo->iulp_rtt + 545 (uinfo->iulp_rtt >> 1); 546 } 547 if (dce->dce_uinfo.iulp_rtt_sd != 0) { 548 dce->dce_uinfo.iulp_rtt_sd = 549 (dce->dce_uinfo.iulp_rtt_sd + 550 uinfo->iulp_rtt_sd) >> 1; 551 } else { 552 dce->dce_uinfo.iulp_rtt_sd = uinfo->iulp_rtt_sd + 553 (uinfo->iulp_rtt_sd >> 1); 554 } 555 } 556 if (uinfo->iulp_mtu != 0) { 557 if (dce->dce_flags & DCEF_PMTU) { 558 dce->dce_pmtu = MIN(uinfo->iulp_mtu, dce->dce_pmtu); 559 } else { 560 dce->dce_pmtu = MIN(uinfo->iulp_mtu, IP_MAXPACKET); 561 dce->dce_flags |= DCEF_PMTU; 562 } 563 dce->dce_last_change_time = TICK_TO_SEC(lbolt64); 564 } 565 if (uinfo->iulp_ssthresh != 0) { 566 if (dce->dce_uinfo.iulp_ssthresh != 0) 567 dce->dce_uinfo.iulp_ssthresh = 568 (uinfo->iulp_ssthresh + 569 dce->dce_uinfo.iulp_ssthresh) >> 1; 570 else 571 dce->dce_uinfo.iulp_ssthresh = uinfo->iulp_ssthresh; 572 } 573 /* We have uinfo for sure */ 574 dce->dce_flags |= DCEF_UINFO; 575 mutex_exit(&dce->dce_lock); 576 } 577 578 579 int 580 dce_update_uinfo_v4(ipaddr_t dst, iulp_t *uinfo, ip_stack_t *ipst) 581 { 582 dce_t *dce; 583 584 dce = dce_lookup_and_add_v4(dst, ipst); 585 if (dce == NULL) 586 return (ENOMEM); 587 588 dce_setuinfo(dce, uinfo); 589 dce_refrele(dce); 590 return (0); 591 } 592 593 int 594 dce_update_uinfo_v6(const in6_addr_t *dst, uint_t ifindex, iulp_t *uinfo, 595 ip_stack_t *ipst) 596 { 597 dce_t *dce; 598 599 dce = dce_lookup_and_add_v6(dst, ifindex, ipst); 600 if (dce == NULL) 601 return (ENOMEM); 602 603 dce_setuinfo(dce, uinfo); 604 dce_refrele(dce); 605 return (0); 606 } 607 608 /* Common routine for IPv4 and IPv6 */ 609 int 610 dce_update_uinfo(const in6_addr_t *dst, uint_t ifindex, iulp_t *uinfo, 611 ip_stack_t *ipst) 612 { 613 ipaddr_t dst4; 614 615 if (IN6_IS_ADDR_V4MAPPED_ANY(dst)) { 616 IN6_V4MAPPED_TO_IPADDR(dst, dst4); 617 return (dce_update_uinfo_v4(dst4, uinfo, ipst)); 618 } else { 619 return (dce_update_uinfo_v6(dst, ifindex, uinfo, ipst)); 620 } 621 } 622 623 static void 624 dce_make_condemned(dce_t *dce) 625 { 626 ip_stack_t *ipst = dce->dce_ipst; 627 628 mutex_enter(&dce->dce_lock); 629 ASSERT(!DCE_IS_CONDEMNED(dce)); 630 dce->dce_generation = DCE_GENERATION_CONDEMNED; 631 mutex_exit(&dce->dce_lock); 632 /* Count how many condemned dces for kmem_cache callback */ 633 atomic_add_32(&ipst->ips_num_dce_condemned, 1); 634 } 635 636 /* 637 * Increment the generation avoiding the special condemned value 638 */ 639 void 640 dce_increment_generation(dce_t *dce) 641 { 642 uint_t generation; 643 644 mutex_enter(&dce->dce_lock); 645 if (!DCE_IS_CONDEMNED(dce)) { 646 generation = dce->dce_generation + 1; 647 if (generation == DCE_GENERATION_CONDEMNED) 648 generation = DCE_GENERATION_INITIAL; 649 ASSERT(generation != DCE_GENERATION_VERIFY); 650 dce->dce_generation = generation; 651 } 652 mutex_exit(&dce->dce_lock); 653 } 654 655 /* 656 * Increment the generation number on all dces that have a path MTU and 657 * the default DCE. Used when ill_mtu changes. 658 */ 659 void 660 dce_increment_all_generations(boolean_t isv6, ip_stack_t *ipst) 661 { 662 int i; 663 dcb_t *dcb; 664 dce_t *dce; 665 666 for (i = 0; i < ipst->ips_dce_hashsize; i++) { 667 if (isv6) 668 dcb = &ipst->ips_dce_hash_v6[i]; 669 else 670 dcb = &ipst->ips_dce_hash_v4[i]; 671 rw_enter(&dcb->dcb_lock, RW_WRITER); 672 for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) { 673 if (DCE_IS_CONDEMNED(dce)) 674 continue; 675 dce_increment_generation(dce); 676 } 677 rw_exit(&dcb->dcb_lock); 678 } 679 dce_increment_generation(ipst->ips_dce_default); 680 } 681 682 /* 683 * Caller needs to do a dce_refrele since we can't do the 684 * dce_refrele under dcb_lock. 685 */ 686 static void 687 dce_delete_locked(dcb_t *dcb, dce_t *dce) 688 { 689 dce->dce_bucket = NULL; 690 *dce->dce_ptpn = dce->dce_next; 691 if (dce->dce_next != NULL) 692 dce->dce_next->dce_ptpn = dce->dce_ptpn; 693 dce->dce_ptpn = NULL; 694 dce->dce_next = NULL; 695 atomic_add_32(&dcb->dcb_cnt, -1); 696 dce_make_condemned(dce); 697 } 698 699 static void 700 dce_inactive(dce_t *dce) 701 { 702 ip_stack_t *ipst = dce->dce_ipst; 703 704 ASSERT(!(dce->dce_flags & DCEF_DEFAULT)); 705 ASSERT(dce->dce_ptpn == NULL); 706 ASSERT(dce->dce_bucket == NULL); 707 708 /* Count how many condemned dces for kmem_cache callback */ 709 if (DCE_IS_CONDEMNED(dce)) 710 atomic_add_32(&ipst->ips_num_dce_condemned, -1); 711 712 kmem_cache_free(dce_cache, dce); 713 } 714 715 void 716 dce_refrele(dce_t *dce) 717 { 718 ASSERT(dce->dce_refcnt != 0); 719 if (atomic_add_32_nv(&dce->dce_refcnt, -1) == 0) 720 dce_inactive(dce); 721 } 722 723 void 724 dce_refhold(dce_t *dce) 725 { 726 atomic_add_32(&dce->dce_refcnt, 1); 727 ASSERT(dce->dce_refcnt != 0); 728 } 729 730 /* No tracing support yet hence the same as the above functions */ 731 void 732 dce_refrele_notr(dce_t *dce) 733 { 734 ASSERT(dce->dce_refcnt != 0); 735 if (atomic_add_32_nv(&dce->dce_refcnt, -1) == 0) 736 dce_inactive(dce); 737 } 738 739 void 740 dce_refhold_notr(dce_t *dce) 741 { 742 atomic_add_32(&dce->dce_refcnt, 1); 743 ASSERT(dce->dce_refcnt != 0); 744 } 745 746 /* Report both the IPv4 and IPv6 DCEs. */ 747 mblk_t * 748 ip_snmp_get_mib2_ip_dce(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst) 749 { 750 struct opthdr *optp; 751 mblk_t *mp2ctl; 752 dest_cache_entry_t dest_cache; 753 mblk_t *mp_tail = NULL; 754 dce_t *dce; 755 dcb_t *dcb; 756 int i; 757 uint64_t current_time; 758 759 current_time = TICK_TO_SEC(lbolt64); 760 761 /* 762 * make a copy of the original message 763 */ 764 mp2ctl = copymsg(mpctl); 765 766 /* First we do IPv4 entries */ 767 optp = (struct opthdr *)&mpctl->b_rptr[ 768 sizeof (struct T_optmgmt_ack)]; 769 optp->level = MIB2_IP; 770 optp->name = EXPER_IP_DCE; 771 772 for (i = 0; i < ipst->ips_dce_hashsize; i++) { 773 dcb = &ipst->ips_dce_hash_v4[i]; 774 rw_enter(&dcb->dcb_lock, RW_READER); 775 for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) { 776 dest_cache.DestIpv4Address = dce->dce_v4addr; 777 dest_cache.DestFlags = dce->dce_flags; 778 if (dce->dce_flags & DCEF_PMTU) 779 dest_cache.DestPmtu = dce->dce_pmtu; 780 else 781 dest_cache.DestPmtu = 0; 782 dest_cache.DestIdent = dce->dce_ident; 783 dest_cache.DestIfindex = 0; 784 dest_cache.DestAge = current_time - 785 dce->dce_last_change_time; 786 if (!snmp_append_data2(mpctl->b_cont, &mp_tail, 787 (char *)&dest_cache, (int)sizeof (dest_cache))) { 788 ip1dbg(("ip_snmp_get_mib2_ip_dce: " 789 "failed to allocate %u bytes\n", 790 (uint_t)sizeof (dest_cache))); 791 } 792 } 793 rw_exit(&dcb->dcb_lock); 794 } 795 optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont); 796 ip3dbg(("ip_snmp_get: level %d, name %d, len %d\n", 797 (int)optp->level, (int)optp->name, (int)optp->len)); 798 qreply(q, mpctl); 799 800 if (mp2ctl == NULL) { 801 /* Copymsg failed above */ 802 return (NULL); 803 } 804 805 /* Now for IPv6 */ 806 mpctl = mp2ctl; 807 mp_tail = NULL; 808 mp2ctl = copymsg(mpctl); 809 optp = (struct opthdr *)&mpctl->b_rptr[ 810 sizeof (struct T_optmgmt_ack)]; 811 optp->level = MIB2_IP6; 812 optp->name = EXPER_IP_DCE; 813 814 for (i = 0; i < ipst->ips_dce_hashsize; i++) { 815 dcb = &ipst->ips_dce_hash_v6[i]; 816 rw_enter(&dcb->dcb_lock, RW_READER); 817 for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) { 818 dest_cache.DestIpv6Address = dce->dce_v6addr; 819 dest_cache.DestFlags = dce->dce_flags; 820 if (dce->dce_flags & DCEF_PMTU) 821 dest_cache.DestPmtu = dce->dce_pmtu; 822 else 823 dest_cache.DestPmtu = 0; 824 dest_cache.DestIdent = dce->dce_ident; 825 if (IN6_IS_ADDR_LINKSCOPE(&dce->dce_v6addr)) 826 dest_cache.DestIfindex = dce->dce_ifindex; 827 else 828 dest_cache.DestIfindex = 0; 829 dest_cache.DestAge = current_time - 830 dce->dce_last_change_time; 831 if (!snmp_append_data2(mpctl->b_cont, &mp_tail, 832 (char *)&dest_cache, (int)sizeof (dest_cache))) { 833 ip1dbg(("ip_snmp_get_mib2_ip_dce: " 834 "failed to allocate %u bytes\n", 835 (uint_t)sizeof (dest_cache))); 836 } 837 } 838 rw_exit(&dcb->dcb_lock); 839 } 840 optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont); 841 ip3dbg(("ip_snmp_get: level %d, name %d, len %d\n", 842 (int)optp->level, (int)optp->name, (int)optp->len)); 843 qreply(q, mpctl); 844 845 return (mp2ctl); 846 } 847 848 /* 849 * Remove IPv6 DCEs which refer to an ifindex that is going away. 850 * This is not required for correctness, but it avoids netstat -d 851 * showing stale stuff that will never be used. 852 */ 853 void 854 dce_cleanup(uint_t ifindex, ip_stack_t *ipst) 855 { 856 uint_t i; 857 dcb_t *dcb; 858 dce_t *dce, *nextdce; 859 860 for (i = 0; i < ipst->ips_dce_hashsize; i++) { 861 dcb = &ipst->ips_dce_hash_v6[i]; 862 rw_enter(&dcb->dcb_lock, RW_WRITER); 863 864 for (dce = dcb->dcb_dce; dce != NULL; dce = nextdce) { 865 nextdce = dce->dce_next; 866 if (dce->dce_ifindex == ifindex) { 867 dce_delete_locked(dcb, dce); 868 dce_refrele(dce); 869 } 870 } 871 rw_exit(&dcb->dcb_lock); 872 } 873 } 874