1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #include <sys/types.h> 28 #include <sys/stream.h> 29 #include <sys/strsun.h> 30 #include <sys/zone.h> 31 #include <sys/ddi.h> 32 #include <sys/sunddi.h> 33 #include <sys/cmn_err.h> 34 #include <sys/debug.h> 35 #include <sys/atomic.h> 36 #define _SUN_TPI_VERSION 2 37 #include <sys/tihdr.h> 38 39 #include <inet/common.h> 40 #include <inet/mi.h> 41 #include <inet/mib2.h> 42 #include <inet/snmpcom.h> 43 44 #include <netinet/ip6.h> 45 #include <netinet/icmp6.h> 46 47 #include <inet/ip.h> 48 #include <inet/ip_impl.h> 49 #include <inet/ip6.h> 50 #include <inet/ip6_asp.h> 51 #include <inet/ip_multi.h> 52 #include <inet/ip_if.h> 53 #include <inet/ip_ire.h> 54 #include <inet/ip_ftable.h> 55 #include <inet/ip_rts.h> 56 #include <inet/ip_ndp.h> 57 #include <inet/ipclassifier.h> 58 #include <inet/ip_listutils.h> 59 60 #include <sys/sunddi.h> 61 62 /* 63 * Routines for handling destination cache entries. 64 * There is always one DCEF_DEFAULT for each ip_stack_t created at init time. 65 * That entry holds both the IP ident value and the dce generation number. 66 * 67 * Any time a DCE is changed significantly (different path MTU, but NOT 68 * different ULP info!), the dce_generation number is increased. 69 * Also, when a new DCE is created, the dce_generation number in the default 70 * DCE is bumped. That allows the dce_t information to be cached efficiently 71 * as long as the entity caching the dce_t also caches the dce_generation, 72 * and compares the cached generation to detect any changes. 73 * Furthermore, when a DCE is deleted, if there are any outstanding references 74 * to the DCE it will be marked as condemned. The condemned mark is 75 * a designated generation number which is never otherwise used, hence 76 * the single comparison with the generation number captures that as well. 77 * 78 * An example of code which caches is as follows: 79 * 80 * if (mystruct->my_dce_generation != mystruct->my_dce->dce_generation) { 81 * The DCE has changed 82 * mystruct->my_dce = dce_lookup_pkt(mp, ixa, 83 * &mystruct->my_dce_generation); 84 * Not needed in practice, since we have the default DCE: 85 * if (DCE_IS_CONDEMNED(mystruct->my_dce)) 86 * return failure; 87 * } 88 * 89 * Note that for IPv6 link-local addresses we record the ifindex since the 90 * link-locals are not globally unique. 91 */ 92 93 /* 94 * Hash bucket structure for DCEs 95 */ 96 typedef struct dcb_s { 97 krwlock_t dcb_lock; 98 uint32_t dcb_cnt; 99 dce_t *dcb_dce; 100 } dcb_t; 101 102 static void dce_delete_locked(dcb_t *, dce_t *); 103 static void dce_make_condemned(dce_t *); 104 105 static kmem_cache_t *dce_cache; 106 107 108 /* Operates on a uint64_t */ 109 #define RANDOM_HASH(p) ((p) ^ ((p)>>16) ^ ((p)>>32) ^ ((p)>>48)) 110 111 /* 112 * Reclaim a fraction of dce's in the dcb. 113 * For now we have a higher probability to delete DCEs without DCE_PMTU. 114 */ 115 static void 116 dcb_reclaim(dcb_t *dcb, ip_stack_t *ipst, uint_t fraction) 117 { 118 uint_t fraction_pmtu = fraction*4; 119 uint_t hash; 120 dce_t *dce, *nextdce; 121 122 rw_enter(&dcb->dcb_lock, RW_WRITER); 123 for (dce = dcb->dcb_dce; dce != NULL; dce = nextdce) { 124 nextdce = dce->dce_next; 125 /* Clear DCEF_PMTU if the pmtu is too old */ 126 mutex_enter(&dce->dce_lock); 127 if ((dce->dce_flags & DCEF_PMTU) && 128 TICK_TO_SEC(ddi_get_lbolt64()) - dce->dce_last_change_time > 129 ipst->ips_ip_pathmtu_interval) { 130 dce->dce_flags &= ~DCEF_PMTU; 131 mutex_exit(&dce->dce_lock); 132 dce_increment_generation(dce); 133 } else { 134 mutex_exit(&dce->dce_lock); 135 } 136 hash = RANDOM_HASH((uint64_t)(uintptr_t)dce); 137 if (dce->dce_flags & DCEF_PMTU) { 138 if (hash % fraction_pmtu != 0) 139 continue; 140 } else { 141 if (hash % fraction != 0) 142 continue; 143 } 144 145 IP_STAT(ipst, ip_dce_reclaim_deleted); 146 dce_delete_locked(dcb, dce); 147 dce_refrele(dce); 148 } 149 rw_exit(&dcb->dcb_lock); 150 } 151 152 /* 153 * kmem_cache callback to free up memory. 154 * 155 */ 156 static void 157 ip_dce_reclaim_stack(ip_stack_t *ipst) 158 { 159 int i; 160 161 IP_STAT(ipst, ip_dce_reclaim_calls); 162 for (i = 0; i < ipst->ips_dce_hashsize; i++) { 163 dcb_reclaim(&ipst->ips_dce_hash_v4[i], ipst, 164 ipst->ips_ip_dce_reclaim_fraction); 165 166 dcb_reclaim(&ipst->ips_dce_hash_v6[i], ipst, 167 ipst->ips_ip_dce_reclaim_fraction); 168 } 169 170 /* 171 * Walk all CONNs that can have a reference on an ire, nce or dce. 172 * Get them to update any stale references to drop any refholds they 173 * have. 174 */ 175 ipcl_walk(conn_ixa_cleanup, (void *)B_FALSE, ipst); 176 } 177 178 /* 179 * Called by the memory allocator subsystem directly, when the system 180 * is running low on memory. 181 */ 182 /* ARGSUSED */ 183 void 184 ip_dce_reclaim(void *args) 185 { 186 netstack_handle_t nh; 187 netstack_t *ns; 188 189 netstack_next_init(&nh); 190 while ((ns = netstack_next(&nh)) != NULL) { 191 ip_dce_reclaim_stack(ns->netstack_ip); 192 netstack_rele(ns); 193 } 194 netstack_next_fini(&nh); 195 } 196 197 void 198 dce_g_init(void) 199 { 200 dce_cache = kmem_cache_create("dce_cache", 201 sizeof (dce_t), 0, NULL, NULL, ip_dce_reclaim, NULL, NULL, 0); 202 } 203 204 void 205 dce_g_destroy(void) 206 { 207 kmem_cache_destroy(dce_cache); 208 } 209 210 211 /* 212 * Allocate a default DCE and a hash table for per-IP address DCEs 213 */ 214 void 215 dce_stack_init(ip_stack_t *ipst) 216 { 217 int i; 218 219 ipst->ips_dce_default = kmem_cache_alloc(dce_cache, KM_SLEEP); 220 bzero(ipst->ips_dce_default, sizeof (dce_t)); 221 ipst->ips_dce_default->dce_flags = DCEF_DEFAULT; 222 ipst->ips_dce_default->dce_generation = DCE_GENERATION_INITIAL; 223 ipst->ips_dce_default->dce_last_change_time = 224 TICK_TO_SEC(ddi_get_lbolt64()); 225 ipst->ips_dce_default->dce_refcnt = 1; /* Should never go away */ 226 ipst->ips_dce_default->dce_ipst = ipst; 227 228 /* This must be a power of two since we are using IRE_ADDR_HASH macro */ 229 ipst->ips_dce_hashsize = 256; 230 ipst->ips_dce_hash_v4 = kmem_zalloc(ipst->ips_dce_hashsize * 231 sizeof (dcb_t), KM_SLEEP); 232 ipst->ips_dce_hash_v6 = kmem_zalloc(ipst->ips_dce_hashsize * 233 sizeof (dcb_t), KM_SLEEP); 234 for (i = 0; i < ipst->ips_dce_hashsize; i++) { 235 rw_init(&ipst->ips_dce_hash_v4[i].dcb_lock, NULL, RW_DEFAULT, 236 NULL); 237 rw_init(&ipst->ips_dce_hash_v6[i].dcb_lock, NULL, RW_DEFAULT, 238 NULL); 239 } 240 } 241 242 void 243 dce_stack_destroy(ip_stack_t *ipst) 244 { 245 int i; 246 for (i = 0; i < ipst->ips_dce_hashsize; i++) { 247 rw_destroy(&ipst->ips_dce_hash_v4[i].dcb_lock); 248 rw_destroy(&ipst->ips_dce_hash_v6[i].dcb_lock); 249 } 250 kmem_free(ipst->ips_dce_hash_v4, 251 ipst->ips_dce_hashsize * sizeof (dcb_t)); 252 ipst->ips_dce_hash_v4 = NULL; 253 kmem_free(ipst->ips_dce_hash_v6, 254 ipst->ips_dce_hashsize * sizeof (dcb_t)); 255 ipst->ips_dce_hash_v6 = NULL; 256 ipst->ips_dce_hashsize = 0; 257 258 ASSERT(ipst->ips_dce_default->dce_refcnt == 1); 259 kmem_cache_free(dce_cache, ipst->ips_dce_default); 260 ipst->ips_dce_default = NULL; 261 } 262 263 /* When any DCE is good enough */ 264 dce_t * 265 dce_get_default(ip_stack_t *ipst) 266 { 267 dce_t *dce; 268 269 dce = ipst->ips_dce_default; 270 dce_refhold(dce); 271 return (dce); 272 } 273 274 /* 275 * Generic for IPv4 and IPv6. 276 * 277 * Used by callers that need to cache e.g., the datapath 278 * Returns the generation number in the last argument. 279 */ 280 dce_t * 281 dce_lookup_pkt(mblk_t *mp, ip_xmit_attr_t *ixa, uint_t *generationp) 282 { 283 if (ixa->ixa_flags & IXAF_IS_IPV4) { 284 /* 285 * If we have a source route we need to look for the final 286 * destination in the source route option. 287 */ 288 ipaddr_t final_dst; 289 ipha_t *ipha = (ipha_t *)mp->b_rptr; 290 291 final_dst = ip_get_dst(ipha); 292 return (dce_lookup_v4(final_dst, ixa->ixa_ipst, generationp)); 293 } else { 294 uint_t ifindex; 295 /* 296 * If we have a routing header we need to look for the final 297 * destination in the routing extension header. 298 */ 299 in6_addr_t final_dst; 300 ip6_t *ip6h = (ip6_t *)mp->b_rptr; 301 302 final_dst = ip_get_dst_v6(ip6h, mp, NULL); 303 ifindex = 0; 304 if (IN6_IS_ADDR_LINKSCOPE(&final_dst) && ixa->ixa_nce != NULL) { 305 ifindex = ixa->ixa_nce->nce_common->ncec_ill-> 306 ill_phyint->phyint_ifindex; 307 } 308 return (dce_lookup_v6(&final_dst, ifindex, ixa->ixa_ipst, 309 generationp)); 310 } 311 } 312 313 /* 314 * Used by callers that need to cache e.g., the datapath 315 * Returns the generation number in the last argument. 316 */ 317 dce_t * 318 dce_lookup_v4(ipaddr_t dst, ip_stack_t *ipst, uint_t *generationp) 319 { 320 uint_t hash; 321 dcb_t *dcb; 322 dce_t *dce; 323 324 /* Set *generationp before dropping the lock(s) that allow additions */ 325 if (generationp != NULL) 326 *generationp = ipst->ips_dce_default->dce_generation; 327 328 hash = IRE_ADDR_HASH(dst, ipst->ips_dce_hashsize); 329 dcb = &ipst->ips_dce_hash_v4[hash]; 330 rw_enter(&dcb->dcb_lock, RW_READER); 331 for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) { 332 if (dce->dce_v4addr == dst) { 333 mutex_enter(&dce->dce_lock); 334 if (!DCE_IS_CONDEMNED(dce)) { 335 dce_refhold(dce); 336 if (generationp != NULL) 337 *generationp = dce->dce_generation; 338 mutex_exit(&dce->dce_lock); 339 rw_exit(&dcb->dcb_lock); 340 return (dce); 341 } 342 mutex_exit(&dce->dce_lock); 343 } 344 } 345 rw_exit(&dcb->dcb_lock); 346 /* Not found */ 347 dce = ipst->ips_dce_default; 348 dce_refhold(dce); 349 return (dce); 350 } 351 352 /* 353 * Used by callers that need to cache e.g., the datapath 354 * Returns the generation number in the last argument. 355 * ifindex should only be set for link-locals 356 */ 357 dce_t * 358 dce_lookup_v6(const in6_addr_t *dst, uint_t ifindex, ip_stack_t *ipst, 359 uint_t *generationp) 360 { 361 uint_t hash; 362 dcb_t *dcb; 363 dce_t *dce; 364 365 /* Set *generationp before dropping the lock(s) that allow additions */ 366 if (generationp != NULL) 367 *generationp = ipst->ips_dce_default->dce_generation; 368 369 hash = IRE_ADDR_HASH_V6(*dst, ipst->ips_dce_hashsize); 370 dcb = &ipst->ips_dce_hash_v6[hash]; 371 rw_enter(&dcb->dcb_lock, RW_READER); 372 for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) { 373 if (IN6_ARE_ADDR_EQUAL(&dce->dce_v6addr, dst) && 374 dce->dce_ifindex == ifindex) { 375 mutex_enter(&dce->dce_lock); 376 if (!DCE_IS_CONDEMNED(dce)) { 377 dce_refhold(dce); 378 if (generationp != NULL) 379 *generationp = dce->dce_generation; 380 mutex_exit(&dce->dce_lock); 381 rw_exit(&dcb->dcb_lock); 382 return (dce); 383 } 384 mutex_exit(&dce->dce_lock); 385 } 386 } 387 rw_exit(&dcb->dcb_lock); 388 /* Not found */ 389 dce = ipst->ips_dce_default; 390 dce_refhold(dce); 391 return (dce); 392 } 393 394 /* 395 * Atomically looks for a non-default DCE, and if not found tries to create one. 396 * If there is no memory it returns NULL. 397 * When an entry is created we increase the generation number on 398 * the default DCE so that conn_ip_output will detect there is a new DCE. 399 */ 400 dce_t * 401 dce_lookup_and_add_v4(ipaddr_t dst, ip_stack_t *ipst) 402 { 403 uint_t hash; 404 dcb_t *dcb; 405 dce_t *dce; 406 407 hash = IRE_ADDR_HASH(dst, ipst->ips_dce_hashsize); 408 dcb = &ipst->ips_dce_hash_v4[hash]; 409 rw_enter(&dcb->dcb_lock, RW_WRITER); 410 for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) { 411 if (dce->dce_v4addr == dst) { 412 mutex_enter(&dce->dce_lock); 413 if (!DCE_IS_CONDEMNED(dce)) { 414 dce_refhold(dce); 415 mutex_exit(&dce->dce_lock); 416 rw_exit(&dcb->dcb_lock); 417 return (dce); 418 } 419 mutex_exit(&dce->dce_lock); 420 } 421 } 422 dce = kmem_cache_alloc(dce_cache, KM_NOSLEEP); 423 if (dce == NULL) { 424 rw_exit(&dcb->dcb_lock); 425 return (NULL); 426 } 427 bzero(dce, sizeof (dce_t)); 428 dce->dce_ipst = ipst; /* No netstack_hold */ 429 dce->dce_v4addr = dst; 430 dce->dce_generation = DCE_GENERATION_INITIAL; 431 dce->dce_ipversion = IPV4_VERSION; 432 dce->dce_last_change_time = TICK_TO_SEC(ddi_get_lbolt64()); 433 dce_refhold(dce); /* For the hash list */ 434 435 /* Link into list */ 436 if (dcb->dcb_dce != NULL) 437 dcb->dcb_dce->dce_ptpn = &dce->dce_next; 438 dce->dce_next = dcb->dcb_dce; 439 dce->dce_ptpn = &dcb->dcb_dce; 440 dcb->dcb_dce = dce; 441 dce->dce_bucket = dcb; 442 dce_refhold(dce); /* For the caller */ 443 rw_exit(&dcb->dcb_lock); 444 445 /* Initialize dce_ident to be different than for the last packet */ 446 dce->dce_ident = ipst->ips_dce_default->dce_ident + 1; 447 448 dce_increment_generation(ipst->ips_dce_default); 449 return (dce); 450 } 451 452 /* 453 * Atomically looks for a non-default DCE, and if not found tries to create one. 454 * If there is no memory it returns NULL. 455 * When an entry is created we increase the generation number on 456 * the default DCE so that conn_ip_output will detect there is a new DCE. 457 * ifindex should only be used with link-local addresses. 458 */ 459 dce_t * 460 dce_lookup_and_add_v6(const in6_addr_t *dst, uint_t ifindex, ip_stack_t *ipst) 461 { 462 uint_t hash; 463 dcb_t *dcb; 464 dce_t *dce; 465 466 /* We should not create entries for link-locals w/o an ifindex */ 467 ASSERT(!(IN6_IS_ADDR_LINKSCOPE(dst)) || ifindex != 0); 468 469 hash = IRE_ADDR_HASH_V6(*dst, ipst->ips_dce_hashsize); 470 dcb = &ipst->ips_dce_hash_v6[hash]; 471 rw_enter(&dcb->dcb_lock, RW_WRITER); 472 for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) { 473 if (IN6_ARE_ADDR_EQUAL(&dce->dce_v6addr, dst) && 474 dce->dce_ifindex == ifindex) { 475 mutex_enter(&dce->dce_lock); 476 if (!DCE_IS_CONDEMNED(dce)) { 477 dce_refhold(dce); 478 mutex_exit(&dce->dce_lock); 479 rw_exit(&dcb->dcb_lock); 480 return (dce); 481 } 482 mutex_exit(&dce->dce_lock); 483 } 484 } 485 486 dce = kmem_cache_alloc(dce_cache, KM_NOSLEEP); 487 if (dce == NULL) { 488 rw_exit(&dcb->dcb_lock); 489 return (NULL); 490 } 491 bzero(dce, sizeof (dce_t)); 492 dce->dce_ipst = ipst; /* No netstack_hold */ 493 dce->dce_v6addr = *dst; 494 dce->dce_ifindex = ifindex; 495 dce->dce_generation = DCE_GENERATION_INITIAL; 496 dce->dce_ipversion = IPV6_VERSION; 497 dce->dce_last_change_time = TICK_TO_SEC(ddi_get_lbolt64()); 498 dce_refhold(dce); /* For the hash list */ 499 500 /* Link into list */ 501 if (dcb->dcb_dce != NULL) 502 dcb->dcb_dce->dce_ptpn = &dce->dce_next; 503 dce->dce_next = dcb->dcb_dce; 504 dce->dce_ptpn = &dcb->dcb_dce; 505 dcb->dcb_dce = dce; 506 dce->dce_bucket = dcb; 507 atomic_add_32(&dcb->dcb_cnt, 1); 508 dce_refhold(dce); /* For the caller */ 509 rw_exit(&dcb->dcb_lock); 510 511 /* Initialize dce_ident to be different than for the last packet */ 512 dce->dce_ident = ipst->ips_dce_default->dce_ident + 1; 513 dce_increment_generation(ipst->ips_dce_default); 514 return (dce); 515 } 516 517 /* 518 * Set/update uinfo. Creates a per-destination dce if none exists. 519 * 520 * Note that we do not bump the generation number here. 521 * New connections will find the new uinfo. 522 * 523 * The only use of this (tcp, sctp using iulp_t) is to set rtt+rtt_sd. 524 */ 525 static void 526 dce_setuinfo(dce_t *dce, iulp_t *uinfo) 527 { 528 /* 529 * Update the round trip time estimate and/or the max frag size 530 * and/or the slow start threshold. 531 * 532 * We serialize multiple advises using dce_lock. 533 */ 534 mutex_enter(&dce->dce_lock); 535 /* Gard against setting to zero */ 536 if (uinfo->iulp_rtt != 0) { 537 /* 538 * If there is no old cached values, initialize them 539 * conservatively. Set them to be (1.5 * new value). 540 */ 541 if (dce->dce_uinfo.iulp_rtt != 0) { 542 dce->dce_uinfo.iulp_rtt = (dce->dce_uinfo.iulp_rtt + 543 uinfo->iulp_rtt) >> 1; 544 } else { 545 dce->dce_uinfo.iulp_rtt = uinfo->iulp_rtt + 546 (uinfo->iulp_rtt >> 1); 547 } 548 if (dce->dce_uinfo.iulp_rtt_sd != 0) { 549 dce->dce_uinfo.iulp_rtt_sd = 550 (dce->dce_uinfo.iulp_rtt_sd + 551 uinfo->iulp_rtt_sd) >> 1; 552 } else { 553 dce->dce_uinfo.iulp_rtt_sd = uinfo->iulp_rtt_sd + 554 (uinfo->iulp_rtt_sd >> 1); 555 } 556 } 557 if (uinfo->iulp_mtu != 0) { 558 if (dce->dce_flags & DCEF_PMTU) { 559 dce->dce_pmtu = MIN(uinfo->iulp_mtu, dce->dce_pmtu); 560 } else { 561 dce->dce_pmtu = MIN(uinfo->iulp_mtu, IP_MAXPACKET); 562 dce->dce_flags |= DCEF_PMTU; 563 } 564 dce->dce_last_change_time = TICK_TO_SEC(ddi_get_lbolt64()); 565 } 566 if (uinfo->iulp_ssthresh != 0) { 567 if (dce->dce_uinfo.iulp_ssthresh != 0) 568 dce->dce_uinfo.iulp_ssthresh = 569 (uinfo->iulp_ssthresh + 570 dce->dce_uinfo.iulp_ssthresh) >> 1; 571 else 572 dce->dce_uinfo.iulp_ssthresh = uinfo->iulp_ssthresh; 573 } 574 /* We have uinfo for sure */ 575 dce->dce_flags |= DCEF_UINFO; 576 mutex_exit(&dce->dce_lock); 577 } 578 579 580 int 581 dce_update_uinfo_v4(ipaddr_t dst, iulp_t *uinfo, ip_stack_t *ipst) 582 { 583 dce_t *dce; 584 585 dce = dce_lookup_and_add_v4(dst, ipst); 586 if (dce == NULL) 587 return (ENOMEM); 588 589 dce_setuinfo(dce, uinfo); 590 dce_refrele(dce); 591 return (0); 592 } 593 594 int 595 dce_update_uinfo_v6(const in6_addr_t *dst, uint_t ifindex, iulp_t *uinfo, 596 ip_stack_t *ipst) 597 { 598 dce_t *dce; 599 600 dce = dce_lookup_and_add_v6(dst, ifindex, ipst); 601 if (dce == NULL) 602 return (ENOMEM); 603 604 dce_setuinfo(dce, uinfo); 605 dce_refrele(dce); 606 return (0); 607 } 608 609 /* Common routine for IPv4 and IPv6 */ 610 int 611 dce_update_uinfo(const in6_addr_t *dst, uint_t ifindex, iulp_t *uinfo, 612 ip_stack_t *ipst) 613 { 614 ipaddr_t dst4; 615 616 if (IN6_IS_ADDR_V4MAPPED_ANY(dst)) { 617 IN6_V4MAPPED_TO_IPADDR(dst, dst4); 618 return (dce_update_uinfo_v4(dst4, uinfo, ipst)); 619 } else { 620 return (dce_update_uinfo_v6(dst, ifindex, uinfo, ipst)); 621 } 622 } 623 624 static void 625 dce_make_condemned(dce_t *dce) 626 { 627 ip_stack_t *ipst = dce->dce_ipst; 628 629 mutex_enter(&dce->dce_lock); 630 ASSERT(!DCE_IS_CONDEMNED(dce)); 631 dce->dce_generation = DCE_GENERATION_CONDEMNED; 632 mutex_exit(&dce->dce_lock); 633 /* Count how many condemned dces for kmem_cache callback */ 634 atomic_add_32(&ipst->ips_num_dce_condemned, 1); 635 } 636 637 /* 638 * Increment the generation avoiding the special condemned value 639 */ 640 void 641 dce_increment_generation(dce_t *dce) 642 { 643 uint_t generation; 644 645 mutex_enter(&dce->dce_lock); 646 if (!DCE_IS_CONDEMNED(dce)) { 647 generation = dce->dce_generation + 1; 648 if (generation == DCE_GENERATION_CONDEMNED) 649 generation = DCE_GENERATION_INITIAL; 650 ASSERT(generation != DCE_GENERATION_VERIFY); 651 dce->dce_generation = generation; 652 } 653 mutex_exit(&dce->dce_lock); 654 } 655 656 /* 657 * Increment the generation number on all dces that have a path MTU and 658 * the default DCE. Used when ill_mtu changes. 659 */ 660 void 661 dce_increment_all_generations(boolean_t isv6, ip_stack_t *ipst) 662 { 663 int i; 664 dcb_t *dcb; 665 dce_t *dce; 666 667 for (i = 0; i < ipst->ips_dce_hashsize; i++) { 668 if (isv6) 669 dcb = &ipst->ips_dce_hash_v6[i]; 670 else 671 dcb = &ipst->ips_dce_hash_v4[i]; 672 rw_enter(&dcb->dcb_lock, RW_WRITER); 673 for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) { 674 if (DCE_IS_CONDEMNED(dce)) 675 continue; 676 dce_increment_generation(dce); 677 } 678 rw_exit(&dcb->dcb_lock); 679 } 680 dce_increment_generation(ipst->ips_dce_default); 681 } 682 683 /* 684 * Caller needs to do a dce_refrele since we can't do the 685 * dce_refrele under dcb_lock. 686 */ 687 static void 688 dce_delete_locked(dcb_t *dcb, dce_t *dce) 689 { 690 dce->dce_bucket = NULL; 691 *dce->dce_ptpn = dce->dce_next; 692 if (dce->dce_next != NULL) 693 dce->dce_next->dce_ptpn = dce->dce_ptpn; 694 dce->dce_ptpn = NULL; 695 dce->dce_next = NULL; 696 atomic_add_32(&dcb->dcb_cnt, -1); 697 dce_make_condemned(dce); 698 } 699 700 static void 701 dce_inactive(dce_t *dce) 702 { 703 ip_stack_t *ipst = dce->dce_ipst; 704 705 ASSERT(!(dce->dce_flags & DCEF_DEFAULT)); 706 ASSERT(dce->dce_ptpn == NULL); 707 ASSERT(dce->dce_bucket == NULL); 708 709 /* Count how many condemned dces for kmem_cache callback */ 710 if (DCE_IS_CONDEMNED(dce)) 711 atomic_add_32(&ipst->ips_num_dce_condemned, -1); 712 713 kmem_cache_free(dce_cache, dce); 714 } 715 716 void 717 dce_refrele(dce_t *dce) 718 { 719 ASSERT(dce->dce_refcnt != 0); 720 if (atomic_add_32_nv(&dce->dce_refcnt, -1) == 0) 721 dce_inactive(dce); 722 } 723 724 void 725 dce_refhold(dce_t *dce) 726 { 727 atomic_add_32(&dce->dce_refcnt, 1); 728 ASSERT(dce->dce_refcnt != 0); 729 } 730 731 /* No tracing support yet hence the same as the above functions */ 732 void 733 dce_refrele_notr(dce_t *dce) 734 { 735 ASSERT(dce->dce_refcnt != 0); 736 if (atomic_add_32_nv(&dce->dce_refcnt, -1) == 0) 737 dce_inactive(dce); 738 } 739 740 void 741 dce_refhold_notr(dce_t *dce) 742 { 743 atomic_add_32(&dce->dce_refcnt, 1); 744 ASSERT(dce->dce_refcnt != 0); 745 } 746 747 /* Report both the IPv4 and IPv6 DCEs. */ 748 mblk_t * 749 ip_snmp_get_mib2_ip_dce(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst) 750 { 751 struct opthdr *optp; 752 mblk_t *mp2ctl; 753 dest_cache_entry_t dest_cache; 754 mblk_t *mp_tail = NULL; 755 dce_t *dce; 756 dcb_t *dcb; 757 int i; 758 uint64_t current_time; 759 760 current_time = TICK_TO_SEC(ddi_get_lbolt64()); 761 762 /* 763 * make a copy of the original message 764 */ 765 mp2ctl = copymsg(mpctl); 766 767 /* First we do IPv4 entries */ 768 optp = (struct opthdr *)&mpctl->b_rptr[ 769 sizeof (struct T_optmgmt_ack)]; 770 optp->level = MIB2_IP; 771 optp->name = EXPER_IP_DCE; 772 773 for (i = 0; i < ipst->ips_dce_hashsize; i++) { 774 dcb = &ipst->ips_dce_hash_v4[i]; 775 rw_enter(&dcb->dcb_lock, RW_READER); 776 for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) { 777 dest_cache.DestIpv4Address = dce->dce_v4addr; 778 dest_cache.DestFlags = dce->dce_flags; 779 if (dce->dce_flags & DCEF_PMTU) 780 dest_cache.DestPmtu = dce->dce_pmtu; 781 else 782 dest_cache.DestPmtu = 0; 783 dest_cache.DestIdent = dce->dce_ident; 784 dest_cache.DestIfindex = 0; 785 dest_cache.DestAge = current_time - 786 dce->dce_last_change_time; 787 if (!snmp_append_data2(mpctl->b_cont, &mp_tail, 788 (char *)&dest_cache, (int)sizeof (dest_cache))) { 789 ip1dbg(("ip_snmp_get_mib2_ip_dce: " 790 "failed to allocate %u bytes\n", 791 (uint_t)sizeof (dest_cache))); 792 } 793 } 794 rw_exit(&dcb->dcb_lock); 795 } 796 optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont); 797 ip3dbg(("ip_snmp_get: level %d, name %d, len %d\n", 798 (int)optp->level, (int)optp->name, (int)optp->len)); 799 qreply(q, mpctl); 800 801 if (mp2ctl == NULL) { 802 /* Copymsg failed above */ 803 return (NULL); 804 } 805 806 /* Now for IPv6 */ 807 mpctl = mp2ctl; 808 mp_tail = NULL; 809 mp2ctl = copymsg(mpctl); 810 optp = (struct opthdr *)&mpctl->b_rptr[ 811 sizeof (struct T_optmgmt_ack)]; 812 optp->level = MIB2_IP6; 813 optp->name = EXPER_IP_DCE; 814 815 for (i = 0; i < ipst->ips_dce_hashsize; i++) { 816 dcb = &ipst->ips_dce_hash_v6[i]; 817 rw_enter(&dcb->dcb_lock, RW_READER); 818 for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) { 819 dest_cache.DestIpv6Address = dce->dce_v6addr; 820 dest_cache.DestFlags = dce->dce_flags; 821 if (dce->dce_flags & DCEF_PMTU) 822 dest_cache.DestPmtu = dce->dce_pmtu; 823 else 824 dest_cache.DestPmtu = 0; 825 dest_cache.DestIdent = dce->dce_ident; 826 if (IN6_IS_ADDR_LINKSCOPE(&dce->dce_v6addr)) 827 dest_cache.DestIfindex = dce->dce_ifindex; 828 else 829 dest_cache.DestIfindex = 0; 830 dest_cache.DestAge = current_time - 831 dce->dce_last_change_time; 832 if (!snmp_append_data2(mpctl->b_cont, &mp_tail, 833 (char *)&dest_cache, (int)sizeof (dest_cache))) { 834 ip1dbg(("ip_snmp_get_mib2_ip_dce: " 835 "failed to allocate %u bytes\n", 836 (uint_t)sizeof (dest_cache))); 837 } 838 } 839 rw_exit(&dcb->dcb_lock); 840 } 841 optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont); 842 ip3dbg(("ip_snmp_get: level %d, name %d, len %d\n", 843 (int)optp->level, (int)optp->name, (int)optp->len)); 844 qreply(q, mpctl); 845 846 return (mp2ctl); 847 } 848 849 /* 850 * Remove IPv6 DCEs which refer to an ifindex that is going away. 851 * This is not required for correctness, but it avoids netstat -d 852 * showing stale stuff that will never be used. 853 */ 854 void 855 dce_cleanup(uint_t ifindex, ip_stack_t *ipst) 856 { 857 uint_t i; 858 dcb_t *dcb; 859 dce_t *dce, *nextdce; 860 861 for (i = 0; i < ipst->ips_dce_hashsize; i++) { 862 dcb = &ipst->ips_dce_hash_v6[i]; 863 rw_enter(&dcb->dcb_lock, RW_WRITER); 864 865 for (dce = dcb->dcb_dce; dce != NULL; dce = nextdce) { 866 nextdce = dce->dce_next; 867 if (dce->dce_ifindex == ifindex) { 868 dce_delete_locked(dcb, dce); 869 dce_refrele(dce); 870 } 871 } 872 rw_exit(&dcb->dcb_lock); 873 } 874 } 875