1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2010 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #include <sys/types.h> 28 #include <sys/stream.h> 29 #include <sys/strsun.h> 30 #include <sys/zone.h> 31 #include <sys/ddi.h> 32 #include <sys/sunddi.h> 33 #include <sys/cmn_err.h> 34 #include <sys/debug.h> 35 #include <sys/atomic.h> 36 #define _SUN_TPI_VERSION 2 37 #include <sys/tihdr.h> 38 39 #include <inet/common.h> 40 #include <inet/mi.h> 41 #include <inet/mib2.h> 42 #include <inet/snmpcom.h> 43 44 #include <netinet/ip6.h> 45 #include <netinet/icmp6.h> 46 47 #include <inet/ip.h> 48 #include <inet/ip_impl.h> 49 #include <inet/ip6.h> 50 #include <inet/ip6_asp.h> 51 #include <inet/ip_multi.h> 52 #include <inet/ip_if.h> 53 #include <inet/ip_ire.h> 54 #include <inet/ip_ftable.h> 55 #include <inet/ip_rts.h> 56 #include <inet/ip_ndp.h> 57 #include <inet/ipclassifier.h> 58 #include <inet/ip_listutils.h> 59 60 #include <sys/sunddi.h> 61 62 /* 63 * Routines for handling destination cache entries. 64 * There is always one DCEF_DEFAULT for each ip_stack_t created at init time. 65 * That entry holds both the IP ident value and the dce generation number. 66 * 67 * Any time a DCE is changed significantly (different path MTU, but NOT 68 * different ULP info!), the dce_generation number is increased. 69 * Also, when a new DCE is created, the dce_generation number in the default 70 * DCE is bumped. That allows the dce_t information to be cached efficiently 71 * as long as the entity caching the dce_t also caches the dce_generation, 72 * and compares the cached generation to detect any changes. 73 * Furthermore, when a DCE is deleted, if there are any outstanding references 74 * to the DCE it will be marked as condemned. The condemned mark is 75 * a designated generation number which is never otherwise used, hence 76 * the single comparison with the generation number captures that as well. 77 * 78 * An example of code which caches is as follows: 79 * 80 * if (mystruct->my_dce_generation != mystruct->my_dce->dce_generation) { 81 * The DCE has changed 82 * mystruct->my_dce = dce_lookup_pkt(mp, ixa, 83 * &mystruct->my_dce_generation); 84 * Not needed in practice, since we have the default DCE: 85 * if (DCE_IS_CONDEMNED(mystruct->my_dce)) 86 * return failure; 87 * } 88 * 89 * Note that for IPv6 link-local addresses we record the ifindex since the 90 * link-locals are not globally unique. 91 */ 92 93 /* 94 * Hash bucket structure for DCEs 95 */ 96 typedef struct dcb_s { 97 krwlock_t dcb_lock; 98 uint32_t dcb_cnt; 99 dce_t *dcb_dce; 100 } dcb_t; 101 102 static void dce_delete_locked(dcb_t *, dce_t *); 103 static void dce_make_condemned(dce_t *); 104 105 static kmem_cache_t *dce_cache; 106 107 108 /* Operates on a uint64_t */ 109 #define RANDOM_HASH(p) ((p) ^ ((p)>>16) ^ ((p)>>32) ^ ((p)>>48)) 110 111 /* 112 * Reclaim a fraction of dce's in the dcb. 113 * For now we have a higher probability to delete DCEs without DCE_PMTU. 114 */ 115 static void 116 dcb_reclaim(dcb_t *dcb, ip_stack_t *ipst, uint_t fraction) 117 { 118 uint_t fraction_pmtu = fraction*4; 119 uint_t hash; 120 dce_t *dce, *nextdce; 121 122 rw_enter(&dcb->dcb_lock, RW_WRITER); 123 for (dce = dcb->dcb_dce; dce != NULL; dce = nextdce) { 124 nextdce = dce->dce_next; 125 /* Clear DCEF_PMTU if the pmtu is too old */ 126 mutex_enter(&dce->dce_lock); 127 if ((dce->dce_flags & DCEF_PMTU) && 128 TICK_TO_SEC(ddi_get_lbolt64()) - dce->dce_last_change_time > 129 ipst->ips_ip_pathmtu_interval) { 130 dce->dce_flags &= ~DCEF_PMTU; 131 mutex_exit(&dce->dce_lock); 132 dce_increment_generation(dce); 133 } else { 134 mutex_exit(&dce->dce_lock); 135 } 136 hash = RANDOM_HASH((uint64_t)(uintptr_t)dce); 137 if (dce->dce_flags & DCEF_PMTU) { 138 if (hash % fraction_pmtu != 0) 139 continue; 140 } else { 141 if (hash % fraction != 0) 142 continue; 143 } 144 145 IP_STAT(ipst, ip_dce_reclaim_deleted); 146 dce_delete_locked(dcb, dce); 147 dce_refrele(dce); 148 } 149 rw_exit(&dcb->dcb_lock); 150 } 151 152 /* 153 * kmem_cache callback to free up memory. 154 * 155 */ 156 static void 157 ip_dce_reclaim_stack(ip_stack_t *ipst) 158 { 159 int i; 160 161 IP_STAT(ipst, ip_dce_reclaim_calls); 162 for (i = 0; i < ipst->ips_dce_hashsize; i++) { 163 dcb_reclaim(&ipst->ips_dce_hash_v4[i], ipst, 164 ipst->ips_ip_dce_reclaim_fraction); 165 166 dcb_reclaim(&ipst->ips_dce_hash_v6[i], ipst, 167 ipst->ips_ip_dce_reclaim_fraction); 168 } 169 170 /* 171 * Walk all CONNs that can have a reference on an ire, nce or dce. 172 * Get them to update any stale references to drop any refholds they 173 * have. 174 */ 175 ipcl_walk(conn_ixa_cleanup, (void *)B_FALSE, ipst); 176 } 177 178 /* 179 * Called by the memory allocator subsystem directly, when the system 180 * is running low on memory. 181 */ 182 /* ARGSUSED */ 183 void 184 ip_dce_reclaim(void *args) 185 { 186 netstack_handle_t nh; 187 netstack_t *ns; 188 ip_stack_t *ipst; 189 190 netstack_next_init(&nh); 191 while ((ns = netstack_next(&nh)) != NULL) { 192 /* 193 * netstack_next() can return a netstack_t with a NULL 194 * netstack_ip at boot time. 195 */ 196 if ((ipst = ns->netstack_ip) == NULL) { 197 netstack_rele(ns); 198 continue; 199 } 200 ip_dce_reclaim_stack(ipst); 201 netstack_rele(ns); 202 } 203 netstack_next_fini(&nh); 204 } 205 206 void 207 dce_g_init(void) 208 { 209 dce_cache = kmem_cache_create("dce_cache", 210 sizeof (dce_t), 0, NULL, NULL, ip_dce_reclaim, NULL, NULL, 0); 211 } 212 213 void 214 dce_g_destroy(void) 215 { 216 kmem_cache_destroy(dce_cache); 217 } 218 219 220 /* 221 * Allocate a default DCE and a hash table for per-IP address DCEs 222 */ 223 void 224 dce_stack_init(ip_stack_t *ipst) 225 { 226 int i; 227 228 ipst->ips_dce_default = kmem_cache_alloc(dce_cache, KM_SLEEP); 229 bzero(ipst->ips_dce_default, sizeof (dce_t)); 230 ipst->ips_dce_default->dce_flags = DCEF_DEFAULT; 231 ipst->ips_dce_default->dce_generation = DCE_GENERATION_INITIAL; 232 ipst->ips_dce_default->dce_last_change_time = 233 TICK_TO_SEC(ddi_get_lbolt64()); 234 ipst->ips_dce_default->dce_refcnt = 1; /* Should never go away */ 235 ipst->ips_dce_default->dce_ipst = ipst; 236 237 /* This must be a power of two since we are using IRE_ADDR_HASH macro */ 238 ipst->ips_dce_hashsize = 256; 239 ipst->ips_dce_hash_v4 = kmem_zalloc(ipst->ips_dce_hashsize * 240 sizeof (dcb_t), KM_SLEEP); 241 ipst->ips_dce_hash_v6 = kmem_zalloc(ipst->ips_dce_hashsize * 242 sizeof (dcb_t), KM_SLEEP); 243 for (i = 0; i < ipst->ips_dce_hashsize; i++) { 244 rw_init(&ipst->ips_dce_hash_v4[i].dcb_lock, NULL, RW_DEFAULT, 245 NULL); 246 rw_init(&ipst->ips_dce_hash_v6[i].dcb_lock, NULL, RW_DEFAULT, 247 NULL); 248 } 249 } 250 251 void 252 dce_stack_destroy(ip_stack_t *ipst) 253 { 254 int i; 255 for (i = 0; i < ipst->ips_dce_hashsize; i++) { 256 rw_destroy(&ipst->ips_dce_hash_v4[i].dcb_lock); 257 rw_destroy(&ipst->ips_dce_hash_v6[i].dcb_lock); 258 } 259 kmem_free(ipst->ips_dce_hash_v4, 260 ipst->ips_dce_hashsize * sizeof (dcb_t)); 261 ipst->ips_dce_hash_v4 = NULL; 262 kmem_free(ipst->ips_dce_hash_v6, 263 ipst->ips_dce_hashsize * sizeof (dcb_t)); 264 ipst->ips_dce_hash_v6 = NULL; 265 ipst->ips_dce_hashsize = 0; 266 267 ASSERT(ipst->ips_dce_default->dce_refcnt == 1); 268 kmem_cache_free(dce_cache, ipst->ips_dce_default); 269 ipst->ips_dce_default = NULL; 270 } 271 272 /* When any DCE is good enough */ 273 dce_t * 274 dce_get_default(ip_stack_t *ipst) 275 { 276 dce_t *dce; 277 278 dce = ipst->ips_dce_default; 279 dce_refhold(dce); 280 return (dce); 281 } 282 283 /* 284 * Generic for IPv4 and IPv6. 285 * 286 * Used by callers that need to cache e.g., the datapath 287 * Returns the generation number in the last argument. 288 */ 289 dce_t * 290 dce_lookup_pkt(mblk_t *mp, ip_xmit_attr_t *ixa, uint_t *generationp) 291 { 292 if (ixa->ixa_flags & IXAF_IS_IPV4) { 293 /* 294 * If we have a source route we need to look for the final 295 * destination in the source route option. 296 */ 297 ipaddr_t final_dst; 298 ipha_t *ipha = (ipha_t *)mp->b_rptr; 299 300 final_dst = ip_get_dst(ipha); 301 return (dce_lookup_v4(final_dst, ixa->ixa_ipst, generationp)); 302 } else { 303 uint_t ifindex; 304 /* 305 * If we have a routing header we need to look for the final 306 * destination in the routing extension header. 307 */ 308 in6_addr_t final_dst; 309 ip6_t *ip6h = (ip6_t *)mp->b_rptr; 310 311 final_dst = ip_get_dst_v6(ip6h, mp, NULL); 312 ifindex = 0; 313 if (IN6_IS_ADDR_LINKSCOPE(&final_dst) && ixa->ixa_nce != NULL) { 314 ifindex = ixa->ixa_nce->nce_common->ncec_ill-> 315 ill_phyint->phyint_ifindex; 316 } 317 return (dce_lookup_v6(&final_dst, ifindex, ixa->ixa_ipst, 318 generationp)); 319 } 320 } 321 322 /* 323 * Used by callers that need to cache e.g., the datapath 324 * Returns the generation number in the last argument. 325 */ 326 dce_t * 327 dce_lookup_v4(ipaddr_t dst, ip_stack_t *ipst, uint_t *generationp) 328 { 329 uint_t hash; 330 dcb_t *dcb; 331 dce_t *dce; 332 333 /* Set *generationp before dropping the lock(s) that allow additions */ 334 if (generationp != NULL) 335 *generationp = ipst->ips_dce_default->dce_generation; 336 337 hash = IRE_ADDR_HASH(dst, ipst->ips_dce_hashsize); 338 dcb = &ipst->ips_dce_hash_v4[hash]; 339 rw_enter(&dcb->dcb_lock, RW_READER); 340 for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) { 341 if (dce->dce_v4addr == dst) { 342 mutex_enter(&dce->dce_lock); 343 if (!DCE_IS_CONDEMNED(dce)) { 344 dce_refhold(dce); 345 if (generationp != NULL) 346 *generationp = dce->dce_generation; 347 mutex_exit(&dce->dce_lock); 348 rw_exit(&dcb->dcb_lock); 349 return (dce); 350 } 351 mutex_exit(&dce->dce_lock); 352 } 353 } 354 rw_exit(&dcb->dcb_lock); 355 /* Not found */ 356 dce = ipst->ips_dce_default; 357 dce_refhold(dce); 358 return (dce); 359 } 360 361 /* 362 * Used by callers that need to cache e.g., the datapath 363 * Returns the generation number in the last argument. 364 * ifindex should only be set for link-locals 365 */ 366 dce_t * 367 dce_lookup_v6(const in6_addr_t *dst, uint_t ifindex, ip_stack_t *ipst, 368 uint_t *generationp) 369 { 370 uint_t hash; 371 dcb_t *dcb; 372 dce_t *dce; 373 374 /* Set *generationp before dropping the lock(s) that allow additions */ 375 if (generationp != NULL) 376 *generationp = ipst->ips_dce_default->dce_generation; 377 378 hash = IRE_ADDR_HASH_V6(*dst, ipst->ips_dce_hashsize); 379 dcb = &ipst->ips_dce_hash_v6[hash]; 380 rw_enter(&dcb->dcb_lock, RW_READER); 381 for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) { 382 if (IN6_ARE_ADDR_EQUAL(&dce->dce_v6addr, dst) && 383 dce->dce_ifindex == ifindex) { 384 mutex_enter(&dce->dce_lock); 385 if (!DCE_IS_CONDEMNED(dce)) { 386 dce_refhold(dce); 387 if (generationp != NULL) 388 *generationp = dce->dce_generation; 389 mutex_exit(&dce->dce_lock); 390 rw_exit(&dcb->dcb_lock); 391 return (dce); 392 } 393 mutex_exit(&dce->dce_lock); 394 } 395 } 396 rw_exit(&dcb->dcb_lock); 397 /* Not found */ 398 dce = ipst->ips_dce_default; 399 dce_refhold(dce); 400 return (dce); 401 } 402 403 /* 404 * Atomically looks for a non-default DCE, and if not found tries to create one. 405 * If there is no memory it returns NULL. 406 * When an entry is created we increase the generation number on 407 * the default DCE so that conn_ip_output will detect there is a new DCE. 408 */ 409 dce_t * 410 dce_lookup_and_add_v4(ipaddr_t dst, ip_stack_t *ipst) 411 { 412 uint_t hash; 413 dcb_t *dcb; 414 dce_t *dce; 415 416 hash = IRE_ADDR_HASH(dst, ipst->ips_dce_hashsize); 417 dcb = &ipst->ips_dce_hash_v4[hash]; 418 rw_enter(&dcb->dcb_lock, RW_WRITER); 419 for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) { 420 if (dce->dce_v4addr == dst) { 421 mutex_enter(&dce->dce_lock); 422 if (!DCE_IS_CONDEMNED(dce)) { 423 dce_refhold(dce); 424 mutex_exit(&dce->dce_lock); 425 rw_exit(&dcb->dcb_lock); 426 return (dce); 427 } 428 mutex_exit(&dce->dce_lock); 429 } 430 } 431 dce = kmem_cache_alloc(dce_cache, KM_NOSLEEP); 432 if (dce == NULL) { 433 rw_exit(&dcb->dcb_lock); 434 return (NULL); 435 } 436 bzero(dce, sizeof (dce_t)); 437 dce->dce_ipst = ipst; /* No netstack_hold */ 438 dce->dce_v4addr = dst; 439 dce->dce_generation = DCE_GENERATION_INITIAL; 440 dce->dce_ipversion = IPV4_VERSION; 441 dce->dce_last_change_time = TICK_TO_SEC(ddi_get_lbolt64()); 442 dce_refhold(dce); /* For the hash list */ 443 444 /* Link into list */ 445 if (dcb->dcb_dce != NULL) 446 dcb->dcb_dce->dce_ptpn = &dce->dce_next; 447 dce->dce_next = dcb->dcb_dce; 448 dce->dce_ptpn = &dcb->dcb_dce; 449 dcb->dcb_dce = dce; 450 dce->dce_bucket = dcb; 451 dce_refhold(dce); /* For the caller */ 452 rw_exit(&dcb->dcb_lock); 453 454 /* Initialize dce_ident to be different than for the last packet */ 455 dce->dce_ident = ipst->ips_dce_default->dce_ident + 1; 456 457 dce_increment_generation(ipst->ips_dce_default); 458 return (dce); 459 } 460 461 /* 462 * Atomically looks for a non-default DCE, and if not found tries to create one. 463 * If there is no memory it returns NULL. 464 * When an entry is created we increase the generation number on 465 * the default DCE so that conn_ip_output will detect there is a new DCE. 466 * ifindex should only be used with link-local addresses. 467 */ 468 dce_t * 469 dce_lookup_and_add_v6(const in6_addr_t *dst, uint_t ifindex, ip_stack_t *ipst) 470 { 471 uint_t hash; 472 dcb_t *dcb; 473 dce_t *dce; 474 475 /* We should not create entries for link-locals w/o an ifindex */ 476 ASSERT(!(IN6_IS_ADDR_LINKSCOPE(dst)) || ifindex != 0); 477 478 hash = IRE_ADDR_HASH_V6(*dst, ipst->ips_dce_hashsize); 479 dcb = &ipst->ips_dce_hash_v6[hash]; 480 rw_enter(&dcb->dcb_lock, RW_WRITER); 481 for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) { 482 if (IN6_ARE_ADDR_EQUAL(&dce->dce_v6addr, dst) && 483 dce->dce_ifindex == ifindex) { 484 mutex_enter(&dce->dce_lock); 485 if (!DCE_IS_CONDEMNED(dce)) { 486 dce_refhold(dce); 487 mutex_exit(&dce->dce_lock); 488 rw_exit(&dcb->dcb_lock); 489 return (dce); 490 } 491 mutex_exit(&dce->dce_lock); 492 } 493 } 494 495 dce = kmem_cache_alloc(dce_cache, KM_NOSLEEP); 496 if (dce == NULL) { 497 rw_exit(&dcb->dcb_lock); 498 return (NULL); 499 } 500 bzero(dce, sizeof (dce_t)); 501 dce->dce_ipst = ipst; /* No netstack_hold */ 502 dce->dce_v6addr = *dst; 503 dce->dce_ifindex = ifindex; 504 dce->dce_generation = DCE_GENERATION_INITIAL; 505 dce->dce_ipversion = IPV6_VERSION; 506 dce->dce_last_change_time = TICK_TO_SEC(ddi_get_lbolt64()); 507 dce_refhold(dce); /* For the hash list */ 508 509 /* Link into list */ 510 if (dcb->dcb_dce != NULL) 511 dcb->dcb_dce->dce_ptpn = &dce->dce_next; 512 dce->dce_next = dcb->dcb_dce; 513 dce->dce_ptpn = &dcb->dcb_dce; 514 dcb->dcb_dce = dce; 515 dce->dce_bucket = dcb; 516 atomic_add_32(&dcb->dcb_cnt, 1); 517 dce_refhold(dce); /* For the caller */ 518 rw_exit(&dcb->dcb_lock); 519 520 /* Initialize dce_ident to be different than for the last packet */ 521 dce->dce_ident = ipst->ips_dce_default->dce_ident + 1; 522 dce_increment_generation(ipst->ips_dce_default); 523 return (dce); 524 } 525 526 /* 527 * Set/update uinfo. Creates a per-destination dce if none exists. 528 * 529 * Note that we do not bump the generation number here. 530 * New connections will find the new uinfo. 531 * 532 * The only use of this (tcp, sctp using iulp_t) is to set rtt+rtt_sd. 533 */ 534 static void 535 dce_setuinfo(dce_t *dce, iulp_t *uinfo) 536 { 537 /* 538 * Update the round trip time estimate and/or the max frag size 539 * and/or the slow start threshold. 540 * 541 * We serialize multiple advises using dce_lock. 542 */ 543 mutex_enter(&dce->dce_lock); 544 /* Gard against setting to zero */ 545 if (uinfo->iulp_rtt != 0) { 546 /* 547 * If there is no old cached values, initialize them 548 * conservatively. Set them to be (1.5 * new value). 549 */ 550 if (dce->dce_uinfo.iulp_rtt != 0) { 551 dce->dce_uinfo.iulp_rtt = (dce->dce_uinfo.iulp_rtt + 552 uinfo->iulp_rtt) >> 1; 553 } else { 554 dce->dce_uinfo.iulp_rtt = uinfo->iulp_rtt + 555 (uinfo->iulp_rtt >> 1); 556 } 557 if (dce->dce_uinfo.iulp_rtt_sd != 0) { 558 dce->dce_uinfo.iulp_rtt_sd = 559 (dce->dce_uinfo.iulp_rtt_sd + 560 uinfo->iulp_rtt_sd) >> 1; 561 } else { 562 dce->dce_uinfo.iulp_rtt_sd = uinfo->iulp_rtt_sd + 563 (uinfo->iulp_rtt_sd >> 1); 564 } 565 } 566 if (uinfo->iulp_mtu != 0) { 567 if (dce->dce_flags & DCEF_PMTU) { 568 dce->dce_pmtu = MIN(uinfo->iulp_mtu, dce->dce_pmtu); 569 } else { 570 dce->dce_pmtu = MIN(uinfo->iulp_mtu, IP_MAXPACKET); 571 dce->dce_flags |= DCEF_PMTU; 572 } 573 dce->dce_last_change_time = TICK_TO_SEC(ddi_get_lbolt64()); 574 } 575 if (uinfo->iulp_ssthresh != 0) { 576 if (dce->dce_uinfo.iulp_ssthresh != 0) 577 dce->dce_uinfo.iulp_ssthresh = 578 (uinfo->iulp_ssthresh + 579 dce->dce_uinfo.iulp_ssthresh) >> 1; 580 else 581 dce->dce_uinfo.iulp_ssthresh = uinfo->iulp_ssthresh; 582 } 583 /* We have uinfo for sure */ 584 dce->dce_flags |= DCEF_UINFO; 585 mutex_exit(&dce->dce_lock); 586 } 587 588 589 int 590 dce_update_uinfo_v4(ipaddr_t dst, iulp_t *uinfo, ip_stack_t *ipst) 591 { 592 dce_t *dce; 593 594 dce = dce_lookup_and_add_v4(dst, ipst); 595 if (dce == NULL) 596 return (ENOMEM); 597 598 dce_setuinfo(dce, uinfo); 599 dce_refrele(dce); 600 return (0); 601 } 602 603 int 604 dce_update_uinfo_v6(const in6_addr_t *dst, uint_t ifindex, iulp_t *uinfo, 605 ip_stack_t *ipst) 606 { 607 dce_t *dce; 608 609 dce = dce_lookup_and_add_v6(dst, ifindex, ipst); 610 if (dce == NULL) 611 return (ENOMEM); 612 613 dce_setuinfo(dce, uinfo); 614 dce_refrele(dce); 615 return (0); 616 } 617 618 /* Common routine for IPv4 and IPv6 */ 619 int 620 dce_update_uinfo(const in6_addr_t *dst, uint_t ifindex, iulp_t *uinfo, 621 ip_stack_t *ipst) 622 { 623 ipaddr_t dst4; 624 625 if (IN6_IS_ADDR_V4MAPPED_ANY(dst)) { 626 IN6_V4MAPPED_TO_IPADDR(dst, dst4); 627 return (dce_update_uinfo_v4(dst4, uinfo, ipst)); 628 } else { 629 return (dce_update_uinfo_v6(dst, ifindex, uinfo, ipst)); 630 } 631 } 632 633 static void 634 dce_make_condemned(dce_t *dce) 635 { 636 ip_stack_t *ipst = dce->dce_ipst; 637 638 mutex_enter(&dce->dce_lock); 639 ASSERT(!DCE_IS_CONDEMNED(dce)); 640 dce->dce_generation = DCE_GENERATION_CONDEMNED; 641 mutex_exit(&dce->dce_lock); 642 /* Count how many condemned dces for kmem_cache callback */ 643 atomic_add_32(&ipst->ips_num_dce_condemned, 1); 644 } 645 646 /* 647 * Increment the generation avoiding the special condemned value 648 */ 649 void 650 dce_increment_generation(dce_t *dce) 651 { 652 uint_t generation; 653 654 mutex_enter(&dce->dce_lock); 655 if (!DCE_IS_CONDEMNED(dce)) { 656 generation = dce->dce_generation + 1; 657 if (generation == DCE_GENERATION_CONDEMNED) 658 generation = DCE_GENERATION_INITIAL; 659 ASSERT(generation != DCE_GENERATION_VERIFY); 660 dce->dce_generation = generation; 661 } 662 mutex_exit(&dce->dce_lock); 663 } 664 665 /* 666 * Increment the generation number on all dces that have a path MTU and 667 * the default DCE. Used when ill_mtu changes. 668 */ 669 void 670 dce_increment_all_generations(boolean_t isv6, ip_stack_t *ipst) 671 { 672 int i; 673 dcb_t *dcb; 674 dce_t *dce; 675 676 for (i = 0; i < ipst->ips_dce_hashsize; i++) { 677 if (isv6) 678 dcb = &ipst->ips_dce_hash_v6[i]; 679 else 680 dcb = &ipst->ips_dce_hash_v4[i]; 681 rw_enter(&dcb->dcb_lock, RW_WRITER); 682 for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) { 683 if (DCE_IS_CONDEMNED(dce)) 684 continue; 685 dce_increment_generation(dce); 686 } 687 rw_exit(&dcb->dcb_lock); 688 } 689 dce_increment_generation(ipst->ips_dce_default); 690 } 691 692 /* 693 * Caller needs to do a dce_refrele since we can't do the 694 * dce_refrele under dcb_lock. 695 */ 696 static void 697 dce_delete_locked(dcb_t *dcb, dce_t *dce) 698 { 699 dce->dce_bucket = NULL; 700 *dce->dce_ptpn = dce->dce_next; 701 if (dce->dce_next != NULL) 702 dce->dce_next->dce_ptpn = dce->dce_ptpn; 703 dce->dce_ptpn = NULL; 704 dce->dce_next = NULL; 705 atomic_add_32(&dcb->dcb_cnt, -1); 706 dce_make_condemned(dce); 707 } 708 709 static void 710 dce_inactive(dce_t *dce) 711 { 712 ip_stack_t *ipst = dce->dce_ipst; 713 714 ASSERT(!(dce->dce_flags & DCEF_DEFAULT)); 715 ASSERT(dce->dce_ptpn == NULL); 716 ASSERT(dce->dce_bucket == NULL); 717 718 /* Count how many condemned dces for kmem_cache callback */ 719 if (DCE_IS_CONDEMNED(dce)) 720 atomic_add_32(&ipst->ips_num_dce_condemned, -1); 721 722 kmem_cache_free(dce_cache, dce); 723 } 724 725 void 726 dce_refrele(dce_t *dce) 727 { 728 ASSERT(dce->dce_refcnt != 0); 729 if (atomic_add_32_nv(&dce->dce_refcnt, -1) == 0) 730 dce_inactive(dce); 731 } 732 733 void 734 dce_refhold(dce_t *dce) 735 { 736 atomic_add_32(&dce->dce_refcnt, 1); 737 ASSERT(dce->dce_refcnt != 0); 738 } 739 740 /* No tracing support yet hence the same as the above functions */ 741 void 742 dce_refrele_notr(dce_t *dce) 743 { 744 ASSERT(dce->dce_refcnt != 0); 745 if (atomic_add_32_nv(&dce->dce_refcnt, -1) == 0) 746 dce_inactive(dce); 747 } 748 749 void 750 dce_refhold_notr(dce_t *dce) 751 { 752 atomic_add_32(&dce->dce_refcnt, 1); 753 ASSERT(dce->dce_refcnt != 0); 754 } 755 756 /* Report both the IPv4 and IPv6 DCEs. */ 757 mblk_t * 758 ip_snmp_get_mib2_ip_dce(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst) 759 { 760 struct opthdr *optp; 761 mblk_t *mp2ctl; 762 dest_cache_entry_t dest_cache; 763 mblk_t *mp_tail = NULL; 764 dce_t *dce; 765 dcb_t *dcb; 766 int i; 767 uint64_t current_time; 768 769 current_time = TICK_TO_SEC(ddi_get_lbolt64()); 770 771 /* 772 * make a copy of the original message 773 */ 774 mp2ctl = copymsg(mpctl); 775 776 /* First we do IPv4 entries */ 777 optp = (struct opthdr *)&mpctl->b_rptr[ 778 sizeof (struct T_optmgmt_ack)]; 779 optp->level = MIB2_IP; 780 optp->name = EXPER_IP_DCE; 781 782 for (i = 0; i < ipst->ips_dce_hashsize; i++) { 783 dcb = &ipst->ips_dce_hash_v4[i]; 784 rw_enter(&dcb->dcb_lock, RW_READER); 785 for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) { 786 dest_cache.DestIpv4Address = dce->dce_v4addr; 787 dest_cache.DestFlags = dce->dce_flags; 788 if (dce->dce_flags & DCEF_PMTU) 789 dest_cache.DestPmtu = dce->dce_pmtu; 790 else 791 dest_cache.DestPmtu = 0; 792 dest_cache.DestIdent = dce->dce_ident; 793 dest_cache.DestIfindex = 0; 794 dest_cache.DestAge = current_time - 795 dce->dce_last_change_time; 796 if (!snmp_append_data2(mpctl->b_cont, &mp_tail, 797 (char *)&dest_cache, (int)sizeof (dest_cache))) { 798 ip1dbg(("ip_snmp_get_mib2_ip_dce: " 799 "failed to allocate %u bytes\n", 800 (uint_t)sizeof (dest_cache))); 801 } 802 } 803 rw_exit(&dcb->dcb_lock); 804 } 805 optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont); 806 ip3dbg(("ip_snmp_get: level %d, name %d, len %d\n", 807 (int)optp->level, (int)optp->name, (int)optp->len)); 808 qreply(q, mpctl); 809 810 if (mp2ctl == NULL) { 811 /* Copymsg failed above */ 812 return (NULL); 813 } 814 815 /* Now for IPv6 */ 816 mpctl = mp2ctl; 817 mp_tail = NULL; 818 mp2ctl = copymsg(mpctl); 819 optp = (struct opthdr *)&mpctl->b_rptr[ 820 sizeof (struct T_optmgmt_ack)]; 821 optp->level = MIB2_IP6; 822 optp->name = EXPER_IP_DCE; 823 824 for (i = 0; i < ipst->ips_dce_hashsize; i++) { 825 dcb = &ipst->ips_dce_hash_v6[i]; 826 rw_enter(&dcb->dcb_lock, RW_READER); 827 for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) { 828 dest_cache.DestIpv6Address = dce->dce_v6addr; 829 dest_cache.DestFlags = dce->dce_flags; 830 if (dce->dce_flags & DCEF_PMTU) 831 dest_cache.DestPmtu = dce->dce_pmtu; 832 else 833 dest_cache.DestPmtu = 0; 834 dest_cache.DestIdent = dce->dce_ident; 835 if (IN6_IS_ADDR_LINKSCOPE(&dce->dce_v6addr)) 836 dest_cache.DestIfindex = dce->dce_ifindex; 837 else 838 dest_cache.DestIfindex = 0; 839 dest_cache.DestAge = current_time - 840 dce->dce_last_change_time; 841 if (!snmp_append_data2(mpctl->b_cont, &mp_tail, 842 (char *)&dest_cache, (int)sizeof (dest_cache))) { 843 ip1dbg(("ip_snmp_get_mib2_ip_dce: " 844 "failed to allocate %u bytes\n", 845 (uint_t)sizeof (dest_cache))); 846 } 847 } 848 rw_exit(&dcb->dcb_lock); 849 } 850 optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont); 851 ip3dbg(("ip_snmp_get: level %d, name %d, len %d\n", 852 (int)optp->level, (int)optp->name, (int)optp->len)); 853 qreply(q, mpctl); 854 855 return (mp2ctl); 856 } 857 858 /* 859 * Remove IPv6 DCEs which refer to an ifindex that is going away. 860 * This is not required for correctness, but it avoids netstat -d 861 * showing stale stuff that will never be used. 862 */ 863 void 864 dce_cleanup(uint_t ifindex, ip_stack_t *ipst) 865 { 866 uint_t i; 867 dcb_t *dcb; 868 dce_t *dce, *nextdce; 869 870 for (i = 0; i < ipst->ips_dce_hashsize; i++) { 871 dcb = &ipst->ips_dce_hash_v6[i]; 872 rw_enter(&dcb->dcb_lock, RW_WRITER); 873 874 for (dce = dcb->dcb_dce; dce != NULL; dce = nextdce) { 875 nextdce = dce->dce_next; 876 if (dce->dce_ifindex == ifindex) { 877 dce_delete_locked(dcb, dce); 878 dce_refrele(dce); 879 } 880 } 881 rw_exit(&dcb->dcb_lock); 882 } 883 } 884