1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright (c) 2012, Joyent, Inc. All rights reserved. 25 */ 26 27 #include <sys/types.h> 28 #include <sys/stream.h> 29 #include <sys/strsun.h> 30 #include <sys/zone.h> 31 #include <sys/ddi.h> 32 #include <sys/disp.h> 33 #include <sys/sunddi.h> 34 #include <sys/cmn_err.h> 35 #include <sys/debug.h> 36 #include <sys/atomic.h> 37 #include <sys/callb.h> 38 #define _SUN_TPI_VERSION 2 39 #include <sys/tihdr.h> 40 41 #include <inet/common.h> 42 #include <inet/mi.h> 43 #include <inet/mib2.h> 44 #include <inet/snmpcom.h> 45 46 #include <netinet/ip6.h> 47 #include <netinet/icmp6.h> 48 49 #include <inet/ip.h> 50 #include <inet/ip_impl.h> 51 #include <inet/ip6.h> 52 #include <inet/ip6_asp.h> 53 #include <inet/ip_multi.h> 54 #include <inet/ip_if.h> 55 #include <inet/ip_ire.h> 56 #include <inet/ip_ftable.h> 57 #include <inet/ip_rts.h> 58 #include <inet/ip_ndp.h> 59 #include <inet/ipclassifier.h> 60 #include <inet/ip_listutils.h> 61 62 #include <sys/sunddi.h> 63 64 /* 65 * Routines for handling destination cache entries. 66 * There is always one DCEF_DEFAULT for each ip_stack_t created at init time. 67 * That entry holds both the IP ident value and the dce generation number. 68 * 69 * Any time a DCE is changed significantly (different path MTU, but NOT 70 * different ULP info!), the dce_generation number is increased. 71 * Also, when a new DCE is created, the dce_generation number in the default 72 * DCE is bumped. That allows the dce_t information to be cached efficiently 73 * as long as the entity caching the dce_t also caches the dce_generation, 74 * and compares the cached generation to detect any changes. 75 * Furthermore, when a DCE is deleted, if there are any outstanding references 76 * to the DCE it will be marked as condemned. The condemned mark is 77 * a designated generation number which is never otherwise used, hence 78 * the single comparison with the generation number captures that as well. 79 * 80 * An example of code which caches is as follows: 81 * 82 * if (mystruct->my_dce_generation != mystruct->my_dce->dce_generation) { 83 * The DCE has changed 84 * mystruct->my_dce = dce_lookup_pkt(mp, ixa, 85 * &mystruct->my_dce_generation); 86 * Not needed in practice, since we have the default DCE: 87 * if (DCE_IS_CONDEMNED(mystruct->my_dce)) 88 * return failure; 89 * } 90 * 91 * Note that for IPv6 link-local addresses we record the ifindex since the 92 * link-locals are not globally unique. 93 */ 94 95 /* 96 * Hash bucket structure for DCEs 97 */ 98 typedef struct dcb_s { 99 krwlock_t dcb_lock; 100 uint32_t dcb_cnt; 101 dce_t *dcb_dce; 102 } dcb_t; 103 104 static void dce_delete_locked(dcb_t *, dce_t *); 105 static void dce_make_condemned(dce_t *); 106 107 static kmem_cache_t *dce_cache; 108 static kthread_t *dce_reclaim_thread; 109 static kmutex_t dce_reclaim_lock; 110 static kcondvar_t dce_reclaim_cv; 111 static int dce_reclaim_shutdown; 112 113 /* Global so it can be tuned in /etc/system. This must be a power of two. */ 114 uint_t ip_dce_hash_size = 1024; 115 116 /* The time in seconds between executions of the IP DCE reclaim worker. */ 117 uint_t ip_dce_reclaim_interval = 60; 118 119 /* The factor of the DCE threshold at which to start hard reclaims */ 120 uint_t ip_dce_reclaim_threshold_hard = 2; 121 122 /* Operates on a uint64_t */ 123 #define RANDOM_HASH(p) ((p) ^ ((p)>>16) ^ ((p)>>32) ^ ((p)>>48)) 124 125 /* 126 * Reclaim a fraction of dce's in the dcb. 127 * For now we have a higher probability to delete DCEs without DCE_PMTU. 128 */ 129 static void 130 dcb_reclaim(dcb_t *dcb, ip_stack_t *ipst, uint_t fraction) 131 { 132 uint_t fraction_pmtu = fraction*4; 133 uint_t hash; 134 dce_t *dce, *nextdce; 135 hrtime_t seed = gethrtime(); 136 uint_t retained = 0; 137 uint_t max = ipst->ips_ip_dce_reclaim_threshold; 138 139 max *= ip_dce_reclaim_threshold_hard; 140 141 rw_enter(&dcb->dcb_lock, RW_WRITER); 142 for (dce = dcb->dcb_dce; dce != NULL; dce = nextdce) { 143 nextdce = dce->dce_next; 144 /* Clear DCEF_PMTU if the pmtu is too old */ 145 mutex_enter(&dce->dce_lock); 146 if ((dce->dce_flags & DCEF_PMTU) && 147 TICK_TO_SEC(ddi_get_lbolt64()) - dce->dce_last_change_time > 148 ipst->ips_ip_pathmtu_interval) { 149 dce->dce_flags &= ~DCEF_PMTU; 150 mutex_exit(&dce->dce_lock); 151 dce_increment_generation(dce); 152 } else { 153 mutex_exit(&dce->dce_lock); 154 } 155 156 if (max == 0 || retained < max) { 157 hash = RANDOM_HASH((uint64_t)((uintptr_t)dce | seed)); 158 159 if (dce->dce_flags & DCEF_PMTU) { 160 if (hash % fraction_pmtu != 0) { 161 retained++; 162 continue; 163 } 164 } else { 165 if (hash % fraction != 0) { 166 retained++; 167 continue; 168 } 169 } 170 } 171 172 IP_STAT(ipst, ip_dce_reclaim_deleted); 173 dce_delete_locked(dcb, dce); 174 dce_refrele(dce); 175 } 176 rw_exit(&dcb->dcb_lock); 177 } 178 179 /* 180 * kmem_cache callback to free up memory. 181 * 182 */ 183 static void 184 ip_dce_reclaim_stack(ip_stack_t *ipst) 185 { 186 int i; 187 188 IP_STAT(ipst, ip_dce_reclaim_calls); 189 for (i = 0; i < ipst->ips_dce_hashsize; i++) { 190 dcb_reclaim(&ipst->ips_dce_hash_v4[i], ipst, 191 ipst->ips_ip_dce_reclaim_fraction); 192 193 dcb_reclaim(&ipst->ips_dce_hash_v6[i], ipst, 194 ipst->ips_ip_dce_reclaim_fraction); 195 } 196 197 /* 198 * Walk all CONNs that can have a reference on an ire, nce or dce. 199 * Get them to update any stale references to drop any refholds they 200 * have. 201 */ 202 ipcl_walk(conn_ixa_cleanup, (void *)B_FALSE, ipst); 203 } 204 205 /* 206 * Called by dce_reclaim_worker() below, and no one else. Typically this will 207 * mean that the number of entries in the hash buckets has exceeded a tunable 208 * threshold. 209 */ 210 static void 211 ip_dce_reclaim(void) 212 { 213 netstack_handle_t nh; 214 netstack_t *ns; 215 ip_stack_t *ipst; 216 217 ASSERT(curthread == dce_reclaim_thread); 218 219 netstack_next_init(&nh); 220 while ((ns = netstack_next(&nh)) != NULL) { 221 /* 222 * netstack_next() can return a netstack_t with a NULL 223 * netstack_ip at boot time. 224 */ 225 if ((ipst = ns->netstack_ip) == NULL) { 226 netstack_rele(ns); 227 continue; 228 } 229 if (atomic_swap_uint(&ipst->ips_dce_reclaim_needed, 0) != 0) 230 ip_dce_reclaim_stack(ipst); 231 netstack_rele(ns); 232 } 233 netstack_next_fini(&nh); 234 } 235 236 /* ARGSUSED */ 237 static void 238 dce_reclaim_worker(void *arg) 239 { 240 callb_cpr_t cprinfo; 241 242 CALLB_CPR_INIT(&cprinfo, &dce_reclaim_lock, callb_generic_cpr, 243 "dce_reclaim_worker"); 244 245 mutex_enter(&dce_reclaim_lock); 246 while (!dce_reclaim_shutdown) { 247 CALLB_CPR_SAFE_BEGIN(&cprinfo); 248 (void) cv_timedwait(&dce_reclaim_cv, &dce_reclaim_lock, 249 ddi_get_lbolt() + ip_dce_reclaim_interval * hz); 250 CALLB_CPR_SAFE_END(&cprinfo, &dce_reclaim_lock); 251 252 if (dce_reclaim_shutdown) 253 break; 254 255 mutex_exit(&dce_reclaim_lock); 256 ip_dce_reclaim(); 257 mutex_enter(&dce_reclaim_lock); 258 } 259 260 ASSERT(MUTEX_HELD(&dce_reclaim_lock)); 261 dce_reclaim_thread = NULL; 262 dce_reclaim_shutdown = 0; 263 cv_broadcast(&dce_reclaim_cv); 264 CALLB_CPR_EXIT(&cprinfo); /* drops the lock */ 265 266 thread_exit(); 267 } 268 269 void 270 dce_g_init(void) 271 { 272 dce_cache = kmem_cache_create("dce_cache", 273 sizeof (dce_t), 0, NULL, NULL, NULL, NULL, NULL, 0); 274 275 mutex_init(&dce_reclaim_lock, NULL, MUTEX_DEFAULT, NULL); 276 cv_init(&dce_reclaim_cv, NULL, CV_DEFAULT, NULL); 277 278 dce_reclaim_thread = thread_create(NULL, 0, dce_reclaim_worker, 279 NULL, 0, &p0, TS_RUN, minclsyspri); 280 } 281 282 void 283 dce_g_destroy(void) 284 { 285 mutex_enter(&dce_reclaim_lock); 286 dce_reclaim_shutdown = 1; 287 cv_signal(&dce_reclaim_cv); 288 while (dce_reclaim_thread != NULL) 289 cv_wait(&dce_reclaim_cv, &dce_reclaim_lock); 290 mutex_exit(&dce_reclaim_lock); 291 292 cv_destroy(&dce_reclaim_cv); 293 mutex_destroy(&dce_reclaim_lock); 294 295 kmem_cache_destroy(dce_cache); 296 } 297 298 /* 299 * Allocate a default DCE and a hash table for per-IP address DCEs 300 */ 301 void 302 dce_stack_init(ip_stack_t *ipst) 303 { 304 int i; 305 306 ipst->ips_dce_default = kmem_cache_alloc(dce_cache, KM_SLEEP); 307 bzero(ipst->ips_dce_default, sizeof (dce_t)); 308 ipst->ips_dce_default->dce_flags = DCEF_DEFAULT; 309 ipst->ips_dce_default->dce_generation = DCE_GENERATION_INITIAL; 310 ipst->ips_dce_default->dce_last_change_time = 311 TICK_TO_SEC(ddi_get_lbolt64()); 312 ipst->ips_dce_default->dce_refcnt = 1; /* Should never go away */ 313 ipst->ips_dce_default->dce_ipst = ipst; 314 315 /* This must be a power of two since we are using IRE_ADDR_HASH macro */ 316 ipst->ips_dce_hashsize = ip_dce_hash_size; 317 ipst->ips_dce_hash_v4 = kmem_zalloc(ipst->ips_dce_hashsize * 318 sizeof (dcb_t), KM_SLEEP); 319 ipst->ips_dce_hash_v6 = kmem_zalloc(ipst->ips_dce_hashsize * 320 sizeof (dcb_t), KM_SLEEP); 321 for (i = 0; i < ipst->ips_dce_hashsize; i++) { 322 rw_init(&ipst->ips_dce_hash_v4[i].dcb_lock, NULL, RW_DEFAULT, 323 NULL); 324 rw_init(&ipst->ips_dce_hash_v6[i].dcb_lock, NULL, RW_DEFAULT, 325 NULL); 326 } 327 } 328 329 void 330 dce_stack_destroy(ip_stack_t *ipst) 331 { 332 int i; 333 for (i = 0; i < ipst->ips_dce_hashsize; i++) { 334 rw_destroy(&ipst->ips_dce_hash_v4[i].dcb_lock); 335 rw_destroy(&ipst->ips_dce_hash_v6[i].dcb_lock); 336 } 337 kmem_free(ipst->ips_dce_hash_v4, 338 ipst->ips_dce_hashsize * sizeof (dcb_t)); 339 ipst->ips_dce_hash_v4 = NULL; 340 kmem_free(ipst->ips_dce_hash_v6, 341 ipst->ips_dce_hashsize * sizeof (dcb_t)); 342 ipst->ips_dce_hash_v6 = NULL; 343 ipst->ips_dce_hashsize = 0; 344 345 ASSERT(ipst->ips_dce_default->dce_refcnt == 1); 346 kmem_cache_free(dce_cache, ipst->ips_dce_default); 347 ipst->ips_dce_default = NULL; 348 } 349 350 /* When any DCE is good enough */ 351 dce_t * 352 dce_get_default(ip_stack_t *ipst) 353 { 354 dce_t *dce; 355 356 dce = ipst->ips_dce_default; 357 dce_refhold(dce); 358 return (dce); 359 } 360 361 /* 362 * Generic for IPv4 and IPv6. 363 * 364 * Used by callers that need to cache e.g., the datapath 365 * Returns the generation number in the last argument. 366 */ 367 dce_t * 368 dce_lookup_pkt(mblk_t *mp, ip_xmit_attr_t *ixa, uint_t *generationp) 369 { 370 if (ixa->ixa_flags & IXAF_IS_IPV4) { 371 /* 372 * If we have a source route we need to look for the final 373 * destination in the source route option. 374 */ 375 ipaddr_t final_dst; 376 ipha_t *ipha = (ipha_t *)mp->b_rptr; 377 378 final_dst = ip_get_dst(ipha); 379 return (dce_lookup_v4(final_dst, ixa->ixa_ipst, generationp)); 380 } else { 381 uint_t ifindex; 382 /* 383 * If we have a routing header we need to look for the final 384 * destination in the routing extension header. 385 */ 386 in6_addr_t final_dst; 387 ip6_t *ip6h = (ip6_t *)mp->b_rptr; 388 389 final_dst = ip_get_dst_v6(ip6h, mp, NULL); 390 ifindex = 0; 391 if (IN6_IS_ADDR_LINKSCOPE(&final_dst) && ixa->ixa_nce != NULL) { 392 ifindex = ixa->ixa_nce->nce_common->ncec_ill-> 393 ill_phyint->phyint_ifindex; 394 } 395 return (dce_lookup_v6(&final_dst, ifindex, ixa->ixa_ipst, 396 generationp)); 397 } 398 } 399 400 /* 401 * Used by callers that need to cache e.g., the datapath 402 * Returns the generation number in the last argument. 403 */ 404 dce_t * 405 dce_lookup_v4(ipaddr_t dst, ip_stack_t *ipst, uint_t *generationp) 406 { 407 uint_t hash; 408 dcb_t *dcb; 409 dce_t *dce; 410 411 /* Set *generationp before dropping the lock(s) that allow additions */ 412 if (generationp != NULL) 413 *generationp = ipst->ips_dce_default->dce_generation; 414 415 hash = IRE_ADDR_HASH(dst, ipst->ips_dce_hashsize); 416 dcb = &ipst->ips_dce_hash_v4[hash]; 417 rw_enter(&dcb->dcb_lock, RW_READER); 418 for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) { 419 if (dce->dce_v4addr == dst) { 420 mutex_enter(&dce->dce_lock); 421 if (!DCE_IS_CONDEMNED(dce)) { 422 dce_refhold(dce); 423 if (generationp != NULL) 424 *generationp = dce->dce_generation; 425 mutex_exit(&dce->dce_lock); 426 rw_exit(&dcb->dcb_lock); 427 return (dce); 428 } 429 mutex_exit(&dce->dce_lock); 430 } 431 } 432 rw_exit(&dcb->dcb_lock); 433 /* Not found */ 434 dce = ipst->ips_dce_default; 435 dce_refhold(dce); 436 return (dce); 437 } 438 439 /* 440 * Used by callers that need to cache e.g., the datapath 441 * Returns the generation number in the last argument. 442 * ifindex should only be set for link-locals 443 */ 444 dce_t * 445 dce_lookup_v6(const in6_addr_t *dst, uint_t ifindex, ip_stack_t *ipst, 446 uint_t *generationp) 447 { 448 uint_t hash; 449 dcb_t *dcb; 450 dce_t *dce; 451 452 /* Set *generationp before dropping the lock(s) that allow additions */ 453 if (generationp != NULL) 454 *generationp = ipst->ips_dce_default->dce_generation; 455 456 hash = IRE_ADDR_HASH_V6(*dst, ipst->ips_dce_hashsize); 457 dcb = &ipst->ips_dce_hash_v6[hash]; 458 rw_enter(&dcb->dcb_lock, RW_READER); 459 for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) { 460 if (IN6_ARE_ADDR_EQUAL(&dce->dce_v6addr, dst) && 461 dce->dce_ifindex == ifindex) { 462 mutex_enter(&dce->dce_lock); 463 if (!DCE_IS_CONDEMNED(dce)) { 464 dce_refhold(dce); 465 if (generationp != NULL) 466 *generationp = dce->dce_generation; 467 mutex_exit(&dce->dce_lock); 468 rw_exit(&dcb->dcb_lock); 469 return (dce); 470 } 471 mutex_exit(&dce->dce_lock); 472 } 473 } 474 rw_exit(&dcb->dcb_lock); 475 /* Not found */ 476 dce = ipst->ips_dce_default; 477 dce_refhold(dce); 478 return (dce); 479 } 480 481 /* 482 * Atomically looks for a non-default DCE, and if not found tries to create one. 483 * If there is no memory it returns NULL. 484 * When an entry is created we increase the generation number on 485 * the default DCE so that conn_ip_output will detect there is a new DCE. 486 */ 487 dce_t * 488 dce_lookup_and_add_v4(ipaddr_t dst, ip_stack_t *ipst) 489 { 490 uint_t hash; 491 dcb_t *dcb; 492 dce_t *dce; 493 494 hash = IRE_ADDR_HASH(dst, ipst->ips_dce_hashsize); 495 dcb = &ipst->ips_dce_hash_v4[hash]; 496 /* 497 * Assuming that we get fairly even distribution across all of the 498 * buckets, once one bucket is overly full, prune the whole cache. 499 */ 500 if (dcb->dcb_cnt > ipst->ips_ip_dce_reclaim_threshold) 501 atomic_or_uint(&ipst->ips_dce_reclaim_needed, 1); 502 rw_enter(&dcb->dcb_lock, RW_WRITER); 503 for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) { 504 if (dce->dce_v4addr == dst) { 505 mutex_enter(&dce->dce_lock); 506 if (!DCE_IS_CONDEMNED(dce)) { 507 dce_refhold(dce); 508 mutex_exit(&dce->dce_lock); 509 rw_exit(&dcb->dcb_lock); 510 return (dce); 511 } 512 mutex_exit(&dce->dce_lock); 513 } 514 } 515 dce = kmem_cache_alloc(dce_cache, KM_NOSLEEP); 516 if (dce == NULL) { 517 rw_exit(&dcb->dcb_lock); 518 return (NULL); 519 } 520 bzero(dce, sizeof (dce_t)); 521 dce->dce_ipst = ipst; /* No netstack_hold */ 522 dce->dce_v4addr = dst; 523 dce->dce_generation = DCE_GENERATION_INITIAL; 524 dce->dce_ipversion = IPV4_VERSION; 525 dce->dce_last_change_time = TICK_TO_SEC(ddi_get_lbolt64()); 526 dce_refhold(dce); /* For the hash list */ 527 528 /* Link into list */ 529 if (dcb->dcb_dce != NULL) 530 dcb->dcb_dce->dce_ptpn = &dce->dce_next; 531 dce->dce_next = dcb->dcb_dce; 532 dce->dce_ptpn = &dcb->dcb_dce; 533 dcb->dcb_dce = dce; 534 dce->dce_bucket = dcb; 535 atomic_add_32(&dcb->dcb_cnt, 1); 536 dce_refhold(dce); /* For the caller */ 537 rw_exit(&dcb->dcb_lock); 538 539 /* Initialize dce_ident to be different than for the last packet */ 540 dce->dce_ident = ipst->ips_dce_default->dce_ident + 1; 541 542 dce_increment_generation(ipst->ips_dce_default); 543 return (dce); 544 } 545 546 /* 547 * Atomically looks for a non-default DCE, and if not found tries to create one. 548 * If there is no memory it returns NULL. 549 * When an entry is created we increase the generation number on 550 * the default DCE so that conn_ip_output will detect there is a new DCE. 551 * ifindex should only be used with link-local addresses. 552 */ 553 dce_t * 554 dce_lookup_and_add_v6(const in6_addr_t *dst, uint_t ifindex, ip_stack_t *ipst) 555 { 556 uint_t hash; 557 dcb_t *dcb; 558 dce_t *dce; 559 560 /* We should not create entries for link-locals w/o an ifindex */ 561 ASSERT(!(IN6_IS_ADDR_LINKSCOPE(dst)) || ifindex != 0); 562 563 hash = IRE_ADDR_HASH_V6(*dst, ipst->ips_dce_hashsize); 564 dcb = &ipst->ips_dce_hash_v6[hash]; 565 /* 566 * Assuming that we get fairly even distribution across all of the 567 * buckets, once one bucket is overly full, prune the whole cache. 568 */ 569 if (dcb->dcb_cnt > ipst->ips_ip_dce_reclaim_threshold) 570 atomic_or_uint(&ipst->ips_dce_reclaim_needed, 1); 571 rw_enter(&dcb->dcb_lock, RW_WRITER); 572 for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) { 573 if (IN6_ARE_ADDR_EQUAL(&dce->dce_v6addr, dst) && 574 dce->dce_ifindex == ifindex) { 575 mutex_enter(&dce->dce_lock); 576 if (!DCE_IS_CONDEMNED(dce)) { 577 dce_refhold(dce); 578 mutex_exit(&dce->dce_lock); 579 rw_exit(&dcb->dcb_lock); 580 return (dce); 581 } 582 mutex_exit(&dce->dce_lock); 583 } 584 } 585 586 dce = kmem_cache_alloc(dce_cache, KM_NOSLEEP); 587 if (dce == NULL) { 588 rw_exit(&dcb->dcb_lock); 589 return (NULL); 590 } 591 bzero(dce, sizeof (dce_t)); 592 dce->dce_ipst = ipst; /* No netstack_hold */ 593 dce->dce_v6addr = *dst; 594 dce->dce_ifindex = ifindex; 595 dce->dce_generation = DCE_GENERATION_INITIAL; 596 dce->dce_ipversion = IPV6_VERSION; 597 dce->dce_last_change_time = TICK_TO_SEC(ddi_get_lbolt64()); 598 dce_refhold(dce); /* For the hash list */ 599 600 /* Link into list */ 601 if (dcb->dcb_dce != NULL) 602 dcb->dcb_dce->dce_ptpn = &dce->dce_next; 603 dce->dce_next = dcb->dcb_dce; 604 dce->dce_ptpn = &dcb->dcb_dce; 605 dcb->dcb_dce = dce; 606 dce->dce_bucket = dcb; 607 atomic_add_32(&dcb->dcb_cnt, 1); 608 dce_refhold(dce); /* For the caller */ 609 rw_exit(&dcb->dcb_lock); 610 611 /* Initialize dce_ident to be different than for the last packet */ 612 dce->dce_ident = ipst->ips_dce_default->dce_ident + 1; 613 dce_increment_generation(ipst->ips_dce_default); 614 return (dce); 615 } 616 617 /* 618 * Set/update uinfo. Creates a per-destination dce if none exists. 619 * 620 * Note that we do not bump the generation number here. 621 * New connections will find the new uinfo. 622 * 623 * The only use of this (tcp, sctp using iulp_t) is to set rtt+rtt_sd. 624 */ 625 static void 626 dce_setuinfo(dce_t *dce, iulp_t *uinfo) 627 { 628 /* 629 * Update the round trip time estimate and/or the max frag size 630 * and/or the slow start threshold. 631 * 632 * We serialize multiple advises using dce_lock. 633 */ 634 mutex_enter(&dce->dce_lock); 635 /* Gard against setting to zero */ 636 if (uinfo->iulp_rtt != 0) { 637 /* 638 * If there is no old cached values, initialize them 639 * conservatively. Set them to be (1.5 * new value). 640 */ 641 if (dce->dce_uinfo.iulp_rtt != 0) { 642 dce->dce_uinfo.iulp_rtt = (dce->dce_uinfo.iulp_rtt + 643 uinfo->iulp_rtt) >> 1; 644 } else { 645 dce->dce_uinfo.iulp_rtt = uinfo->iulp_rtt + 646 (uinfo->iulp_rtt >> 1); 647 } 648 if (dce->dce_uinfo.iulp_rtt_sd != 0) { 649 dce->dce_uinfo.iulp_rtt_sd = 650 (dce->dce_uinfo.iulp_rtt_sd + 651 uinfo->iulp_rtt_sd) >> 1; 652 } else { 653 dce->dce_uinfo.iulp_rtt_sd = uinfo->iulp_rtt_sd + 654 (uinfo->iulp_rtt_sd >> 1); 655 } 656 } 657 if (uinfo->iulp_mtu != 0) { 658 if (dce->dce_flags & DCEF_PMTU) { 659 dce->dce_pmtu = MIN(uinfo->iulp_mtu, dce->dce_pmtu); 660 } else { 661 dce->dce_pmtu = MIN(uinfo->iulp_mtu, IP_MAXPACKET); 662 dce->dce_flags |= DCEF_PMTU; 663 } 664 dce->dce_last_change_time = TICK_TO_SEC(ddi_get_lbolt64()); 665 } 666 if (uinfo->iulp_ssthresh != 0) { 667 if (dce->dce_uinfo.iulp_ssthresh != 0) 668 dce->dce_uinfo.iulp_ssthresh = 669 (uinfo->iulp_ssthresh + 670 dce->dce_uinfo.iulp_ssthresh) >> 1; 671 else 672 dce->dce_uinfo.iulp_ssthresh = uinfo->iulp_ssthresh; 673 } 674 /* We have uinfo for sure */ 675 dce->dce_flags |= DCEF_UINFO; 676 mutex_exit(&dce->dce_lock); 677 } 678 679 680 int 681 dce_update_uinfo_v4(ipaddr_t dst, iulp_t *uinfo, ip_stack_t *ipst) 682 { 683 dce_t *dce; 684 685 dce = dce_lookup_and_add_v4(dst, ipst); 686 if (dce == NULL) 687 return (ENOMEM); 688 689 dce_setuinfo(dce, uinfo); 690 dce_refrele(dce); 691 return (0); 692 } 693 694 int 695 dce_update_uinfo_v6(const in6_addr_t *dst, uint_t ifindex, iulp_t *uinfo, 696 ip_stack_t *ipst) 697 { 698 dce_t *dce; 699 700 dce = dce_lookup_and_add_v6(dst, ifindex, ipst); 701 if (dce == NULL) 702 return (ENOMEM); 703 704 dce_setuinfo(dce, uinfo); 705 dce_refrele(dce); 706 return (0); 707 } 708 709 /* Common routine for IPv4 and IPv6 */ 710 int 711 dce_update_uinfo(const in6_addr_t *dst, uint_t ifindex, iulp_t *uinfo, 712 ip_stack_t *ipst) 713 { 714 ipaddr_t dst4; 715 716 if (IN6_IS_ADDR_V4MAPPED_ANY(dst)) { 717 IN6_V4MAPPED_TO_IPADDR(dst, dst4); 718 return (dce_update_uinfo_v4(dst4, uinfo, ipst)); 719 } else { 720 return (dce_update_uinfo_v6(dst, ifindex, uinfo, ipst)); 721 } 722 } 723 724 static void 725 dce_make_condemned(dce_t *dce) 726 { 727 ip_stack_t *ipst = dce->dce_ipst; 728 729 mutex_enter(&dce->dce_lock); 730 ASSERT(!DCE_IS_CONDEMNED(dce)); 731 dce->dce_generation = DCE_GENERATION_CONDEMNED; 732 mutex_exit(&dce->dce_lock); 733 /* Count how many condemned dces for kmem_cache callback */ 734 atomic_add_32(&ipst->ips_num_dce_condemned, 1); 735 } 736 737 /* 738 * Increment the generation avoiding the special condemned value 739 */ 740 void 741 dce_increment_generation(dce_t *dce) 742 { 743 uint_t generation; 744 745 mutex_enter(&dce->dce_lock); 746 if (!DCE_IS_CONDEMNED(dce)) { 747 generation = dce->dce_generation + 1; 748 if (generation == DCE_GENERATION_CONDEMNED) 749 generation = DCE_GENERATION_INITIAL; 750 ASSERT(generation != DCE_GENERATION_VERIFY); 751 dce->dce_generation = generation; 752 } 753 mutex_exit(&dce->dce_lock); 754 } 755 756 /* 757 * Increment the generation number on all dces that have a path MTU and 758 * the default DCE. Used when ill_mtu or ill_mc_mtu changes. 759 */ 760 void 761 dce_increment_all_generations(boolean_t isv6, ip_stack_t *ipst) 762 { 763 int i; 764 dcb_t *dcb; 765 dce_t *dce; 766 767 for (i = 0; i < ipst->ips_dce_hashsize; i++) { 768 if (isv6) 769 dcb = &ipst->ips_dce_hash_v6[i]; 770 else 771 dcb = &ipst->ips_dce_hash_v4[i]; 772 rw_enter(&dcb->dcb_lock, RW_WRITER); 773 for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) { 774 if (DCE_IS_CONDEMNED(dce)) 775 continue; 776 dce_increment_generation(dce); 777 } 778 rw_exit(&dcb->dcb_lock); 779 } 780 dce_increment_generation(ipst->ips_dce_default); 781 } 782 783 /* 784 * Caller needs to do a dce_refrele since we can't do the 785 * dce_refrele under dcb_lock. 786 */ 787 static void 788 dce_delete_locked(dcb_t *dcb, dce_t *dce) 789 { 790 dce->dce_bucket = NULL; 791 *dce->dce_ptpn = dce->dce_next; 792 if (dce->dce_next != NULL) 793 dce->dce_next->dce_ptpn = dce->dce_ptpn; 794 dce->dce_ptpn = NULL; 795 dce->dce_next = NULL; 796 atomic_add_32(&dcb->dcb_cnt, -1); 797 dce_make_condemned(dce); 798 } 799 800 static void 801 dce_inactive(dce_t *dce) 802 { 803 ip_stack_t *ipst = dce->dce_ipst; 804 805 ASSERT(!(dce->dce_flags & DCEF_DEFAULT)); 806 ASSERT(dce->dce_ptpn == NULL); 807 ASSERT(dce->dce_bucket == NULL); 808 809 /* Count how many condemned dces for kmem_cache callback */ 810 if (DCE_IS_CONDEMNED(dce)) 811 atomic_add_32(&ipst->ips_num_dce_condemned, -1); 812 813 kmem_cache_free(dce_cache, dce); 814 } 815 816 void 817 dce_refrele(dce_t *dce) 818 { 819 ASSERT(dce->dce_refcnt != 0); 820 if (atomic_add_32_nv(&dce->dce_refcnt, -1) == 0) 821 dce_inactive(dce); 822 } 823 824 void 825 dce_refhold(dce_t *dce) 826 { 827 atomic_add_32(&dce->dce_refcnt, 1); 828 ASSERT(dce->dce_refcnt != 0); 829 } 830 831 /* No tracing support yet hence the same as the above functions */ 832 void 833 dce_refrele_notr(dce_t *dce) 834 { 835 ASSERT(dce->dce_refcnt != 0); 836 if (atomic_add_32_nv(&dce->dce_refcnt, -1) == 0) 837 dce_inactive(dce); 838 } 839 840 void 841 dce_refhold_notr(dce_t *dce) 842 { 843 atomic_add_32(&dce->dce_refcnt, 1); 844 ASSERT(dce->dce_refcnt != 0); 845 } 846 847 /* Report both the IPv4 and IPv6 DCEs. */ 848 mblk_t * 849 ip_snmp_get_mib2_ip_dce(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst) 850 { 851 struct opthdr *optp; 852 mblk_t *mp2ctl; 853 dest_cache_entry_t dest_cache; 854 mblk_t *mp_tail = NULL; 855 dce_t *dce; 856 dcb_t *dcb; 857 int i; 858 uint64_t current_time; 859 860 current_time = TICK_TO_SEC(ddi_get_lbolt64()); 861 862 /* 863 * make a copy of the original message 864 */ 865 mp2ctl = copymsg(mpctl); 866 867 /* First we do IPv4 entries */ 868 optp = (struct opthdr *)&mpctl->b_rptr[ 869 sizeof (struct T_optmgmt_ack)]; 870 optp->level = MIB2_IP; 871 optp->name = EXPER_IP_DCE; 872 873 for (i = 0; i < ipst->ips_dce_hashsize; i++) { 874 dcb = &ipst->ips_dce_hash_v4[i]; 875 rw_enter(&dcb->dcb_lock, RW_READER); 876 for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) { 877 dest_cache.DestIpv4Address = dce->dce_v4addr; 878 dest_cache.DestFlags = dce->dce_flags; 879 if (dce->dce_flags & DCEF_PMTU) 880 dest_cache.DestPmtu = dce->dce_pmtu; 881 else 882 dest_cache.DestPmtu = 0; 883 dest_cache.DestIdent = dce->dce_ident; 884 dest_cache.DestIfindex = 0; 885 dest_cache.DestAge = current_time - 886 dce->dce_last_change_time; 887 if (!snmp_append_data2(mpctl->b_cont, &mp_tail, 888 (char *)&dest_cache, (int)sizeof (dest_cache))) { 889 ip1dbg(("ip_snmp_get_mib2_ip_dce: " 890 "failed to allocate %u bytes\n", 891 (uint_t)sizeof (dest_cache))); 892 } 893 } 894 rw_exit(&dcb->dcb_lock); 895 } 896 optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont); 897 ip3dbg(("ip_snmp_get: level %d, name %d, len %d\n", 898 (int)optp->level, (int)optp->name, (int)optp->len)); 899 qreply(q, mpctl); 900 901 if (mp2ctl == NULL) { 902 /* Copymsg failed above */ 903 return (NULL); 904 } 905 906 /* Now for IPv6 */ 907 mpctl = mp2ctl; 908 mp_tail = NULL; 909 mp2ctl = copymsg(mpctl); 910 optp = (struct opthdr *)&mpctl->b_rptr[ 911 sizeof (struct T_optmgmt_ack)]; 912 optp->level = MIB2_IP6; 913 optp->name = EXPER_IP_DCE; 914 915 for (i = 0; i < ipst->ips_dce_hashsize; i++) { 916 dcb = &ipst->ips_dce_hash_v6[i]; 917 rw_enter(&dcb->dcb_lock, RW_READER); 918 for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) { 919 dest_cache.DestIpv6Address = dce->dce_v6addr; 920 dest_cache.DestFlags = dce->dce_flags; 921 if (dce->dce_flags & DCEF_PMTU) 922 dest_cache.DestPmtu = dce->dce_pmtu; 923 else 924 dest_cache.DestPmtu = 0; 925 dest_cache.DestIdent = dce->dce_ident; 926 if (IN6_IS_ADDR_LINKSCOPE(&dce->dce_v6addr)) 927 dest_cache.DestIfindex = dce->dce_ifindex; 928 else 929 dest_cache.DestIfindex = 0; 930 dest_cache.DestAge = current_time - 931 dce->dce_last_change_time; 932 if (!snmp_append_data2(mpctl->b_cont, &mp_tail, 933 (char *)&dest_cache, (int)sizeof (dest_cache))) { 934 ip1dbg(("ip_snmp_get_mib2_ip_dce: " 935 "failed to allocate %u bytes\n", 936 (uint_t)sizeof (dest_cache))); 937 } 938 } 939 rw_exit(&dcb->dcb_lock); 940 } 941 optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont); 942 ip3dbg(("ip_snmp_get: level %d, name %d, len %d\n", 943 (int)optp->level, (int)optp->name, (int)optp->len)); 944 qreply(q, mpctl); 945 946 return (mp2ctl); 947 } 948 949 /* 950 * Remove IPv6 DCEs which refer to an ifindex that is going away. 951 * This is not required for correctness, but it avoids netstat -d 952 * showing stale stuff that will never be used. 953 */ 954 void 955 dce_cleanup(uint_t ifindex, ip_stack_t *ipst) 956 { 957 uint_t i; 958 dcb_t *dcb; 959 dce_t *dce, *nextdce; 960 961 for (i = 0; i < ipst->ips_dce_hashsize; i++) { 962 dcb = &ipst->ips_dce_hash_v6[i]; 963 rw_enter(&dcb->dcb_lock, RW_WRITER); 964 965 for (dce = dcb->dcb_dce; dce != NULL; dce = nextdce) { 966 nextdce = dce->dce_next; 967 if (dce->dce_ifindex == ifindex) { 968 dce_delete_locked(dcb, dce); 969 dce_refrele(dce); 970 } 971 } 972 rw_exit(&dcb->dcb_lock); 973 } 974 } 975