1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright (c) 2012, Joyent, Inc. All rights reserved. 25 * Copyright 2017, OmniTI Computer Consulting, Inc. All rights reserved. 26 */ 27 28 #include <sys/types.h> 29 #include <sys/stream.h> 30 #include <sys/strsun.h> 31 #include <sys/zone.h> 32 #include <sys/ddi.h> 33 #include <sys/disp.h> 34 #include <sys/sunddi.h> 35 #include <sys/cmn_err.h> 36 #include <sys/debug.h> 37 #include <sys/atomic.h> 38 #include <sys/callb.h> 39 #define _SUN_TPI_VERSION 2 40 #include <sys/tihdr.h> 41 42 #include <inet/common.h> 43 #include <inet/mi.h> 44 #include <inet/mib2.h> 45 #include <inet/snmpcom.h> 46 47 #include <netinet/ip6.h> 48 #include <netinet/icmp6.h> 49 50 #include <inet/ip.h> 51 #include <inet/ip_impl.h> 52 #include <inet/ip6.h> 53 #include <inet/ip6_asp.h> 54 #include <inet/ip_multi.h> 55 #include <inet/ip_if.h> 56 #include <inet/ip_ire.h> 57 #include <inet/ip_ftable.h> 58 #include <inet/ip_rts.h> 59 #include <inet/ip_ndp.h> 60 #include <inet/ipclassifier.h> 61 #include <inet/ip_listutils.h> 62 63 #include <sys/sunddi.h> 64 65 /* 66 * Routines for handling destination cache entries. 67 * There is always one DCEF_DEFAULT for each ip_stack_t created at init time. 68 * That entry holds both the IP ident value and the dce generation number. 69 * 70 * Any time a DCE is changed significantly (different path MTU, but NOT 71 * different ULP info!), the dce_generation number is increased. 72 * Also, when a new DCE is created, the dce_generation number in the default 73 * DCE is bumped. That allows the dce_t information to be cached efficiently 74 * as long as the entity caching the dce_t also caches the dce_generation, 75 * and compares the cached generation to detect any changes. 76 * Furthermore, when a DCE is deleted, if there are any outstanding references 77 * to the DCE it will be marked as condemned. The condemned mark is 78 * a designated generation number which is never otherwise used, hence 79 * the single comparison with the generation number captures that as well. 80 * 81 * An example of code which caches is as follows: 82 * 83 * if (mystruct->my_dce_generation != mystruct->my_dce->dce_generation) { 84 * The DCE has changed 85 * mystruct->my_dce = dce_lookup_pkt(mp, ixa, 86 * &mystruct->my_dce_generation); 87 * Not needed in practice, since we have the default DCE: 88 * if (DCE_IS_CONDEMNED(mystruct->my_dce)) 89 * return failure; 90 * } 91 * 92 * Note that for IPv6 link-local addresses we record the ifindex since the 93 * link-locals are not globally unique. 94 * 95 * DCEs can remain for an arbitrarily long time, until memory pressure or 96 * too-deep hash buckets (see dce_lookup_and_add*()) enable the reclaim thread 97 * to actually remove DCEs from the cache. 98 */ 99 100 /* 101 * Hash bucket structure for DCEs 102 */ 103 typedef struct dcb_s { 104 krwlock_t dcb_lock; 105 uint32_t dcb_cnt; 106 dce_t *dcb_dce; 107 } dcb_t; 108 109 static void dce_delete_locked(dcb_t *, dce_t *); 110 static void dce_make_condemned(dce_t *); 111 112 static kmem_cache_t *dce_cache; 113 static kthread_t *dce_reclaim_thread; 114 static kmutex_t dce_reclaim_lock; 115 static kcondvar_t dce_reclaim_cv; 116 static int dce_reclaim_shutdown; 117 118 /* Global so it can be tuned in /etc/system. This must be a power of two. */ 119 uint_t ip_dce_hash_size = 1024; 120 121 /* The time in seconds between executions of the IP DCE reclaim worker. */ 122 uint_t ip_dce_reclaim_interval = 60; 123 124 /* The factor of the DCE threshold at which to start hard reclaims */ 125 uint_t ip_dce_reclaim_threshold_hard = 2; 126 127 /* Operates on a uint64_t */ 128 #define RANDOM_HASH(p) ((p) ^ ((p)>>16) ^ ((p)>>32) ^ ((p)>>48)) 129 130 /* 131 * Reclaim a fraction of dce's in the dcb. 132 * For now we have a higher probability to delete DCEs without DCE_PMTU. 133 */ 134 static void 135 dcb_reclaim(dcb_t *dcb, ip_stack_t *ipst, uint_t fraction) 136 { 137 uint_t fraction_pmtu = fraction*4; 138 uint_t hash; 139 dce_t *dce, *nextdce; 140 hrtime_t seed = gethrtime(); 141 uint_t retained = 0; 142 uint_t max = ipst->ips_ip_dce_reclaim_threshold; 143 144 max *= ip_dce_reclaim_threshold_hard; 145 146 rw_enter(&dcb->dcb_lock, RW_WRITER); 147 for (dce = dcb->dcb_dce; dce != NULL; dce = nextdce) { 148 nextdce = dce->dce_next; 149 /* Clear DCEF_PMTU if the pmtu is too old */ 150 mutex_enter(&dce->dce_lock); 151 if ((dce->dce_flags & DCEF_PMTU) && 152 TICK_TO_SEC(ddi_get_lbolt64()) - dce->dce_last_change_time > 153 ipst->ips_ip_pathmtu_interval) { 154 dce->dce_flags &= ~DCEF_PMTU; 155 mutex_exit(&dce->dce_lock); 156 dce_increment_generation(dce); 157 } else { 158 mutex_exit(&dce->dce_lock); 159 } 160 161 if (max == 0 || retained < max) { 162 hash = RANDOM_HASH((uint64_t)((uintptr_t)dce | seed)); 163 164 if (dce->dce_flags & DCEF_PMTU) { 165 if (hash % fraction_pmtu != 0) { 166 retained++; 167 continue; 168 } 169 } else { 170 if (hash % fraction != 0) { 171 retained++; 172 continue; 173 } 174 } 175 } 176 177 IP_STAT(ipst, ip_dce_reclaim_deleted); 178 dce_delete_locked(dcb, dce); 179 dce_refrele(dce); 180 } 181 rw_exit(&dcb->dcb_lock); 182 } 183 184 /* 185 * kmem_cache callback to free up memory. 186 * 187 */ 188 static void 189 ip_dce_reclaim_stack(ip_stack_t *ipst) 190 { 191 int i; 192 193 IP_STAT(ipst, ip_dce_reclaim_calls); 194 for (i = 0; i < ipst->ips_dce_hashsize; i++) { 195 dcb_reclaim(&ipst->ips_dce_hash_v4[i], ipst, 196 ipst->ips_ip_dce_reclaim_fraction); 197 198 dcb_reclaim(&ipst->ips_dce_hash_v6[i], ipst, 199 ipst->ips_ip_dce_reclaim_fraction); 200 } 201 202 /* 203 * Walk all CONNs that can have a reference on an ire, nce or dce. 204 * Get them to update any stale references to drop any refholds they 205 * have. 206 */ 207 ipcl_walk(conn_ixa_cleanup, (void *)B_FALSE, ipst); 208 } 209 210 /* 211 * Called by dce_reclaim_worker() below, and no one else. Typically this will 212 * mean that the number of entries in the hash buckets has exceeded a tunable 213 * threshold. 214 */ 215 static void 216 ip_dce_reclaim(void) 217 { 218 netstack_handle_t nh; 219 netstack_t *ns; 220 ip_stack_t *ipst; 221 222 ASSERT(curthread == dce_reclaim_thread); 223 224 netstack_next_init(&nh); 225 while ((ns = netstack_next(&nh)) != NULL) { 226 /* 227 * netstack_next() can return a netstack_t with a NULL 228 * netstack_ip at boot time. 229 */ 230 if ((ipst = ns->netstack_ip) == NULL) { 231 netstack_rele(ns); 232 continue; 233 } 234 if (atomic_swap_uint(&ipst->ips_dce_reclaim_needed, 0) != 0) 235 ip_dce_reclaim_stack(ipst); 236 netstack_rele(ns); 237 } 238 netstack_next_fini(&nh); 239 } 240 241 /* ARGSUSED */ 242 static void 243 dce_reclaim_worker(void *arg) 244 { 245 callb_cpr_t cprinfo; 246 247 CALLB_CPR_INIT(&cprinfo, &dce_reclaim_lock, callb_generic_cpr, 248 "dce_reclaim_worker"); 249 250 mutex_enter(&dce_reclaim_lock); 251 while (!dce_reclaim_shutdown) { 252 CALLB_CPR_SAFE_BEGIN(&cprinfo); 253 (void) cv_timedwait(&dce_reclaim_cv, &dce_reclaim_lock, 254 ddi_get_lbolt() + ip_dce_reclaim_interval * hz); 255 CALLB_CPR_SAFE_END(&cprinfo, &dce_reclaim_lock); 256 257 if (dce_reclaim_shutdown) 258 break; 259 260 mutex_exit(&dce_reclaim_lock); 261 ip_dce_reclaim(); 262 mutex_enter(&dce_reclaim_lock); 263 } 264 265 ASSERT(MUTEX_HELD(&dce_reclaim_lock)); 266 dce_reclaim_thread = NULL; 267 dce_reclaim_shutdown = 0; 268 cv_broadcast(&dce_reclaim_cv); 269 CALLB_CPR_EXIT(&cprinfo); /* drops the lock */ 270 271 thread_exit(); 272 } 273 274 void 275 dce_g_init(void) 276 { 277 dce_cache = kmem_cache_create("dce_cache", 278 sizeof (dce_t), 0, NULL, NULL, NULL, NULL, NULL, 0); 279 280 mutex_init(&dce_reclaim_lock, NULL, MUTEX_DEFAULT, NULL); 281 cv_init(&dce_reclaim_cv, NULL, CV_DEFAULT, NULL); 282 283 dce_reclaim_thread = thread_create(NULL, 0, dce_reclaim_worker, 284 NULL, 0, &p0, TS_RUN, minclsyspri); 285 } 286 287 void 288 dce_g_destroy(void) 289 { 290 mutex_enter(&dce_reclaim_lock); 291 dce_reclaim_shutdown = 1; 292 cv_signal(&dce_reclaim_cv); 293 while (dce_reclaim_thread != NULL) 294 cv_wait(&dce_reclaim_cv, &dce_reclaim_lock); 295 mutex_exit(&dce_reclaim_lock); 296 297 cv_destroy(&dce_reclaim_cv); 298 mutex_destroy(&dce_reclaim_lock); 299 300 kmem_cache_destroy(dce_cache); 301 } 302 303 /* 304 * Allocate a default DCE and a hash table for per-IP address DCEs 305 */ 306 void 307 dce_stack_init(ip_stack_t *ipst) 308 { 309 int i; 310 311 ipst->ips_dce_default = kmem_cache_alloc(dce_cache, KM_SLEEP); 312 bzero(ipst->ips_dce_default, sizeof (dce_t)); 313 ipst->ips_dce_default->dce_flags = DCEF_DEFAULT; 314 ipst->ips_dce_default->dce_generation = DCE_GENERATION_INITIAL; 315 ipst->ips_dce_default->dce_last_change_time = 316 TICK_TO_SEC(ddi_get_lbolt64()); 317 ipst->ips_dce_default->dce_refcnt = 1; /* Should never go away */ 318 ipst->ips_dce_default->dce_ipst = ipst; 319 320 /* This must be a power of two since we are using IRE_ADDR_HASH macro */ 321 ipst->ips_dce_hashsize = ip_dce_hash_size; 322 ipst->ips_dce_hash_v4 = kmem_zalloc(ipst->ips_dce_hashsize * 323 sizeof (dcb_t), KM_SLEEP); 324 ipst->ips_dce_hash_v6 = kmem_zalloc(ipst->ips_dce_hashsize * 325 sizeof (dcb_t), KM_SLEEP); 326 for (i = 0; i < ipst->ips_dce_hashsize; i++) { 327 rw_init(&ipst->ips_dce_hash_v4[i].dcb_lock, NULL, RW_DEFAULT, 328 NULL); 329 rw_init(&ipst->ips_dce_hash_v6[i].dcb_lock, NULL, RW_DEFAULT, 330 NULL); 331 } 332 } 333 334 /* 335 * Given a DCE hash bucket, unlink DCE entries from it. Some callers need 336 * ifindex-specific matching, others don't. Don't overload ifindex to indicate 337 * specificity, just indicate so explicitly. 338 */ 339 static void 340 dce_bucket_clean(dcb_t *dcb, boolean_t specific_ifindex, uint_t ifindex) 341 { 342 dce_t *dce, *nextdce; 343 344 rw_enter(&dcb->dcb_lock, RW_WRITER); 345 346 for (dce = dcb->dcb_dce; dce != NULL; dce = nextdce) { 347 nextdce = dce->dce_next; 348 if ((!specific_ifindex) || dce->dce_ifindex == ifindex) { 349 dce_delete_locked(dcb, dce); 350 dce_refrele(dce); 351 } 352 } 353 354 rw_exit(&dcb->dcb_lock); 355 } 356 357 void 358 dce_stack_destroy(ip_stack_t *ipst) 359 { 360 int i; 361 for (i = 0; i < ipst->ips_dce_hashsize; i++) { 362 dce_bucket_clean(&ipst->ips_dce_hash_v4[i], B_FALSE, 0); 363 rw_destroy(&ipst->ips_dce_hash_v4[i].dcb_lock); 364 dce_bucket_clean(&ipst->ips_dce_hash_v6[i], B_FALSE, 0); 365 rw_destroy(&ipst->ips_dce_hash_v6[i].dcb_lock); 366 } 367 kmem_free(ipst->ips_dce_hash_v4, 368 ipst->ips_dce_hashsize * sizeof (dcb_t)); 369 ipst->ips_dce_hash_v4 = NULL; 370 kmem_free(ipst->ips_dce_hash_v6, 371 ipst->ips_dce_hashsize * sizeof (dcb_t)); 372 ipst->ips_dce_hash_v6 = NULL; 373 ipst->ips_dce_hashsize = 0; 374 375 ASSERT(ipst->ips_dce_default->dce_refcnt == 1); 376 kmem_cache_free(dce_cache, ipst->ips_dce_default); 377 ipst->ips_dce_default = NULL; 378 } 379 380 /* When any DCE is good enough */ 381 dce_t * 382 dce_get_default(ip_stack_t *ipst) 383 { 384 dce_t *dce; 385 386 dce = ipst->ips_dce_default; 387 dce_refhold(dce); 388 return (dce); 389 } 390 391 /* 392 * Generic for IPv4 and IPv6. 393 * 394 * Used by callers that need to cache e.g., the datapath 395 * Returns the generation number in the last argument. 396 */ 397 dce_t * 398 dce_lookup_pkt(mblk_t *mp, ip_xmit_attr_t *ixa, uint_t *generationp) 399 { 400 if (ixa->ixa_flags & IXAF_IS_IPV4) { 401 /* 402 * If we have a source route we need to look for the final 403 * destination in the source route option. 404 */ 405 ipaddr_t final_dst; 406 ipha_t *ipha = (ipha_t *)mp->b_rptr; 407 408 final_dst = ip_get_dst(ipha); 409 return (dce_lookup_v4(final_dst, ixa->ixa_ipst, generationp)); 410 } else { 411 uint_t ifindex; 412 /* 413 * If we have a routing header we need to look for the final 414 * destination in the routing extension header. 415 */ 416 in6_addr_t final_dst; 417 ip6_t *ip6h = (ip6_t *)mp->b_rptr; 418 419 final_dst = ip_get_dst_v6(ip6h, mp, NULL); 420 ifindex = 0; 421 if (IN6_IS_ADDR_LINKSCOPE(&final_dst) && ixa->ixa_nce != NULL) { 422 ifindex = ixa->ixa_nce->nce_common->ncec_ill-> 423 ill_phyint->phyint_ifindex; 424 } 425 return (dce_lookup_v6(&final_dst, ifindex, ixa->ixa_ipst, 426 generationp)); 427 } 428 } 429 430 /* 431 * Used by callers that need to cache e.g., the datapath 432 * Returns the generation number in the last argument. 433 */ 434 dce_t * 435 dce_lookup_v4(ipaddr_t dst, ip_stack_t *ipst, uint_t *generationp) 436 { 437 uint_t hash; 438 dcb_t *dcb; 439 dce_t *dce; 440 441 /* Set *generationp before dropping the lock(s) that allow additions */ 442 if (generationp != NULL) 443 *generationp = ipst->ips_dce_default->dce_generation; 444 445 hash = IRE_ADDR_HASH(dst, ipst->ips_dce_hashsize); 446 dcb = &ipst->ips_dce_hash_v4[hash]; 447 rw_enter(&dcb->dcb_lock, RW_READER); 448 for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) { 449 if (dce->dce_v4addr == dst) { 450 mutex_enter(&dce->dce_lock); 451 if (!DCE_IS_CONDEMNED(dce)) { 452 dce_refhold(dce); 453 if (generationp != NULL) 454 *generationp = dce->dce_generation; 455 mutex_exit(&dce->dce_lock); 456 rw_exit(&dcb->dcb_lock); 457 return (dce); 458 } 459 mutex_exit(&dce->dce_lock); 460 } 461 } 462 rw_exit(&dcb->dcb_lock); 463 /* Not found */ 464 dce = ipst->ips_dce_default; 465 dce_refhold(dce); 466 return (dce); 467 } 468 469 /* 470 * Used by callers that need to cache e.g., the datapath 471 * Returns the generation number in the last argument. 472 * ifindex should only be set for link-locals 473 */ 474 dce_t * 475 dce_lookup_v6(const in6_addr_t *dst, uint_t ifindex, ip_stack_t *ipst, 476 uint_t *generationp) 477 { 478 uint_t hash; 479 dcb_t *dcb; 480 dce_t *dce; 481 482 /* Set *generationp before dropping the lock(s) that allow additions */ 483 if (generationp != NULL) 484 *generationp = ipst->ips_dce_default->dce_generation; 485 486 hash = IRE_ADDR_HASH_V6(*dst, ipst->ips_dce_hashsize); 487 dcb = &ipst->ips_dce_hash_v6[hash]; 488 rw_enter(&dcb->dcb_lock, RW_READER); 489 for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) { 490 if (IN6_ARE_ADDR_EQUAL(&dce->dce_v6addr, dst) && 491 dce->dce_ifindex == ifindex) { 492 mutex_enter(&dce->dce_lock); 493 if (!DCE_IS_CONDEMNED(dce)) { 494 dce_refhold(dce); 495 if (generationp != NULL) 496 *generationp = dce->dce_generation; 497 mutex_exit(&dce->dce_lock); 498 rw_exit(&dcb->dcb_lock); 499 return (dce); 500 } 501 mutex_exit(&dce->dce_lock); 502 } 503 } 504 rw_exit(&dcb->dcb_lock); 505 /* Not found */ 506 dce = ipst->ips_dce_default; 507 dce_refhold(dce); 508 return (dce); 509 } 510 511 /* 512 * Atomically looks for a non-default DCE, and if not found tries to create one. 513 * If there is no memory it returns NULL. 514 * When an entry is created we increase the generation number on 515 * the default DCE so that conn_ip_output will detect there is a new DCE. 516 */ 517 dce_t * 518 dce_lookup_and_add_v4(ipaddr_t dst, ip_stack_t *ipst) 519 { 520 uint_t hash; 521 dcb_t *dcb; 522 dce_t *dce; 523 524 hash = IRE_ADDR_HASH(dst, ipst->ips_dce_hashsize); 525 dcb = &ipst->ips_dce_hash_v4[hash]; 526 /* 527 * Assuming that we get fairly even distribution across all of the 528 * buckets, once one bucket is overly full, prune the whole cache. 529 */ 530 if (dcb->dcb_cnt > ipst->ips_ip_dce_reclaim_threshold) 531 atomic_or_uint(&ipst->ips_dce_reclaim_needed, 1); 532 rw_enter(&dcb->dcb_lock, RW_WRITER); 533 for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) { 534 if (dce->dce_v4addr == dst) { 535 mutex_enter(&dce->dce_lock); 536 if (!DCE_IS_CONDEMNED(dce)) { 537 dce_refhold(dce); 538 mutex_exit(&dce->dce_lock); 539 rw_exit(&dcb->dcb_lock); 540 return (dce); 541 } 542 mutex_exit(&dce->dce_lock); 543 } 544 } 545 dce = kmem_cache_alloc(dce_cache, KM_NOSLEEP); 546 if (dce == NULL) { 547 rw_exit(&dcb->dcb_lock); 548 return (NULL); 549 } 550 bzero(dce, sizeof (dce_t)); 551 dce->dce_ipst = ipst; /* No netstack_hold */ 552 dce->dce_v4addr = dst; 553 dce->dce_generation = DCE_GENERATION_INITIAL; 554 dce->dce_ipversion = IPV4_VERSION; 555 dce->dce_last_change_time = TICK_TO_SEC(ddi_get_lbolt64()); 556 dce_refhold(dce); /* For the hash list */ 557 558 /* Link into list */ 559 if (dcb->dcb_dce != NULL) 560 dcb->dcb_dce->dce_ptpn = &dce->dce_next; 561 dce->dce_next = dcb->dcb_dce; 562 dce->dce_ptpn = &dcb->dcb_dce; 563 dcb->dcb_dce = dce; 564 dce->dce_bucket = dcb; 565 atomic_inc_32(&dcb->dcb_cnt); 566 dce_refhold(dce); /* For the caller */ 567 rw_exit(&dcb->dcb_lock); 568 569 /* Initialize dce_ident to be different than for the last packet */ 570 dce->dce_ident = ipst->ips_dce_default->dce_ident + 1; 571 572 dce_increment_generation(ipst->ips_dce_default); 573 return (dce); 574 } 575 576 /* 577 * Atomically looks for a non-default DCE, and if not found tries to create one. 578 * If there is no memory it returns NULL. 579 * When an entry is created we increase the generation number on 580 * the default DCE so that conn_ip_output will detect there is a new DCE. 581 * ifindex should only be used with link-local addresses. 582 */ 583 dce_t * 584 dce_lookup_and_add_v6(const in6_addr_t *dst, uint_t ifindex, ip_stack_t *ipst) 585 { 586 uint_t hash; 587 dcb_t *dcb; 588 dce_t *dce; 589 590 /* We should not create entries for link-locals w/o an ifindex */ 591 ASSERT(!(IN6_IS_ADDR_LINKSCOPE(dst)) || ifindex != 0); 592 593 hash = IRE_ADDR_HASH_V6(*dst, ipst->ips_dce_hashsize); 594 dcb = &ipst->ips_dce_hash_v6[hash]; 595 /* 596 * Assuming that we get fairly even distribution across all of the 597 * buckets, once one bucket is overly full, prune the whole cache. 598 */ 599 if (dcb->dcb_cnt > ipst->ips_ip_dce_reclaim_threshold) 600 atomic_or_uint(&ipst->ips_dce_reclaim_needed, 1); 601 rw_enter(&dcb->dcb_lock, RW_WRITER); 602 for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) { 603 if (IN6_ARE_ADDR_EQUAL(&dce->dce_v6addr, dst) && 604 dce->dce_ifindex == ifindex) { 605 mutex_enter(&dce->dce_lock); 606 if (!DCE_IS_CONDEMNED(dce)) { 607 dce_refhold(dce); 608 mutex_exit(&dce->dce_lock); 609 rw_exit(&dcb->dcb_lock); 610 return (dce); 611 } 612 mutex_exit(&dce->dce_lock); 613 } 614 } 615 616 dce = kmem_cache_alloc(dce_cache, KM_NOSLEEP); 617 if (dce == NULL) { 618 rw_exit(&dcb->dcb_lock); 619 return (NULL); 620 } 621 bzero(dce, sizeof (dce_t)); 622 dce->dce_ipst = ipst; /* No netstack_hold */ 623 dce->dce_v6addr = *dst; 624 dce->dce_ifindex = ifindex; 625 dce->dce_generation = DCE_GENERATION_INITIAL; 626 dce->dce_ipversion = IPV6_VERSION; 627 dce->dce_last_change_time = TICK_TO_SEC(ddi_get_lbolt64()); 628 dce_refhold(dce); /* For the hash list */ 629 630 /* Link into list */ 631 if (dcb->dcb_dce != NULL) 632 dcb->dcb_dce->dce_ptpn = &dce->dce_next; 633 dce->dce_next = dcb->dcb_dce; 634 dce->dce_ptpn = &dcb->dcb_dce; 635 dcb->dcb_dce = dce; 636 dce->dce_bucket = dcb; 637 atomic_inc_32(&dcb->dcb_cnt); 638 dce_refhold(dce); /* For the caller */ 639 rw_exit(&dcb->dcb_lock); 640 641 /* Initialize dce_ident to be different than for the last packet */ 642 dce->dce_ident = ipst->ips_dce_default->dce_ident + 1; 643 dce_increment_generation(ipst->ips_dce_default); 644 return (dce); 645 } 646 647 /* 648 * Set/update uinfo. Creates a per-destination dce if none exists. 649 * 650 * Note that we do not bump the generation number here. 651 * New connections will find the new uinfo. 652 * 653 * The only use of this (tcp, sctp using iulp_t) is to set rtt+rtt_sd. 654 */ 655 static void 656 dce_setuinfo(dce_t *dce, iulp_t *uinfo) 657 { 658 /* 659 * Update the round trip time estimate and/or the max frag size 660 * and/or the slow start threshold. 661 * 662 * We serialize multiple advises using dce_lock. 663 */ 664 mutex_enter(&dce->dce_lock); 665 /* Gard against setting to zero */ 666 if (uinfo->iulp_rtt != 0) { 667 /* 668 * If there is no old cached values, initialize them 669 * conservatively. Set them to be (1.5 * new value). 670 */ 671 if (dce->dce_uinfo.iulp_rtt != 0) { 672 dce->dce_uinfo.iulp_rtt = (dce->dce_uinfo.iulp_rtt + 673 uinfo->iulp_rtt) >> 1; 674 } else { 675 dce->dce_uinfo.iulp_rtt = uinfo->iulp_rtt + 676 (uinfo->iulp_rtt >> 1); 677 } 678 if (dce->dce_uinfo.iulp_rtt_sd != 0) { 679 dce->dce_uinfo.iulp_rtt_sd = 680 (dce->dce_uinfo.iulp_rtt_sd + 681 uinfo->iulp_rtt_sd) >> 1; 682 } else { 683 dce->dce_uinfo.iulp_rtt_sd = uinfo->iulp_rtt_sd + 684 (uinfo->iulp_rtt_sd >> 1); 685 } 686 } 687 if (uinfo->iulp_mtu != 0) { 688 if (dce->dce_flags & DCEF_PMTU) { 689 dce->dce_pmtu = MIN(uinfo->iulp_mtu, dce->dce_pmtu); 690 } else { 691 dce->dce_pmtu = MIN(uinfo->iulp_mtu, IP_MAXPACKET); 692 dce->dce_flags |= DCEF_PMTU; 693 } 694 dce->dce_last_change_time = TICK_TO_SEC(ddi_get_lbolt64()); 695 } 696 if (uinfo->iulp_ssthresh != 0) { 697 if (dce->dce_uinfo.iulp_ssthresh != 0) 698 dce->dce_uinfo.iulp_ssthresh = 699 (uinfo->iulp_ssthresh + 700 dce->dce_uinfo.iulp_ssthresh) >> 1; 701 else 702 dce->dce_uinfo.iulp_ssthresh = uinfo->iulp_ssthresh; 703 } 704 /* We have uinfo for sure */ 705 dce->dce_flags |= DCEF_UINFO; 706 mutex_exit(&dce->dce_lock); 707 } 708 709 710 int 711 dce_update_uinfo_v4(ipaddr_t dst, iulp_t *uinfo, ip_stack_t *ipst) 712 { 713 dce_t *dce; 714 715 dce = dce_lookup_and_add_v4(dst, ipst); 716 if (dce == NULL) 717 return (ENOMEM); 718 719 dce_setuinfo(dce, uinfo); 720 dce_refrele(dce); 721 return (0); 722 } 723 724 int 725 dce_update_uinfo_v6(const in6_addr_t *dst, uint_t ifindex, iulp_t *uinfo, 726 ip_stack_t *ipst) 727 { 728 dce_t *dce; 729 730 dce = dce_lookup_and_add_v6(dst, ifindex, ipst); 731 if (dce == NULL) 732 return (ENOMEM); 733 734 dce_setuinfo(dce, uinfo); 735 dce_refrele(dce); 736 return (0); 737 } 738 739 /* Common routine for IPv4 and IPv6 */ 740 int 741 dce_update_uinfo(const in6_addr_t *dst, uint_t ifindex, iulp_t *uinfo, 742 ip_stack_t *ipst) 743 { 744 ipaddr_t dst4; 745 746 if (IN6_IS_ADDR_V4MAPPED_ANY(dst)) { 747 IN6_V4MAPPED_TO_IPADDR(dst, dst4); 748 return (dce_update_uinfo_v4(dst4, uinfo, ipst)); 749 } else { 750 return (dce_update_uinfo_v6(dst, ifindex, uinfo, ipst)); 751 } 752 } 753 754 static void 755 dce_make_condemned(dce_t *dce) 756 { 757 ip_stack_t *ipst = dce->dce_ipst; 758 759 mutex_enter(&dce->dce_lock); 760 ASSERT(!DCE_IS_CONDEMNED(dce)); 761 dce->dce_generation = DCE_GENERATION_CONDEMNED; 762 mutex_exit(&dce->dce_lock); 763 /* Count how many condemned dces for kmem_cache callback */ 764 atomic_inc_32(&ipst->ips_num_dce_condemned); 765 } 766 767 /* 768 * Increment the generation avoiding the special condemned value 769 */ 770 void 771 dce_increment_generation(dce_t *dce) 772 { 773 uint_t generation; 774 775 mutex_enter(&dce->dce_lock); 776 if (!DCE_IS_CONDEMNED(dce)) { 777 generation = dce->dce_generation + 1; 778 if (generation == DCE_GENERATION_CONDEMNED) 779 generation = DCE_GENERATION_INITIAL; 780 ASSERT(generation != DCE_GENERATION_VERIFY); 781 dce->dce_generation = generation; 782 } 783 mutex_exit(&dce->dce_lock); 784 } 785 786 /* 787 * Increment the generation number on all dces that have a path MTU and 788 * the default DCE. Used when ill_mtu or ill_mc_mtu changes. 789 */ 790 void 791 dce_increment_all_generations(boolean_t isv6, ip_stack_t *ipst) 792 { 793 int i; 794 dcb_t *dcb; 795 dce_t *dce; 796 797 for (i = 0; i < ipst->ips_dce_hashsize; i++) { 798 if (isv6) 799 dcb = &ipst->ips_dce_hash_v6[i]; 800 else 801 dcb = &ipst->ips_dce_hash_v4[i]; 802 rw_enter(&dcb->dcb_lock, RW_WRITER); 803 for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) { 804 if (DCE_IS_CONDEMNED(dce)) 805 continue; 806 dce_increment_generation(dce); 807 } 808 rw_exit(&dcb->dcb_lock); 809 } 810 dce_increment_generation(ipst->ips_dce_default); 811 } 812 813 /* 814 * Caller needs to do a dce_refrele since we can't do the 815 * dce_refrele under dcb_lock. 816 */ 817 static void 818 dce_delete_locked(dcb_t *dcb, dce_t *dce) 819 { 820 dce->dce_bucket = NULL; 821 *dce->dce_ptpn = dce->dce_next; 822 if (dce->dce_next != NULL) 823 dce->dce_next->dce_ptpn = dce->dce_ptpn; 824 dce->dce_ptpn = NULL; 825 dce->dce_next = NULL; 826 atomic_dec_32(&dcb->dcb_cnt); 827 dce_make_condemned(dce); 828 } 829 830 static void 831 dce_inactive(dce_t *dce) 832 { 833 ip_stack_t *ipst = dce->dce_ipst; 834 835 ASSERT(!(dce->dce_flags & DCEF_DEFAULT)); 836 ASSERT(dce->dce_ptpn == NULL); 837 ASSERT(dce->dce_bucket == NULL); 838 839 /* Count how many condemned dces for kmem_cache callback */ 840 if (DCE_IS_CONDEMNED(dce)) 841 atomic_dec_32(&ipst->ips_num_dce_condemned); 842 843 kmem_cache_free(dce_cache, dce); 844 } 845 846 void 847 dce_refrele(dce_t *dce) 848 { 849 ASSERT(dce->dce_refcnt != 0); 850 if (atomic_dec_32_nv(&dce->dce_refcnt) == 0) 851 dce_inactive(dce); 852 } 853 854 void 855 dce_refhold(dce_t *dce) 856 { 857 atomic_inc_32(&dce->dce_refcnt); 858 ASSERT(dce->dce_refcnt != 0); 859 } 860 861 /* No tracing support yet hence the same as the above functions */ 862 void 863 dce_refrele_notr(dce_t *dce) 864 { 865 ASSERT(dce->dce_refcnt != 0); 866 if (atomic_dec_32_nv(&dce->dce_refcnt) == 0) 867 dce_inactive(dce); 868 } 869 870 void 871 dce_refhold_notr(dce_t *dce) 872 { 873 atomic_inc_32(&dce->dce_refcnt); 874 ASSERT(dce->dce_refcnt != 0); 875 } 876 877 /* Report both the IPv4 and IPv6 DCEs. */ 878 mblk_t * 879 ip_snmp_get_mib2_ip_dce(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst) 880 { 881 struct opthdr *optp; 882 mblk_t *mp2ctl; 883 dest_cache_entry_t dest_cache; 884 mblk_t *mp_tail = NULL; 885 dce_t *dce; 886 dcb_t *dcb; 887 int i; 888 uint64_t current_time; 889 890 current_time = TICK_TO_SEC(ddi_get_lbolt64()); 891 892 /* 893 * make a copy of the original message 894 */ 895 mp2ctl = copymsg(mpctl); 896 897 /* First we do IPv4 entries */ 898 optp = (struct opthdr *)&mpctl->b_rptr[ 899 sizeof (struct T_optmgmt_ack)]; 900 optp->level = MIB2_IP; 901 optp->name = EXPER_IP_DCE; 902 903 for (i = 0; i < ipst->ips_dce_hashsize; i++) { 904 dcb = &ipst->ips_dce_hash_v4[i]; 905 rw_enter(&dcb->dcb_lock, RW_READER); 906 for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) { 907 dest_cache.DestIpv4Address = dce->dce_v4addr; 908 dest_cache.DestFlags = dce->dce_flags; 909 if (dce->dce_flags & DCEF_PMTU) 910 dest_cache.DestPmtu = dce->dce_pmtu; 911 else 912 dest_cache.DestPmtu = 0; 913 dest_cache.DestIdent = dce->dce_ident; 914 dest_cache.DestIfindex = 0; 915 dest_cache.DestAge = current_time - 916 dce->dce_last_change_time; 917 if (!snmp_append_data2(mpctl->b_cont, &mp_tail, 918 (char *)&dest_cache, (int)sizeof (dest_cache))) { 919 ip1dbg(("ip_snmp_get_mib2_ip_dce: " 920 "failed to allocate %u bytes\n", 921 (uint_t)sizeof (dest_cache))); 922 } 923 } 924 rw_exit(&dcb->dcb_lock); 925 } 926 optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont); 927 ip3dbg(("ip_snmp_get: level %d, name %d, len %d\n", 928 (int)optp->level, (int)optp->name, (int)optp->len)); 929 qreply(q, mpctl); 930 931 if (mp2ctl == NULL) { 932 /* Copymsg failed above */ 933 return (NULL); 934 } 935 936 /* Now for IPv6 */ 937 mpctl = mp2ctl; 938 mp_tail = NULL; 939 mp2ctl = copymsg(mpctl); 940 optp = (struct opthdr *)&mpctl->b_rptr[ 941 sizeof (struct T_optmgmt_ack)]; 942 optp->level = MIB2_IP6; 943 optp->name = EXPER_IP_DCE; 944 945 for (i = 0; i < ipst->ips_dce_hashsize; i++) { 946 dcb = &ipst->ips_dce_hash_v6[i]; 947 rw_enter(&dcb->dcb_lock, RW_READER); 948 for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) { 949 dest_cache.DestIpv6Address = dce->dce_v6addr; 950 dest_cache.DestFlags = dce->dce_flags; 951 if (dce->dce_flags & DCEF_PMTU) 952 dest_cache.DestPmtu = dce->dce_pmtu; 953 else 954 dest_cache.DestPmtu = 0; 955 dest_cache.DestIdent = dce->dce_ident; 956 if (IN6_IS_ADDR_LINKSCOPE(&dce->dce_v6addr)) 957 dest_cache.DestIfindex = dce->dce_ifindex; 958 else 959 dest_cache.DestIfindex = 0; 960 dest_cache.DestAge = current_time - 961 dce->dce_last_change_time; 962 if (!snmp_append_data2(mpctl->b_cont, &mp_tail, 963 (char *)&dest_cache, (int)sizeof (dest_cache))) { 964 ip1dbg(("ip_snmp_get_mib2_ip_dce: " 965 "failed to allocate %u bytes\n", 966 (uint_t)sizeof (dest_cache))); 967 } 968 } 969 rw_exit(&dcb->dcb_lock); 970 } 971 optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont); 972 ip3dbg(("ip_snmp_get: level %d, name %d, len %d\n", 973 (int)optp->level, (int)optp->name, (int)optp->len)); 974 qreply(q, mpctl); 975 976 return (mp2ctl); 977 } 978 979 /* 980 * Remove IPv6 DCEs which refer to an ifindex that is going away. 981 * This is not required for correctness, but it avoids netstat -d 982 * showing stale stuff that will never be used. 983 */ 984 void 985 dce_cleanup(uint_t ifindex, ip_stack_t *ipst) 986 { 987 uint_t i; 988 989 for (i = 0; i < ipst->ips_dce_hashsize; i++) 990 dce_bucket_clean(&ipst->ips_dce_hash_v6[i], B_TRUE, ifindex); 991 } 992