xref: /illumos-gate/usr/src/uts/common/inet/ip/ip_dce.c (revision 202ca9ae460faf1825ede303c46abd4e1f6cee28)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
24  * Copyright (c) 2012, Joyent, Inc. All rights reserved.
25  * Copyright 2017, OmniTI Computer Consulting, Inc. All rights reserved.
26  */
27 
28 #include <sys/types.h>
29 #include <sys/stream.h>
30 #include <sys/strsun.h>
31 #include <sys/zone.h>
32 #include <sys/ddi.h>
33 #include <sys/disp.h>
34 #include <sys/sunddi.h>
35 #include <sys/cmn_err.h>
36 #include <sys/debug.h>
37 #include <sys/atomic.h>
38 #include <sys/callb.h>
39 #define	_SUN_TPI_VERSION 2
40 #include <sys/tihdr.h>
41 
42 #include <inet/common.h>
43 #include <inet/mi.h>
44 #include <inet/mib2.h>
45 #include <inet/snmpcom.h>
46 
47 #include <netinet/ip6.h>
48 #include <netinet/icmp6.h>
49 
50 #include <inet/ip.h>
51 #include <inet/ip_impl.h>
52 #include <inet/ip6.h>
53 #include <inet/ip6_asp.h>
54 #include <inet/ip_multi.h>
55 #include <inet/ip_if.h>
56 #include <inet/ip_ire.h>
57 #include <inet/ip_ftable.h>
58 #include <inet/ip_rts.h>
59 #include <inet/ip_ndp.h>
60 #include <inet/ipclassifier.h>
61 #include <inet/ip_listutils.h>
62 
63 #include <sys/sunddi.h>
64 
65 /*
66  * Routines for handling destination cache entries.
67  * There is always one DCEF_DEFAULT for each ip_stack_t created at init time.
68  * That entry holds both the IP ident value and the dce generation number.
69  *
70  * Any time a DCE is changed significantly (different path MTU, but NOT
71  * different ULP info!), the dce_generation number is increased.
72  * Also, when a new DCE is created, the dce_generation number in the default
73  * DCE is bumped. That allows the dce_t information to be cached efficiently
74  * as long as the entity caching the dce_t also caches the dce_generation,
75  * and compares the cached generation to detect any changes.
76  * Furthermore, when a DCE is deleted, if there are any outstanding references
77  * to the DCE it will be marked as condemned. The condemned mark is
78  * a designated generation number which is never otherwise used, hence
79  * the single comparison with the generation number captures that as well.
80  *
81  * An example of code which caches is as follows:
82  *
83  *	if (mystruct->my_dce_generation != mystruct->my_dce->dce_generation) {
84  *		The DCE has changed
85  *		mystruct->my_dce = dce_lookup_pkt(mp, ixa,
86  *		    &mystruct->my_dce_generation);
87  *		Not needed in practice, since we have the default DCE:
88  *		if (DCE_IS_CONDEMNED(mystruct->my_dce))
89  *			return failure;
90  *	}
91  *
92  * Note that for IPv6 link-local addresses we record the ifindex since the
93  * link-locals are not globally unique.
94  *
95  * DCEs can remain for an arbitrarily long time, until memory pressure or
96  * too-deep hash buckets (see dce_lookup_and_add*()) enable the reclaim thread
97  * to actually remove DCEs from the cache.
98  */
99 
100 /*
101  * Hash bucket structure for DCEs
102  */
103 typedef struct dcb_s {
104 	krwlock_t	dcb_lock;
105 	uint32_t	dcb_cnt;
106 	dce_t		*dcb_dce;
107 } dcb_t;
108 
109 static void	dce_delete_locked(dcb_t *, dce_t *);
110 static void	dce_make_condemned(dce_t *);
111 
112 static kmem_cache_t *dce_cache;
113 static kthread_t *dce_reclaim_thread;
114 static kmutex_t dce_reclaim_lock;
115 static kcondvar_t dce_reclaim_cv;
116 static int dce_reclaim_shutdown;
117 
118 /* Global so it can be tuned in /etc/system. This must be a power of two. */
119 uint_t ip_dce_hash_size = 1024;
120 
121 /* The time in seconds between executions of the IP DCE reclaim worker. */
122 uint_t ip_dce_reclaim_interval = 60;
123 
124 /* The factor of the DCE threshold at which to start hard reclaims */
125 uint_t ip_dce_reclaim_threshold_hard = 2;
126 
127 /* Operates on a uint64_t */
128 #define	RANDOM_HASH(p) ((p) ^ ((p)>>16) ^ ((p)>>32) ^ ((p)>>48))
129 
130 /*
131  * Reclaim a fraction of dce's in the dcb.
132  * For now we have a higher probability to delete DCEs without DCE_PMTU.
133  */
134 static void
135 dcb_reclaim(dcb_t *dcb, ip_stack_t *ipst, uint_t fraction)
136 {
137 	uint_t	fraction_pmtu = fraction*4;
138 	uint_t	hash;
139 	dce_t	*dce, *nextdce;
140 	hrtime_t seed = gethrtime();
141 	uint_t	retained = 0;
142 	uint_t	max = ipst->ips_ip_dce_reclaim_threshold;
143 
144 	max *= ip_dce_reclaim_threshold_hard;
145 
146 	rw_enter(&dcb->dcb_lock, RW_WRITER);
147 	for (dce = dcb->dcb_dce; dce != NULL; dce = nextdce) {
148 		nextdce = dce->dce_next;
149 		/* Clear DCEF_PMTU if the pmtu is too old */
150 		mutex_enter(&dce->dce_lock);
151 		if ((dce->dce_flags & DCEF_PMTU) &&
152 		    TICK_TO_SEC(ddi_get_lbolt64()) - dce->dce_last_change_time >
153 		    ipst->ips_ip_pathmtu_interval) {
154 			dce->dce_flags &= ~DCEF_PMTU;
155 			mutex_exit(&dce->dce_lock);
156 			dce_increment_generation(dce);
157 		} else {
158 			mutex_exit(&dce->dce_lock);
159 		}
160 
161 		if (max == 0 || retained < max) {
162 			hash = RANDOM_HASH((uint64_t)((uintptr_t)dce | seed));
163 
164 			if (dce->dce_flags & DCEF_PMTU) {
165 				if (hash % fraction_pmtu != 0) {
166 					retained++;
167 					continue;
168 				}
169 			} else {
170 				if (hash % fraction != 0) {
171 					retained++;
172 					continue;
173 				}
174 			}
175 		}
176 
177 		IP_STAT(ipst, ip_dce_reclaim_deleted);
178 		dce_delete_locked(dcb, dce);
179 		dce_refrele(dce);
180 	}
181 	rw_exit(&dcb->dcb_lock);
182 }
183 
184 /*
185  * kmem_cache callback to free up memory.
186  *
187  */
188 static void
189 ip_dce_reclaim_stack(ip_stack_t *ipst)
190 {
191 	int	i;
192 
193 	IP_STAT(ipst, ip_dce_reclaim_calls);
194 	for (i = 0; i < ipst->ips_dce_hashsize; i++) {
195 		dcb_reclaim(&ipst->ips_dce_hash_v4[i], ipst,
196 		    ipst->ips_ip_dce_reclaim_fraction);
197 
198 		dcb_reclaim(&ipst->ips_dce_hash_v6[i], ipst,
199 		    ipst->ips_ip_dce_reclaim_fraction);
200 	}
201 
202 	/*
203 	 * Walk all CONNs that can have a reference on an ire, nce or dce.
204 	 * Get them to update any stale references to drop any refholds they
205 	 * have.
206 	 */
207 	ipcl_walk(conn_ixa_cleanup, (void *)B_FALSE, ipst);
208 }
209 
210 /*
211  * Called by dce_reclaim_worker() below, and no one else.  Typically this will
212  * mean that the number of entries in the hash buckets has exceeded a tunable
213  * threshold.
214  */
215 static void
216 ip_dce_reclaim(void)
217 {
218 	netstack_handle_t nh;
219 	netstack_t *ns;
220 	ip_stack_t *ipst;
221 
222 	ASSERT(curthread == dce_reclaim_thread);
223 
224 	netstack_next_init(&nh);
225 	while ((ns = netstack_next(&nh)) != NULL) {
226 		/*
227 		 * netstack_next() can return a netstack_t with a NULL
228 		 * netstack_ip at boot time.
229 		 */
230 		if ((ipst = ns->netstack_ip) == NULL) {
231 			netstack_rele(ns);
232 			continue;
233 		}
234 		if (atomic_swap_uint(&ipst->ips_dce_reclaim_needed, 0) != 0)
235 			ip_dce_reclaim_stack(ipst);
236 		netstack_rele(ns);
237 	}
238 	netstack_next_fini(&nh);
239 }
240 
241 /* ARGSUSED */
242 static void
243 dce_reclaim_worker(void *arg)
244 {
245 	callb_cpr_t	cprinfo;
246 
247 	CALLB_CPR_INIT(&cprinfo, &dce_reclaim_lock, callb_generic_cpr,
248 	    "dce_reclaim_worker");
249 
250 	mutex_enter(&dce_reclaim_lock);
251 	while (!dce_reclaim_shutdown) {
252 		CALLB_CPR_SAFE_BEGIN(&cprinfo);
253 		(void) cv_timedwait(&dce_reclaim_cv, &dce_reclaim_lock,
254 		    ddi_get_lbolt() + ip_dce_reclaim_interval * hz);
255 		CALLB_CPR_SAFE_END(&cprinfo, &dce_reclaim_lock);
256 
257 		if (dce_reclaim_shutdown)
258 			break;
259 
260 		mutex_exit(&dce_reclaim_lock);
261 		ip_dce_reclaim();
262 		mutex_enter(&dce_reclaim_lock);
263 	}
264 
265 	ASSERT(MUTEX_HELD(&dce_reclaim_lock));
266 	dce_reclaim_thread = NULL;
267 	dce_reclaim_shutdown = 0;
268 	cv_broadcast(&dce_reclaim_cv);
269 	CALLB_CPR_EXIT(&cprinfo);	/* drops the lock */
270 
271 	thread_exit();
272 }
273 
274 void
275 dce_g_init(void)
276 {
277 	dce_cache = kmem_cache_create("dce_cache",
278 	    sizeof (dce_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
279 
280 	mutex_init(&dce_reclaim_lock, NULL, MUTEX_DEFAULT, NULL);
281 	cv_init(&dce_reclaim_cv, NULL, CV_DEFAULT, NULL);
282 
283 	dce_reclaim_thread = thread_create(NULL, 0, dce_reclaim_worker,
284 	    NULL, 0, &p0, TS_RUN, minclsyspri);
285 }
286 
287 void
288 dce_g_destroy(void)
289 {
290 	mutex_enter(&dce_reclaim_lock);
291 	dce_reclaim_shutdown = 1;
292 	cv_signal(&dce_reclaim_cv);
293 	while (dce_reclaim_thread != NULL)
294 		cv_wait(&dce_reclaim_cv, &dce_reclaim_lock);
295 	mutex_exit(&dce_reclaim_lock);
296 
297 	cv_destroy(&dce_reclaim_cv);
298 	mutex_destroy(&dce_reclaim_lock);
299 
300 	kmem_cache_destroy(dce_cache);
301 }
302 
303 /*
304  * Allocate a default DCE and a hash table for per-IP address DCEs
305  */
306 void
307 dce_stack_init(ip_stack_t *ipst)
308 {
309 	int	i;
310 
311 	ipst->ips_dce_default = kmem_cache_alloc(dce_cache, KM_SLEEP);
312 	bzero(ipst->ips_dce_default, sizeof (dce_t));
313 	ipst->ips_dce_default->dce_flags = DCEF_DEFAULT;
314 	ipst->ips_dce_default->dce_generation = DCE_GENERATION_INITIAL;
315 	ipst->ips_dce_default->dce_last_change_time =
316 	    TICK_TO_SEC(ddi_get_lbolt64());
317 	ipst->ips_dce_default->dce_refcnt = 1;	/* Should never go away */
318 	ipst->ips_dce_default->dce_ipst = ipst;
319 
320 	/* This must be a power of two since we are using IRE_ADDR_HASH macro */
321 	ipst->ips_dce_hashsize = ip_dce_hash_size;
322 	ipst->ips_dce_hash_v4 = kmem_zalloc(ipst->ips_dce_hashsize *
323 	    sizeof (dcb_t), KM_SLEEP);
324 	ipst->ips_dce_hash_v6 = kmem_zalloc(ipst->ips_dce_hashsize *
325 	    sizeof (dcb_t), KM_SLEEP);
326 	for (i = 0; i < ipst->ips_dce_hashsize; i++) {
327 		rw_init(&ipst->ips_dce_hash_v4[i].dcb_lock, NULL, RW_DEFAULT,
328 		    NULL);
329 		rw_init(&ipst->ips_dce_hash_v6[i].dcb_lock, NULL, RW_DEFAULT,
330 		    NULL);
331 	}
332 }
333 
334 /*
335  * Given a DCE hash bucket, unlink DCE entries from it. Some callers need
336  * ifindex-specific matching, others don't. Don't overload ifindex to indicate
337  * specificity, just indicate so explicitly.
338  */
339 static void
340 dce_bucket_clean(dcb_t *dcb, boolean_t specific_ifindex, uint_t ifindex)
341 {
342 	dce_t	*dce, *nextdce;
343 
344 	rw_enter(&dcb->dcb_lock, RW_WRITER);
345 
346 	for (dce = dcb->dcb_dce; dce != NULL; dce = nextdce) {
347 		nextdce = dce->dce_next;
348 		if ((!specific_ifindex) || dce->dce_ifindex == ifindex) {
349 			dce_delete_locked(dcb, dce);
350 			dce_refrele(dce);
351 		}
352 	}
353 
354 	rw_exit(&dcb->dcb_lock);
355 }
356 
357 void
358 dce_stack_destroy(ip_stack_t *ipst)
359 {
360 	int i;
361 	for (i = 0; i < ipst->ips_dce_hashsize; i++) {
362 		dce_bucket_clean(&ipst->ips_dce_hash_v4[i], B_FALSE, 0);
363 		rw_destroy(&ipst->ips_dce_hash_v4[i].dcb_lock);
364 		dce_bucket_clean(&ipst->ips_dce_hash_v6[i], B_FALSE, 0);
365 		rw_destroy(&ipst->ips_dce_hash_v6[i].dcb_lock);
366 	}
367 	kmem_free(ipst->ips_dce_hash_v4,
368 	    ipst->ips_dce_hashsize * sizeof (dcb_t));
369 	ipst->ips_dce_hash_v4 = NULL;
370 	kmem_free(ipst->ips_dce_hash_v6,
371 	    ipst->ips_dce_hashsize * sizeof (dcb_t));
372 	ipst->ips_dce_hash_v6 = NULL;
373 	ipst->ips_dce_hashsize = 0;
374 
375 	ASSERT(ipst->ips_dce_default->dce_refcnt == 1);
376 	kmem_cache_free(dce_cache, ipst->ips_dce_default);
377 	ipst->ips_dce_default = NULL;
378 }
379 
380 /* When any DCE is good enough */
381 dce_t *
382 dce_get_default(ip_stack_t *ipst)
383 {
384 	dce_t		*dce;
385 
386 	dce = ipst->ips_dce_default;
387 	dce_refhold(dce);
388 	return (dce);
389 }
390 
391 /*
392  * Generic for IPv4 and IPv6.
393  *
394  * Used by callers that need to cache e.g., the datapath
395  * Returns the generation number in the last argument.
396  */
397 dce_t *
398 dce_lookup_pkt(mblk_t *mp, ip_xmit_attr_t *ixa, uint_t *generationp)
399 {
400 	if (ixa->ixa_flags & IXAF_IS_IPV4) {
401 		/*
402 		 * If we have a source route we need to look for the final
403 		 * destination in the source route option.
404 		 */
405 		ipaddr_t final_dst;
406 		ipha_t *ipha = (ipha_t *)mp->b_rptr;
407 
408 		final_dst = ip_get_dst(ipha);
409 		return (dce_lookup_v4(final_dst, ixa->ixa_ipst, generationp));
410 	} else {
411 		uint_t ifindex;
412 		/*
413 		 * If we have a routing header we need to look for the final
414 		 * destination in the routing extension header.
415 		 */
416 		in6_addr_t final_dst;
417 		ip6_t *ip6h = (ip6_t *)mp->b_rptr;
418 
419 		final_dst = ip_get_dst_v6(ip6h, mp, NULL);
420 		ifindex = 0;
421 		if (IN6_IS_ADDR_LINKSCOPE(&final_dst) && ixa->ixa_nce != NULL) {
422 			ifindex = ixa->ixa_nce->nce_common->ncec_ill->
423 			    ill_phyint->phyint_ifindex;
424 		}
425 		return (dce_lookup_v6(&final_dst, ifindex, ixa->ixa_ipst,
426 		    generationp));
427 	}
428 }
429 
430 /*
431  * Used by callers that need to cache e.g., the datapath
432  * Returns the generation number in the last argument.
433  */
434 dce_t *
435 dce_lookup_v4(ipaddr_t dst, ip_stack_t *ipst, uint_t *generationp)
436 {
437 	uint_t		hash;
438 	dcb_t		*dcb;
439 	dce_t		*dce;
440 
441 	/* Set *generationp before dropping the lock(s) that allow additions */
442 	if (generationp != NULL)
443 		*generationp = ipst->ips_dce_default->dce_generation;
444 
445 	hash = IRE_ADDR_HASH(dst, ipst->ips_dce_hashsize);
446 	dcb = &ipst->ips_dce_hash_v4[hash];
447 	rw_enter(&dcb->dcb_lock, RW_READER);
448 	for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) {
449 		if (dce->dce_v4addr == dst) {
450 			mutex_enter(&dce->dce_lock);
451 			if (!DCE_IS_CONDEMNED(dce)) {
452 				dce_refhold(dce);
453 				if (generationp != NULL)
454 					*generationp = dce->dce_generation;
455 				mutex_exit(&dce->dce_lock);
456 				rw_exit(&dcb->dcb_lock);
457 				return (dce);
458 			}
459 			mutex_exit(&dce->dce_lock);
460 		}
461 	}
462 	rw_exit(&dcb->dcb_lock);
463 	/* Not found */
464 	dce = ipst->ips_dce_default;
465 	dce_refhold(dce);
466 	return (dce);
467 }
468 
469 /*
470  * Used by callers that need to cache e.g., the datapath
471  * Returns the generation number in the last argument.
472  * ifindex should only be set for link-locals
473  */
474 dce_t *
475 dce_lookup_v6(const in6_addr_t *dst, uint_t ifindex, ip_stack_t *ipst,
476     uint_t *generationp)
477 {
478 	uint_t		hash;
479 	dcb_t		*dcb;
480 	dce_t		*dce;
481 
482 	/* Set *generationp before dropping the lock(s) that allow additions */
483 	if (generationp != NULL)
484 		*generationp = ipst->ips_dce_default->dce_generation;
485 
486 	hash = IRE_ADDR_HASH_V6(*dst, ipst->ips_dce_hashsize);
487 	dcb = &ipst->ips_dce_hash_v6[hash];
488 	rw_enter(&dcb->dcb_lock, RW_READER);
489 	for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) {
490 		if (IN6_ARE_ADDR_EQUAL(&dce->dce_v6addr, dst) &&
491 		    dce->dce_ifindex == ifindex) {
492 			mutex_enter(&dce->dce_lock);
493 			if (!DCE_IS_CONDEMNED(dce)) {
494 				dce_refhold(dce);
495 				if (generationp != NULL)
496 					*generationp = dce->dce_generation;
497 				mutex_exit(&dce->dce_lock);
498 				rw_exit(&dcb->dcb_lock);
499 				return (dce);
500 			}
501 			mutex_exit(&dce->dce_lock);
502 		}
503 	}
504 	rw_exit(&dcb->dcb_lock);
505 	/* Not found */
506 	dce = ipst->ips_dce_default;
507 	dce_refhold(dce);
508 	return (dce);
509 }
510 
511 /*
512  * Atomically looks for a non-default DCE, and if not found tries to create one.
513  * If there is no memory it returns NULL.
514  * When an entry is created we increase the generation number on
515  * the default DCE so that conn_ip_output will detect there is a new DCE.
516  */
517 dce_t *
518 dce_lookup_and_add_v4(ipaddr_t dst, ip_stack_t *ipst)
519 {
520 	uint_t		hash;
521 	dcb_t		*dcb;
522 	dce_t		*dce;
523 
524 	hash = IRE_ADDR_HASH(dst, ipst->ips_dce_hashsize);
525 	dcb = &ipst->ips_dce_hash_v4[hash];
526 	/*
527 	 * Assuming that we get fairly even distribution across all of the
528 	 * buckets, once one bucket is overly full, prune the whole cache.
529 	 */
530 	if (dcb->dcb_cnt > ipst->ips_ip_dce_reclaim_threshold)
531 		atomic_or_uint(&ipst->ips_dce_reclaim_needed, 1);
532 	rw_enter(&dcb->dcb_lock, RW_WRITER);
533 	for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) {
534 		if (dce->dce_v4addr == dst) {
535 			mutex_enter(&dce->dce_lock);
536 			if (!DCE_IS_CONDEMNED(dce)) {
537 				dce_refhold(dce);
538 				mutex_exit(&dce->dce_lock);
539 				rw_exit(&dcb->dcb_lock);
540 				return (dce);
541 			}
542 			mutex_exit(&dce->dce_lock);
543 		}
544 	}
545 	dce = kmem_cache_alloc(dce_cache, KM_NOSLEEP);
546 	if (dce == NULL) {
547 		rw_exit(&dcb->dcb_lock);
548 		return (NULL);
549 	}
550 	bzero(dce, sizeof (dce_t));
551 	dce->dce_ipst = ipst;	/* No netstack_hold */
552 	dce->dce_v4addr = dst;
553 	dce->dce_generation = DCE_GENERATION_INITIAL;
554 	dce->dce_ipversion = IPV4_VERSION;
555 	dce->dce_last_change_time = TICK_TO_SEC(ddi_get_lbolt64());
556 	dce_refhold(dce);	/* For the hash list */
557 
558 	/* Link into list */
559 	if (dcb->dcb_dce != NULL)
560 		dcb->dcb_dce->dce_ptpn = &dce->dce_next;
561 	dce->dce_next = dcb->dcb_dce;
562 	dce->dce_ptpn = &dcb->dcb_dce;
563 	dcb->dcb_dce = dce;
564 	dce->dce_bucket = dcb;
565 	atomic_inc_32(&dcb->dcb_cnt);
566 	dce_refhold(dce);	/* For the caller */
567 	rw_exit(&dcb->dcb_lock);
568 
569 	/* Initialize dce_ident to be different than for the last packet */
570 	dce->dce_ident = ipst->ips_dce_default->dce_ident + 1;
571 
572 	dce_increment_generation(ipst->ips_dce_default);
573 	return (dce);
574 }
575 
576 /*
577  * Atomically looks for a non-default DCE, and if not found tries to create one.
578  * If there is no memory it returns NULL.
579  * When an entry is created we increase the generation number on
580  * the default DCE so that conn_ip_output will detect there is a new DCE.
581  * ifindex should only be used with link-local addresses.
582  */
583 dce_t *
584 dce_lookup_and_add_v6(const in6_addr_t *dst, uint_t ifindex, ip_stack_t *ipst)
585 {
586 	uint_t		hash;
587 	dcb_t		*dcb;
588 	dce_t		*dce;
589 
590 	/* We should not create entries for link-locals w/o an ifindex */
591 	ASSERT(!(IN6_IS_ADDR_LINKSCOPE(dst)) || ifindex != 0);
592 
593 	hash = IRE_ADDR_HASH_V6(*dst, ipst->ips_dce_hashsize);
594 	dcb = &ipst->ips_dce_hash_v6[hash];
595 	/*
596 	 * Assuming that we get fairly even distribution across all of the
597 	 * buckets, once one bucket is overly full, prune the whole cache.
598 	 */
599 	if (dcb->dcb_cnt > ipst->ips_ip_dce_reclaim_threshold)
600 		atomic_or_uint(&ipst->ips_dce_reclaim_needed, 1);
601 	rw_enter(&dcb->dcb_lock, RW_WRITER);
602 	for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) {
603 		if (IN6_ARE_ADDR_EQUAL(&dce->dce_v6addr, dst) &&
604 		    dce->dce_ifindex == ifindex) {
605 			mutex_enter(&dce->dce_lock);
606 			if (!DCE_IS_CONDEMNED(dce)) {
607 				dce_refhold(dce);
608 				mutex_exit(&dce->dce_lock);
609 				rw_exit(&dcb->dcb_lock);
610 				return (dce);
611 			}
612 			mutex_exit(&dce->dce_lock);
613 		}
614 	}
615 
616 	dce = kmem_cache_alloc(dce_cache, KM_NOSLEEP);
617 	if (dce == NULL) {
618 		rw_exit(&dcb->dcb_lock);
619 		return (NULL);
620 	}
621 	bzero(dce, sizeof (dce_t));
622 	dce->dce_ipst = ipst;	/* No netstack_hold */
623 	dce->dce_v6addr = *dst;
624 	dce->dce_ifindex = ifindex;
625 	dce->dce_generation = DCE_GENERATION_INITIAL;
626 	dce->dce_ipversion = IPV6_VERSION;
627 	dce->dce_last_change_time = TICK_TO_SEC(ddi_get_lbolt64());
628 	dce_refhold(dce);	/* For the hash list */
629 
630 	/* Link into list */
631 	if (dcb->dcb_dce != NULL)
632 		dcb->dcb_dce->dce_ptpn = &dce->dce_next;
633 	dce->dce_next = dcb->dcb_dce;
634 	dce->dce_ptpn = &dcb->dcb_dce;
635 	dcb->dcb_dce = dce;
636 	dce->dce_bucket = dcb;
637 	atomic_inc_32(&dcb->dcb_cnt);
638 	dce_refhold(dce);	/* For the caller */
639 	rw_exit(&dcb->dcb_lock);
640 
641 	/* Initialize dce_ident to be different than for the last packet */
642 	dce->dce_ident = ipst->ips_dce_default->dce_ident + 1;
643 	dce_increment_generation(ipst->ips_dce_default);
644 	return (dce);
645 }
646 
647 /*
648  * Set/update uinfo. Creates a per-destination dce if none exists.
649  *
650  * Note that we do not bump the generation number here.
651  * New connections will find the new uinfo.
652  *
653  * The only use of this (tcp, sctp using iulp_t) is to set rtt+rtt_sd.
654  */
655 static void
656 dce_setuinfo(dce_t *dce, iulp_t *uinfo)
657 {
658 	/*
659 	 * Update the round trip time estimate and/or the max frag size
660 	 * and/or the slow start threshold.
661 	 *
662 	 * We serialize multiple advises using dce_lock.
663 	 */
664 	mutex_enter(&dce->dce_lock);
665 	/* Gard against setting to zero */
666 	if (uinfo->iulp_rtt != 0) {
667 		/*
668 		 * If there is no old cached values, initialize them
669 		 * conservatively.  Set them to be (1.5 * new value).
670 		 */
671 		if (dce->dce_uinfo.iulp_rtt != 0) {
672 			dce->dce_uinfo.iulp_rtt = (dce->dce_uinfo.iulp_rtt +
673 			    uinfo->iulp_rtt) >> 1;
674 		} else {
675 			dce->dce_uinfo.iulp_rtt = uinfo->iulp_rtt +
676 			    (uinfo->iulp_rtt >> 1);
677 		}
678 		if (dce->dce_uinfo.iulp_rtt_sd != 0) {
679 			dce->dce_uinfo.iulp_rtt_sd =
680 			    (dce->dce_uinfo.iulp_rtt_sd +
681 			    uinfo->iulp_rtt_sd) >> 1;
682 		} else {
683 			dce->dce_uinfo.iulp_rtt_sd = uinfo->iulp_rtt_sd +
684 			    (uinfo->iulp_rtt_sd >> 1);
685 		}
686 	}
687 	if (uinfo->iulp_mtu != 0) {
688 		if (dce->dce_flags & DCEF_PMTU) {
689 			dce->dce_pmtu = MIN(uinfo->iulp_mtu, dce->dce_pmtu);
690 		} else {
691 			dce->dce_pmtu = MIN(uinfo->iulp_mtu, IP_MAXPACKET);
692 			dce->dce_flags |= DCEF_PMTU;
693 		}
694 		dce->dce_last_change_time = TICK_TO_SEC(ddi_get_lbolt64());
695 	}
696 	if (uinfo->iulp_ssthresh != 0) {
697 		if (dce->dce_uinfo.iulp_ssthresh != 0)
698 			dce->dce_uinfo.iulp_ssthresh =
699 			    (uinfo->iulp_ssthresh +
700 			    dce->dce_uinfo.iulp_ssthresh) >> 1;
701 		else
702 			dce->dce_uinfo.iulp_ssthresh = uinfo->iulp_ssthresh;
703 	}
704 	/* We have uinfo for sure */
705 	dce->dce_flags |= DCEF_UINFO;
706 	mutex_exit(&dce->dce_lock);
707 }
708 
709 
710 int
711 dce_update_uinfo_v4(ipaddr_t dst, iulp_t *uinfo, ip_stack_t *ipst)
712 {
713 	dce_t *dce;
714 
715 	dce = dce_lookup_and_add_v4(dst, ipst);
716 	if (dce == NULL)
717 		return (ENOMEM);
718 
719 	dce_setuinfo(dce, uinfo);
720 	dce_refrele(dce);
721 	return (0);
722 }
723 
724 int
725 dce_update_uinfo_v6(const in6_addr_t *dst, uint_t ifindex, iulp_t *uinfo,
726     ip_stack_t *ipst)
727 {
728 	dce_t *dce;
729 
730 	dce = dce_lookup_and_add_v6(dst, ifindex, ipst);
731 	if (dce == NULL)
732 		return (ENOMEM);
733 
734 	dce_setuinfo(dce, uinfo);
735 	dce_refrele(dce);
736 	return (0);
737 }
738 
739 /* Common routine for IPv4 and IPv6 */
740 int
741 dce_update_uinfo(const in6_addr_t *dst, uint_t ifindex, iulp_t *uinfo,
742     ip_stack_t *ipst)
743 {
744 	ipaddr_t dst4;
745 
746 	if (IN6_IS_ADDR_V4MAPPED_ANY(dst)) {
747 		IN6_V4MAPPED_TO_IPADDR(dst, dst4);
748 		return (dce_update_uinfo_v4(dst4, uinfo, ipst));
749 	} else {
750 		return (dce_update_uinfo_v6(dst, ifindex, uinfo, ipst));
751 	}
752 }
753 
754 static void
755 dce_make_condemned(dce_t *dce)
756 {
757 	ip_stack_t	*ipst = dce->dce_ipst;
758 
759 	mutex_enter(&dce->dce_lock);
760 	ASSERT(!DCE_IS_CONDEMNED(dce));
761 	dce->dce_generation = DCE_GENERATION_CONDEMNED;
762 	mutex_exit(&dce->dce_lock);
763 	/* Count how many condemned dces for kmem_cache callback */
764 	atomic_inc_32(&ipst->ips_num_dce_condemned);
765 }
766 
767 /*
768  * Increment the generation avoiding the special condemned value
769  */
770 void
771 dce_increment_generation(dce_t *dce)
772 {
773 	uint_t generation;
774 
775 	mutex_enter(&dce->dce_lock);
776 	if (!DCE_IS_CONDEMNED(dce)) {
777 		generation = dce->dce_generation + 1;
778 		if (generation == DCE_GENERATION_CONDEMNED)
779 			generation = DCE_GENERATION_INITIAL;
780 		ASSERT(generation != DCE_GENERATION_VERIFY);
781 		dce->dce_generation = generation;
782 	}
783 	mutex_exit(&dce->dce_lock);
784 }
785 
786 /*
787  * Increment the generation number on all dces that have a path MTU and
788  * the default DCE. Used when ill_mtu or ill_mc_mtu changes.
789  */
790 void
791 dce_increment_all_generations(boolean_t isv6, ip_stack_t *ipst)
792 {
793 	int		i;
794 	dcb_t		*dcb;
795 	dce_t		*dce;
796 
797 	for (i = 0; i < ipst->ips_dce_hashsize; i++) {
798 		if (isv6)
799 			dcb = &ipst->ips_dce_hash_v6[i];
800 		else
801 			dcb = &ipst->ips_dce_hash_v4[i];
802 		rw_enter(&dcb->dcb_lock, RW_WRITER);
803 		for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) {
804 			if (DCE_IS_CONDEMNED(dce))
805 				continue;
806 			dce_increment_generation(dce);
807 		}
808 		rw_exit(&dcb->dcb_lock);
809 	}
810 	dce_increment_generation(ipst->ips_dce_default);
811 }
812 
813 /*
814  * Caller needs to do a dce_refrele since we can't do the
815  * dce_refrele under dcb_lock.
816  */
817 static void
818 dce_delete_locked(dcb_t *dcb, dce_t *dce)
819 {
820 	dce->dce_bucket = NULL;
821 	*dce->dce_ptpn = dce->dce_next;
822 	if (dce->dce_next != NULL)
823 		dce->dce_next->dce_ptpn = dce->dce_ptpn;
824 	dce->dce_ptpn = NULL;
825 	dce->dce_next = NULL;
826 	atomic_dec_32(&dcb->dcb_cnt);
827 	dce_make_condemned(dce);
828 }
829 
830 static void
831 dce_inactive(dce_t *dce)
832 {
833 	ip_stack_t	*ipst = dce->dce_ipst;
834 
835 	ASSERT(!(dce->dce_flags & DCEF_DEFAULT));
836 	ASSERT(dce->dce_ptpn == NULL);
837 	ASSERT(dce->dce_bucket == NULL);
838 
839 	/* Count how many condemned dces for kmem_cache callback */
840 	if (DCE_IS_CONDEMNED(dce))
841 		atomic_dec_32(&ipst->ips_num_dce_condemned);
842 
843 	kmem_cache_free(dce_cache, dce);
844 }
845 
846 void
847 dce_refrele(dce_t *dce)
848 {
849 	ASSERT(dce->dce_refcnt != 0);
850 	if (atomic_dec_32_nv(&dce->dce_refcnt) == 0)
851 		dce_inactive(dce);
852 }
853 
854 void
855 dce_refhold(dce_t *dce)
856 {
857 	atomic_inc_32(&dce->dce_refcnt);
858 	ASSERT(dce->dce_refcnt != 0);
859 }
860 
861 /* No tracing support yet hence the same as the above functions */
862 void
863 dce_refrele_notr(dce_t *dce)
864 {
865 	ASSERT(dce->dce_refcnt != 0);
866 	if (atomic_dec_32_nv(&dce->dce_refcnt) == 0)
867 		dce_inactive(dce);
868 }
869 
870 void
871 dce_refhold_notr(dce_t *dce)
872 {
873 	atomic_inc_32(&dce->dce_refcnt);
874 	ASSERT(dce->dce_refcnt != 0);
875 }
876 
877 /* Report both the IPv4 and IPv6 DCEs. */
878 mblk_t *
879 ip_snmp_get_mib2_ip_dce(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
880 {
881 	struct opthdr		*optp;
882 	mblk_t			*mp2ctl;
883 	dest_cache_entry_t	dest_cache;
884 	mblk_t			*mp_tail = NULL;
885 	dce_t			*dce;
886 	dcb_t			*dcb;
887 	int			i;
888 	uint64_t		current_time;
889 
890 	current_time = TICK_TO_SEC(ddi_get_lbolt64());
891 
892 	/*
893 	 * make a copy of the original message
894 	 */
895 	mp2ctl = copymsg(mpctl);
896 
897 	/* First we do IPv4 entries */
898 	optp = (struct opthdr *)&mpctl->b_rptr[
899 	    sizeof (struct T_optmgmt_ack)];
900 	optp->level = MIB2_IP;
901 	optp->name = EXPER_IP_DCE;
902 
903 	for (i = 0; i < ipst->ips_dce_hashsize; i++) {
904 		dcb = &ipst->ips_dce_hash_v4[i];
905 		rw_enter(&dcb->dcb_lock, RW_READER);
906 		for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) {
907 			dest_cache.DestIpv4Address = dce->dce_v4addr;
908 			dest_cache.DestFlags = dce->dce_flags;
909 			if (dce->dce_flags & DCEF_PMTU)
910 				dest_cache.DestPmtu = dce->dce_pmtu;
911 			else
912 				dest_cache.DestPmtu = 0;
913 			dest_cache.DestIdent = dce->dce_ident;
914 			dest_cache.DestIfindex = 0;
915 			dest_cache.DestAge = current_time -
916 			    dce->dce_last_change_time;
917 			if (!snmp_append_data2(mpctl->b_cont, &mp_tail,
918 			    (char *)&dest_cache, (int)sizeof (dest_cache))) {
919 				ip1dbg(("ip_snmp_get_mib2_ip_dce: "
920 				    "failed to allocate %u bytes\n",
921 				    (uint_t)sizeof (dest_cache)));
922 			}
923 		}
924 		rw_exit(&dcb->dcb_lock);
925 	}
926 	optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont);
927 	ip3dbg(("ip_snmp_get: level %d, name %d, len %d\n",
928 	    (int)optp->level, (int)optp->name, (int)optp->len));
929 	qreply(q, mpctl);
930 
931 	if (mp2ctl == NULL) {
932 		/* Copymsg failed above */
933 		return (NULL);
934 	}
935 
936 	/* Now for IPv6 */
937 	mpctl = mp2ctl;
938 	mp_tail = NULL;
939 	mp2ctl = copymsg(mpctl);
940 	optp = (struct opthdr *)&mpctl->b_rptr[
941 	    sizeof (struct T_optmgmt_ack)];
942 	optp->level = MIB2_IP6;
943 	optp->name = EXPER_IP_DCE;
944 
945 	for (i = 0; i < ipst->ips_dce_hashsize; i++) {
946 		dcb = &ipst->ips_dce_hash_v6[i];
947 		rw_enter(&dcb->dcb_lock, RW_READER);
948 		for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) {
949 			dest_cache.DestIpv6Address = dce->dce_v6addr;
950 			dest_cache.DestFlags = dce->dce_flags;
951 			if (dce->dce_flags & DCEF_PMTU)
952 				dest_cache.DestPmtu = dce->dce_pmtu;
953 			else
954 				dest_cache.DestPmtu = 0;
955 			dest_cache.DestIdent = dce->dce_ident;
956 			if (IN6_IS_ADDR_LINKSCOPE(&dce->dce_v6addr))
957 				dest_cache.DestIfindex = dce->dce_ifindex;
958 			else
959 				dest_cache.DestIfindex = 0;
960 			dest_cache.DestAge = current_time -
961 			    dce->dce_last_change_time;
962 			if (!snmp_append_data2(mpctl->b_cont, &mp_tail,
963 			    (char *)&dest_cache, (int)sizeof (dest_cache))) {
964 				ip1dbg(("ip_snmp_get_mib2_ip_dce: "
965 				    "failed to allocate %u bytes\n",
966 				    (uint_t)sizeof (dest_cache)));
967 			}
968 		}
969 		rw_exit(&dcb->dcb_lock);
970 	}
971 	optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont);
972 	ip3dbg(("ip_snmp_get: level %d, name %d, len %d\n",
973 	    (int)optp->level, (int)optp->name, (int)optp->len));
974 	qreply(q, mpctl);
975 
976 	return (mp2ctl);
977 }
978 
979 /*
980  * Remove IPv6 DCEs which refer to an ifindex that is going away.
981  * This is not required for correctness, but it avoids netstat -d
982  * showing stale stuff that will never be used.
983  */
984 void
985 dce_cleanup(uint_t ifindex, ip_stack_t *ipst)
986 {
987 	uint_t	i;
988 
989 	for (i = 0; i < ipst->ips_dce_hashsize; i++)
990 		dce_bucket_clean(&ipst->ips_dce_hash_v6[i], B_TRUE, ifindex);
991 }
992