xref: /illumos-gate/usr/src/uts/common/inet/ip/ip_dce.c (revision 4b5c8e93cab28d3c65ba9d407fd8f46e3be1db1c)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
24  * Copyright (c) 2012, Joyent, Inc. All rights reserved.
25  */
26 
27 #include <sys/types.h>
28 #include <sys/stream.h>
29 #include <sys/strsun.h>
30 #include <sys/zone.h>
31 #include <sys/ddi.h>
32 #include <sys/disp.h>
33 #include <sys/sunddi.h>
34 #include <sys/cmn_err.h>
35 #include <sys/debug.h>
36 #include <sys/atomic.h>
37 #include <sys/callb.h>
38 #define	_SUN_TPI_VERSION 2
39 #include <sys/tihdr.h>
40 
41 #include <inet/common.h>
42 #include <inet/mi.h>
43 #include <inet/mib2.h>
44 #include <inet/snmpcom.h>
45 
46 #include <netinet/ip6.h>
47 #include <netinet/icmp6.h>
48 
49 #include <inet/ip.h>
50 #include <inet/ip_impl.h>
51 #include <inet/ip6.h>
52 #include <inet/ip6_asp.h>
53 #include <inet/ip_multi.h>
54 #include <inet/ip_if.h>
55 #include <inet/ip_ire.h>
56 #include <inet/ip_ftable.h>
57 #include <inet/ip_rts.h>
58 #include <inet/ip_ndp.h>
59 #include <inet/ipclassifier.h>
60 #include <inet/ip_listutils.h>
61 
62 #include <sys/sunddi.h>
63 
64 /*
65  * Routines for handling destination cache entries.
66  * There is always one DCEF_DEFAULT for each ip_stack_t created at init time.
67  * That entry holds both the IP ident value and the dce generation number.
68  *
69  * Any time a DCE is changed significantly (different path MTU, but NOT
70  * different ULP info!), the dce_generation number is increased.
71  * Also, when a new DCE is created, the dce_generation number in the default
72  * DCE is bumped. That allows the dce_t information to be cached efficiently
73  * as long as the entity caching the dce_t also caches the dce_generation,
74  * and compares the cached generation to detect any changes.
75  * Furthermore, when a DCE is deleted, if there are any outstanding references
76  * to the DCE it will be marked as condemned. The condemned mark is
77  * a designated generation number which is never otherwise used, hence
78  * the single comparison with the generation number captures that as well.
79  *
80  * An example of code which caches is as follows:
81  *
82  *	if (mystruct->my_dce_generation != mystruct->my_dce->dce_generation) {
83  *		The DCE has changed
84  *		mystruct->my_dce = dce_lookup_pkt(mp, ixa,
85  *		    &mystruct->my_dce_generation);
86  *		Not needed in practice, since we have the default DCE:
87  *		if (DCE_IS_CONDEMNED(mystruct->my_dce))
88  *			return failure;
89  *	}
90  *
91  * Note that for IPv6 link-local addresses we record the ifindex since the
92  * link-locals are not globally unique.
93  */
94 
95 /*
96  * Hash bucket structure for DCEs
97  */
98 typedef struct dcb_s {
99 	krwlock_t	dcb_lock;
100 	uint32_t	dcb_cnt;
101 	dce_t		*dcb_dce;
102 } dcb_t;
103 
104 static void	dce_delete_locked(dcb_t *, dce_t *);
105 static void	dce_make_condemned(dce_t *);
106 
107 static kmem_cache_t *dce_cache;
108 static kthread_t *dce_reclaim_thread;
109 static kmutex_t dce_reclaim_lock;
110 static kcondvar_t dce_reclaim_cv;
111 static int dce_reclaim_shutdown;
112 
113 /* Global so it can be tuned in /etc/system. This must be a power of two. */
114 uint_t ip_dce_hash_size = 1024;
115 
116 /* The time in seconds between executions of the IP DCE reclaim worker. */
117 uint_t ip_dce_reclaim_interval = 60;
118 
119 /* The factor of the DCE threshold at which to start hard reclaims */
120 uint_t ip_dce_reclaim_threshold_hard = 2;
121 
122 /* Operates on a uint64_t */
123 #define	RANDOM_HASH(p) ((p) ^ ((p)>>16) ^ ((p)>>32) ^ ((p)>>48))
124 
125 /*
126  * Reclaim a fraction of dce's in the dcb.
127  * For now we have a higher probability to delete DCEs without DCE_PMTU.
128  */
129 static void
130 dcb_reclaim(dcb_t *dcb, ip_stack_t *ipst, uint_t fraction)
131 {
132 	uint_t	fraction_pmtu = fraction*4;
133 	uint_t	hash;
134 	dce_t	*dce, *nextdce;
135 	hrtime_t seed = gethrtime();
136 	uint_t	retained = 0;
137 	uint_t	max = ipst->ips_ip_dce_reclaim_threshold;
138 
139 	max *= ip_dce_reclaim_threshold_hard;
140 
141 	rw_enter(&dcb->dcb_lock, RW_WRITER);
142 	for (dce = dcb->dcb_dce; dce != NULL; dce = nextdce) {
143 		nextdce = dce->dce_next;
144 		/* Clear DCEF_PMTU if the pmtu is too old */
145 		mutex_enter(&dce->dce_lock);
146 		if ((dce->dce_flags & DCEF_PMTU) &&
147 		    TICK_TO_SEC(ddi_get_lbolt64()) - dce->dce_last_change_time >
148 		    ipst->ips_ip_pathmtu_interval) {
149 			dce->dce_flags &= ~DCEF_PMTU;
150 			mutex_exit(&dce->dce_lock);
151 			dce_increment_generation(dce);
152 		} else {
153 			mutex_exit(&dce->dce_lock);
154 		}
155 
156 		if (max == 0 || retained < max) {
157 			hash = RANDOM_HASH((uint64_t)((uintptr_t)dce | seed));
158 
159 			if (dce->dce_flags & DCEF_PMTU) {
160 				if (hash % fraction_pmtu != 0) {
161 					retained++;
162 					continue;
163 				}
164 			} else {
165 				if (hash % fraction != 0) {
166 					retained++;
167 					continue;
168 				}
169 			}
170 		}
171 
172 		IP_STAT(ipst, ip_dce_reclaim_deleted);
173 		dce_delete_locked(dcb, dce);
174 		dce_refrele(dce);
175 	}
176 	rw_exit(&dcb->dcb_lock);
177 }
178 
179 /*
180  * kmem_cache callback to free up memory.
181  *
182  */
183 static void
184 ip_dce_reclaim_stack(ip_stack_t *ipst)
185 {
186 	int	i;
187 
188 	IP_STAT(ipst, ip_dce_reclaim_calls);
189 	for (i = 0; i < ipst->ips_dce_hashsize; i++) {
190 		dcb_reclaim(&ipst->ips_dce_hash_v4[i], ipst,
191 		    ipst->ips_ip_dce_reclaim_fraction);
192 
193 		dcb_reclaim(&ipst->ips_dce_hash_v6[i], ipst,
194 		    ipst->ips_ip_dce_reclaim_fraction);
195 	}
196 
197 	/*
198 	 * Walk all CONNs that can have a reference on an ire, nce or dce.
199 	 * Get them to update any stale references to drop any refholds they
200 	 * have.
201 	 */
202 	ipcl_walk(conn_ixa_cleanup, (void *)B_FALSE, ipst);
203 }
204 
205 /*
206  * Called by dce_reclaim_worker() below, and no one else.  Typically this will
207  * mean that the number of entries in the hash buckets has exceeded a tunable
208  * threshold.
209  */
210 static void
211 ip_dce_reclaim(void)
212 {
213 	netstack_handle_t nh;
214 	netstack_t *ns;
215 	ip_stack_t *ipst;
216 
217 	ASSERT(curthread == dce_reclaim_thread);
218 
219 	netstack_next_init(&nh);
220 	while ((ns = netstack_next(&nh)) != NULL) {
221 		/*
222 		 * netstack_next() can return a netstack_t with a NULL
223 		 * netstack_ip at boot time.
224 		 */
225 		if ((ipst = ns->netstack_ip) == NULL) {
226 			netstack_rele(ns);
227 			continue;
228 		}
229 		if (atomic_swap_uint(&ipst->ips_dce_reclaim_needed, 0) != 0)
230 			ip_dce_reclaim_stack(ipst);
231 		netstack_rele(ns);
232 	}
233 	netstack_next_fini(&nh);
234 }
235 
236 /* ARGSUSED */
237 static void
238 dce_reclaim_worker(void *arg)
239 {
240 	callb_cpr_t	cprinfo;
241 
242 	CALLB_CPR_INIT(&cprinfo, &dce_reclaim_lock, callb_generic_cpr,
243 	    "dce_reclaim_worker");
244 
245 	mutex_enter(&dce_reclaim_lock);
246 	while (!dce_reclaim_shutdown) {
247 		CALLB_CPR_SAFE_BEGIN(&cprinfo);
248 		(void) cv_timedwait(&dce_reclaim_cv, &dce_reclaim_lock,
249 		    ddi_get_lbolt() + ip_dce_reclaim_interval * hz);
250 		CALLB_CPR_SAFE_END(&cprinfo, &dce_reclaim_lock);
251 
252 		if (dce_reclaim_shutdown)
253 			break;
254 
255 		mutex_exit(&dce_reclaim_lock);
256 		ip_dce_reclaim();
257 		mutex_enter(&dce_reclaim_lock);
258 	}
259 
260 	ASSERT(MUTEX_HELD(&dce_reclaim_lock));
261 	dce_reclaim_thread = NULL;
262 	dce_reclaim_shutdown = 0;
263 	cv_broadcast(&dce_reclaim_cv);
264 	CALLB_CPR_EXIT(&cprinfo);	/* drops the lock */
265 
266 	thread_exit();
267 }
268 
269 void
270 dce_g_init(void)
271 {
272 	dce_cache = kmem_cache_create("dce_cache",
273 	    sizeof (dce_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
274 
275 	mutex_init(&dce_reclaim_lock, NULL, MUTEX_DEFAULT, NULL);
276 	cv_init(&dce_reclaim_cv, NULL, CV_DEFAULT, NULL);
277 
278 	dce_reclaim_thread = thread_create(NULL, 0, dce_reclaim_worker,
279 	    NULL, 0, &p0, TS_RUN, minclsyspri);
280 }
281 
282 void
283 dce_g_destroy(void)
284 {
285 	mutex_enter(&dce_reclaim_lock);
286 	dce_reclaim_shutdown = 1;
287 	cv_signal(&dce_reclaim_cv);
288 	while (dce_reclaim_thread != NULL)
289 		cv_wait(&dce_reclaim_cv, &dce_reclaim_lock);
290 	mutex_exit(&dce_reclaim_lock);
291 
292 	cv_destroy(&dce_reclaim_cv);
293 	mutex_destroy(&dce_reclaim_lock);
294 
295 	kmem_cache_destroy(dce_cache);
296 }
297 
298 /*
299  * Allocate a default DCE and a hash table for per-IP address DCEs
300  */
301 void
302 dce_stack_init(ip_stack_t *ipst)
303 {
304 	int	i;
305 
306 	ipst->ips_dce_default = kmem_cache_alloc(dce_cache, KM_SLEEP);
307 	bzero(ipst->ips_dce_default, sizeof (dce_t));
308 	ipst->ips_dce_default->dce_flags = DCEF_DEFAULT;
309 	ipst->ips_dce_default->dce_generation = DCE_GENERATION_INITIAL;
310 	ipst->ips_dce_default->dce_last_change_time =
311 	    TICK_TO_SEC(ddi_get_lbolt64());
312 	ipst->ips_dce_default->dce_refcnt = 1;	/* Should never go away */
313 	ipst->ips_dce_default->dce_ipst = ipst;
314 
315 	/* This must be a power of two since we are using IRE_ADDR_HASH macro */
316 	ipst->ips_dce_hashsize = ip_dce_hash_size;
317 	ipst->ips_dce_hash_v4 = kmem_zalloc(ipst->ips_dce_hashsize *
318 	    sizeof (dcb_t), KM_SLEEP);
319 	ipst->ips_dce_hash_v6 = kmem_zalloc(ipst->ips_dce_hashsize *
320 	    sizeof (dcb_t), KM_SLEEP);
321 	for (i = 0; i < ipst->ips_dce_hashsize; i++) {
322 		rw_init(&ipst->ips_dce_hash_v4[i].dcb_lock, NULL, RW_DEFAULT,
323 		    NULL);
324 		rw_init(&ipst->ips_dce_hash_v6[i].dcb_lock, NULL, RW_DEFAULT,
325 		    NULL);
326 	}
327 }
328 
329 void
330 dce_stack_destroy(ip_stack_t *ipst)
331 {
332 	int i;
333 	for (i = 0; i < ipst->ips_dce_hashsize; i++) {
334 		rw_destroy(&ipst->ips_dce_hash_v4[i].dcb_lock);
335 		rw_destroy(&ipst->ips_dce_hash_v6[i].dcb_lock);
336 	}
337 	kmem_free(ipst->ips_dce_hash_v4,
338 	    ipst->ips_dce_hashsize * sizeof (dcb_t));
339 	ipst->ips_dce_hash_v4 = NULL;
340 	kmem_free(ipst->ips_dce_hash_v6,
341 	    ipst->ips_dce_hashsize * sizeof (dcb_t));
342 	ipst->ips_dce_hash_v6 = NULL;
343 	ipst->ips_dce_hashsize = 0;
344 
345 	ASSERT(ipst->ips_dce_default->dce_refcnt == 1);
346 	kmem_cache_free(dce_cache, ipst->ips_dce_default);
347 	ipst->ips_dce_default = NULL;
348 }
349 
350 /* When any DCE is good enough */
351 dce_t *
352 dce_get_default(ip_stack_t *ipst)
353 {
354 	dce_t		*dce;
355 
356 	dce = ipst->ips_dce_default;
357 	dce_refhold(dce);
358 	return (dce);
359 }
360 
361 /*
362  * Generic for IPv4 and IPv6.
363  *
364  * Used by callers that need to cache e.g., the datapath
365  * Returns the generation number in the last argument.
366  */
367 dce_t *
368 dce_lookup_pkt(mblk_t *mp, ip_xmit_attr_t *ixa, uint_t *generationp)
369 {
370 	if (ixa->ixa_flags & IXAF_IS_IPV4) {
371 		/*
372 		 * If we have a source route we need to look for the final
373 		 * destination in the source route option.
374 		 */
375 		ipaddr_t final_dst;
376 		ipha_t *ipha = (ipha_t *)mp->b_rptr;
377 
378 		final_dst = ip_get_dst(ipha);
379 		return (dce_lookup_v4(final_dst, ixa->ixa_ipst, generationp));
380 	} else {
381 		uint_t ifindex;
382 		/*
383 		 * If we have a routing header we need to look for the final
384 		 * destination in the routing extension header.
385 		 */
386 		in6_addr_t final_dst;
387 		ip6_t *ip6h = (ip6_t *)mp->b_rptr;
388 
389 		final_dst = ip_get_dst_v6(ip6h, mp, NULL);
390 		ifindex = 0;
391 		if (IN6_IS_ADDR_LINKSCOPE(&final_dst) && ixa->ixa_nce != NULL) {
392 			ifindex = ixa->ixa_nce->nce_common->ncec_ill->
393 			    ill_phyint->phyint_ifindex;
394 		}
395 		return (dce_lookup_v6(&final_dst, ifindex, ixa->ixa_ipst,
396 		    generationp));
397 	}
398 }
399 
400 /*
401  * Used by callers that need to cache e.g., the datapath
402  * Returns the generation number in the last argument.
403  */
404 dce_t *
405 dce_lookup_v4(ipaddr_t dst, ip_stack_t *ipst, uint_t *generationp)
406 {
407 	uint_t		hash;
408 	dcb_t		*dcb;
409 	dce_t		*dce;
410 
411 	/* Set *generationp before dropping the lock(s) that allow additions */
412 	if (generationp != NULL)
413 		*generationp = ipst->ips_dce_default->dce_generation;
414 
415 	hash = IRE_ADDR_HASH(dst, ipst->ips_dce_hashsize);
416 	dcb = &ipst->ips_dce_hash_v4[hash];
417 	rw_enter(&dcb->dcb_lock, RW_READER);
418 	for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) {
419 		if (dce->dce_v4addr == dst) {
420 			mutex_enter(&dce->dce_lock);
421 			if (!DCE_IS_CONDEMNED(dce)) {
422 				dce_refhold(dce);
423 				if (generationp != NULL)
424 					*generationp = dce->dce_generation;
425 				mutex_exit(&dce->dce_lock);
426 				rw_exit(&dcb->dcb_lock);
427 				return (dce);
428 			}
429 			mutex_exit(&dce->dce_lock);
430 		}
431 	}
432 	rw_exit(&dcb->dcb_lock);
433 	/* Not found */
434 	dce = ipst->ips_dce_default;
435 	dce_refhold(dce);
436 	return (dce);
437 }
438 
439 /*
440  * Used by callers that need to cache e.g., the datapath
441  * Returns the generation number in the last argument.
442  * ifindex should only be set for link-locals
443  */
444 dce_t *
445 dce_lookup_v6(const in6_addr_t *dst, uint_t ifindex, ip_stack_t *ipst,
446     uint_t *generationp)
447 {
448 	uint_t		hash;
449 	dcb_t		*dcb;
450 	dce_t		*dce;
451 
452 	/* Set *generationp before dropping the lock(s) that allow additions */
453 	if (generationp != NULL)
454 		*generationp = ipst->ips_dce_default->dce_generation;
455 
456 	hash = IRE_ADDR_HASH_V6(*dst, ipst->ips_dce_hashsize);
457 	dcb = &ipst->ips_dce_hash_v6[hash];
458 	rw_enter(&dcb->dcb_lock, RW_READER);
459 	for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) {
460 		if (IN6_ARE_ADDR_EQUAL(&dce->dce_v6addr, dst) &&
461 		    dce->dce_ifindex == ifindex) {
462 			mutex_enter(&dce->dce_lock);
463 			if (!DCE_IS_CONDEMNED(dce)) {
464 				dce_refhold(dce);
465 				if (generationp != NULL)
466 					*generationp = dce->dce_generation;
467 				mutex_exit(&dce->dce_lock);
468 				rw_exit(&dcb->dcb_lock);
469 				return (dce);
470 			}
471 			mutex_exit(&dce->dce_lock);
472 		}
473 	}
474 	rw_exit(&dcb->dcb_lock);
475 	/* Not found */
476 	dce = ipst->ips_dce_default;
477 	dce_refhold(dce);
478 	return (dce);
479 }
480 
481 /*
482  * Atomically looks for a non-default DCE, and if not found tries to create one.
483  * If there is no memory it returns NULL.
484  * When an entry is created we increase the generation number on
485  * the default DCE so that conn_ip_output will detect there is a new DCE.
486  */
487 dce_t *
488 dce_lookup_and_add_v4(ipaddr_t dst, ip_stack_t *ipst)
489 {
490 	uint_t		hash;
491 	dcb_t		*dcb;
492 	dce_t		*dce;
493 
494 	hash = IRE_ADDR_HASH(dst, ipst->ips_dce_hashsize);
495 	dcb = &ipst->ips_dce_hash_v4[hash];
496 	/*
497 	 * Assuming that we get fairly even distribution across all of the
498 	 * buckets, once one bucket is overly full, prune the whole cache.
499 	 */
500 	if (dcb->dcb_cnt > ipst->ips_ip_dce_reclaim_threshold)
501 		atomic_or_uint(&ipst->ips_dce_reclaim_needed, 1);
502 	rw_enter(&dcb->dcb_lock, RW_WRITER);
503 	for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) {
504 		if (dce->dce_v4addr == dst) {
505 			mutex_enter(&dce->dce_lock);
506 			if (!DCE_IS_CONDEMNED(dce)) {
507 				dce_refhold(dce);
508 				mutex_exit(&dce->dce_lock);
509 				rw_exit(&dcb->dcb_lock);
510 				return (dce);
511 			}
512 			mutex_exit(&dce->dce_lock);
513 		}
514 	}
515 	dce = kmem_cache_alloc(dce_cache, KM_NOSLEEP);
516 	if (dce == NULL) {
517 		rw_exit(&dcb->dcb_lock);
518 		return (NULL);
519 	}
520 	bzero(dce, sizeof (dce_t));
521 	dce->dce_ipst = ipst;	/* No netstack_hold */
522 	dce->dce_v4addr = dst;
523 	dce->dce_generation = DCE_GENERATION_INITIAL;
524 	dce->dce_ipversion = IPV4_VERSION;
525 	dce->dce_last_change_time = TICK_TO_SEC(ddi_get_lbolt64());
526 	dce_refhold(dce);	/* For the hash list */
527 
528 	/* Link into list */
529 	if (dcb->dcb_dce != NULL)
530 		dcb->dcb_dce->dce_ptpn = &dce->dce_next;
531 	dce->dce_next = dcb->dcb_dce;
532 	dce->dce_ptpn = &dcb->dcb_dce;
533 	dcb->dcb_dce = dce;
534 	dce->dce_bucket = dcb;
535 	atomic_inc_32(&dcb->dcb_cnt);
536 	dce_refhold(dce);	/* For the caller */
537 	rw_exit(&dcb->dcb_lock);
538 
539 	/* Initialize dce_ident to be different than for the last packet */
540 	dce->dce_ident = ipst->ips_dce_default->dce_ident + 1;
541 
542 	dce_increment_generation(ipst->ips_dce_default);
543 	return (dce);
544 }
545 
546 /*
547  * Atomically looks for a non-default DCE, and if not found tries to create one.
548  * If there is no memory it returns NULL.
549  * When an entry is created we increase the generation number on
550  * the default DCE so that conn_ip_output will detect there is a new DCE.
551  * ifindex should only be used with link-local addresses.
552  */
553 dce_t *
554 dce_lookup_and_add_v6(const in6_addr_t *dst, uint_t ifindex, ip_stack_t *ipst)
555 {
556 	uint_t		hash;
557 	dcb_t		*dcb;
558 	dce_t		*dce;
559 
560 	/* We should not create entries for link-locals w/o an ifindex */
561 	ASSERT(!(IN6_IS_ADDR_LINKSCOPE(dst)) || ifindex != 0);
562 
563 	hash = IRE_ADDR_HASH_V6(*dst, ipst->ips_dce_hashsize);
564 	dcb = &ipst->ips_dce_hash_v6[hash];
565 	/*
566 	 * Assuming that we get fairly even distribution across all of the
567 	 * buckets, once one bucket is overly full, prune the whole cache.
568 	 */
569 	if (dcb->dcb_cnt > ipst->ips_ip_dce_reclaim_threshold)
570 		atomic_or_uint(&ipst->ips_dce_reclaim_needed, 1);
571 	rw_enter(&dcb->dcb_lock, RW_WRITER);
572 	for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) {
573 		if (IN6_ARE_ADDR_EQUAL(&dce->dce_v6addr, dst) &&
574 		    dce->dce_ifindex == ifindex) {
575 			mutex_enter(&dce->dce_lock);
576 			if (!DCE_IS_CONDEMNED(dce)) {
577 				dce_refhold(dce);
578 				mutex_exit(&dce->dce_lock);
579 				rw_exit(&dcb->dcb_lock);
580 				return (dce);
581 			}
582 			mutex_exit(&dce->dce_lock);
583 		}
584 	}
585 
586 	dce = kmem_cache_alloc(dce_cache, KM_NOSLEEP);
587 	if (dce == NULL) {
588 		rw_exit(&dcb->dcb_lock);
589 		return (NULL);
590 	}
591 	bzero(dce, sizeof (dce_t));
592 	dce->dce_ipst = ipst;	/* No netstack_hold */
593 	dce->dce_v6addr = *dst;
594 	dce->dce_ifindex = ifindex;
595 	dce->dce_generation = DCE_GENERATION_INITIAL;
596 	dce->dce_ipversion = IPV6_VERSION;
597 	dce->dce_last_change_time = TICK_TO_SEC(ddi_get_lbolt64());
598 	dce_refhold(dce);	/* For the hash list */
599 
600 	/* Link into list */
601 	if (dcb->dcb_dce != NULL)
602 		dcb->dcb_dce->dce_ptpn = &dce->dce_next;
603 	dce->dce_next = dcb->dcb_dce;
604 	dce->dce_ptpn = &dcb->dcb_dce;
605 	dcb->dcb_dce = dce;
606 	dce->dce_bucket = dcb;
607 	atomic_inc_32(&dcb->dcb_cnt);
608 	dce_refhold(dce);	/* For the caller */
609 	rw_exit(&dcb->dcb_lock);
610 
611 	/* Initialize dce_ident to be different than for the last packet */
612 	dce->dce_ident = ipst->ips_dce_default->dce_ident + 1;
613 	dce_increment_generation(ipst->ips_dce_default);
614 	return (dce);
615 }
616 
617 /*
618  * Set/update uinfo. Creates a per-destination dce if none exists.
619  *
620  * Note that we do not bump the generation number here.
621  * New connections will find the new uinfo.
622  *
623  * The only use of this (tcp, sctp using iulp_t) is to set rtt+rtt_sd.
624  */
625 static void
626 dce_setuinfo(dce_t *dce, iulp_t *uinfo)
627 {
628 	/*
629 	 * Update the round trip time estimate and/or the max frag size
630 	 * and/or the slow start threshold.
631 	 *
632 	 * We serialize multiple advises using dce_lock.
633 	 */
634 	mutex_enter(&dce->dce_lock);
635 	/* Gard against setting to zero */
636 	if (uinfo->iulp_rtt != 0) {
637 		/*
638 		 * If there is no old cached values, initialize them
639 		 * conservatively.  Set them to be (1.5 * new value).
640 		 */
641 		if (dce->dce_uinfo.iulp_rtt != 0) {
642 			dce->dce_uinfo.iulp_rtt = (dce->dce_uinfo.iulp_rtt +
643 			    uinfo->iulp_rtt) >> 1;
644 		} else {
645 			dce->dce_uinfo.iulp_rtt = uinfo->iulp_rtt +
646 			    (uinfo->iulp_rtt >> 1);
647 		}
648 		if (dce->dce_uinfo.iulp_rtt_sd != 0) {
649 			dce->dce_uinfo.iulp_rtt_sd =
650 			    (dce->dce_uinfo.iulp_rtt_sd +
651 			    uinfo->iulp_rtt_sd) >> 1;
652 		} else {
653 			dce->dce_uinfo.iulp_rtt_sd = uinfo->iulp_rtt_sd +
654 			    (uinfo->iulp_rtt_sd >> 1);
655 		}
656 	}
657 	if (uinfo->iulp_mtu != 0) {
658 		if (dce->dce_flags & DCEF_PMTU) {
659 			dce->dce_pmtu = MIN(uinfo->iulp_mtu, dce->dce_pmtu);
660 		} else {
661 			dce->dce_pmtu = MIN(uinfo->iulp_mtu, IP_MAXPACKET);
662 			dce->dce_flags |= DCEF_PMTU;
663 		}
664 		dce->dce_last_change_time = TICK_TO_SEC(ddi_get_lbolt64());
665 	}
666 	if (uinfo->iulp_ssthresh != 0) {
667 		if (dce->dce_uinfo.iulp_ssthresh != 0)
668 			dce->dce_uinfo.iulp_ssthresh =
669 			    (uinfo->iulp_ssthresh +
670 			    dce->dce_uinfo.iulp_ssthresh) >> 1;
671 		else
672 			dce->dce_uinfo.iulp_ssthresh = uinfo->iulp_ssthresh;
673 	}
674 	/* We have uinfo for sure */
675 	dce->dce_flags |= DCEF_UINFO;
676 	mutex_exit(&dce->dce_lock);
677 }
678 
679 
680 int
681 dce_update_uinfo_v4(ipaddr_t dst, iulp_t *uinfo, ip_stack_t *ipst)
682 {
683 	dce_t *dce;
684 
685 	dce = dce_lookup_and_add_v4(dst, ipst);
686 	if (dce == NULL)
687 		return (ENOMEM);
688 
689 	dce_setuinfo(dce, uinfo);
690 	dce_refrele(dce);
691 	return (0);
692 }
693 
694 int
695 dce_update_uinfo_v6(const in6_addr_t *dst, uint_t ifindex, iulp_t *uinfo,
696     ip_stack_t *ipst)
697 {
698 	dce_t *dce;
699 
700 	dce = dce_lookup_and_add_v6(dst, ifindex, ipst);
701 	if (dce == NULL)
702 		return (ENOMEM);
703 
704 	dce_setuinfo(dce, uinfo);
705 	dce_refrele(dce);
706 	return (0);
707 }
708 
709 /* Common routine for IPv4 and IPv6 */
710 int
711 dce_update_uinfo(const in6_addr_t *dst, uint_t ifindex, iulp_t *uinfo,
712     ip_stack_t *ipst)
713 {
714 	ipaddr_t dst4;
715 
716 	if (IN6_IS_ADDR_V4MAPPED_ANY(dst)) {
717 		IN6_V4MAPPED_TO_IPADDR(dst, dst4);
718 		return (dce_update_uinfo_v4(dst4, uinfo, ipst));
719 	} else {
720 		return (dce_update_uinfo_v6(dst, ifindex, uinfo, ipst));
721 	}
722 }
723 
724 static void
725 dce_make_condemned(dce_t *dce)
726 {
727 	ip_stack_t	*ipst = dce->dce_ipst;
728 
729 	mutex_enter(&dce->dce_lock);
730 	ASSERT(!DCE_IS_CONDEMNED(dce));
731 	dce->dce_generation = DCE_GENERATION_CONDEMNED;
732 	mutex_exit(&dce->dce_lock);
733 	/* Count how many condemned dces for kmem_cache callback */
734 	atomic_inc_32(&ipst->ips_num_dce_condemned);
735 }
736 
737 /*
738  * Increment the generation avoiding the special condemned value
739  */
740 void
741 dce_increment_generation(dce_t *dce)
742 {
743 	uint_t generation;
744 
745 	mutex_enter(&dce->dce_lock);
746 	if (!DCE_IS_CONDEMNED(dce)) {
747 		generation = dce->dce_generation + 1;
748 		if (generation == DCE_GENERATION_CONDEMNED)
749 			generation = DCE_GENERATION_INITIAL;
750 		ASSERT(generation != DCE_GENERATION_VERIFY);
751 		dce->dce_generation = generation;
752 	}
753 	mutex_exit(&dce->dce_lock);
754 }
755 
756 /*
757  * Increment the generation number on all dces that have a path MTU and
758  * the default DCE. Used when ill_mtu or ill_mc_mtu changes.
759  */
760 void
761 dce_increment_all_generations(boolean_t isv6, ip_stack_t *ipst)
762 {
763 	int		i;
764 	dcb_t		*dcb;
765 	dce_t		*dce;
766 
767 	for (i = 0; i < ipst->ips_dce_hashsize; i++) {
768 		if (isv6)
769 			dcb = &ipst->ips_dce_hash_v6[i];
770 		else
771 			dcb = &ipst->ips_dce_hash_v4[i];
772 		rw_enter(&dcb->dcb_lock, RW_WRITER);
773 		for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) {
774 			if (DCE_IS_CONDEMNED(dce))
775 				continue;
776 			dce_increment_generation(dce);
777 		}
778 		rw_exit(&dcb->dcb_lock);
779 	}
780 	dce_increment_generation(ipst->ips_dce_default);
781 }
782 
783 /*
784  * Caller needs to do a dce_refrele since we can't do the
785  * dce_refrele under dcb_lock.
786  */
787 static void
788 dce_delete_locked(dcb_t *dcb, dce_t *dce)
789 {
790 	dce->dce_bucket = NULL;
791 	*dce->dce_ptpn = dce->dce_next;
792 	if (dce->dce_next != NULL)
793 		dce->dce_next->dce_ptpn = dce->dce_ptpn;
794 	dce->dce_ptpn = NULL;
795 	dce->dce_next = NULL;
796 	atomic_dec_32(&dcb->dcb_cnt);
797 	dce_make_condemned(dce);
798 }
799 
800 static void
801 dce_inactive(dce_t *dce)
802 {
803 	ip_stack_t	*ipst = dce->dce_ipst;
804 
805 	ASSERT(!(dce->dce_flags & DCEF_DEFAULT));
806 	ASSERT(dce->dce_ptpn == NULL);
807 	ASSERT(dce->dce_bucket == NULL);
808 
809 	/* Count how many condemned dces for kmem_cache callback */
810 	if (DCE_IS_CONDEMNED(dce))
811 		atomic_dec_32(&ipst->ips_num_dce_condemned);
812 
813 	kmem_cache_free(dce_cache, dce);
814 }
815 
816 void
817 dce_refrele(dce_t *dce)
818 {
819 	ASSERT(dce->dce_refcnt != 0);
820 	if (atomic_dec_32_nv(&dce->dce_refcnt) == 0)
821 		dce_inactive(dce);
822 }
823 
824 void
825 dce_refhold(dce_t *dce)
826 {
827 	atomic_inc_32(&dce->dce_refcnt);
828 	ASSERT(dce->dce_refcnt != 0);
829 }
830 
831 /* No tracing support yet hence the same as the above functions */
832 void
833 dce_refrele_notr(dce_t *dce)
834 {
835 	ASSERT(dce->dce_refcnt != 0);
836 	if (atomic_dec_32_nv(&dce->dce_refcnt) == 0)
837 		dce_inactive(dce);
838 }
839 
840 void
841 dce_refhold_notr(dce_t *dce)
842 {
843 	atomic_inc_32(&dce->dce_refcnt);
844 	ASSERT(dce->dce_refcnt != 0);
845 }
846 
847 /* Report both the IPv4 and IPv6 DCEs. */
848 mblk_t *
849 ip_snmp_get_mib2_ip_dce(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
850 {
851 	struct opthdr		*optp;
852 	mblk_t			*mp2ctl;
853 	dest_cache_entry_t	dest_cache;
854 	mblk_t			*mp_tail = NULL;
855 	dce_t			*dce;
856 	dcb_t			*dcb;
857 	int			i;
858 	uint64_t		current_time;
859 
860 	current_time = TICK_TO_SEC(ddi_get_lbolt64());
861 
862 	/*
863 	 * make a copy of the original message
864 	 */
865 	mp2ctl = copymsg(mpctl);
866 
867 	/* First we do IPv4 entries */
868 	optp = (struct opthdr *)&mpctl->b_rptr[
869 	    sizeof (struct T_optmgmt_ack)];
870 	optp->level = MIB2_IP;
871 	optp->name = EXPER_IP_DCE;
872 
873 	for (i = 0; i < ipst->ips_dce_hashsize; i++) {
874 		dcb = &ipst->ips_dce_hash_v4[i];
875 		rw_enter(&dcb->dcb_lock, RW_READER);
876 		for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) {
877 			dest_cache.DestIpv4Address = dce->dce_v4addr;
878 			dest_cache.DestFlags = dce->dce_flags;
879 			if (dce->dce_flags & DCEF_PMTU)
880 				dest_cache.DestPmtu = dce->dce_pmtu;
881 			else
882 				dest_cache.DestPmtu = 0;
883 			dest_cache.DestIdent = dce->dce_ident;
884 			dest_cache.DestIfindex = 0;
885 			dest_cache.DestAge = current_time -
886 			    dce->dce_last_change_time;
887 			if (!snmp_append_data2(mpctl->b_cont, &mp_tail,
888 			    (char *)&dest_cache, (int)sizeof (dest_cache))) {
889 				ip1dbg(("ip_snmp_get_mib2_ip_dce: "
890 				    "failed to allocate %u bytes\n",
891 				    (uint_t)sizeof (dest_cache)));
892 			}
893 		}
894 		rw_exit(&dcb->dcb_lock);
895 	}
896 	optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont);
897 	ip3dbg(("ip_snmp_get: level %d, name %d, len %d\n",
898 	    (int)optp->level, (int)optp->name, (int)optp->len));
899 	qreply(q, mpctl);
900 
901 	if (mp2ctl == NULL) {
902 		/* Copymsg failed above */
903 		return (NULL);
904 	}
905 
906 	/* Now for IPv6 */
907 	mpctl = mp2ctl;
908 	mp_tail = NULL;
909 	mp2ctl = copymsg(mpctl);
910 	optp = (struct opthdr *)&mpctl->b_rptr[
911 	    sizeof (struct T_optmgmt_ack)];
912 	optp->level = MIB2_IP6;
913 	optp->name = EXPER_IP_DCE;
914 
915 	for (i = 0; i < ipst->ips_dce_hashsize; i++) {
916 		dcb = &ipst->ips_dce_hash_v6[i];
917 		rw_enter(&dcb->dcb_lock, RW_READER);
918 		for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) {
919 			dest_cache.DestIpv6Address = dce->dce_v6addr;
920 			dest_cache.DestFlags = dce->dce_flags;
921 			if (dce->dce_flags & DCEF_PMTU)
922 				dest_cache.DestPmtu = dce->dce_pmtu;
923 			else
924 				dest_cache.DestPmtu = 0;
925 			dest_cache.DestIdent = dce->dce_ident;
926 			if (IN6_IS_ADDR_LINKSCOPE(&dce->dce_v6addr))
927 				dest_cache.DestIfindex = dce->dce_ifindex;
928 			else
929 				dest_cache.DestIfindex = 0;
930 			dest_cache.DestAge = current_time -
931 			    dce->dce_last_change_time;
932 			if (!snmp_append_data2(mpctl->b_cont, &mp_tail,
933 			    (char *)&dest_cache, (int)sizeof (dest_cache))) {
934 				ip1dbg(("ip_snmp_get_mib2_ip_dce: "
935 				    "failed to allocate %u bytes\n",
936 				    (uint_t)sizeof (dest_cache)));
937 			}
938 		}
939 		rw_exit(&dcb->dcb_lock);
940 	}
941 	optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont);
942 	ip3dbg(("ip_snmp_get: level %d, name %d, len %d\n",
943 	    (int)optp->level, (int)optp->name, (int)optp->len));
944 	qreply(q, mpctl);
945 
946 	return (mp2ctl);
947 }
948 
949 /*
950  * Remove IPv6 DCEs which refer to an ifindex that is going away.
951  * This is not required for correctness, but it avoids netstat -d
952  * showing stale stuff that will never be used.
953  */
954 void
955 dce_cleanup(uint_t ifindex, ip_stack_t *ipst)
956 {
957 	uint_t	i;
958 	dcb_t	*dcb;
959 	dce_t	*dce, *nextdce;
960 
961 	for (i = 0; i < ipst->ips_dce_hashsize; i++) {
962 		dcb = &ipst->ips_dce_hash_v6[i];
963 		rw_enter(&dcb->dcb_lock, RW_WRITER);
964 
965 		for (dce = dcb->dcb_dce; dce != NULL; dce = nextdce) {
966 			nextdce = dce->dce_next;
967 			if (dce->dce_ifindex == ifindex) {
968 				dce_delete_locked(dcb, dce);
969 				dce_refrele(dce);
970 			}
971 		}
972 		rw_exit(&dcb->dcb_lock);
973 	}
974 }
975