xref: /titanic_44/usr/src/uts/common/inet/ip/ip_dce.c (revision 0db3240d392634cfff2f95fb6da34b56b8dc574f)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #include <sys/types.h>
28 #include <sys/stream.h>
29 #include <sys/strsun.h>
30 #include <sys/zone.h>
31 #include <sys/ddi.h>
32 #include <sys/sunddi.h>
33 #include <sys/cmn_err.h>
34 #include <sys/debug.h>
35 #include <sys/atomic.h>
36 #define	_SUN_TPI_VERSION 2
37 #include <sys/tihdr.h>
38 
39 #include <inet/common.h>
40 #include <inet/mi.h>
41 #include <inet/mib2.h>
42 #include <inet/snmpcom.h>
43 
44 #include <netinet/ip6.h>
45 #include <netinet/icmp6.h>
46 
47 #include <inet/ip.h>
48 #include <inet/ip_impl.h>
49 #include <inet/ip6.h>
50 #include <inet/ip6_asp.h>
51 #include <inet/ip_multi.h>
52 #include <inet/ip_if.h>
53 #include <inet/ip_ire.h>
54 #include <inet/ip_ftable.h>
55 #include <inet/ip_rts.h>
56 #include <inet/ip_ndp.h>
57 #include <inet/ipclassifier.h>
58 #include <inet/ip_listutils.h>
59 
60 #include <sys/sunddi.h>
61 
62 /*
63  * Routines for handling destination cache entries.
64  * There is always one DCEF_DEFAULT for each ip_stack_t created at init time.
65  * That entry holds both the IP ident value and the dce generation number.
66  *
67  * Any time a DCE is changed significantly (different path MTU, but NOT
68  * different ULP info!), the dce_generation number is increased.
69  * Also, when a new DCE is created, the dce_generation number in the default
70  * DCE is bumped. That allows the dce_t information to be cached efficiently
71  * as long as the entity caching the dce_t also caches the dce_generation,
72  * and compares the cached generation to detect any changes.
73  * Furthermore, when a DCE is deleted, if there are any outstanding references
74  * to the DCE it will be marked as condemned. The condemned mark is
75  * a designated generation number which is never otherwise used, hence
76  * the single comparison with the generation number captures that as well.
77  *
78  * An example of code which caches is as follows:
79  *
80  *	if (mystruct->my_dce_generation != mystruct->my_dce->dce_generation) {
81  *		The DCE has changed
82  *		mystruct->my_dce = dce_lookup_pkt(mp, ixa,
83  *		    &mystruct->my_dce_generation);
84  *		Not needed in practice, since we have the default DCE:
85  *		if (DCE_IS_CONDEMNED(mystruct->my_dce))
86  *			return failure;
87  *	}
88  *
89  * Note that for IPv6 link-local addresses we record the ifindex since the
90  * link-locals are not globally unique.
91  */
92 
93 /*
94  * Hash bucket structure for DCEs
95  */
96 typedef struct dcb_s {
97 	krwlock_t	dcb_lock;
98 	uint32_t	dcb_cnt;
99 	dce_t		*dcb_dce;
100 } dcb_t;
101 
102 static void	dce_delete_locked(dcb_t *, dce_t *);
103 static void	dce_make_condemned(dce_t *);
104 
105 static kmem_cache_t *dce_cache;
106 
107 
108 /* Operates on a uint64_t */
109 #define	RANDOM_HASH(p) ((p) ^ ((p)>>16) ^ ((p)>>32) ^ ((p)>>48))
110 
111 /*
112  * Reclaim a fraction of dce's in the dcb.
113  * For now we have a higher probability to delete DCEs without DCE_PMTU.
114  */
115 static void
116 dcb_reclaim(dcb_t *dcb, ip_stack_t *ipst, uint_t fraction)
117 {
118 	uint_t	fraction_pmtu = fraction*4;
119 	uint_t	hash;
120 	dce_t	*dce, *nextdce;
121 
122 	rw_enter(&dcb->dcb_lock, RW_WRITER);
123 	for (dce = dcb->dcb_dce; dce != NULL; dce = nextdce) {
124 		nextdce = dce->dce_next;
125 		/* Clear DCEF_PMTU if the pmtu is too old */
126 		mutex_enter(&dce->dce_lock);
127 		if ((dce->dce_flags & DCEF_PMTU) &&
128 		    TICK_TO_SEC(ddi_get_lbolt64()) - dce->dce_last_change_time >
129 		    ipst->ips_ip_pathmtu_interval) {
130 			dce->dce_flags &= ~DCEF_PMTU;
131 			mutex_exit(&dce->dce_lock);
132 			dce_increment_generation(dce);
133 		} else {
134 			mutex_exit(&dce->dce_lock);
135 		}
136 		hash = RANDOM_HASH((uint64_t)(uintptr_t)dce);
137 		if (dce->dce_flags & DCEF_PMTU) {
138 			if (hash % fraction_pmtu != 0)
139 				continue;
140 		} else {
141 			if (hash % fraction != 0)
142 				continue;
143 		}
144 
145 		IP_STAT(ipst, ip_dce_reclaim_deleted);
146 		dce_delete_locked(dcb, dce);
147 		dce_refrele(dce);
148 	}
149 	rw_exit(&dcb->dcb_lock);
150 }
151 
152 /*
153  * kmem_cache callback to free up memory.
154  *
155  */
156 static void
157 ip_dce_reclaim_stack(ip_stack_t *ipst)
158 {
159 	int	i;
160 
161 	IP_STAT(ipst, ip_dce_reclaim_calls);
162 	for (i = 0; i < ipst->ips_dce_hashsize; i++) {
163 		dcb_reclaim(&ipst->ips_dce_hash_v4[i], ipst,
164 		    ipst->ips_ip_dce_reclaim_fraction);
165 
166 		dcb_reclaim(&ipst->ips_dce_hash_v6[i], ipst,
167 		    ipst->ips_ip_dce_reclaim_fraction);
168 	}
169 
170 	/*
171 	 * Walk all CONNs that can have a reference on an ire, nce or dce.
172 	 * Get them to update any stale references to drop any refholds they
173 	 * have.
174 	 */
175 	ipcl_walk(conn_ixa_cleanup, (void *)B_FALSE, ipst);
176 }
177 
178 /*
179  * Called by the memory allocator subsystem directly, when the system
180  * is running low on memory.
181  */
182 /* ARGSUSED */
183 void
184 ip_dce_reclaim(void *args)
185 {
186 	netstack_handle_t nh;
187 	netstack_t *ns;
188 	ip_stack_t *ipst;
189 
190 	netstack_next_init(&nh);
191 	while ((ns = netstack_next(&nh)) != NULL) {
192 		/*
193 		 * netstack_next() can return a netstack_t with a NULL
194 		 * netstack_ip at boot time.
195 		 */
196 		if ((ipst = ns->netstack_ip) == NULL) {
197 			netstack_rele(ns);
198 			continue;
199 		}
200 		ip_dce_reclaim_stack(ipst);
201 		netstack_rele(ns);
202 	}
203 	netstack_next_fini(&nh);
204 }
205 
206 void
207 dce_g_init(void)
208 {
209 	dce_cache = kmem_cache_create("dce_cache",
210 	    sizeof (dce_t), 0, NULL, NULL, ip_dce_reclaim, NULL, NULL, 0);
211 }
212 
213 void
214 dce_g_destroy(void)
215 {
216 	kmem_cache_destroy(dce_cache);
217 }
218 
219 
220 /*
221  * Allocate a default DCE and a hash table for per-IP address DCEs
222  */
223 void
224 dce_stack_init(ip_stack_t *ipst)
225 {
226 	int	i;
227 
228 	ipst->ips_dce_default = kmem_cache_alloc(dce_cache, KM_SLEEP);
229 	bzero(ipst->ips_dce_default, sizeof (dce_t));
230 	ipst->ips_dce_default->dce_flags = DCEF_DEFAULT;
231 	ipst->ips_dce_default->dce_generation = DCE_GENERATION_INITIAL;
232 	ipst->ips_dce_default->dce_last_change_time =
233 	    TICK_TO_SEC(ddi_get_lbolt64());
234 	ipst->ips_dce_default->dce_refcnt = 1;	/* Should never go away */
235 	ipst->ips_dce_default->dce_ipst = ipst;
236 
237 	/* This must be a power of two since we are using IRE_ADDR_HASH macro */
238 	ipst->ips_dce_hashsize = 256;
239 	ipst->ips_dce_hash_v4 = kmem_zalloc(ipst->ips_dce_hashsize *
240 	    sizeof (dcb_t), KM_SLEEP);
241 	ipst->ips_dce_hash_v6 = kmem_zalloc(ipst->ips_dce_hashsize *
242 	    sizeof (dcb_t), KM_SLEEP);
243 	for (i = 0; i < ipst->ips_dce_hashsize; i++) {
244 		rw_init(&ipst->ips_dce_hash_v4[i].dcb_lock, NULL, RW_DEFAULT,
245 		    NULL);
246 		rw_init(&ipst->ips_dce_hash_v6[i].dcb_lock, NULL, RW_DEFAULT,
247 		    NULL);
248 	}
249 }
250 
251 void
252 dce_stack_destroy(ip_stack_t *ipst)
253 {
254 	int i;
255 	for (i = 0; i < ipst->ips_dce_hashsize; i++) {
256 		rw_destroy(&ipst->ips_dce_hash_v4[i].dcb_lock);
257 		rw_destroy(&ipst->ips_dce_hash_v6[i].dcb_lock);
258 	}
259 	kmem_free(ipst->ips_dce_hash_v4,
260 	    ipst->ips_dce_hashsize * sizeof (dcb_t));
261 	ipst->ips_dce_hash_v4 = NULL;
262 	kmem_free(ipst->ips_dce_hash_v6,
263 	    ipst->ips_dce_hashsize * sizeof (dcb_t));
264 	ipst->ips_dce_hash_v6 = NULL;
265 	ipst->ips_dce_hashsize = 0;
266 
267 	ASSERT(ipst->ips_dce_default->dce_refcnt == 1);
268 	kmem_cache_free(dce_cache, ipst->ips_dce_default);
269 	ipst->ips_dce_default = NULL;
270 }
271 
272 /* When any DCE is good enough */
273 dce_t *
274 dce_get_default(ip_stack_t *ipst)
275 {
276 	dce_t		*dce;
277 
278 	dce = ipst->ips_dce_default;
279 	dce_refhold(dce);
280 	return (dce);
281 }
282 
283 /*
284  * Generic for IPv4 and IPv6.
285  *
286  * Used by callers that need to cache e.g., the datapath
287  * Returns the generation number in the last argument.
288  */
289 dce_t *
290 dce_lookup_pkt(mblk_t *mp, ip_xmit_attr_t *ixa, uint_t *generationp)
291 {
292 	if (ixa->ixa_flags & IXAF_IS_IPV4) {
293 		/*
294 		 * If we have a source route we need to look for the final
295 		 * destination in the source route option.
296 		 */
297 		ipaddr_t final_dst;
298 		ipha_t *ipha = (ipha_t *)mp->b_rptr;
299 
300 		final_dst = ip_get_dst(ipha);
301 		return (dce_lookup_v4(final_dst, ixa->ixa_ipst, generationp));
302 	} else {
303 		uint_t ifindex;
304 		/*
305 		 * If we have a routing header we need to look for the final
306 		 * destination in the routing extension header.
307 		 */
308 		in6_addr_t final_dst;
309 		ip6_t *ip6h = (ip6_t *)mp->b_rptr;
310 
311 		final_dst = ip_get_dst_v6(ip6h, mp, NULL);
312 		ifindex = 0;
313 		if (IN6_IS_ADDR_LINKSCOPE(&final_dst) && ixa->ixa_nce != NULL) {
314 			ifindex = ixa->ixa_nce->nce_common->ncec_ill->
315 			    ill_phyint->phyint_ifindex;
316 		}
317 		return (dce_lookup_v6(&final_dst, ifindex, ixa->ixa_ipst,
318 		    generationp));
319 	}
320 }
321 
322 /*
323  * Used by callers that need to cache e.g., the datapath
324  * Returns the generation number in the last argument.
325  */
326 dce_t *
327 dce_lookup_v4(ipaddr_t dst, ip_stack_t *ipst, uint_t *generationp)
328 {
329 	uint_t		hash;
330 	dcb_t		*dcb;
331 	dce_t		*dce;
332 
333 	/* Set *generationp before dropping the lock(s) that allow additions */
334 	if (generationp != NULL)
335 		*generationp = ipst->ips_dce_default->dce_generation;
336 
337 	hash = IRE_ADDR_HASH(dst, ipst->ips_dce_hashsize);
338 	dcb = &ipst->ips_dce_hash_v4[hash];
339 	rw_enter(&dcb->dcb_lock, RW_READER);
340 	for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) {
341 		if (dce->dce_v4addr == dst) {
342 			mutex_enter(&dce->dce_lock);
343 			if (!DCE_IS_CONDEMNED(dce)) {
344 				dce_refhold(dce);
345 				if (generationp != NULL)
346 					*generationp = dce->dce_generation;
347 				mutex_exit(&dce->dce_lock);
348 				rw_exit(&dcb->dcb_lock);
349 				return (dce);
350 			}
351 			mutex_exit(&dce->dce_lock);
352 		}
353 	}
354 	rw_exit(&dcb->dcb_lock);
355 	/* Not found */
356 	dce = ipst->ips_dce_default;
357 	dce_refhold(dce);
358 	return (dce);
359 }
360 
361 /*
362  * Used by callers that need to cache e.g., the datapath
363  * Returns the generation number in the last argument.
364  * ifindex should only be set for link-locals
365  */
366 dce_t *
367 dce_lookup_v6(const in6_addr_t *dst, uint_t ifindex, ip_stack_t *ipst,
368     uint_t *generationp)
369 {
370 	uint_t		hash;
371 	dcb_t		*dcb;
372 	dce_t		*dce;
373 
374 	/* Set *generationp before dropping the lock(s) that allow additions */
375 	if (generationp != NULL)
376 		*generationp = ipst->ips_dce_default->dce_generation;
377 
378 	hash = IRE_ADDR_HASH_V6(*dst, ipst->ips_dce_hashsize);
379 	dcb = &ipst->ips_dce_hash_v6[hash];
380 	rw_enter(&dcb->dcb_lock, RW_READER);
381 	for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) {
382 		if (IN6_ARE_ADDR_EQUAL(&dce->dce_v6addr, dst) &&
383 		    dce->dce_ifindex == ifindex) {
384 			mutex_enter(&dce->dce_lock);
385 			if (!DCE_IS_CONDEMNED(dce)) {
386 				dce_refhold(dce);
387 				if (generationp != NULL)
388 					*generationp = dce->dce_generation;
389 				mutex_exit(&dce->dce_lock);
390 				rw_exit(&dcb->dcb_lock);
391 				return (dce);
392 			}
393 			mutex_exit(&dce->dce_lock);
394 		}
395 	}
396 	rw_exit(&dcb->dcb_lock);
397 	/* Not found */
398 	dce = ipst->ips_dce_default;
399 	dce_refhold(dce);
400 	return (dce);
401 }
402 
403 /*
404  * Atomically looks for a non-default DCE, and if not found tries to create one.
405  * If there is no memory it returns NULL.
406  * When an entry is created we increase the generation number on
407  * the default DCE so that conn_ip_output will detect there is a new DCE.
408  */
409 dce_t *
410 dce_lookup_and_add_v4(ipaddr_t dst, ip_stack_t *ipst)
411 {
412 	uint_t		hash;
413 	dcb_t		*dcb;
414 	dce_t		*dce;
415 
416 	hash = IRE_ADDR_HASH(dst, ipst->ips_dce_hashsize);
417 	dcb = &ipst->ips_dce_hash_v4[hash];
418 	rw_enter(&dcb->dcb_lock, RW_WRITER);
419 	for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) {
420 		if (dce->dce_v4addr == dst) {
421 			mutex_enter(&dce->dce_lock);
422 			if (!DCE_IS_CONDEMNED(dce)) {
423 				dce_refhold(dce);
424 				mutex_exit(&dce->dce_lock);
425 				rw_exit(&dcb->dcb_lock);
426 				return (dce);
427 			}
428 			mutex_exit(&dce->dce_lock);
429 		}
430 	}
431 	dce = kmem_cache_alloc(dce_cache, KM_NOSLEEP);
432 	if (dce == NULL) {
433 		rw_exit(&dcb->dcb_lock);
434 		return (NULL);
435 	}
436 	bzero(dce, sizeof (dce_t));
437 	dce->dce_ipst = ipst;	/* No netstack_hold */
438 	dce->dce_v4addr = dst;
439 	dce->dce_generation = DCE_GENERATION_INITIAL;
440 	dce->dce_ipversion = IPV4_VERSION;
441 	dce->dce_last_change_time = TICK_TO_SEC(ddi_get_lbolt64());
442 	dce_refhold(dce);	/* For the hash list */
443 
444 	/* Link into list */
445 	if (dcb->dcb_dce != NULL)
446 		dcb->dcb_dce->dce_ptpn = &dce->dce_next;
447 	dce->dce_next = dcb->dcb_dce;
448 	dce->dce_ptpn = &dcb->dcb_dce;
449 	dcb->dcb_dce = dce;
450 	dce->dce_bucket = dcb;
451 	dce_refhold(dce);	/* For the caller */
452 	rw_exit(&dcb->dcb_lock);
453 
454 	/* Initialize dce_ident to be different than for the last packet */
455 	dce->dce_ident = ipst->ips_dce_default->dce_ident + 1;
456 
457 	dce_increment_generation(ipst->ips_dce_default);
458 	return (dce);
459 }
460 
461 /*
462  * Atomically looks for a non-default DCE, and if not found tries to create one.
463  * If there is no memory it returns NULL.
464  * When an entry is created we increase the generation number on
465  * the default DCE so that conn_ip_output will detect there is a new DCE.
466  * ifindex should only be used with link-local addresses.
467  */
468 dce_t *
469 dce_lookup_and_add_v6(const in6_addr_t *dst, uint_t ifindex, ip_stack_t *ipst)
470 {
471 	uint_t		hash;
472 	dcb_t		*dcb;
473 	dce_t		*dce;
474 
475 	/* We should not create entries for link-locals w/o an ifindex */
476 	ASSERT(!(IN6_IS_ADDR_LINKSCOPE(dst)) || ifindex != 0);
477 
478 	hash = IRE_ADDR_HASH_V6(*dst, ipst->ips_dce_hashsize);
479 	dcb = &ipst->ips_dce_hash_v6[hash];
480 	rw_enter(&dcb->dcb_lock, RW_WRITER);
481 	for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) {
482 		if (IN6_ARE_ADDR_EQUAL(&dce->dce_v6addr, dst) &&
483 		    dce->dce_ifindex == ifindex) {
484 			mutex_enter(&dce->dce_lock);
485 			if (!DCE_IS_CONDEMNED(dce)) {
486 				dce_refhold(dce);
487 				mutex_exit(&dce->dce_lock);
488 				rw_exit(&dcb->dcb_lock);
489 				return (dce);
490 			}
491 			mutex_exit(&dce->dce_lock);
492 		}
493 	}
494 
495 	dce = kmem_cache_alloc(dce_cache, KM_NOSLEEP);
496 	if (dce == NULL) {
497 		rw_exit(&dcb->dcb_lock);
498 		return (NULL);
499 	}
500 	bzero(dce, sizeof (dce_t));
501 	dce->dce_ipst = ipst;	/* No netstack_hold */
502 	dce->dce_v6addr = *dst;
503 	dce->dce_ifindex = ifindex;
504 	dce->dce_generation = DCE_GENERATION_INITIAL;
505 	dce->dce_ipversion = IPV6_VERSION;
506 	dce->dce_last_change_time = TICK_TO_SEC(ddi_get_lbolt64());
507 	dce_refhold(dce);	/* For the hash list */
508 
509 	/* Link into list */
510 	if (dcb->dcb_dce != NULL)
511 		dcb->dcb_dce->dce_ptpn = &dce->dce_next;
512 	dce->dce_next = dcb->dcb_dce;
513 	dce->dce_ptpn = &dcb->dcb_dce;
514 	dcb->dcb_dce = dce;
515 	dce->dce_bucket = dcb;
516 	atomic_add_32(&dcb->dcb_cnt, 1);
517 	dce_refhold(dce);	/* For the caller */
518 	rw_exit(&dcb->dcb_lock);
519 
520 	/* Initialize dce_ident to be different than for the last packet */
521 	dce->dce_ident = ipst->ips_dce_default->dce_ident + 1;
522 	dce_increment_generation(ipst->ips_dce_default);
523 	return (dce);
524 }
525 
526 /*
527  * Set/update uinfo. Creates a per-destination dce if none exists.
528  *
529  * Note that we do not bump the generation number here.
530  * New connections will find the new uinfo.
531  *
532  * The only use of this (tcp, sctp using iulp_t) is to set rtt+rtt_sd.
533  */
534 static void
535 dce_setuinfo(dce_t *dce, iulp_t *uinfo)
536 {
537 	/*
538 	 * Update the round trip time estimate and/or the max frag size
539 	 * and/or the slow start threshold.
540 	 *
541 	 * We serialize multiple advises using dce_lock.
542 	 */
543 	mutex_enter(&dce->dce_lock);
544 	/* Gard against setting to zero */
545 	if (uinfo->iulp_rtt != 0) {
546 		/*
547 		 * If there is no old cached values, initialize them
548 		 * conservatively.  Set them to be (1.5 * new value).
549 		 */
550 		if (dce->dce_uinfo.iulp_rtt != 0) {
551 			dce->dce_uinfo.iulp_rtt = (dce->dce_uinfo.iulp_rtt +
552 			    uinfo->iulp_rtt) >> 1;
553 		} else {
554 			dce->dce_uinfo.iulp_rtt = uinfo->iulp_rtt +
555 			    (uinfo->iulp_rtt >> 1);
556 		}
557 		if (dce->dce_uinfo.iulp_rtt_sd != 0) {
558 			dce->dce_uinfo.iulp_rtt_sd =
559 			    (dce->dce_uinfo.iulp_rtt_sd +
560 			    uinfo->iulp_rtt_sd) >> 1;
561 		} else {
562 			dce->dce_uinfo.iulp_rtt_sd = uinfo->iulp_rtt_sd +
563 			    (uinfo->iulp_rtt_sd >> 1);
564 		}
565 	}
566 	if (uinfo->iulp_mtu != 0) {
567 		if (dce->dce_flags & DCEF_PMTU) {
568 			dce->dce_pmtu = MIN(uinfo->iulp_mtu, dce->dce_pmtu);
569 		} else {
570 			dce->dce_pmtu = MIN(uinfo->iulp_mtu, IP_MAXPACKET);
571 			dce->dce_flags |= DCEF_PMTU;
572 		}
573 		dce->dce_last_change_time = TICK_TO_SEC(ddi_get_lbolt64());
574 	}
575 	if (uinfo->iulp_ssthresh != 0) {
576 		if (dce->dce_uinfo.iulp_ssthresh != 0)
577 			dce->dce_uinfo.iulp_ssthresh =
578 			    (uinfo->iulp_ssthresh +
579 			    dce->dce_uinfo.iulp_ssthresh) >> 1;
580 		else
581 			dce->dce_uinfo.iulp_ssthresh = uinfo->iulp_ssthresh;
582 	}
583 	/* We have uinfo for sure */
584 	dce->dce_flags |= DCEF_UINFO;
585 	mutex_exit(&dce->dce_lock);
586 }
587 
588 
589 int
590 dce_update_uinfo_v4(ipaddr_t dst, iulp_t *uinfo, ip_stack_t *ipst)
591 {
592 	dce_t *dce;
593 
594 	dce = dce_lookup_and_add_v4(dst, ipst);
595 	if (dce == NULL)
596 		return (ENOMEM);
597 
598 	dce_setuinfo(dce, uinfo);
599 	dce_refrele(dce);
600 	return (0);
601 }
602 
603 int
604 dce_update_uinfo_v6(const in6_addr_t *dst, uint_t ifindex, iulp_t *uinfo,
605     ip_stack_t *ipst)
606 {
607 	dce_t *dce;
608 
609 	dce = dce_lookup_and_add_v6(dst, ifindex, ipst);
610 	if (dce == NULL)
611 		return (ENOMEM);
612 
613 	dce_setuinfo(dce, uinfo);
614 	dce_refrele(dce);
615 	return (0);
616 }
617 
618 /* Common routine for IPv4 and IPv6 */
619 int
620 dce_update_uinfo(const in6_addr_t *dst, uint_t ifindex, iulp_t *uinfo,
621     ip_stack_t *ipst)
622 {
623 	ipaddr_t dst4;
624 
625 	if (IN6_IS_ADDR_V4MAPPED_ANY(dst)) {
626 		IN6_V4MAPPED_TO_IPADDR(dst, dst4);
627 		return (dce_update_uinfo_v4(dst4, uinfo, ipst));
628 	} else {
629 		return (dce_update_uinfo_v6(dst, ifindex, uinfo, ipst));
630 	}
631 }
632 
633 static void
634 dce_make_condemned(dce_t *dce)
635 {
636 	ip_stack_t	*ipst = dce->dce_ipst;
637 
638 	mutex_enter(&dce->dce_lock);
639 	ASSERT(!DCE_IS_CONDEMNED(dce));
640 	dce->dce_generation = DCE_GENERATION_CONDEMNED;
641 	mutex_exit(&dce->dce_lock);
642 	/* Count how many condemned dces for kmem_cache callback */
643 	atomic_add_32(&ipst->ips_num_dce_condemned, 1);
644 }
645 
646 /*
647  * Increment the generation avoiding the special condemned value
648  */
649 void
650 dce_increment_generation(dce_t *dce)
651 {
652 	uint_t generation;
653 
654 	mutex_enter(&dce->dce_lock);
655 	if (!DCE_IS_CONDEMNED(dce)) {
656 		generation = dce->dce_generation + 1;
657 		if (generation == DCE_GENERATION_CONDEMNED)
658 			generation = DCE_GENERATION_INITIAL;
659 		ASSERT(generation != DCE_GENERATION_VERIFY);
660 		dce->dce_generation = generation;
661 	}
662 	mutex_exit(&dce->dce_lock);
663 }
664 
665 /*
666  * Increment the generation number on all dces that have a path MTU and
667  * the default DCE. Used when ill_mtu changes.
668  */
669 void
670 dce_increment_all_generations(boolean_t isv6, ip_stack_t *ipst)
671 {
672 	int		i;
673 	dcb_t		*dcb;
674 	dce_t		*dce;
675 
676 	for (i = 0; i < ipst->ips_dce_hashsize; i++) {
677 		if (isv6)
678 			dcb = &ipst->ips_dce_hash_v6[i];
679 		else
680 			dcb = &ipst->ips_dce_hash_v4[i];
681 		rw_enter(&dcb->dcb_lock, RW_WRITER);
682 		for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) {
683 			if (DCE_IS_CONDEMNED(dce))
684 				continue;
685 			dce_increment_generation(dce);
686 		}
687 		rw_exit(&dcb->dcb_lock);
688 	}
689 	dce_increment_generation(ipst->ips_dce_default);
690 }
691 
692 /*
693  * Caller needs to do a dce_refrele since we can't do the
694  * dce_refrele under dcb_lock.
695  */
696 static void
697 dce_delete_locked(dcb_t *dcb, dce_t *dce)
698 {
699 	dce->dce_bucket = NULL;
700 	*dce->dce_ptpn = dce->dce_next;
701 	if (dce->dce_next != NULL)
702 		dce->dce_next->dce_ptpn = dce->dce_ptpn;
703 	dce->dce_ptpn = NULL;
704 	dce->dce_next = NULL;
705 	atomic_add_32(&dcb->dcb_cnt, -1);
706 	dce_make_condemned(dce);
707 }
708 
709 static void
710 dce_inactive(dce_t *dce)
711 {
712 	ip_stack_t	*ipst = dce->dce_ipst;
713 
714 	ASSERT(!(dce->dce_flags & DCEF_DEFAULT));
715 	ASSERT(dce->dce_ptpn == NULL);
716 	ASSERT(dce->dce_bucket == NULL);
717 
718 	/* Count how many condemned dces for kmem_cache callback */
719 	if (DCE_IS_CONDEMNED(dce))
720 		atomic_add_32(&ipst->ips_num_dce_condemned, -1);
721 
722 	kmem_cache_free(dce_cache, dce);
723 }
724 
725 void
726 dce_refrele(dce_t *dce)
727 {
728 	ASSERT(dce->dce_refcnt != 0);
729 	if (atomic_add_32_nv(&dce->dce_refcnt, -1) == 0)
730 		dce_inactive(dce);
731 }
732 
733 void
734 dce_refhold(dce_t *dce)
735 {
736 	atomic_add_32(&dce->dce_refcnt, 1);
737 	ASSERT(dce->dce_refcnt != 0);
738 }
739 
740 /* No tracing support yet hence the same as the above functions */
741 void
742 dce_refrele_notr(dce_t *dce)
743 {
744 	ASSERT(dce->dce_refcnt != 0);
745 	if (atomic_add_32_nv(&dce->dce_refcnt, -1) == 0)
746 		dce_inactive(dce);
747 }
748 
749 void
750 dce_refhold_notr(dce_t *dce)
751 {
752 	atomic_add_32(&dce->dce_refcnt, 1);
753 	ASSERT(dce->dce_refcnt != 0);
754 }
755 
756 /* Report both the IPv4 and IPv6 DCEs. */
757 mblk_t *
758 ip_snmp_get_mib2_ip_dce(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
759 {
760 	struct opthdr		*optp;
761 	mblk_t			*mp2ctl;
762 	dest_cache_entry_t	dest_cache;
763 	mblk_t			*mp_tail = NULL;
764 	dce_t			*dce;
765 	dcb_t			*dcb;
766 	int			i;
767 	uint64_t		current_time;
768 
769 	current_time = TICK_TO_SEC(ddi_get_lbolt64());
770 
771 	/*
772 	 * make a copy of the original message
773 	 */
774 	mp2ctl = copymsg(mpctl);
775 
776 	/* First we do IPv4 entries */
777 	optp = (struct opthdr *)&mpctl->b_rptr[
778 	    sizeof (struct T_optmgmt_ack)];
779 	optp->level = MIB2_IP;
780 	optp->name = EXPER_IP_DCE;
781 
782 	for (i = 0; i < ipst->ips_dce_hashsize; i++) {
783 		dcb = &ipst->ips_dce_hash_v4[i];
784 		rw_enter(&dcb->dcb_lock, RW_READER);
785 		for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) {
786 			dest_cache.DestIpv4Address = dce->dce_v4addr;
787 			dest_cache.DestFlags = dce->dce_flags;
788 			if (dce->dce_flags & DCEF_PMTU)
789 				dest_cache.DestPmtu = dce->dce_pmtu;
790 			else
791 				dest_cache.DestPmtu = 0;
792 			dest_cache.DestIdent = dce->dce_ident;
793 			dest_cache.DestIfindex = 0;
794 			dest_cache.DestAge = current_time -
795 			    dce->dce_last_change_time;
796 			if (!snmp_append_data2(mpctl->b_cont, &mp_tail,
797 			    (char *)&dest_cache, (int)sizeof (dest_cache))) {
798 				ip1dbg(("ip_snmp_get_mib2_ip_dce: "
799 				    "failed to allocate %u bytes\n",
800 				    (uint_t)sizeof (dest_cache)));
801 			}
802 		}
803 		rw_exit(&dcb->dcb_lock);
804 	}
805 	optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont);
806 	ip3dbg(("ip_snmp_get: level %d, name %d, len %d\n",
807 	    (int)optp->level, (int)optp->name, (int)optp->len));
808 	qreply(q, mpctl);
809 
810 	if (mp2ctl == NULL) {
811 		/* Copymsg failed above */
812 		return (NULL);
813 	}
814 
815 	/* Now for IPv6 */
816 	mpctl = mp2ctl;
817 	mp_tail = NULL;
818 	mp2ctl = copymsg(mpctl);
819 	optp = (struct opthdr *)&mpctl->b_rptr[
820 	    sizeof (struct T_optmgmt_ack)];
821 	optp->level = MIB2_IP6;
822 	optp->name = EXPER_IP_DCE;
823 
824 	for (i = 0; i < ipst->ips_dce_hashsize; i++) {
825 		dcb = &ipst->ips_dce_hash_v6[i];
826 		rw_enter(&dcb->dcb_lock, RW_READER);
827 		for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) {
828 			dest_cache.DestIpv6Address = dce->dce_v6addr;
829 			dest_cache.DestFlags = dce->dce_flags;
830 			if (dce->dce_flags & DCEF_PMTU)
831 				dest_cache.DestPmtu = dce->dce_pmtu;
832 			else
833 				dest_cache.DestPmtu = 0;
834 			dest_cache.DestIdent = dce->dce_ident;
835 			if (IN6_IS_ADDR_LINKSCOPE(&dce->dce_v6addr))
836 				dest_cache.DestIfindex = dce->dce_ifindex;
837 			else
838 				dest_cache.DestIfindex = 0;
839 			dest_cache.DestAge = current_time -
840 			    dce->dce_last_change_time;
841 			if (!snmp_append_data2(mpctl->b_cont, &mp_tail,
842 			    (char *)&dest_cache, (int)sizeof (dest_cache))) {
843 				ip1dbg(("ip_snmp_get_mib2_ip_dce: "
844 				    "failed to allocate %u bytes\n",
845 				    (uint_t)sizeof (dest_cache)));
846 			}
847 		}
848 		rw_exit(&dcb->dcb_lock);
849 	}
850 	optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont);
851 	ip3dbg(("ip_snmp_get: level %d, name %d, len %d\n",
852 	    (int)optp->level, (int)optp->name, (int)optp->len));
853 	qreply(q, mpctl);
854 
855 	return (mp2ctl);
856 }
857 
858 /*
859  * Remove IPv6 DCEs which refer to an ifindex that is going away.
860  * This is not required for correctness, but it avoids netstat -d
861  * showing stale stuff that will never be used.
862  */
863 void
864 dce_cleanup(uint_t ifindex, ip_stack_t *ipst)
865 {
866 	uint_t	i;
867 	dcb_t	*dcb;
868 	dce_t	*dce, *nextdce;
869 
870 	for (i = 0; i < ipst->ips_dce_hashsize; i++) {
871 		dcb = &ipst->ips_dce_hash_v6[i];
872 		rw_enter(&dcb->dcb_lock, RW_WRITER);
873 
874 		for (dce = dcb->dcb_dce; dce != NULL; dce = nextdce) {
875 			nextdce = dce->dce_next;
876 			if (dce->dce_ifindex == ifindex) {
877 				dce_delete_locked(dcb, dce);
878 				dce_refrele(dce);
879 			}
880 		}
881 		rw_exit(&dcb->dcb_lock);
882 	}
883 }
884