xref: /titanic_51/usr/src/uts/common/inet/ip/ip_dce.c (revision 0c240c64cf90f44c2fdf3439010f6e8b33d85e7d)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #include <sys/types.h>
28 #include <sys/stream.h>
29 #include <sys/strsun.h>
30 #include <sys/zone.h>
31 #include <sys/ddi.h>
32 #include <sys/sunddi.h>
33 #include <sys/cmn_err.h>
34 #include <sys/debug.h>
35 #include <sys/atomic.h>
36 #define	_SUN_TPI_VERSION 2
37 #include <sys/tihdr.h>
38 
39 #include <inet/common.h>
40 #include <inet/mi.h>
41 #include <inet/mib2.h>
42 #include <inet/snmpcom.h>
43 
44 #include <netinet/ip6.h>
45 #include <netinet/icmp6.h>
46 
47 #include <inet/ip.h>
48 #include <inet/ip_impl.h>
49 #include <inet/ip6.h>
50 #include <inet/ip6_asp.h>
51 #include <inet/ip_multi.h>
52 #include <inet/ip_if.h>
53 #include <inet/ip_ire.h>
54 #include <inet/ip_ftable.h>
55 #include <inet/ip_rts.h>
56 #include <inet/ip_ndp.h>
57 #include <inet/ipclassifier.h>
58 #include <inet/ip_listutils.h>
59 
60 #include <sys/sunddi.h>
61 
62 /*
63  * Routines for handling destination cache entries.
64  * There is always one DCEF_DEFAULT for each ip_stack_t created at init time.
65  * That entry holds both the IP ident value and the dce generation number.
66  *
67  * Any time a DCE is changed significantly (different path MTU, but NOT
68  * different ULP info!), the dce_generation number is increased.
69  * Also, when a new DCE is created, the dce_generation number in the default
70  * DCE is bumped. That allows the dce_t information to be cached efficiently
71  * as long as the entity caching the dce_t also caches the dce_generation,
72  * and compares the cached generation to detect any changes.
73  * Furthermore, when a DCE is deleted, if there are any outstanding references
74  * to the DCE it will be marked as condemned. The condemned mark is
75  * a designated generation number which is never otherwise used, hence
76  * the single comparison with the generation number captures that as well.
77  *
78  * An example of code which caches is as follows:
79  *
80  *	if (mystruct->my_dce_generation != mystruct->my_dce->dce_generation) {
81  *		The DCE has changed
82  *		mystruct->my_dce = dce_lookup_pkt(mp, ixa,
83  *		    &mystruct->my_dce_generation);
84  *		Not needed in practice, since we have the default DCE:
85  *		if (DCE_IS_CONDEMNED(mystruct->my_dce))
86  *			return failure;
87  *	}
88  *
89  * Note that for IPv6 link-local addresses we record the ifindex since the
90  * link-locals are not globally unique.
91  */
92 
93 /*
94  * Hash bucket structure for DCEs
95  */
96 typedef struct dcb_s {
97 	krwlock_t	dcb_lock;
98 	uint32_t	dcb_cnt;
99 	dce_t		*dcb_dce;
100 } dcb_t;
101 
102 static void	dce_delete_locked(dcb_t *, dce_t *);
103 static void	dce_make_condemned(dce_t *);
104 
105 static kmem_cache_t *dce_cache;
106 
107 
108 /* Operates on a uint64_t */
109 #define	RANDOM_HASH(p) ((p) ^ ((p)>>16) ^ ((p)>>32) ^ ((p)>>48))
110 
111 /*
112  * Reclaim a fraction of dce's in the dcb.
113  * For now we have a higher probability to delete DCEs without DCE_PMTU.
114  */
115 static void
116 dcb_reclaim(dcb_t *dcb, ip_stack_t *ipst, uint_t fraction)
117 {
118 	uint_t	fraction_pmtu = fraction*4;
119 	uint_t	hash;
120 	dce_t	*dce, *nextdce;
121 
122 	rw_enter(&dcb->dcb_lock, RW_WRITER);
123 	for (dce = dcb->dcb_dce; dce != NULL; dce = nextdce) {
124 		nextdce = dce->dce_next;
125 		/* Clear DCEF_PMTU if the pmtu is too old */
126 		mutex_enter(&dce->dce_lock);
127 		if ((dce->dce_flags & DCEF_PMTU) &&
128 		    TICK_TO_SEC(lbolt64) - dce->dce_last_change_time >
129 		    ipst->ips_ip_pathmtu_interval) {
130 			dce->dce_flags &= ~DCEF_PMTU;
131 			mutex_exit(&dce->dce_lock);
132 			dce_increment_generation(dce);
133 		} else {
134 			mutex_exit(&dce->dce_lock);
135 		}
136 		hash = RANDOM_HASH((uint64_t)(uintptr_t)dce);
137 		if (dce->dce_flags & DCEF_PMTU) {
138 			if (hash % fraction_pmtu != 0)
139 				continue;
140 		} else {
141 			if (hash % fraction != 0)
142 				continue;
143 		}
144 
145 		IP_STAT(ipst, ip_dce_reclaim_deleted);
146 		dce_delete_locked(dcb, dce);
147 		dce_refrele(dce);
148 	}
149 	rw_exit(&dcb->dcb_lock);
150 }
151 
152 /*
153  * kmem_cache callback to free up memory.
154  *
155  */
156 static void
157 ip_dce_reclaim_stack(ip_stack_t *ipst)
158 {
159 	int	i;
160 
161 	IP_STAT(ipst, ip_dce_reclaim_calls);
162 	for (i = 0; i < ipst->ips_dce_hashsize; i++) {
163 		dcb_reclaim(&ipst->ips_dce_hash_v4[i], ipst,
164 		    ipst->ips_ip_dce_reclaim_fraction);
165 
166 		dcb_reclaim(&ipst->ips_dce_hash_v6[i], ipst,
167 		    ipst->ips_ip_dce_reclaim_fraction);
168 	}
169 
170 	/*
171 	 * Walk all CONNs that can have a reference on an ire, nce or dce.
172 	 * Get them to update any stale references to drop any refholds they
173 	 * have.
174 	 */
175 	ipcl_walk(conn_ixa_cleanup, (void *)B_FALSE, ipst);
176 }
177 
178 /*
179  * Called by the memory allocator subsystem directly, when the system
180  * is running low on memory.
181  */
182 /* ARGSUSED */
183 void
184 ip_dce_reclaim(void *args)
185 {
186 	netstack_handle_t nh;
187 	netstack_t *ns;
188 
189 	netstack_next_init(&nh);
190 	while ((ns = netstack_next(&nh)) != NULL) {
191 		ip_dce_reclaim_stack(ns->netstack_ip);
192 		netstack_rele(ns);
193 	}
194 	netstack_next_fini(&nh);
195 }
196 
197 void
198 dce_g_init(void)
199 {
200 	dce_cache = kmem_cache_create("dce_cache",
201 	    sizeof (dce_t), 0, NULL, NULL, ip_dce_reclaim, NULL, NULL, 0);
202 }
203 
204 void
205 dce_g_destroy(void)
206 {
207 	kmem_cache_destroy(dce_cache);
208 }
209 
210 
211 /*
212  * Allocate a default DCE and a hash table for per-IP address DCEs
213  */
214 void
215 dce_stack_init(ip_stack_t *ipst)
216 {
217 	int	i;
218 
219 	ipst->ips_dce_default = kmem_cache_alloc(dce_cache, KM_SLEEP);
220 	bzero(ipst->ips_dce_default, sizeof (dce_t));
221 	ipst->ips_dce_default->dce_flags = DCEF_DEFAULT;
222 	ipst->ips_dce_default->dce_generation = DCE_GENERATION_INITIAL;
223 	ipst->ips_dce_default->dce_last_change_time = TICK_TO_SEC(lbolt64);
224 	ipst->ips_dce_default->dce_refcnt = 1;	/* Should never go away */
225 	ipst->ips_dce_default->dce_ipst = ipst;
226 
227 	/* This must be a power of two since we are using IRE_ADDR_HASH macro */
228 	ipst->ips_dce_hashsize = 256;
229 	ipst->ips_dce_hash_v4 = kmem_zalloc(ipst->ips_dce_hashsize *
230 	    sizeof (dcb_t), KM_SLEEP);
231 	ipst->ips_dce_hash_v6 = kmem_zalloc(ipst->ips_dce_hashsize *
232 	    sizeof (dcb_t), KM_SLEEP);
233 	for (i = 0; i < ipst->ips_dce_hashsize; i++) {
234 		rw_init(&ipst->ips_dce_hash_v4[i].dcb_lock, NULL, RW_DEFAULT,
235 		    NULL);
236 		rw_init(&ipst->ips_dce_hash_v6[i].dcb_lock, NULL, RW_DEFAULT,
237 		    NULL);
238 	}
239 }
240 
241 void
242 dce_stack_destroy(ip_stack_t *ipst)
243 {
244 	int i;
245 	for (i = 0; i < ipst->ips_dce_hashsize; i++) {
246 		rw_destroy(&ipst->ips_dce_hash_v4[i].dcb_lock);
247 		rw_destroy(&ipst->ips_dce_hash_v6[i].dcb_lock);
248 	}
249 	kmem_free(ipst->ips_dce_hash_v4,
250 	    ipst->ips_dce_hashsize * sizeof (dcb_t));
251 	ipst->ips_dce_hash_v4 = NULL;
252 	kmem_free(ipst->ips_dce_hash_v6,
253 	    ipst->ips_dce_hashsize * sizeof (dcb_t));
254 	ipst->ips_dce_hash_v6 = NULL;
255 	ipst->ips_dce_hashsize = 0;
256 
257 	ASSERT(ipst->ips_dce_default->dce_refcnt == 1);
258 	kmem_cache_free(dce_cache, ipst->ips_dce_default);
259 	ipst->ips_dce_default = NULL;
260 }
261 
262 /* When any DCE is good enough */
263 dce_t *
264 dce_get_default(ip_stack_t *ipst)
265 {
266 	dce_t		*dce;
267 
268 	dce = ipst->ips_dce_default;
269 	dce_refhold(dce);
270 	return (dce);
271 }
272 
273 /*
274  * Generic for IPv4 and IPv6.
275  *
276  * Used by callers that need to cache e.g., the datapath
277  * Returns the generation number in the last argument.
278  */
279 dce_t *
280 dce_lookup_pkt(mblk_t *mp, ip_xmit_attr_t *ixa, uint_t *generationp)
281 {
282 	if (ixa->ixa_flags & IXAF_IS_IPV4) {
283 		/*
284 		 * If we have a source route we need to look for the final
285 		 * destination in the source route option.
286 		 */
287 		ipaddr_t final_dst;
288 		ipha_t *ipha = (ipha_t *)mp->b_rptr;
289 
290 		final_dst = ip_get_dst(ipha);
291 		return (dce_lookup_v4(final_dst, ixa->ixa_ipst, generationp));
292 	} else {
293 		uint_t ifindex;
294 		/*
295 		 * If we have a routing header we need to look for the final
296 		 * destination in the routing extension header.
297 		 */
298 		in6_addr_t final_dst;
299 		ip6_t *ip6h = (ip6_t *)mp->b_rptr;
300 
301 		final_dst = ip_get_dst_v6(ip6h, mp, NULL);
302 		ifindex = 0;
303 		if (IN6_IS_ADDR_LINKSCOPE(&final_dst) && ixa->ixa_nce != NULL) {
304 			ifindex = ixa->ixa_nce->nce_common->ncec_ill->
305 			    ill_phyint->phyint_ifindex;
306 		}
307 		return (dce_lookup_v6(&final_dst, ifindex, ixa->ixa_ipst,
308 		    generationp));
309 	}
310 }
311 
312 /*
313  * Used by callers that need to cache e.g., the datapath
314  * Returns the generation number in the last argument.
315  */
316 dce_t *
317 dce_lookup_v4(ipaddr_t dst, ip_stack_t *ipst, uint_t *generationp)
318 {
319 	uint_t		hash;
320 	dcb_t		*dcb;
321 	dce_t		*dce;
322 
323 	/* Set *generationp before dropping the lock(s) that allow additions */
324 	if (generationp != NULL)
325 		*generationp = ipst->ips_dce_default->dce_generation;
326 
327 	hash = IRE_ADDR_HASH(dst, ipst->ips_dce_hashsize);
328 	dcb = &ipst->ips_dce_hash_v4[hash];
329 	rw_enter(&dcb->dcb_lock, RW_READER);
330 	for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) {
331 		if (dce->dce_v4addr == dst) {
332 			mutex_enter(&dce->dce_lock);
333 			if (!DCE_IS_CONDEMNED(dce)) {
334 				dce_refhold(dce);
335 				if (generationp != NULL)
336 					*generationp = dce->dce_generation;
337 				mutex_exit(&dce->dce_lock);
338 				rw_exit(&dcb->dcb_lock);
339 				return (dce);
340 			}
341 			mutex_exit(&dce->dce_lock);
342 		}
343 	}
344 	rw_exit(&dcb->dcb_lock);
345 	/* Not found */
346 	dce = ipst->ips_dce_default;
347 	dce_refhold(dce);
348 	return (dce);
349 }
350 
351 /*
352  * Used by callers that need to cache e.g., the datapath
353  * Returns the generation number in the last argument.
354  * ifindex should only be set for link-locals
355  */
356 dce_t *
357 dce_lookup_v6(const in6_addr_t *dst, uint_t ifindex, ip_stack_t *ipst,
358     uint_t *generationp)
359 {
360 	uint_t		hash;
361 	dcb_t		*dcb;
362 	dce_t		*dce;
363 
364 	/* Set *generationp before dropping the lock(s) that allow additions */
365 	if (generationp != NULL)
366 		*generationp = ipst->ips_dce_default->dce_generation;
367 
368 	hash = IRE_ADDR_HASH_V6(*dst, ipst->ips_dce_hashsize);
369 	dcb = &ipst->ips_dce_hash_v6[hash];
370 	rw_enter(&dcb->dcb_lock, RW_READER);
371 	for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) {
372 		if (IN6_ARE_ADDR_EQUAL(&dce->dce_v6addr, dst) &&
373 		    dce->dce_ifindex == ifindex) {
374 			mutex_enter(&dce->dce_lock);
375 			if (!DCE_IS_CONDEMNED(dce)) {
376 				dce_refhold(dce);
377 				if (generationp != NULL)
378 					*generationp = dce->dce_generation;
379 				mutex_exit(&dce->dce_lock);
380 				rw_exit(&dcb->dcb_lock);
381 				return (dce);
382 			}
383 			mutex_exit(&dce->dce_lock);
384 		}
385 	}
386 	rw_exit(&dcb->dcb_lock);
387 	/* Not found */
388 	dce = ipst->ips_dce_default;
389 	dce_refhold(dce);
390 	return (dce);
391 }
392 
393 /*
394  * Atomically looks for a non-default DCE, and if not found tries to create one.
395  * If there is no memory it returns NULL.
396  * When an entry is created we increase the generation number on
397  * the default DCE so that conn_ip_output will detect there is a new DCE.
398  */
399 dce_t *
400 dce_lookup_and_add_v4(ipaddr_t dst, ip_stack_t *ipst)
401 {
402 	uint_t		hash;
403 	dcb_t		*dcb;
404 	dce_t		*dce;
405 
406 	hash = IRE_ADDR_HASH(dst, ipst->ips_dce_hashsize);
407 	dcb = &ipst->ips_dce_hash_v4[hash];
408 	rw_enter(&dcb->dcb_lock, RW_WRITER);
409 	for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) {
410 		if (dce->dce_v4addr == dst) {
411 			mutex_enter(&dce->dce_lock);
412 			if (!DCE_IS_CONDEMNED(dce)) {
413 				dce_refhold(dce);
414 				mutex_exit(&dce->dce_lock);
415 				rw_exit(&dcb->dcb_lock);
416 				return (dce);
417 			}
418 			mutex_exit(&dce->dce_lock);
419 		}
420 	}
421 	dce = kmem_cache_alloc(dce_cache, KM_NOSLEEP);
422 	if (dce == NULL) {
423 		rw_exit(&dcb->dcb_lock);
424 		return (NULL);
425 	}
426 	bzero(dce, sizeof (dce_t));
427 	dce->dce_ipst = ipst;	/* No netstack_hold */
428 	dce->dce_v4addr = dst;
429 	dce->dce_generation = DCE_GENERATION_INITIAL;
430 	dce->dce_ipversion = IPV4_VERSION;
431 	dce->dce_last_change_time = TICK_TO_SEC(lbolt64);
432 	dce_refhold(dce);	/* For the hash list */
433 
434 	/* Link into list */
435 	if (dcb->dcb_dce != NULL)
436 		dcb->dcb_dce->dce_ptpn = &dce->dce_next;
437 	dce->dce_next = dcb->dcb_dce;
438 	dce->dce_ptpn = &dcb->dcb_dce;
439 	dcb->dcb_dce = dce;
440 	dce->dce_bucket = dcb;
441 	dce_refhold(dce);	/* For the caller */
442 	rw_exit(&dcb->dcb_lock);
443 
444 	/* Initialize dce_ident to be different than for the last packet */
445 	dce->dce_ident = ipst->ips_dce_default->dce_ident + 1;
446 
447 	dce_increment_generation(ipst->ips_dce_default);
448 	return (dce);
449 }
450 
451 /*
452  * Atomically looks for a non-default DCE, and if not found tries to create one.
453  * If there is no memory it returns NULL.
454  * When an entry is created we increase the generation number on
455  * the default DCE so that conn_ip_output will detect there is a new DCE.
456  * ifindex should only be used with link-local addresses.
457  */
458 dce_t *
459 dce_lookup_and_add_v6(const in6_addr_t *dst, uint_t ifindex, ip_stack_t *ipst)
460 {
461 	uint_t		hash;
462 	dcb_t		*dcb;
463 	dce_t		*dce;
464 
465 	/* We should not create entries for link-locals w/o an ifindex */
466 	ASSERT(!(IN6_IS_ADDR_LINKSCOPE(dst)) || ifindex != 0);
467 
468 	hash = IRE_ADDR_HASH_V6(*dst, ipst->ips_dce_hashsize);
469 	dcb = &ipst->ips_dce_hash_v6[hash];
470 	rw_enter(&dcb->dcb_lock, RW_WRITER);
471 	for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) {
472 		if (IN6_ARE_ADDR_EQUAL(&dce->dce_v6addr, dst) &&
473 		    dce->dce_ifindex == ifindex) {
474 			mutex_enter(&dce->dce_lock);
475 			if (!DCE_IS_CONDEMNED(dce)) {
476 				dce_refhold(dce);
477 				mutex_exit(&dce->dce_lock);
478 				rw_exit(&dcb->dcb_lock);
479 				return (dce);
480 			}
481 			mutex_exit(&dce->dce_lock);
482 		}
483 	}
484 
485 	dce = kmem_cache_alloc(dce_cache, KM_NOSLEEP);
486 	if (dce == NULL) {
487 		rw_exit(&dcb->dcb_lock);
488 		return (NULL);
489 	}
490 	bzero(dce, sizeof (dce_t));
491 	dce->dce_ipst = ipst;	/* No netstack_hold */
492 	dce->dce_v6addr = *dst;
493 	dce->dce_ifindex = ifindex;
494 	dce->dce_generation = DCE_GENERATION_INITIAL;
495 	dce->dce_ipversion = IPV6_VERSION;
496 	dce->dce_last_change_time = TICK_TO_SEC(lbolt64);
497 	dce_refhold(dce);	/* For the hash list */
498 
499 	/* Link into list */
500 	if (dcb->dcb_dce != NULL)
501 		dcb->dcb_dce->dce_ptpn = &dce->dce_next;
502 	dce->dce_next = dcb->dcb_dce;
503 	dce->dce_ptpn = &dcb->dcb_dce;
504 	dcb->dcb_dce = dce;
505 	dce->dce_bucket = dcb;
506 	atomic_add_32(&dcb->dcb_cnt, 1);
507 	dce_refhold(dce);	/* For the caller */
508 	rw_exit(&dcb->dcb_lock);
509 
510 	/* Initialize dce_ident to be different than for the last packet */
511 	dce->dce_ident = ipst->ips_dce_default->dce_ident + 1;
512 	dce_increment_generation(ipst->ips_dce_default);
513 	return (dce);
514 }
515 
516 /*
517  * Set/update uinfo. Creates a per-destination dce if none exists.
518  *
519  * Note that we do not bump the generation number here.
520  * New connections will find the new uinfo.
521  *
522  * The only use of this (tcp, sctp using iulp_t) is to set rtt+rtt_sd.
523  */
524 static void
525 dce_setuinfo(dce_t *dce, iulp_t *uinfo)
526 {
527 	/*
528 	 * Update the round trip time estimate and/or the max frag size
529 	 * and/or the slow start threshold.
530 	 *
531 	 * We serialize multiple advises using dce_lock.
532 	 */
533 	mutex_enter(&dce->dce_lock);
534 	/* Gard against setting to zero */
535 	if (uinfo->iulp_rtt != 0) {
536 		/*
537 		 * If there is no old cached values, initialize them
538 		 * conservatively.  Set them to be (1.5 * new value).
539 		 */
540 		if (dce->dce_uinfo.iulp_rtt != 0) {
541 			dce->dce_uinfo.iulp_rtt = (dce->dce_uinfo.iulp_rtt +
542 			    uinfo->iulp_rtt) >> 1;
543 		} else {
544 			dce->dce_uinfo.iulp_rtt = uinfo->iulp_rtt +
545 			    (uinfo->iulp_rtt >> 1);
546 		}
547 		if (dce->dce_uinfo.iulp_rtt_sd != 0) {
548 			dce->dce_uinfo.iulp_rtt_sd =
549 			    (dce->dce_uinfo.iulp_rtt_sd +
550 			    uinfo->iulp_rtt_sd) >> 1;
551 		} else {
552 			dce->dce_uinfo.iulp_rtt_sd = uinfo->iulp_rtt_sd +
553 			    (uinfo->iulp_rtt_sd >> 1);
554 		}
555 	}
556 	if (uinfo->iulp_mtu != 0) {
557 		if (dce->dce_flags & DCEF_PMTU) {
558 			dce->dce_pmtu = MIN(uinfo->iulp_mtu, dce->dce_pmtu);
559 		} else {
560 			dce->dce_pmtu = MIN(uinfo->iulp_mtu, IP_MAXPACKET);
561 			dce->dce_flags |= DCEF_PMTU;
562 		}
563 		dce->dce_last_change_time = TICK_TO_SEC(lbolt64);
564 	}
565 	if (uinfo->iulp_ssthresh != 0) {
566 		if (dce->dce_uinfo.iulp_ssthresh != 0)
567 			dce->dce_uinfo.iulp_ssthresh =
568 			    (uinfo->iulp_ssthresh +
569 			    dce->dce_uinfo.iulp_ssthresh) >> 1;
570 		else
571 			dce->dce_uinfo.iulp_ssthresh = uinfo->iulp_ssthresh;
572 	}
573 	/* We have uinfo for sure */
574 	dce->dce_flags |= DCEF_UINFO;
575 	mutex_exit(&dce->dce_lock);
576 }
577 
578 
579 int
580 dce_update_uinfo_v4(ipaddr_t dst, iulp_t *uinfo, ip_stack_t *ipst)
581 {
582 	dce_t *dce;
583 
584 	dce = dce_lookup_and_add_v4(dst, ipst);
585 	if (dce == NULL)
586 		return (ENOMEM);
587 
588 	dce_setuinfo(dce, uinfo);
589 	dce_refrele(dce);
590 	return (0);
591 }
592 
593 int
594 dce_update_uinfo_v6(const in6_addr_t *dst, uint_t ifindex, iulp_t *uinfo,
595     ip_stack_t *ipst)
596 {
597 	dce_t *dce;
598 
599 	dce = dce_lookup_and_add_v6(dst, ifindex, ipst);
600 	if (dce == NULL)
601 		return (ENOMEM);
602 
603 	dce_setuinfo(dce, uinfo);
604 	dce_refrele(dce);
605 	return (0);
606 }
607 
608 /* Common routine for IPv4 and IPv6 */
609 int
610 dce_update_uinfo(const in6_addr_t *dst, uint_t ifindex, iulp_t *uinfo,
611     ip_stack_t *ipst)
612 {
613 	ipaddr_t dst4;
614 
615 	if (IN6_IS_ADDR_V4MAPPED_ANY(dst)) {
616 		IN6_V4MAPPED_TO_IPADDR(dst, dst4);
617 		return (dce_update_uinfo_v4(dst4, uinfo, ipst));
618 	} else {
619 		return (dce_update_uinfo_v6(dst, ifindex, uinfo, ipst));
620 	}
621 }
622 
623 static void
624 dce_make_condemned(dce_t *dce)
625 {
626 	ip_stack_t	*ipst = dce->dce_ipst;
627 
628 	mutex_enter(&dce->dce_lock);
629 	ASSERT(!DCE_IS_CONDEMNED(dce));
630 	dce->dce_generation = DCE_GENERATION_CONDEMNED;
631 	mutex_exit(&dce->dce_lock);
632 	/* Count how many condemned dces for kmem_cache callback */
633 	atomic_add_32(&ipst->ips_num_dce_condemned, 1);
634 }
635 
636 /*
637  * Increment the generation avoiding the special condemned value
638  */
639 void
640 dce_increment_generation(dce_t *dce)
641 {
642 	uint_t generation;
643 
644 	mutex_enter(&dce->dce_lock);
645 	if (!DCE_IS_CONDEMNED(dce)) {
646 		generation = dce->dce_generation + 1;
647 		if (generation == DCE_GENERATION_CONDEMNED)
648 			generation = DCE_GENERATION_INITIAL;
649 		ASSERT(generation != DCE_GENERATION_VERIFY);
650 		dce->dce_generation = generation;
651 	}
652 	mutex_exit(&dce->dce_lock);
653 }
654 
655 /*
656  * Increment the generation number on all dces that have a path MTU and
657  * the default DCE. Used when ill_mtu changes.
658  */
659 void
660 dce_increment_all_generations(boolean_t isv6, ip_stack_t *ipst)
661 {
662 	int		i;
663 	dcb_t		*dcb;
664 	dce_t		*dce;
665 
666 	for (i = 0; i < ipst->ips_dce_hashsize; i++) {
667 		if (isv6)
668 			dcb = &ipst->ips_dce_hash_v6[i];
669 		else
670 			dcb = &ipst->ips_dce_hash_v4[i];
671 		rw_enter(&dcb->dcb_lock, RW_WRITER);
672 		for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) {
673 			if (DCE_IS_CONDEMNED(dce))
674 				continue;
675 			dce_increment_generation(dce);
676 		}
677 		rw_exit(&dcb->dcb_lock);
678 	}
679 	dce_increment_generation(ipst->ips_dce_default);
680 }
681 
682 /*
683  * Caller needs to do a dce_refrele since we can't do the
684  * dce_refrele under dcb_lock.
685  */
686 static void
687 dce_delete_locked(dcb_t *dcb, dce_t *dce)
688 {
689 	dce->dce_bucket = NULL;
690 	*dce->dce_ptpn = dce->dce_next;
691 	if (dce->dce_next != NULL)
692 		dce->dce_next->dce_ptpn = dce->dce_ptpn;
693 	dce->dce_ptpn = NULL;
694 	dce->dce_next = NULL;
695 	atomic_add_32(&dcb->dcb_cnt, -1);
696 	dce_make_condemned(dce);
697 }
698 
699 static void
700 dce_inactive(dce_t *dce)
701 {
702 	ip_stack_t	*ipst = dce->dce_ipst;
703 
704 	ASSERT(!(dce->dce_flags & DCEF_DEFAULT));
705 	ASSERT(dce->dce_ptpn == NULL);
706 	ASSERT(dce->dce_bucket == NULL);
707 
708 	/* Count how many condemned dces for kmem_cache callback */
709 	if (DCE_IS_CONDEMNED(dce))
710 		atomic_add_32(&ipst->ips_num_dce_condemned, -1);
711 
712 	kmem_cache_free(dce_cache, dce);
713 }
714 
715 void
716 dce_refrele(dce_t *dce)
717 {
718 	ASSERT(dce->dce_refcnt != 0);
719 	if (atomic_add_32_nv(&dce->dce_refcnt, -1) == 0)
720 		dce_inactive(dce);
721 }
722 
723 void
724 dce_refhold(dce_t *dce)
725 {
726 	atomic_add_32(&dce->dce_refcnt, 1);
727 	ASSERT(dce->dce_refcnt != 0);
728 }
729 
730 /* No tracing support yet hence the same as the above functions */
731 void
732 dce_refrele_notr(dce_t *dce)
733 {
734 	ASSERT(dce->dce_refcnt != 0);
735 	if (atomic_add_32_nv(&dce->dce_refcnt, -1) == 0)
736 		dce_inactive(dce);
737 }
738 
739 void
740 dce_refhold_notr(dce_t *dce)
741 {
742 	atomic_add_32(&dce->dce_refcnt, 1);
743 	ASSERT(dce->dce_refcnt != 0);
744 }
745 
746 /* Report both the IPv4 and IPv6 DCEs. */
747 mblk_t *
748 ip_snmp_get_mib2_ip_dce(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
749 {
750 	struct opthdr		*optp;
751 	mblk_t			*mp2ctl;
752 	dest_cache_entry_t	dest_cache;
753 	mblk_t			*mp_tail = NULL;
754 	dce_t			*dce;
755 	dcb_t			*dcb;
756 	int			i;
757 	uint64_t		current_time;
758 
759 	current_time = TICK_TO_SEC(lbolt64);
760 
761 	/*
762 	 * make a copy of the original message
763 	 */
764 	mp2ctl = copymsg(mpctl);
765 
766 	/* First we do IPv4 entries */
767 	optp = (struct opthdr *)&mpctl->b_rptr[
768 	    sizeof (struct T_optmgmt_ack)];
769 	optp->level = MIB2_IP;
770 	optp->name = EXPER_IP_DCE;
771 
772 	for (i = 0; i < ipst->ips_dce_hashsize; i++) {
773 		dcb = &ipst->ips_dce_hash_v4[i];
774 		rw_enter(&dcb->dcb_lock, RW_READER);
775 		for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) {
776 			dest_cache.DestIpv4Address = dce->dce_v4addr;
777 			dest_cache.DestFlags = dce->dce_flags;
778 			if (dce->dce_flags & DCEF_PMTU)
779 				dest_cache.DestPmtu = dce->dce_pmtu;
780 			else
781 				dest_cache.DestPmtu = 0;
782 			dest_cache.DestIdent = dce->dce_ident;
783 			dest_cache.DestIfindex = 0;
784 			dest_cache.DestAge = current_time -
785 			    dce->dce_last_change_time;
786 			if (!snmp_append_data2(mpctl->b_cont, &mp_tail,
787 			    (char *)&dest_cache, (int)sizeof (dest_cache))) {
788 				ip1dbg(("ip_snmp_get_mib2_ip_dce: "
789 				    "failed to allocate %u bytes\n",
790 				    (uint_t)sizeof (dest_cache)));
791 			}
792 		}
793 		rw_exit(&dcb->dcb_lock);
794 	}
795 	optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont);
796 	ip3dbg(("ip_snmp_get: level %d, name %d, len %d\n",
797 	    (int)optp->level, (int)optp->name, (int)optp->len));
798 	qreply(q, mpctl);
799 
800 	if (mp2ctl == NULL) {
801 		/* Copymsg failed above */
802 		return (NULL);
803 	}
804 
805 	/* Now for IPv6 */
806 	mpctl = mp2ctl;
807 	mp_tail = NULL;
808 	mp2ctl = copymsg(mpctl);
809 	optp = (struct opthdr *)&mpctl->b_rptr[
810 	    sizeof (struct T_optmgmt_ack)];
811 	optp->level = MIB2_IP6;
812 	optp->name = EXPER_IP_DCE;
813 
814 	for (i = 0; i < ipst->ips_dce_hashsize; i++) {
815 		dcb = &ipst->ips_dce_hash_v6[i];
816 		rw_enter(&dcb->dcb_lock, RW_READER);
817 		for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) {
818 			dest_cache.DestIpv6Address = dce->dce_v6addr;
819 			dest_cache.DestFlags = dce->dce_flags;
820 			if (dce->dce_flags & DCEF_PMTU)
821 				dest_cache.DestPmtu = dce->dce_pmtu;
822 			else
823 				dest_cache.DestPmtu = 0;
824 			dest_cache.DestIdent = dce->dce_ident;
825 			if (IN6_IS_ADDR_LINKSCOPE(&dce->dce_v6addr))
826 				dest_cache.DestIfindex = dce->dce_ifindex;
827 			else
828 				dest_cache.DestIfindex = 0;
829 			dest_cache.DestAge = current_time -
830 			    dce->dce_last_change_time;
831 			if (!snmp_append_data2(mpctl->b_cont, &mp_tail,
832 			    (char *)&dest_cache, (int)sizeof (dest_cache))) {
833 				ip1dbg(("ip_snmp_get_mib2_ip_dce: "
834 				    "failed to allocate %u bytes\n",
835 				    (uint_t)sizeof (dest_cache)));
836 			}
837 		}
838 		rw_exit(&dcb->dcb_lock);
839 	}
840 	optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont);
841 	ip3dbg(("ip_snmp_get: level %d, name %d, len %d\n",
842 	    (int)optp->level, (int)optp->name, (int)optp->len));
843 	qreply(q, mpctl);
844 
845 	return (mp2ctl);
846 }
847 
848 /*
849  * Remove IPv6 DCEs which refer to an ifindex that is going away.
850  * This is not required for correctness, but it avoids netstat -d
851  * showing stale stuff that will never be used.
852  */
853 void
854 dce_cleanup(uint_t ifindex, ip_stack_t *ipst)
855 {
856 	uint_t	i;
857 	dcb_t	*dcb;
858 	dce_t	*dce, *nextdce;
859 
860 	for (i = 0; i < ipst->ips_dce_hashsize; i++) {
861 		dcb = &ipst->ips_dce_hash_v6[i];
862 		rw_enter(&dcb->dcb_lock, RW_WRITER);
863 
864 		for (dce = dcb->dcb_dce; dce != NULL; dce = nextdce) {
865 			nextdce = dce->dce_next;
866 			if (dce->dce_ifindex == ifindex) {
867 				dce_delete_locked(dcb, dce);
868 				dce_refrele(dce);
869 			}
870 		}
871 		rw_exit(&dcb->dcb_lock);
872 	}
873 }
874