xref: /illumos-gate/usr/src/uts/common/inet/ip/ip_dce.c (revision 89b2a9fbeabf42fa54594df0e5927bcc50a07cc9)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #include <sys/types.h>
28 #include <sys/stream.h>
29 #include <sys/strsun.h>
30 #include <sys/zone.h>
31 #include <sys/ddi.h>
32 #include <sys/sunddi.h>
33 #include <sys/cmn_err.h>
34 #include <sys/debug.h>
35 #include <sys/atomic.h>
36 #define	_SUN_TPI_VERSION 2
37 #include <sys/tihdr.h>
38 
39 #include <inet/common.h>
40 #include <inet/mi.h>
41 #include <inet/mib2.h>
42 #include <inet/snmpcom.h>
43 
44 #include <netinet/ip6.h>
45 #include <netinet/icmp6.h>
46 
47 #include <inet/ip.h>
48 #include <inet/ip_impl.h>
49 #include <inet/ip6.h>
50 #include <inet/ip6_asp.h>
51 #include <inet/ip_multi.h>
52 #include <inet/ip_if.h>
53 #include <inet/ip_ire.h>
54 #include <inet/ip_ftable.h>
55 #include <inet/ip_rts.h>
56 #include <inet/ip_ndp.h>
57 #include <inet/ipclassifier.h>
58 #include <inet/ip_listutils.h>
59 
60 #include <sys/sunddi.h>
61 
62 /*
63  * Routines for handling destination cache entries.
64  * There is always one DCEF_DEFAULT for each ip_stack_t created at init time.
65  * That entry holds both the IP ident value and the dce generation number.
66  *
67  * Any time a DCE is changed significantly (different path MTU, but NOT
68  * different ULP info!), the dce_generation number is increased.
69  * Also, when a new DCE is created, the dce_generation number in the default
70  * DCE is bumped. That allows the dce_t information to be cached efficiently
71  * as long as the entity caching the dce_t also caches the dce_generation,
72  * and compares the cached generation to detect any changes.
73  * Furthermore, when a DCE is deleted, if there are any outstanding references
74  * to the DCE it will be marked as condemned. The condemned mark is
75  * a designated generation number which is never otherwise used, hence
76  * the single comparison with the generation number captures that as well.
77  *
78  * An example of code which caches is as follows:
79  *
80  *	if (mystruct->my_dce_generation != mystruct->my_dce->dce_generation) {
81  *		The DCE has changed
82  *		mystruct->my_dce = dce_lookup_pkt(mp, ixa,
83  *		    &mystruct->my_dce_generation);
84  *		Not needed in practice, since we have the default DCE:
85  *		if (DCE_IS_CONDEMNED(mystruct->my_dce))
86  *			return failure;
87  *	}
88  *
89  * Note that for IPv6 link-local addresses we record the ifindex since the
90  * link-locals are not globally unique.
91  */
92 
93 /*
94  * Hash bucket structure for DCEs
95  */
96 typedef struct dcb_s {
97 	krwlock_t	dcb_lock;
98 	uint32_t	dcb_cnt;
99 	dce_t		*dcb_dce;
100 } dcb_t;
101 
102 static void	dce_delete_locked(dcb_t *, dce_t *);
103 static void	dce_make_condemned(dce_t *);
104 
105 static kmem_cache_t *dce_cache;
106 
107 
108 /* Operates on a uint64_t */
109 #define	RANDOM_HASH(p) ((p) ^ ((p)>>16) ^ ((p)>>32) ^ ((p)>>48))
110 
111 /*
112  * Reclaim a fraction of dce's in the dcb.
113  * For now we have a higher probability to delete DCEs without DCE_PMTU.
114  */
115 static void
116 dcb_reclaim(dcb_t *dcb, ip_stack_t *ipst, uint_t fraction)
117 {
118 	uint_t	fraction_pmtu = fraction*4;
119 	uint_t	hash;
120 	dce_t	*dce, *nextdce;
121 
122 	rw_enter(&dcb->dcb_lock, RW_WRITER);
123 	for (dce = dcb->dcb_dce; dce != NULL; dce = nextdce) {
124 		nextdce = dce->dce_next;
125 		/* Clear DCEF_PMTU if the pmtu is too old */
126 		mutex_enter(&dce->dce_lock);
127 		if ((dce->dce_flags & DCEF_PMTU) &&
128 		    TICK_TO_SEC(ddi_get_lbolt64()) - dce->dce_last_change_time >
129 		    ipst->ips_ip_pathmtu_interval) {
130 			dce->dce_flags &= ~DCEF_PMTU;
131 			mutex_exit(&dce->dce_lock);
132 			dce_increment_generation(dce);
133 		} else {
134 			mutex_exit(&dce->dce_lock);
135 		}
136 		hash = RANDOM_HASH((uint64_t)(uintptr_t)dce);
137 		if (dce->dce_flags & DCEF_PMTU) {
138 			if (hash % fraction_pmtu != 0)
139 				continue;
140 		} else {
141 			if (hash % fraction != 0)
142 				continue;
143 		}
144 
145 		IP_STAT(ipst, ip_dce_reclaim_deleted);
146 		dce_delete_locked(dcb, dce);
147 		dce_refrele(dce);
148 	}
149 	rw_exit(&dcb->dcb_lock);
150 }
151 
152 /*
153  * kmem_cache callback to free up memory.
154  *
155  */
156 static void
157 ip_dce_reclaim_stack(ip_stack_t *ipst)
158 {
159 	int	i;
160 
161 	IP_STAT(ipst, ip_dce_reclaim_calls);
162 	for (i = 0; i < ipst->ips_dce_hashsize; i++) {
163 		dcb_reclaim(&ipst->ips_dce_hash_v4[i], ipst,
164 		    ipst->ips_ip_dce_reclaim_fraction);
165 
166 		dcb_reclaim(&ipst->ips_dce_hash_v6[i], ipst,
167 		    ipst->ips_ip_dce_reclaim_fraction);
168 	}
169 
170 	/*
171 	 * Walk all CONNs that can have a reference on an ire, nce or dce.
172 	 * Get them to update any stale references to drop any refholds they
173 	 * have.
174 	 */
175 	ipcl_walk(conn_ixa_cleanup, (void *)B_FALSE, ipst);
176 }
177 
178 /*
179  * Called by the memory allocator subsystem directly, when the system
180  * is running low on memory.
181  */
182 /* ARGSUSED */
183 void
184 ip_dce_reclaim(void *args)
185 {
186 	netstack_handle_t nh;
187 	netstack_t *ns;
188 
189 	netstack_next_init(&nh);
190 	while ((ns = netstack_next(&nh)) != NULL) {
191 		ip_dce_reclaim_stack(ns->netstack_ip);
192 		netstack_rele(ns);
193 	}
194 	netstack_next_fini(&nh);
195 }
196 
197 void
198 dce_g_init(void)
199 {
200 	dce_cache = kmem_cache_create("dce_cache",
201 	    sizeof (dce_t), 0, NULL, NULL, ip_dce_reclaim, NULL, NULL, 0);
202 }
203 
204 void
205 dce_g_destroy(void)
206 {
207 	kmem_cache_destroy(dce_cache);
208 }
209 
210 
211 /*
212  * Allocate a default DCE and a hash table for per-IP address DCEs
213  */
214 void
215 dce_stack_init(ip_stack_t *ipst)
216 {
217 	int	i;
218 
219 	ipst->ips_dce_default = kmem_cache_alloc(dce_cache, KM_SLEEP);
220 	bzero(ipst->ips_dce_default, sizeof (dce_t));
221 	ipst->ips_dce_default->dce_flags = DCEF_DEFAULT;
222 	ipst->ips_dce_default->dce_generation = DCE_GENERATION_INITIAL;
223 	ipst->ips_dce_default->dce_last_change_time =
224 	    TICK_TO_SEC(ddi_get_lbolt64());
225 	ipst->ips_dce_default->dce_refcnt = 1;	/* Should never go away */
226 	ipst->ips_dce_default->dce_ipst = ipst;
227 
228 	/* This must be a power of two since we are using IRE_ADDR_HASH macro */
229 	ipst->ips_dce_hashsize = 256;
230 	ipst->ips_dce_hash_v4 = kmem_zalloc(ipst->ips_dce_hashsize *
231 	    sizeof (dcb_t), KM_SLEEP);
232 	ipst->ips_dce_hash_v6 = kmem_zalloc(ipst->ips_dce_hashsize *
233 	    sizeof (dcb_t), KM_SLEEP);
234 	for (i = 0; i < ipst->ips_dce_hashsize; i++) {
235 		rw_init(&ipst->ips_dce_hash_v4[i].dcb_lock, NULL, RW_DEFAULT,
236 		    NULL);
237 		rw_init(&ipst->ips_dce_hash_v6[i].dcb_lock, NULL, RW_DEFAULT,
238 		    NULL);
239 	}
240 }
241 
242 void
243 dce_stack_destroy(ip_stack_t *ipst)
244 {
245 	int i;
246 	for (i = 0; i < ipst->ips_dce_hashsize; i++) {
247 		rw_destroy(&ipst->ips_dce_hash_v4[i].dcb_lock);
248 		rw_destroy(&ipst->ips_dce_hash_v6[i].dcb_lock);
249 	}
250 	kmem_free(ipst->ips_dce_hash_v4,
251 	    ipst->ips_dce_hashsize * sizeof (dcb_t));
252 	ipst->ips_dce_hash_v4 = NULL;
253 	kmem_free(ipst->ips_dce_hash_v6,
254 	    ipst->ips_dce_hashsize * sizeof (dcb_t));
255 	ipst->ips_dce_hash_v6 = NULL;
256 	ipst->ips_dce_hashsize = 0;
257 
258 	ASSERT(ipst->ips_dce_default->dce_refcnt == 1);
259 	kmem_cache_free(dce_cache, ipst->ips_dce_default);
260 	ipst->ips_dce_default = NULL;
261 }
262 
263 /* When any DCE is good enough */
264 dce_t *
265 dce_get_default(ip_stack_t *ipst)
266 {
267 	dce_t		*dce;
268 
269 	dce = ipst->ips_dce_default;
270 	dce_refhold(dce);
271 	return (dce);
272 }
273 
274 /*
275  * Generic for IPv4 and IPv6.
276  *
277  * Used by callers that need to cache e.g., the datapath
278  * Returns the generation number in the last argument.
279  */
280 dce_t *
281 dce_lookup_pkt(mblk_t *mp, ip_xmit_attr_t *ixa, uint_t *generationp)
282 {
283 	if (ixa->ixa_flags & IXAF_IS_IPV4) {
284 		/*
285 		 * If we have a source route we need to look for the final
286 		 * destination in the source route option.
287 		 */
288 		ipaddr_t final_dst;
289 		ipha_t *ipha = (ipha_t *)mp->b_rptr;
290 
291 		final_dst = ip_get_dst(ipha);
292 		return (dce_lookup_v4(final_dst, ixa->ixa_ipst, generationp));
293 	} else {
294 		uint_t ifindex;
295 		/*
296 		 * If we have a routing header we need to look for the final
297 		 * destination in the routing extension header.
298 		 */
299 		in6_addr_t final_dst;
300 		ip6_t *ip6h = (ip6_t *)mp->b_rptr;
301 
302 		final_dst = ip_get_dst_v6(ip6h, mp, NULL);
303 		ifindex = 0;
304 		if (IN6_IS_ADDR_LINKSCOPE(&final_dst) && ixa->ixa_nce != NULL) {
305 			ifindex = ixa->ixa_nce->nce_common->ncec_ill->
306 			    ill_phyint->phyint_ifindex;
307 		}
308 		return (dce_lookup_v6(&final_dst, ifindex, ixa->ixa_ipst,
309 		    generationp));
310 	}
311 }
312 
313 /*
314  * Used by callers that need to cache e.g., the datapath
315  * Returns the generation number in the last argument.
316  */
317 dce_t *
318 dce_lookup_v4(ipaddr_t dst, ip_stack_t *ipst, uint_t *generationp)
319 {
320 	uint_t		hash;
321 	dcb_t		*dcb;
322 	dce_t		*dce;
323 
324 	/* Set *generationp before dropping the lock(s) that allow additions */
325 	if (generationp != NULL)
326 		*generationp = ipst->ips_dce_default->dce_generation;
327 
328 	hash = IRE_ADDR_HASH(dst, ipst->ips_dce_hashsize);
329 	dcb = &ipst->ips_dce_hash_v4[hash];
330 	rw_enter(&dcb->dcb_lock, RW_READER);
331 	for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) {
332 		if (dce->dce_v4addr == dst) {
333 			mutex_enter(&dce->dce_lock);
334 			if (!DCE_IS_CONDEMNED(dce)) {
335 				dce_refhold(dce);
336 				if (generationp != NULL)
337 					*generationp = dce->dce_generation;
338 				mutex_exit(&dce->dce_lock);
339 				rw_exit(&dcb->dcb_lock);
340 				return (dce);
341 			}
342 			mutex_exit(&dce->dce_lock);
343 		}
344 	}
345 	rw_exit(&dcb->dcb_lock);
346 	/* Not found */
347 	dce = ipst->ips_dce_default;
348 	dce_refhold(dce);
349 	return (dce);
350 }
351 
352 /*
353  * Used by callers that need to cache e.g., the datapath
354  * Returns the generation number in the last argument.
355  * ifindex should only be set for link-locals
356  */
357 dce_t *
358 dce_lookup_v6(const in6_addr_t *dst, uint_t ifindex, ip_stack_t *ipst,
359     uint_t *generationp)
360 {
361 	uint_t		hash;
362 	dcb_t		*dcb;
363 	dce_t		*dce;
364 
365 	/* Set *generationp before dropping the lock(s) that allow additions */
366 	if (generationp != NULL)
367 		*generationp = ipst->ips_dce_default->dce_generation;
368 
369 	hash = IRE_ADDR_HASH_V6(*dst, ipst->ips_dce_hashsize);
370 	dcb = &ipst->ips_dce_hash_v6[hash];
371 	rw_enter(&dcb->dcb_lock, RW_READER);
372 	for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) {
373 		if (IN6_ARE_ADDR_EQUAL(&dce->dce_v6addr, dst) &&
374 		    dce->dce_ifindex == ifindex) {
375 			mutex_enter(&dce->dce_lock);
376 			if (!DCE_IS_CONDEMNED(dce)) {
377 				dce_refhold(dce);
378 				if (generationp != NULL)
379 					*generationp = dce->dce_generation;
380 				mutex_exit(&dce->dce_lock);
381 				rw_exit(&dcb->dcb_lock);
382 				return (dce);
383 			}
384 			mutex_exit(&dce->dce_lock);
385 		}
386 	}
387 	rw_exit(&dcb->dcb_lock);
388 	/* Not found */
389 	dce = ipst->ips_dce_default;
390 	dce_refhold(dce);
391 	return (dce);
392 }
393 
394 /*
395  * Atomically looks for a non-default DCE, and if not found tries to create one.
396  * If there is no memory it returns NULL.
397  * When an entry is created we increase the generation number on
398  * the default DCE so that conn_ip_output will detect there is a new DCE.
399  */
400 dce_t *
401 dce_lookup_and_add_v4(ipaddr_t dst, ip_stack_t *ipst)
402 {
403 	uint_t		hash;
404 	dcb_t		*dcb;
405 	dce_t		*dce;
406 
407 	hash = IRE_ADDR_HASH(dst, ipst->ips_dce_hashsize);
408 	dcb = &ipst->ips_dce_hash_v4[hash];
409 	rw_enter(&dcb->dcb_lock, RW_WRITER);
410 	for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) {
411 		if (dce->dce_v4addr == dst) {
412 			mutex_enter(&dce->dce_lock);
413 			if (!DCE_IS_CONDEMNED(dce)) {
414 				dce_refhold(dce);
415 				mutex_exit(&dce->dce_lock);
416 				rw_exit(&dcb->dcb_lock);
417 				return (dce);
418 			}
419 			mutex_exit(&dce->dce_lock);
420 		}
421 	}
422 	dce = kmem_cache_alloc(dce_cache, KM_NOSLEEP);
423 	if (dce == NULL) {
424 		rw_exit(&dcb->dcb_lock);
425 		return (NULL);
426 	}
427 	bzero(dce, sizeof (dce_t));
428 	dce->dce_ipst = ipst;	/* No netstack_hold */
429 	dce->dce_v4addr = dst;
430 	dce->dce_generation = DCE_GENERATION_INITIAL;
431 	dce->dce_ipversion = IPV4_VERSION;
432 	dce->dce_last_change_time = TICK_TO_SEC(ddi_get_lbolt64());
433 	dce_refhold(dce);	/* For the hash list */
434 
435 	/* Link into list */
436 	if (dcb->dcb_dce != NULL)
437 		dcb->dcb_dce->dce_ptpn = &dce->dce_next;
438 	dce->dce_next = dcb->dcb_dce;
439 	dce->dce_ptpn = &dcb->dcb_dce;
440 	dcb->dcb_dce = dce;
441 	dce->dce_bucket = dcb;
442 	dce_refhold(dce);	/* For the caller */
443 	rw_exit(&dcb->dcb_lock);
444 
445 	/* Initialize dce_ident to be different than for the last packet */
446 	dce->dce_ident = ipst->ips_dce_default->dce_ident + 1;
447 
448 	dce_increment_generation(ipst->ips_dce_default);
449 	return (dce);
450 }
451 
452 /*
453  * Atomically looks for a non-default DCE, and if not found tries to create one.
454  * If there is no memory it returns NULL.
455  * When an entry is created we increase the generation number on
456  * the default DCE so that conn_ip_output will detect there is a new DCE.
457  * ifindex should only be used with link-local addresses.
458  */
459 dce_t *
460 dce_lookup_and_add_v6(const in6_addr_t *dst, uint_t ifindex, ip_stack_t *ipst)
461 {
462 	uint_t		hash;
463 	dcb_t		*dcb;
464 	dce_t		*dce;
465 
466 	/* We should not create entries for link-locals w/o an ifindex */
467 	ASSERT(!(IN6_IS_ADDR_LINKSCOPE(dst)) || ifindex != 0);
468 
469 	hash = IRE_ADDR_HASH_V6(*dst, ipst->ips_dce_hashsize);
470 	dcb = &ipst->ips_dce_hash_v6[hash];
471 	rw_enter(&dcb->dcb_lock, RW_WRITER);
472 	for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) {
473 		if (IN6_ARE_ADDR_EQUAL(&dce->dce_v6addr, dst) &&
474 		    dce->dce_ifindex == ifindex) {
475 			mutex_enter(&dce->dce_lock);
476 			if (!DCE_IS_CONDEMNED(dce)) {
477 				dce_refhold(dce);
478 				mutex_exit(&dce->dce_lock);
479 				rw_exit(&dcb->dcb_lock);
480 				return (dce);
481 			}
482 			mutex_exit(&dce->dce_lock);
483 		}
484 	}
485 
486 	dce = kmem_cache_alloc(dce_cache, KM_NOSLEEP);
487 	if (dce == NULL) {
488 		rw_exit(&dcb->dcb_lock);
489 		return (NULL);
490 	}
491 	bzero(dce, sizeof (dce_t));
492 	dce->dce_ipst = ipst;	/* No netstack_hold */
493 	dce->dce_v6addr = *dst;
494 	dce->dce_ifindex = ifindex;
495 	dce->dce_generation = DCE_GENERATION_INITIAL;
496 	dce->dce_ipversion = IPV6_VERSION;
497 	dce->dce_last_change_time = TICK_TO_SEC(ddi_get_lbolt64());
498 	dce_refhold(dce);	/* For the hash list */
499 
500 	/* Link into list */
501 	if (dcb->dcb_dce != NULL)
502 		dcb->dcb_dce->dce_ptpn = &dce->dce_next;
503 	dce->dce_next = dcb->dcb_dce;
504 	dce->dce_ptpn = &dcb->dcb_dce;
505 	dcb->dcb_dce = dce;
506 	dce->dce_bucket = dcb;
507 	atomic_add_32(&dcb->dcb_cnt, 1);
508 	dce_refhold(dce);	/* For the caller */
509 	rw_exit(&dcb->dcb_lock);
510 
511 	/* Initialize dce_ident to be different than for the last packet */
512 	dce->dce_ident = ipst->ips_dce_default->dce_ident + 1;
513 	dce_increment_generation(ipst->ips_dce_default);
514 	return (dce);
515 }
516 
517 /*
518  * Set/update uinfo. Creates a per-destination dce if none exists.
519  *
520  * Note that we do not bump the generation number here.
521  * New connections will find the new uinfo.
522  *
523  * The only use of this (tcp, sctp using iulp_t) is to set rtt+rtt_sd.
524  */
525 static void
526 dce_setuinfo(dce_t *dce, iulp_t *uinfo)
527 {
528 	/*
529 	 * Update the round trip time estimate and/or the max frag size
530 	 * and/or the slow start threshold.
531 	 *
532 	 * We serialize multiple advises using dce_lock.
533 	 */
534 	mutex_enter(&dce->dce_lock);
535 	/* Gard against setting to zero */
536 	if (uinfo->iulp_rtt != 0) {
537 		/*
538 		 * If there is no old cached values, initialize them
539 		 * conservatively.  Set them to be (1.5 * new value).
540 		 */
541 		if (dce->dce_uinfo.iulp_rtt != 0) {
542 			dce->dce_uinfo.iulp_rtt = (dce->dce_uinfo.iulp_rtt +
543 			    uinfo->iulp_rtt) >> 1;
544 		} else {
545 			dce->dce_uinfo.iulp_rtt = uinfo->iulp_rtt +
546 			    (uinfo->iulp_rtt >> 1);
547 		}
548 		if (dce->dce_uinfo.iulp_rtt_sd != 0) {
549 			dce->dce_uinfo.iulp_rtt_sd =
550 			    (dce->dce_uinfo.iulp_rtt_sd +
551 			    uinfo->iulp_rtt_sd) >> 1;
552 		} else {
553 			dce->dce_uinfo.iulp_rtt_sd = uinfo->iulp_rtt_sd +
554 			    (uinfo->iulp_rtt_sd >> 1);
555 		}
556 	}
557 	if (uinfo->iulp_mtu != 0) {
558 		if (dce->dce_flags & DCEF_PMTU) {
559 			dce->dce_pmtu = MIN(uinfo->iulp_mtu, dce->dce_pmtu);
560 		} else {
561 			dce->dce_pmtu = MIN(uinfo->iulp_mtu, IP_MAXPACKET);
562 			dce->dce_flags |= DCEF_PMTU;
563 		}
564 		dce->dce_last_change_time = TICK_TO_SEC(ddi_get_lbolt64());
565 	}
566 	if (uinfo->iulp_ssthresh != 0) {
567 		if (dce->dce_uinfo.iulp_ssthresh != 0)
568 			dce->dce_uinfo.iulp_ssthresh =
569 			    (uinfo->iulp_ssthresh +
570 			    dce->dce_uinfo.iulp_ssthresh) >> 1;
571 		else
572 			dce->dce_uinfo.iulp_ssthresh = uinfo->iulp_ssthresh;
573 	}
574 	/* We have uinfo for sure */
575 	dce->dce_flags |= DCEF_UINFO;
576 	mutex_exit(&dce->dce_lock);
577 }
578 
579 
580 int
581 dce_update_uinfo_v4(ipaddr_t dst, iulp_t *uinfo, ip_stack_t *ipst)
582 {
583 	dce_t *dce;
584 
585 	dce = dce_lookup_and_add_v4(dst, ipst);
586 	if (dce == NULL)
587 		return (ENOMEM);
588 
589 	dce_setuinfo(dce, uinfo);
590 	dce_refrele(dce);
591 	return (0);
592 }
593 
594 int
595 dce_update_uinfo_v6(const in6_addr_t *dst, uint_t ifindex, iulp_t *uinfo,
596     ip_stack_t *ipst)
597 {
598 	dce_t *dce;
599 
600 	dce = dce_lookup_and_add_v6(dst, ifindex, ipst);
601 	if (dce == NULL)
602 		return (ENOMEM);
603 
604 	dce_setuinfo(dce, uinfo);
605 	dce_refrele(dce);
606 	return (0);
607 }
608 
609 /* Common routine for IPv4 and IPv6 */
610 int
611 dce_update_uinfo(const in6_addr_t *dst, uint_t ifindex, iulp_t *uinfo,
612     ip_stack_t *ipst)
613 {
614 	ipaddr_t dst4;
615 
616 	if (IN6_IS_ADDR_V4MAPPED_ANY(dst)) {
617 		IN6_V4MAPPED_TO_IPADDR(dst, dst4);
618 		return (dce_update_uinfo_v4(dst4, uinfo, ipst));
619 	} else {
620 		return (dce_update_uinfo_v6(dst, ifindex, uinfo, ipst));
621 	}
622 }
623 
624 static void
625 dce_make_condemned(dce_t *dce)
626 {
627 	ip_stack_t	*ipst = dce->dce_ipst;
628 
629 	mutex_enter(&dce->dce_lock);
630 	ASSERT(!DCE_IS_CONDEMNED(dce));
631 	dce->dce_generation = DCE_GENERATION_CONDEMNED;
632 	mutex_exit(&dce->dce_lock);
633 	/* Count how many condemned dces for kmem_cache callback */
634 	atomic_add_32(&ipst->ips_num_dce_condemned, 1);
635 }
636 
637 /*
638  * Increment the generation avoiding the special condemned value
639  */
640 void
641 dce_increment_generation(dce_t *dce)
642 {
643 	uint_t generation;
644 
645 	mutex_enter(&dce->dce_lock);
646 	if (!DCE_IS_CONDEMNED(dce)) {
647 		generation = dce->dce_generation + 1;
648 		if (generation == DCE_GENERATION_CONDEMNED)
649 			generation = DCE_GENERATION_INITIAL;
650 		ASSERT(generation != DCE_GENERATION_VERIFY);
651 		dce->dce_generation = generation;
652 	}
653 	mutex_exit(&dce->dce_lock);
654 }
655 
656 /*
657  * Increment the generation number on all dces that have a path MTU and
658  * the default DCE. Used when ill_mtu changes.
659  */
660 void
661 dce_increment_all_generations(boolean_t isv6, ip_stack_t *ipst)
662 {
663 	int		i;
664 	dcb_t		*dcb;
665 	dce_t		*dce;
666 
667 	for (i = 0; i < ipst->ips_dce_hashsize; i++) {
668 		if (isv6)
669 			dcb = &ipst->ips_dce_hash_v6[i];
670 		else
671 			dcb = &ipst->ips_dce_hash_v4[i];
672 		rw_enter(&dcb->dcb_lock, RW_WRITER);
673 		for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) {
674 			if (DCE_IS_CONDEMNED(dce))
675 				continue;
676 			dce_increment_generation(dce);
677 		}
678 		rw_exit(&dcb->dcb_lock);
679 	}
680 	dce_increment_generation(ipst->ips_dce_default);
681 }
682 
683 /*
684  * Caller needs to do a dce_refrele since we can't do the
685  * dce_refrele under dcb_lock.
686  */
687 static void
688 dce_delete_locked(dcb_t *dcb, dce_t *dce)
689 {
690 	dce->dce_bucket = NULL;
691 	*dce->dce_ptpn = dce->dce_next;
692 	if (dce->dce_next != NULL)
693 		dce->dce_next->dce_ptpn = dce->dce_ptpn;
694 	dce->dce_ptpn = NULL;
695 	dce->dce_next = NULL;
696 	atomic_add_32(&dcb->dcb_cnt, -1);
697 	dce_make_condemned(dce);
698 }
699 
700 static void
701 dce_inactive(dce_t *dce)
702 {
703 	ip_stack_t	*ipst = dce->dce_ipst;
704 
705 	ASSERT(!(dce->dce_flags & DCEF_DEFAULT));
706 	ASSERT(dce->dce_ptpn == NULL);
707 	ASSERT(dce->dce_bucket == NULL);
708 
709 	/* Count how many condemned dces for kmem_cache callback */
710 	if (DCE_IS_CONDEMNED(dce))
711 		atomic_add_32(&ipst->ips_num_dce_condemned, -1);
712 
713 	kmem_cache_free(dce_cache, dce);
714 }
715 
716 void
717 dce_refrele(dce_t *dce)
718 {
719 	ASSERT(dce->dce_refcnt != 0);
720 	if (atomic_add_32_nv(&dce->dce_refcnt, -1) == 0)
721 		dce_inactive(dce);
722 }
723 
724 void
725 dce_refhold(dce_t *dce)
726 {
727 	atomic_add_32(&dce->dce_refcnt, 1);
728 	ASSERT(dce->dce_refcnt != 0);
729 }
730 
731 /* No tracing support yet hence the same as the above functions */
732 void
733 dce_refrele_notr(dce_t *dce)
734 {
735 	ASSERT(dce->dce_refcnt != 0);
736 	if (atomic_add_32_nv(&dce->dce_refcnt, -1) == 0)
737 		dce_inactive(dce);
738 }
739 
740 void
741 dce_refhold_notr(dce_t *dce)
742 {
743 	atomic_add_32(&dce->dce_refcnt, 1);
744 	ASSERT(dce->dce_refcnt != 0);
745 }
746 
747 /* Report both the IPv4 and IPv6 DCEs. */
748 mblk_t *
749 ip_snmp_get_mib2_ip_dce(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
750 {
751 	struct opthdr		*optp;
752 	mblk_t			*mp2ctl;
753 	dest_cache_entry_t	dest_cache;
754 	mblk_t			*mp_tail = NULL;
755 	dce_t			*dce;
756 	dcb_t			*dcb;
757 	int			i;
758 	uint64_t		current_time;
759 
760 	current_time = TICK_TO_SEC(ddi_get_lbolt64());
761 
762 	/*
763 	 * make a copy of the original message
764 	 */
765 	mp2ctl = copymsg(mpctl);
766 
767 	/* First we do IPv4 entries */
768 	optp = (struct opthdr *)&mpctl->b_rptr[
769 	    sizeof (struct T_optmgmt_ack)];
770 	optp->level = MIB2_IP;
771 	optp->name = EXPER_IP_DCE;
772 
773 	for (i = 0; i < ipst->ips_dce_hashsize; i++) {
774 		dcb = &ipst->ips_dce_hash_v4[i];
775 		rw_enter(&dcb->dcb_lock, RW_READER);
776 		for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) {
777 			dest_cache.DestIpv4Address = dce->dce_v4addr;
778 			dest_cache.DestFlags = dce->dce_flags;
779 			if (dce->dce_flags & DCEF_PMTU)
780 				dest_cache.DestPmtu = dce->dce_pmtu;
781 			else
782 				dest_cache.DestPmtu = 0;
783 			dest_cache.DestIdent = dce->dce_ident;
784 			dest_cache.DestIfindex = 0;
785 			dest_cache.DestAge = current_time -
786 			    dce->dce_last_change_time;
787 			if (!snmp_append_data2(mpctl->b_cont, &mp_tail,
788 			    (char *)&dest_cache, (int)sizeof (dest_cache))) {
789 				ip1dbg(("ip_snmp_get_mib2_ip_dce: "
790 				    "failed to allocate %u bytes\n",
791 				    (uint_t)sizeof (dest_cache)));
792 			}
793 		}
794 		rw_exit(&dcb->dcb_lock);
795 	}
796 	optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont);
797 	ip3dbg(("ip_snmp_get: level %d, name %d, len %d\n",
798 	    (int)optp->level, (int)optp->name, (int)optp->len));
799 	qreply(q, mpctl);
800 
801 	if (mp2ctl == NULL) {
802 		/* Copymsg failed above */
803 		return (NULL);
804 	}
805 
806 	/* Now for IPv6 */
807 	mpctl = mp2ctl;
808 	mp_tail = NULL;
809 	mp2ctl = copymsg(mpctl);
810 	optp = (struct opthdr *)&mpctl->b_rptr[
811 	    sizeof (struct T_optmgmt_ack)];
812 	optp->level = MIB2_IP6;
813 	optp->name = EXPER_IP_DCE;
814 
815 	for (i = 0; i < ipst->ips_dce_hashsize; i++) {
816 		dcb = &ipst->ips_dce_hash_v6[i];
817 		rw_enter(&dcb->dcb_lock, RW_READER);
818 		for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) {
819 			dest_cache.DestIpv6Address = dce->dce_v6addr;
820 			dest_cache.DestFlags = dce->dce_flags;
821 			if (dce->dce_flags & DCEF_PMTU)
822 				dest_cache.DestPmtu = dce->dce_pmtu;
823 			else
824 				dest_cache.DestPmtu = 0;
825 			dest_cache.DestIdent = dce->dce_ident;
826 			if (IN6_IS_ADDR_LINKSCOPE(&dce->dce_v6addr))
827 				dest_cache.DestIfindex = dce->dce_ifindex;
828 			else
829 				dest_cache.DestIfindex = 0;
830 			dest_cache.DestAge = current_time -
831 			    dce->dce_last_change_time;
832 			if (!snmp_append_data2(mpctl->b_cont, &mp_tail,
833 			    (char *)&dest_cache, (int)sizeof (dest_cache))) {
834 				ip1dbg(("ip_snmp_get_mib2_ip_dce: "
835 				    "failed to allocate %u bytes\n",
836 				    (uint_t)sizeof (dest_cache)));
837 			}
838 		}
839 		rw_exit(&dcb->dcb_lock);
840 	}
841 	optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont);
842 	ip3dbg(("ip_snmp_get: level %d, name %d, len %d\n",
843 	    (int)optp->level, (int)optp->name, (int)optp->len));
844 	qreply(q, mpctl);
845 
846 	return (mp2ctl);
847 }
848 
849 /*
850  * Remove IPv6 DCEs which refer to an ifindex that is going away.
851  * This is not required for correctness, but it avoids netstat -d
852  * showing stale stuff that will never be used.
853  */
854 void
855 dce_cleanup(uint_t ifindex, ip_stack_t *ipst)
856 {
857 	uint_t	i;
858 	dcb_t	*dcb;
859 	dce_t	*dce, *nextdce;
860 
861 	for (i = 0; i < ipst->ips_dce_hashsize; i++) {
862 		dcb = &ipst->ips_dce_hash_v6[i];
863 		rw_enter(&dcb->dcb_lock, RW_WRITER);
864 
865 		for (dce = dcb->dcb_dce; dce != NULL; dce = nextdce) {
866 			nextdce = dce->dce_next;
867 			if (dce->dce_ifindex == ifindex) {
868 				dce_delete_locked(dcb, dce);
869 				dce_refrele(dce);
870 			}
871 		}
872 		rw_exit(&dcb->dcb_lock);
873 	}
874 }
875