xref: /titanic_51/usr/src/uts/common/inet/ip/ip_ndp.c (revision d326b23bcecd3c0d693a54003343ec3de73e58d0)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 #include <sys/types.h>
29 #include <sys/stream.h>
30 #include <sys/stropts.h>
31 #include <sys/sysmacros.h>
32 #include <sys/errno.h>
33 #include <sys/dlpi.h>
34 #include <sys/socket.h>
35 #include <sys/ddi.h>
36 #include <sys/cmn_err.h>
37 #include <sys/debug.h>
38 #include <sys/vtrace.h>
39 #include <sys/kmem.h>
40 #include <sys/zone.h>
41 
42 #include <net/if.h>
43 #include <net/if_dl.h>
44 #include <net/route.h>
45 #include <netinet/in.h>
46 #include <netinet/ip6.h>
47 #include <netinet/icmp6.h>
48 
49 #include <inet/common.h>
50 #include <inet/mi.h>
51 #include <inet/mib2.h>
52 #include <inet/nd.h>
53 #include <inet/ip.h>
54 #include <inet/ip_if.h>
55 #include <inet/ip_ire.h>
56 #include <inet/ip_rts.h>
57 #include <inet/ip6.h>
58 #include <inet/ip_ndp.h>
59 #include <inet/ipsec_impl.h>
60 #include <inet/ipsec_info.h>
61 
62 /*
63  * Function names with nce_ prefix are static while function
64  * names with ndp_ prefix are used by rest of the IP.
65  */
66 
67 static	boolean_t nce_cmp_ll_addr(nce_t *nce, char *new_ll_addr,
68     uint32_t ll_addr_len);
69 static	void	nce_fastpath(nce_t *nce);
70 static	void	nce_ire_delete(nce_t *nce);
71 static	void	nce_ire_delete1(ire_t *ire, char *nce_arg);
72 static	void 	nce_set_ll(nce_t *nce, uchar_t *ll_addr);
73 static	nce_t	*nce_lookup_addr(ill_t *ill, const in6_addr_t *addr);
74 static	nce_t	*nce_lookup_mapping(ill_t *ill, const in6_addr_t *addr);
75 static	void	nce_make_mapping(nce_t *nce, uchar_t *addrpos,
76     uchar_t *addr);
77 static	int	nce_set_multicast(ill_t *ill, const in6_addr_t *addr);
78 static	void	nce_queue_mp(nce_t *nce, mblk_t *mp);
79 static	void	nce_report1(nce_t *nce, uchar_t *mp_arg);
80 static	mblk_t	*nce_udreq_alloc(ill_t *ill);
81 static	void	nce_update(nce_t *nce, uint16_t new_state,
82     uchar_t *new_ll_addr);
83 static	uint32_t	nce_solicit(nce_t *nce, mblk_t *mp);
84 static	boolean_t	nce_xmit(ill_t *ill, uint32_t operation,
85     ill_t *hwaddr_ill, boolean_t use_lla_addr, const in6_addr_t *sender,
86     const in6_addr_t *target, int flag);
87 static	void	lla2ascii(uint8_t *lla, int addrlen, uchar_t *buf);
88 extern void	th_trace_rrecord(th_trace_t *);
89 
90 #ifdef NCE_DEBUG
91 void	nce_trace_inactive(nce_t *);
92 #endif
93 
94 /* NDP Cache Entry Hash Table */
95 #define	NCE_TABLE_SIZE	256
96 static	nce_t	*nce_hash_tbl[NCE_TABLE_SIZE];
97 static	nce_t	*nce_mask_entries;	/* mask not all ones */
98 static	int	ndp_g_walker = 0;	/* # of active thread */
99 					/* walking nce hash list */
100 /* ndp_g_walker_cleanup will be true, when deletion have to be defered */
101 static	boolean_t	ndp_g_walker_cleanup = B_FALSE;
102 
103 #define	NCE_HASH_PTR(addr) \
104 	(&(nce_hash_tbl[NCE_ADDR_HASH_V6(addr, NCE_TABLE_SIZE)]))
105 
106 /*
107  * NDP Cache Entry creation routine.
108  * Mapped entries will never do NUD .
109  * This routine must always be called with ndp_g_lock held.
110  * Prior to return, nce_refcnt is incremented.
111  */
112 int
113 ndp_add(ill_t *ill, uchar_t *hw_addr, const in6_addr_t *addr,
114     const in6_addr_t *mask, const in6_addr_t *extract_mask,
115     uint32_t hw_extract_start, uint16_t flags, uint16_t state,
116     nce_t **newnce)
117 {
118 static	nce_t		nce_nil;
119 	nce_t		*nce;
120 	mblk_t		*mp;
121 	mblk_t		*template;
122 	nce_t		**ncep;
123 	boolean_t	dropped = B_FALSE;
124 
125 	ASSERT(MUTEX_HELD(&ndp_g_lock));
126 	ASSERT(ill != NULL);
127 	if (IN6_IS_ADDR_UNSPECIFIED(addr)) {
128 		ip0dbg(("ndp_add: no addr\n"));
129 		return (EINVAL);
130 	}
131 	if ((flags & ~NCE_EXTERNAL_FLAGS_MASK)) {
132 		ip0dbg(("ndp_add: flags = %x\n", (int)flags));
133 		return (EINVAL);
134 	}
135 	if (IN6_IS_ADDR_UNSPECIFIED(extract_mask) &&
136 	    (flags & NCE_F_MAPPING)) {
137 		ip0dbg(("ndp_add: extract mask zero for mapping"));
138 		return (EINVAL);
139 	}
140 	/*
141 	 * Allocate the mblk to hold the nce.
142 	 *
143 	 * XXX This can come out of a separate cache - nce_cache.
144 	 * We don't need the mp anymore as there are no more
145 	 * "qwriter"s
146 	 */
147 	mp = allocb(sizeof (nce_t), BPRI_MED);
148 	if (mp == NULL)
149 		return (ENOMEM);
150 
151 	nce = (nce_t *)mp->b_rptr;
152 	mp->b_wptr = (uchar_t *)&nce[1];
153 	*nce = nce_nil;
154 
155 	/*
156 	 * This one holds link layer address
157 	 */
158 	if (ill->ill_net_type == IRE_IF_RESOLVER) {
159 		template = nce_udreq_alloc(ill);
160 	} else {
161 		ASSERT((ill->ill_net_type == IRE_IF_NORESOLVER));
162 		ASSERT((ill->ill_resolver_mp != NULL));
163 		template = copyb(ill->ill_resolver_mp);
164 	}
165 	if (template == NULL) {
166 		freeb(mp);
167 		return (ENOMEM);
168 	}
169 	nce->nce_ill = ill;
170 	nce->nce_flags = flags;
171 	nce->nce_state = state;
172 	nce->nce_pcnt = ND_MAX_UNICAST_SOLICIT;
173 	nce->nce_rcnt = ill->ill_xmit_count;
174 	nce->nce_addr = *addr;
175 	nce->nce_mask = *mask;
176 	nce->nce_extract_mask = *extract_mask;
177 	nce->nce_ll_extract_start = hw_extract_start;
178 	nce->nce_fp_mp = NULL;
179 	nce->nce_res_mp = template;
180 	if (state == ND_REACHABLE)
181 		nce->nce_last = TICK_TO_MSEC(lbolt64);
182 	else
183 		nce->nce_last = 0;
184 	nce->nce_qd_mp = NULL;
185 	nce->nce_mp = mp;
186 	if (hw_addr != NULL)
187 		nce_set_ll(nce, hw_addr);
188 	/* This one is for nce getting created */
189 	nce->nce_refcnt = 1;
190 	mutex_init(&nce->nce_lock, NULL, MUTEX_DEFAULT, NULL);
191 	if (nce->nce_flags & NCE_F_MAPPING) {
192 		ASSERT(IN6_IS_ADDR_MULTICAST(addr));
193 		ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&nce->nce_mask));
194 		ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&nce->nce_extract_mask));
195 		ncep = &nce_mask_entries;
196 	} else {
197 		ncep = ((nce_t **)NCE_HASH_PTR(*addr));
198 	}
199 
200 #ifdef NCE_DEBUG
201 	bzero(nce->nce_trace, sizeof (th_trace_t *) * IP_TR_HASH_MAX);
202 #endif
203 	/*
204 	 * Atomically ensure that the ill is not CONDEMNED, before
205 	 * adding the NCE.
206 	 */
207 	mutex_enter(&ill->ill_lock);
208 	if (ill->ill_state_flags & ILL_CONDEMNED) {
209 		mutex_exit(&ill->ill_lock);
210 		freeb(mp);
211 		return (EINVAL);
212 	}
213 	if ((nce->nce_next = *ncep) != NULL)
214 		nce->nce_next->nce_ptpn = &nce->nce_next;
215 	*ncep = nce;
216 	nce->nce_ptpn = ncep;
217 	*newnce = nce;
218 	/* This one is for nce being used by an active thread */
219 	NCE_REFHOLD(*newnce);
220 
221 	/* Bump up the number of nce's referencing this ill */
222 	ill->ill_nce_cnt++;
223 	mutex_exit(&ill->ill_lock);
224 
225 	/*
226 	 * Before we insert the nce, honor the UNSOL_ADV flag.
227 	 * We cannot hold the ndp_g_lock and call nce_xmit
228 	 * which does a putnext.
229 	 */
230 	if (flags & NCE_F_UNSOL_ADV) {
231 		flags |= NDP_ORIDE;
232 		/*
233 		 * We account for the transmit below by assigning one
234 		 * less than the ndd variable. Subsequent decrements
235 		 * are done in ndp_timer.
236 		 */
237 		mutex_enter(&nce->nce_lock);
238 		mutex_exit(&ndp_g_lock);
239 		nce->nce_unsolicit_count = ip_ndp_unsolicit_count - 1;
240 		mutex_exit(&nce->nce_lock);
241 		dropped = nce_xmit(ill,
242 		    ND_NEIGHBOR_ADVERT,
243 		    ill,	/* ill to be used for extracting ill_nd_lla */
244 		    B_TRUE,	/* use ill_nd_lla */
245 		    addr,	/* Source and target of the advertisement pkt */
246 		    &ipv6_all_hosts_mcast, /* Destination of the packet */
247 		    flags);
248 		mutex_enter(&nce->nce_lock);
249 		if (dropped)
250 			nce->nce_unsolicit_count++;
251 		if (nce->nce_unsolicit_count != 0) {
252 			nce->nce_timeout_id = timeout(ndp_timer, nce,
253 			    MSEC_TO_TICK(ip_ndp_unsolicit_interval));
254 		}
255 		mutex_exit(&nce->nce_lock);
256 		mutex_enter(&ndp_g_lock);
257 	}
258 	/*
259 	 * If the hw_addr is NULL, typically for ND_INCOMPLETE nces, then
260 	 * we call nce_fastpath as soon as the nce is resolved in ndp_process.
261 	 * We call nce_fastpath from nce_update if the link layer address of
262 	 * the peer changes from nce_update
263 	 */
264 	if (hw_addr != NULL || ill->ill_net_type == IRE_IF_NORESOLVER)
265 		nce_fastpath(nce);
266 	return (0);
267 }
268 
269 int
270 ndp_lookup_then_add(ill_t *ill, uchar_t *hw_addr, const in6_addr_t *addr,
271     const in6_addr_t *mask, const in6_addr_t *extract_mask,
272     uint32_t hw_extract_start, uint16_t flags, uint16_t state,
273     nce_t **newnce)
274 {
275 	int	err = 0;
276 	nce_t	*nce;
277 
278 	mutex_enter(&ndp_g_lock);
279 	nce = nce_lookup_addr(ill, addr);
280 	if (nce == NULL) {
281 		err = ndp_add(ill,
282 		    hw_addr,
283 		    addr,
284 		    mask,
285 		    extract_mask,
286 		    hw_extract_start,
287 		    flags,
288 		    state,
289 		    newnce);
290 	} else {
291 		*newnce = nce;
292 		err = EEXIST;
293 	}
294 	mutex_exit(&ndp_g_lock);
295 	return (err);
296 }
297 
298 /*
299  * Remove all the CONDEMNED nces from the appropriate hash table.
300  * We create a private list of NCEs, these may have ires pointing
301  * to them, so the list will be passed through to clean up dependent
302  * ires and only then we can do NCE_REFRELE which can make NCE inactive.
303  */
304 static void
305 nce_remove(nce_t *nce, nce_t **free_nce_list)
306 {
307 	nce_t *nce1;
308 	nce_t **ptpn;
309 
310 	ASSERT(MUTEX_HELD(&ndp_g_lock));
311 	ASSERT(ndp_g_walker == 0);
312 	for (; nce; nce = nce1) {
313 		nce1 = nce->nce_next;
314 		mutex_enter(&nce->nce_lock);
315 		if (nce->nce_flags & NCE_F_CONDEMNED) {
316 			ptpn = nce->nce_ptpn;
317 			nce1 = nce->nce_next;
318 			if (nce1 != NULL)
319 				nce1->nce_ptpn = ptpn;
320 			*ptpn = nce1;
321 			nce->nce_ptpn = NULL;
322 			nce->nce_next = NULL;
323 			nce->nce_next = *free_nce_list;
324 			*free_nce_list = nce;
325 		}
326 		mutex_exit(&nce->nce_lock);
327 	}
328 }
329 
330 /*
331  * 1. Mark the nce CONDEMNED. This ensures that no new nce_lookup()
332  *    will return this NCE. Also no new IREs will be created that
333  *    point to this NCE (See ire_add_v6).  Also no new timeouts will
334  *    be started (See NDP_RESTART_TIMER).
335  * 2. Cancel any currently running timeouts.
336  * 3. If there is an ndp walker, return. The walker will do the cleanup.
337  *    This ensures that walkers see a consistent list of NCEs while walking.
338  * 4. Otherwise remove the NCE from the list of NCEs
339  * 5. Delete all IREs pointing to this NCE.
340  */
341 void
342 ndp_delete(nce_t *nce)
343 {
344 	nce_t	**ptpn;
345 	nce_t	*nce1;
346 
347 	/* Serialize deletes */
348 	mutex_enter(&nce->nce_lock);
349 	if (nce->nce_flags & NCE_F_CONDEMNED) {
350 		/* Some other thread is doing the delete */
351 		mutex_exit(&nce->nce_lock);
352 		return;
353 	}
354 	/*
355 	 * Caller has a refhold. Also 1 ref for being in the list. Thus
356 	 * refcnt has to be >= 2
357 	 */
358 	ASSERT(nce->nce_refcnt >= 2);
359 	nce->nce_flags |= NCE_F_CONDEMNED;
360 	mutex_exit(&nce->nce_lock);
361 
362 	nce_fastpath_list_delete(nce);
363 
364 	/*
365 	 * Cancel any running timer. Timeout can't be restarted
366 	 * since CONDEMNED is set. Can't hold nce_lock across untimeout.
367 	 * Passing invalid timeout id is fine.
368 	 */
369 	if (nce->nce_timeout_id != 0) {
370 		(void) untimeout(nce->nce_timeout_id);
371 		nce->nce_timeout_id = 0;
372 	}
373 
374 	mutex_enter(&ndp_g_lock);
375 	if (nce->nce_ptpn == NULL) {
376 		/*
377 		 * The last ndp walker has already removed this nce from
378 		 * the list after we marked the nce CONDEMNED and before
379 		 * we grabbed the ndp_g_lock.
380 		 */
381 		mutex_exit(&ndp_g_lock);
382 		return;
383 	}
384 	if (ndp_g_walker > 0) {
385 		/*
386 		 * Can't unlink. The walker will clean up
387 		 */
388 		ndp_g_walker_cleanup = B_TRUE;
389 		mutex_exit(&ndp_g_lock);
390 		return;
391 	}
392 
393 	/*
394 	 * Now remove the nce from the list. NDP_RESTART_TIMER won't restart
395 	 * the timer since it is marked CONDEMNED.
396 	 */
397 	ptpn = nce->nce_ptpn;
398 	nce1 = nce->nce_next;
399 	if (nce1 != NULL)
400 		nce1->nce_ptpn = ptpn;
401 	*ptpn = nce1;
402 	nce->nce_ptpn = NULL;
403 	nce->nce_next = NULL;
404 	mutex_exit(&ndp_g_lock);
405 
406 	nce_ire_delete(nce);
407 }
408 
409 void
410 ndp_inactive(nce_t *nce)
411 {
412 	mblk_t		**mpp;
413 	ill_t		*ill;
414 
415 	ASSERT(nce->nce_refcnt == 0);
416 	ASSERT(MUTEX_HELD(&nce->nce_lock));
417 	ASSERT(nce->nce_fastpath == NULL);
418 
419 	/* Free all nce allocated messages */
420 	mpp = &nce->nce_first_mp_to_free;
421 	do {
422 		while (*mpp != NULL) {
423 			mblk_t  *mp;
424 
425 			mp = *mpp;
426 			*mpp = mp->b_next;
427 			mp->b_next = NULL;
428 			mp->b_prev = NULL;
429 			freemsg(mp);
430 		}
431 	} while (mpp++ != &nce->nce_last_mp_to_free);
432 
433 #ifdef NCE_DEBUG
434 	nce_trace_inactive(nce);
435 #endif
436 
437 	ill = nce->nce_ill;
438 	mutex_enter(&ill->ill_lock);
439 	ill->ill_nce_cnt--;
440 	/*
441 	 * If the number of nce's associated with this ill have dropped
442 	 * to zero, check whether we need to restart any operation that
443 	 * is waiting for this to happen.
444 	 */
445 	if (ill->ill_nce_cnt == 0) {
446 		/* ipif_ill_refrele_tail drops the ill_lock */
447 		ipif_ill_refrele_tail(ill);
448 	} else {
449 		mutex_exit(&ill->ill_lock);
450 	}
451 	mutex_destroy(&nce->nce_lock);
452 	freeb(nce->nce_mp);
453 }
454 
455 /*
456  * ndp_walk routine.  Delete the nce if it is associated with the ill
457  * that is going away.  Always called as a writer.
458  */
459 void
460 ndp_delete_per_ill(nce_t *nce, uchar_t *arg)
461 {
462 	if ((nce != NULL) && nce->nce_ill == (ill_t *)arg) {
463 		ndp_delete(nce);
464 	}
465 }
466 
467 /*
468  * Walk a list of to be inactive NCEs and blow away all the ires.
469  */
470 static void
471 nce_ire_delete_list(nce_t *nce)
472 {
473 	nce_t *nce_next;
474 
475 	ASSERT(nce != NULL);
476 	while (nce != NULL) {
477 		nce_next = nce->nce_next;
478 		nce->nce_next = NULL;
479 
480 		/*
481 		 * It is possible for the last ndp walker (this thread)
482 		 * to come here after ndp_delete has marked the nce CONDEMNED
483 		 * and before it has removed the nce from the fastpath list
484 		 * or called untimeout. So we need to do it here. It is safe
485 		 * for both ndp_delete and this thread to do it twice or
486 		 * even simultaneously since each of the threads has a
487 		 * reference on the nce.
488 		 */
489 		nce_fastpath_list_delete(nce);
490 		/*
491 		 * Cancel any running timer. Timeout can't be restarted
492 		 * since CONDEMNED is set. Can't hold nce_lock across untimeout.
493 		 * Passing invalid timeout id is fine.
494 		 */
495 		if (nce->nce_timeout_id != 0) {
496 			(void) untimeout(nce->nce_timeout_id);
497 			nce->nce_timeout_id = 0;
498 		}
499 
500 		ire_walk_ill_v6(MATCH_IRE_ILL | MATCH_IRE_TYPE, IRE_CACHE,
501 		    nce_ire_delete1, (char *)nce, nce->nce_ill);
502 		NCE_REFRELE_NOTR(nce);
503 		nce = nce_next;
504 	}
505 }
506 
507 /*
508  * Delete an ire when the nce goes away.
509  */
510 /* ARGSUSED */
511 static void
512 nce_ire_delete(nce_t *nce)
513 {
514 	ire_walk_ill_v6(MATCH_IRE_ILL | MATCH_IRE_TYPE, IRE_CACHE,
515 	    nce_ire_delete1, (char *)nce, nce->nce_ill);
516 	NCE_REFRELE_NOTR(nce);
517 }
518 
519 /*
520  * ire_walk routine used to delete every IRE that shares this nce
521  */
522 static void
523 nce_ire_delete1(ire_t *ire, char *nce_arg)
524 {
525 	nce_t	*nce = (nce_t *)nce_arg;
526 
527 	ASSERT(ire->ire_type == IRE_CACHE);
528 
529 	if (ire->ire_nce == nce)
530 		ire_delete(ire);
531 }
532 
533 /*
534  * Cache entry lookup.  Try to find an nce matching the parameters passed.
535  * If one is found, the refcnt on the nce will be incremented.
536  */
537 nce_t *
538 ndp_lookup(ill_t *ill, const in6_addr_t *addr, boolean_t caller_holds_lock)
539 {
540 	nce_t	*nce;
541 
542 	if (!caller_holds_lock)
543 		mutex_enter(&ndp_g_lock);
544 	nce = nce_lookup_addr(ill, addr);
545 	if (nce == NULL)
546 		nce = nce_lookup_mapping(ill, addr);
547 	if (!caller_holds_lock)
548 		mutex_exit(&ndp_g_lock);
549 	return (nce);
550 }
551 
552 /*
553  * Cache entry lookup.  Try to find an nce matching the parameters passed.
554  * Look only for exact entries (no mappings).  If an nce is found, increment
555  * the hold count on that nce.
556  */
557 static nce_t *
558 nce_lookup_addr(ill_t *ill, const in6_addr_t *addr)
559 {
560 	nce_t	*nce;
561 
562 	ASSERT(ill != NULL);
563 	ASSERT(MUTEX_HELD(&ndp_g_lock));
564 	if (IN6_IS_ADDR_UNSPECIFIED(addr))
565 		return (NULL);
566 	nce = *((nce_t **)NCE_HASH_PTR(*addr));
567 	for (; nce != NULL; nce = nce->nce_next) {
568 		if (nce->nce_ill == ill) {
569 			if (IN6_ARE_ADDR_EQUAL(&nce->nce_addr, addr) &&
570 			    IN6_ARE_ADDR_EQUAL(&nce->nce_mask,
571 			    &ipv6_all_ones)) {
572 				mutex_enter(&nce->nce_lock);
573 				if (!(nce->nce_flags & NCE_F_CONDEMNED)) {
574 					NCE_REFHOLD_LOCKED(nce);
575 					mutex_exit(&nce->nce_lock);
576 					break;
577 				}
578 				mutex_exit(&nce->nce_lock);
579 			}
580 		}
581 	}
582 	return (nce);
583 }
584 
585 /*
586  * Cache entry lookup.  Try to find an nce matching the parameters passed.
587  * Look only for mappings.
588  */
589 static nce_t *
590 nce_lookup_mapping(ill_t *ill, const in6_addr_t *addr)
591 {
592 	nce_t	*nce;
593 
594 	ASSERT(ill != NULL);
595 	ASSERT(MUTEX_HELD(&ndp_g_lock));
596 	if (!IN6_IS_ADDR_MULTICAST(addr))
597 		return (NULL);
598 	nce = nce_mask_entries;
599 	for (; nce != NULL; nce = nce->nce_next)
600 		if (nce->nce_ill == ill &&
601 		    (V6_MASK_EQ(*addr, nce->nce_mask, nce->nce_addr))) {
602 			mutex_enter(&nce->nce_lock);
603 			if (!(nce->nce_flags & NCE_F_CONDEMNED)) {
604 				NCE_REFHOLD_LOCKED(nce);
605 				mutex_exit(&nce->nce_lock);
606 				break;
607 			}
608 			mutex_exit(&nce->nce_lock);
609 		}
610 	return (nce);
611 }
612 
613 /*
614  * Process passed in parameters either from an incoming packet or via
615  * user ioctl.
616  */
617 void
618 ndp_process(nce_t *nce, uchar_t *hw_addr, uint32_t flag, boolean_t is_adv)
619 {
620 	ill_t	*ill = nce->nce_ill;
621 	uint32_t hw_addr_len = ill->ill_nd_lla_len;
622 	mblk_t	*mp;
623 	boolean_t ll_updated = B_FALSE;
624 	boolean_t ll_changed;
625 
626 	/*
627 	 * No updates of link layer address or the neighbor state is
628 	 * allowed, when the cache is in NONUD state.  This still
629 	 * allows for responding to reachability solicitation.
630 	 */
631 	mutex_enter(&nce->nce_lock);
632 	if (nce->nce_state == ND_INCOMPLETE) {
633 		if (hw_addr == NULL) {
634 			mutex_exit(&nce->nce_lock);
635 			return;
636 		}
637 		nce_set_ll(nce, hw_addr);
638 		/*
639 		 * Update nce state and send the queued packets
640 		 * back to ip this time ire will be added.
641 		 */
642 		if (flag & ND_NA_FLAG_SOLICITED) {
643 			nce_update(nce, ND_REACHABLE, NULL);
644 		} else {
645 			nce_update(nce, ND_STALE, NULL);
646 		}
647 		mutex_exit(&nce->nce_lock);
648 		nce_fastpath(nce);
649 		mutex_enter(&nce->nce_lock);
650 		mp = nce->nce_qd_mp;
651 		nce->nce_qd_mp = NULL;
652 		mutex_exit(&nce->nce_lock);
653 		while (mp != NULL) {
654 			mblk_t *nxt_mp;
655 
656 			nxt_mp = mp->b_next;
657 			mp->b_next = NULL;
658 			if (mp->b_prev != NULL) {
659 				ill_t   *inbound_ill;
660 				queue_t *fwdq = NULL;
661 				uint_t ifindex;
662 
663 				ifindex = (uint_t)(uintptr_t)mp->b_prev;
664 				inbound_ill = ill_lookup_on_ifindex(ifindex,
665 				    B_TRUE, NULL, NULL, NULL, NULL);
666 				if (inbound_ill == NULL) {
667 					mp->b_prev = NULL;
668 					freemsg(mp);
669 					return;
670 				} else {
671 					fwdq = inbound_ill->ill_rq;
672 				}
673 				mp->b_prev = NULL;
674 				/*
675 				 * Send a forwarded packet back into ip_rput_v6
676 				 * just as in ire_send_v6().
677 				 * Extract the queue from b_prev (set in
678 				 * ip_rput_data_v6).
679 				 */
680 				if (fwdq != NULL) {
681 					/*
682 					 * Forwarded packets hop count will
683 					 * get decremented in ip_rput_data_v6
684 					 */
685 					put(fwdq, mp);
686 				} else {
687 					/*
688 					 * Send locally originated packets back
689 					 * into * ip_wput_v6.
690 					 */
691 					put(ill->ill_wq, mp);
692 				}
693 				ill_refrele(inbound_ill);
694 			} else {
695 				put(ill->ill_wq, mp);
696 			}
697 			mp = nxt_mp;
698 		}
699 		return;
700 	}
701 	ll_changed = nce_cmp_ll_addr(nce, (char *)hw_addr, hw_addr_len);
702 	if (!is_adv) {
703 		/* If this is a SOLICITATION request only */
704 		if (ll_changed)
705 			nce_update(nce, ND_STALE, hw_addr);
706 		mutex_exit(&nce->nce_lock);
707 		return;
708 	}
709 	if (!(flag & ND_NA_FLAG_OVERRIDE) && ll_changed) {
710 		/* If in any other state than REACHABLE, ignore */
711 		if (nce->nce_state == ND_REACHABLE) {
712 			nce_update(nce, ND_STALE, NULL);
713 		}
714 		mutex_exit(&nce->nce_lock);
715 		return;
716 	} else {
717 		if (ll_changed) {
718 			nce_update(nce, ND_UNCHANGED, hw_addr);
719 			ll_updated = B_TRUE;
720 		}
721 		if (flag & ND_NA_FLAG_SOLICITED) {
722 			nce_update(nce, ND_REACHABLE, NULL);
723 		} else {
724 			if (ll_updated) {
725 				nce_update(nce, ND_STALE, NULL);
726 			}
727 		}
728 		mutex_exit(&nce->nce_lock);
729 		if (!(flag & ND_NA_FLAG_ROUTER) && (nce->nce_flags &
730 		    NCE_F_ISROUTER)) {
731 			ire_t *ire;
732 
733 			/*
734 			 * Router turned to host.  We need to remove the
735 			 * entry as well as any default route that may be
736 			 * using this as a next hop.  This is required by
737 			 * section 7.2.5 of RFC 2461.
738 			 */
739 			ire = ire_ftable_lookup_v6(&ipv6_all_zeros,
740 			    &ipv6_all_zeros, &nce->nce_addr, IRE_DEFAULT,
741 			    nce->nce_ill->ill_ipif, NULL, ALL_ZONES, 0, NULL,
742 			    MATCH_IRE_ILL | MATCH_IRE_TYPE | MATCH_IRE_GW |
743 			    MATCH_IRE_DEFAULT);
744 			if (ire != NULL) {
745 				ip_rts_rtmsg(RTM_DELETE, ire, 0);
746 				ire_delete(ire);
747 				ire_refrele(ire);
748 			}
749 			ndp_delete(nce);
750 		}
751 	}
752 }
753 
754 /*
755  * Pass arg1 to the pfi supplied, along with each nce in existence.
756  * ndp_walk() places a REFHOLD on the nce and drops the lock when
757  * walking the hash list.
758  */
759 void
760 ndp_walk_impl(ill_t *ill, pfi_t pfi, void *arg1, boolean_t trace)
761 {
762 
763 	nce_t	*nce;
764 	nce_t	*nce1;
765 	nce_t	**ncep;
766 	nce_t	*free_nce_list = NULL;
767 
768 	mutex_enter(&ndp_g_lock);
769 	ndp_g_walker++;	/* Prevent ndp_delete from unlink and free of NCE */
770 	mutex_exit(&ndp_g_lock);
771 	for (ncep = nce_hash_tbl; ncep < A_END(nce_hash_tbl); ncep++) {
772 		for (nce = *ncep; nce; nce = nce1) {
773 			nce1 = nce->nce_next;
774 			if (ill == NULL || nce->nce_ill == ill) {
775 				if (trace) {
776 					NCE_REFHOLD(nce);
777 					(*pfi)(nce, arg1);
778 					NCE_REFRELE(nce);
779 				} else {
780 					NCE_REFHOLD_NOTR(nce);
781 					(*pfi)(nce, arg1);
782 					NCE_REFRELE_NOTR(nce);
783 				}
784 			}
785 		}
786 	}
787 	for (nce = nce_mask_entries; nce; nce = nce1) {
788 		nce1 = nce->nce_next;
789 		if (ill == NULL || nce->nce_ill == ill) {
790 			if (trace) {
791 				NCE_REFHOLD(nce);
792 				(*pfi)(nce, arg1);
793 				NCE_REFRELE(nce);
794 			} else {
795 				NCE_REFHOLD_NOTR(nce);
796 				(*pfi)(nce, arg1);
797 				NCE_REFRELE_NOTR(nce);
798 			}
799 		}
800 	}
801 	mutex_enter(&ndp_g_lock);
802 	ndp_g_walker--;
803 	/*
804 	 * While NCE's are removed from global list they are placed
805 	 * in a private list, to be passed to nce_ire_delete_list().
806 	 * The reason is, there may be ires pointing to this nce
807 	 * which needs to cleaned up.
808 	 */
809 	if (ndp_g_walker_cleanup && ndp_g_walker == 0) {
810 		/* Time to delete condemned entries */
811 		for (ncep = nce_hash_tbl; ncep < A_END(nce_hash_tbl); ncep++) {
812 			nce = *ncep;
813 			if (nce != NULL) {
814 				nce_remove(nce, &free_nce_list);
815 			}
816 		}
817 		nce = nce_mask_entries;
818 		if (nce != NULL) {
819 			nce_remove(nce, &free_nce_list);
820 		}
821 		ndp_g_walker_cleanup = B_FALSE;
822 	}
823 	mutex_exit(&ndp_g_lock);
824 
825 	if (free_nce_list != NULL) {
826 		nce_ire_delete_list(free_nce_list);
827 	}
828 }
829 
830 void
831 ndp_walk(ill_t *ill, pfi_t pfi, void *arg1)
832 {
833 	ndp_walk_impl(ill, pfi, arg1, B_TRUE);
834 }
835 
836 /*
837  * Prepend the zoneid using an ipsec_out_t for later use by functions like
838  * ip_rput_v6() after neighbor discovery has taken place.  If the message
839  * block already has a M_CTL at the front of it, then simply set the zoneid
840  * appropriately.
841  */
842 static mblk_t *
843 ndp_prepend_zone(mblk_t *mp, zoneid_t zoneid)
844 {
845 	mblk_t		*first_mp;
846 	ipsec_out_t	*io;
847 
848 	ASSERT(zoneid != ALL_ZONES);
849 	if (mp->b_datap->db_type == M_CTL) {
850 		io = (ipsec_out_t *)mp->b_rptr;
851 		ASSERT(io->ipsec_out_type == IPSEC_OUT);
852 		io->ipsec_out_zoneid = zoneid;
853 		return (mp);
854 	}
855 
856 	first_mp = ipsec_alloc_ipsec_out();
857 	if (first_mp == NULL)
858 		return (NULL);
859 	io = (ipsec_out_t *)first_mp->b_rptr;
860 	/* This is not a secure packet */
861 	io->ipsec_out_secure = B_FALSE;
862 	io->ipsec_out_zoneid = zoneid;
863 	first_mp->b_cont = mp;
864 	return (first_mp);
865 }
866 
867 /*
868  * Process resolve requests.  Handles both mapped entries
869  * as well as cases that needs to be send out on the wire.
870  * Lookup a NCE for a given IRE.  Regardless of whether one exists
871  * or one is created, we defer making ire point to nce until the
872  * ire is actually added at which point the nce_refcnt on the nce is
873  * incremented.  This is done primarily to have symmetry between ire_add()
874  * and ire_delete() which decrements the nce_refcnt, when an ire is deleted.
875  */
876 int
877 ndp_resolver(ill_t *ill, const in6_addr_t *dst, mblk_t *mp, zoneid_t zoneid)
878 {
879 	nce_t		*nce;
880 	int		err = 0;
881 	uint32_t	ms;
882 	mblk_t		*mp_nce = NULL;
883 
884 	ASSERT(ill != NULL);
885 	if (IN6_IS_ADDR_MULTICAST(dst)) {
886 		err = nce_set_multicast(ill, dst);
887 		return (err);
888 	}
889 	err = ndp_lookup_then_add(ill,
890 	    NULL,	/* No hardware address */
891 	    dst,
892 	    &ipv6_all_ones,
893 	    &ipv6_all_zeros,
894 	    0,
895 	    (ill->ill_flags & ILLF_NONUD) ? NCE_F_NONUD : 0,
896 	    ND_INCOMPLETE,
897 	    &nce);
898 
899 	switch (err) {
900 	case 0:
901 		/*
902 		 * New cache entry was created. Make sure that the state
903 		 * is not ND_INCOMPLETE. It can be in some other state
904 		 * even before we send out the solicitation as we could
905 		 * get un-solicited advertisements.
906 		 *
907 		 * If this is an XRESOLV interface, simply return 0,
908 		 * since we don't want to solicit just yet.
909 		 */
910 		if (ill->ill_flags & ILLF_XRESOLV) {
911 			NCE_REFRELE(nce);
912 			return (0);
913 		}
914 		rw_enter(&ill_g_lock, RW_READER);
915 		mutex_enter(&nce->nce_lock);
916 		if (nce->nce_state != ND_INCOMPLETE) {
917 			mutex_exit(&nce->nce_lock);
918 			rw_exit(&ill_g_lock);
919 			NCE_REFRELE(nce);
920 			return (0);
921 		}
922 		mp_nce = ndp_prepend_zone(mp, zoneid);
923 		if (mp_nce == NULL) {
924 			/* The caller will free mp */
925 			mutex_exit(&nce->nce_lock);
926 			rw_exit(&ill_g_lock);
927 			ndp_delete(nce);
928 			NCE_REFRELE(nce);
929 			return (ENOMEM);
930 		}
931 		ms = nce_solicit(nce, mp_nce);
932 		rw_exit(&ill_g_lock);
933 		if (ms == 0) {
934 			/* The caller will free mp */
935 			if (mp_nce != mp)
936 				freeb(mp_nce);
937 			mutex_exit(&nce->nce_lock);
938 			ndp_delete(nce);
939 			NCE_REFRELE(nce);
940 			return (EBUSY);
941 		}
942 		mutex_exit(&nce->nce_lock);
943 		NDP_RESTART_TIMER(nce, (clock_t)ms);
944 		NCE_REFRELE(nce);
945 		return (EINPROGRESS);
946 	case EEXIST:
947 		/* Resolution in progress just queue the packet */
948 		mutex_enter(&nce->nce_lock);
949 		if (nce->nce_state == ND_INCOMPLETE) {
950 			mp_nce = ndp_prepend_zone(mp, zoneid);
951 			if (mp_nce == NULL) {
952 				err = ENOMEM;
953 			} else {
954 				nce_queue_mp(nce, mp_nce);
955 				err = EINPROGRESS;
956 			}
957 		} else {
958 			/*
959 			 * Any other state implies we have
960 			 * a nce but IRE needs to be added ...
961 			 * ire_add_v6() will take care of the
962 			 * the case when the nce becomes CONDEMNED
963 			 * before the ire is added to the table.
964 			 */
965 			err = 0;
966 		}
967 		mutex_exit(&nce->nce_lock);
968 		NCE_REFRELE(nce);
969 		break;
970 	default:
971 		ip1dbg(("ndp_resolver: Can't create NCE %d\n", err));
972 		break;
973 	}
974 	return (err);
975 }
976 
977 /*
978  * When there is no resolver, the link layer template is passed in
979  * the IRE.
980  * Lookup a NCE for a given IRE.  Regardless of whether one exists
981  * or one is created, we defer making ire point to nce until the
982  * ire is actually added at which point the nce_refcnt on the nce is
983  * incremented.  This is done primarily to have symmetry between ire_add()
984  * and ire_delete() which decrements the nce_refcnt, when an ire is deleted.
985  */
986 int
987 ndp_noresolver(ill_t *ill, const in6_addr_t *dst)
988 {
989 	nce_t		*nce;
990 	int		err = 0;
991 
992 	ASSERT(ill != NULL);
993 	if (IN6_IS_ADDR_MULTICAST(dst)) {
994 		err = nce_set_multicast(ill, dst);
995 		return (err);
996 	}
997 
998 	err = ndp_lookup_then_add(ill,
999 	    NULL,	/* hardware address */
1000 	    dst,
1001 	    &ipv6_all_ones,
1002 	    &ipv6_all_zeros,
1003 	    0,
1004 	    (ill->ill_flags & ILLF_NONUD) ? NCE_F_NONUD : 0,
1005 	    ND_REACHABLE,
1006 	    &nce);
1007 
1008 	switch (err) {
1009 	case 0:
1010 		/*
1011 		 * Cache entry with a proper resolver cookie was
1012 		 * created.
1013 		 */
1014 		NCE_REFRELE(nce);
1015 		break;
1016 	case EEXIST:
1017 		err = 0;
1018 		NCE_REFRELE(nce);
1019 		break;
1020 	default:
1021 		ip1dbg(("ndp_noresolver: Can't create NCE %d\n", err));
1022 		break;
1023 	}
1024 	return (err);
1025 }
1026 
1027 /*
1028  * For each interface an entry is added for the unspecified multicast group.
1029  * Here that mapping is used to form the multicast cache entry for a particular
1030  * multicast destination.
1031  */
1032 static int
1033 nce_set_multicast(ill_t *ill, const in6_addr_t *dst)
1034 {
1035 	nce_t		*mnce;	/* Multicast mapping entry */
1036 	nce_t		*nce;
1037 	uchar_t		*hw_addr = NULL;
1038 	int		err = 0;
1039 
1040 	ASSERT(ill != NULL);
1041 	ASSERT(!(IN6_IS_ADDR_UNSPECIFIED(dst)));
1042 
1043 	mutex_enter(&ndp_g_lock);
1044 	nce = nce_lookup_addr(ill, dst);
1045 	if (nce != NULL) {
1046 		mutex_exit(&ndp_g_lock);
1047 		NCE_REFRELE(nce);
1048 		return (0);
1049 	}
1050 	/* No entry, now lookup for a mapping this should never fail */
1051 	mnce = nce_lookup_mapping(ill, dst);
1052 	if (mnce == NULL) {
1053 		/* Something broken for the interface. */
1054 		mutex_exit(&ndp_g_lock);
1055 		return (ESRCH);
1056 	}
1057 	ASSERT(mnce->nce_flags & NCE_F_MAPPING);
1058 	if (ill->ill_net_type == IRE_IF_RESOLVER) {
1059 		/*
1060 		 * For IRE_IF_RESOLVER a hardware mapping can be
1061 		 * generated, for IRE_IF_NORESOLVER, resolution cookie
1062 		 * in the ill is copied in ndp_add().
1063 		 */
1064 		hw_addr = kmem_alloc(ill->ill_nd_lla_len, KM_NOSLEEP);
1065 		if (hw_addr == NULL) {
1066 			mutex_exit(&ndp_g_lock);
1067 			NCE_REFRELE(mnce);
1068 			return (ENOMEM);
1069 		}
1070 		nce_make_mapping(mnce, hw_addr, (uchar_t *)dst);
1071 	}
1072 	NCE_REFRELE(mnce);
1073 	/*
1074 	 * IRE_IF_NORESOLVER type simply copies the resolution
1075 	 * cookie passed in.  So no hw_addr is needed.
1076 	 */
1077 	err = ndp_add(ill,
1078 	    hw_addr,
1079 	    dst,
1080 	    &ipv6_all_ones,
1081 	    &ipv6_all_zeros,
1082 	    0,
1083 	    NCE_F_NONUD,
1084 	    ND_REACHABLE,
1085 	    &nce);
1086 	mutex_exit(&ndp_g_lock);
1087 	if (hw_addr != NULL)
1088 		kmem_free(hw_addr, ill->ill_nd_lla_len);
1089 	if (err != 0) {
1090 		ip1dbg(("nce_set_multicast: create failed" "%d\n", err));
1091 		return (err);
1092 	}
1093 	NCE_REFRELE(nce);
1094 	return (0);
1095 }
1096 
1097 /*
1098  * Return the link layer address, and any flags of a nce.
1099  */
1100 int
1101 ndp_query(ill_t *ill, struct lif_nd_req *lnr)
1102 {
1103 	nce_t		*nce;
1104 	in6_addr_t	*addr;
1105 	sin6_t		*sin6;
1106 	dl_unitdata_req_t	*dl;
1107 
1108 	ASSERT(ill != NULL);
1109 	sin6 = (sin6_t *)&lnr->lnr_addr;
1110 	addr =  &sin6->sin6_addr;
1111 
1112 	nce = ndp_lookup(ill, addr, B_FALSE);
1113 	if (nce == NULL)
1114 		return (ESRCH);
1115 	/* If in INCOMPLETE state, no link layer address is available yet */
1116 	if (nce->nce_state == ND_INCOMPLETE)
1117 		goto done;
1118 	dl = (dl_unitdata_req_t *)nce->nce_res_mp->b_rptr;
1119 	if (ill->ill_flags & ILLF_XRESOLV)
1120 		lnr->lnr_hdw_len = dl->dl_dest_addr_length;
1121 	else
1122 		lnr->lnr_hdw_len = ill->ill_nd_lla_len;
1123 	ASSERT(NCE_LL_ADDR_OFFSET(ill) + lnr->lnr_hdw_len <=
1124 	    sizeof (lnr->lnr_hdw_addr));
1125 	bcopy(nce->nce_res_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill),
1126 	    (uchar_t *)&lnr->lnr_hdw_addr, lnr->lnr_hdw_len);
1127 	if (nce->nce_flags & NCE_F_ISROUTER)
1128 		lnr->lnr_flags = NDF_ISROUTER_ON;
1129 	if (nce->nce_flags & NCE_F_PROXY)
1130 		lnr->lnr_flags |= NDF_PROXY_ON;
1131 	if (nce->nce_flags & NCE_F_ANYCAST)
1132 		lnr->lnr_flags |= NDF_ANYCAST_ON;
1133 done:
1134 	NCE_REFRELE(nce);
1135 	return (0);
1136 }
1137 
1138 /*
1139  * Send Enable/Disable multicast reqs to driver.
1140  */
1141 int
1142 ndp_mcastreq(ill_t *ill, const in6_addr_t *addr, uint32_t hw_addr_len,
1143     uint32_t hw_addr_offset, mblk_t *mp)
1144 {
1145 	nce_t		*nce;
1146 	uchar_t		*hw_addr;
1147 
1148 	ASSERT(ill != NULL);
1149 	ASSERT(ill->ill_net_type == IRE_IF_RESOLVER);
1150 	hw_addr = mi_offset_paramc(mp, hw_addr_offset, hw_addr_len);
1151 	if (hw_addr == NULL || !IN6_IS_ADDR_MULTICAST(addr)) {
1152 		freemsg(mp);
1153 		return (EINVAL);
1154 	}
1155 	mutex_enter(&ndp_g_lock);
1156 	nce = nce_lookup_mapping(ill, addr);
1157 	if (nce == NULL) {
1158 		mutex_exit(&ndp_g_lock);
1159 		freemsg(mp);
1160 		return (ESRCH);
1161 	}
1162 	mutex_exit(&ndp_g_lock);
1163 	/*
1164 	 * Update dl_addr_length and dl_addr_offset for primitives that
1165 	 * have physical addresses as opposed to full saps
1166 	 */
1167 	switch (((union DL_primitives *)mp->b_rptr)->dl_primitive) {
1168 	case DL_ENABMULTI_REQ:
1169 		/* Track the state if this is the first enabmulti */
1170 		if (ill->ill_dlpi_multicast_state == IDMS_UNKNOWN)
1171 			ill->ill_dlpi_multicast_state = IDMS_INPROGRESS;
1172 		ip1dbg(("ndp_mcastreq: ENABMULTI\n"));
1173 		break;
1174 	case DL_DISABMULTI_REQ:
1175 		ip1dbg(("ndp_mcastreq: DISABMULTI\n"));
1176 		break;
1177 	default:
1178 		NCE_REFRELE(nce);
1179 		ip1dbg(("ndp_mcastreq: default\n"));
1180 		return (EINVAL);
1181 	}
1182 	nce_make_mapping(nce, hw_addr, (uchar_t *)addr);
1183 	NCE_REFRELE(nce);
1184 	putnext(ill->ill_wq, mp);
1185 	return (0);
1186 }
1187 
1188 /*
1189  * Send a neighbor solicitation.
1190  * Returns number of milliseconds after which we should either rexmit or abort.
1191  * Return of zero means we should abort.
1192  * The caller holds the nce_lock to protect nce_qd_mp and nce_rcnt.
1193  *
1194  * NOTE: This routine drops nce_lock (and later reacquires it) when sending
1195  * the packet.
1196  * NOTE: This routine does not consume mp.
1197  */
1198 uint32_t
1199 nce_solicit(nce_t *nce, mblk_t *mp)
1200 {
1201 	ill_t		*ill;
1202 	ill_t		*src_ill;
1203 	ip6_t		*ip6h;
1204 	in6_addr_t	src;
1205 	in6_addr_t	dst;
1206 	ipif_t		*ipif;
1207 	ip6i_t		*ip6i;
1208 	boolean_t	dropped = B_FALSE;
1209 
1210 	ASSERT(RW_READ_HELD(&ill_g_lock));
1211 	ASSERT(MUTEX_HELD(&nce->nce_lock));
1212 	ill = nce->nce_ill;
1213 	ASSERT(ill != NULL);
1214 
1215 	if (nce->nce_rcnt == 0) {
1216 		return (0);
1217 	}
1218 
1219 	if (mp == NULL) {
1220 		ASSERT(nce->nce_qd_mp != NULL);
1221 		mp = nce->nce_qd_mp;
1222 	} else {
1223 		nce_queue_mp(nce, mp);
1224 	}
1225 
1226 	/* Handle ip_newroute_v6 giving us IPSEC packets */
1227 	if (mp->b_datap->db_type == M_CTL)
1228 		mp = mp->b_cont;
1229 
1230 	ip6h = (ip6_t *)mp->b_rptr;
1231 	if (ip6h->ip6_nxt == IPPROTO_RAW) {
1232 		/*
1233 		 * This message should have been pulled up already in
1234 		 * ip_wput_v6. We can't do pullups here because the message
1235 		 * could be from the nce_qd_mp which could have b_next/b_prev
1236 		 * non-NULL.
1237 		 */
1238 		ip6i = (ip6i_t *)ip6h;
1239 		ASSERT((mp->b_wptr - (uchar_t *)ip6i) >=
1240 			    sizeof (ip6i_t) + IPV6_HDR_LEN);
1241 		ip6h = (ip6_t *)(mp->b_rptr + sizeof (ip6i_t));
1242 	}
1243 	src = ip6h->ip6_src;
1244 	/*
1245 	 * If the src of outgoing packet is one of the assigned interface
1246 	 * addresses use it, otherwise we will pick the source address below.
1247 	 */
1248 	src_ill = ill;
1249 	if (!IN6_IS_ADDR_UNSPECIFIED(&src)) {
1250 		if (ill->ill_group != NULL)
1251 			src_ill = ill->ill_group->illgrp_ill;
1252 		for (; src_ill != NULL; src_ill = src_ill->ill_group_next) {
1253 			for (ipif = src_ill->ill_ipif; ipif != NULL;
1254 			    ipif = ipif->ipif_next) {
1255 				if (IN6_ARE_ADDR_EQUAL(&src,
1256 				    &ipif->ipif_v6lcl_addr)) {
1257 					break;
1258 				}
1259 			}
1260 			if (ipif != NULL)
1261 				break;
1262 		}
1263 		if (src_ill == NULL) {
1264 			/* May be a forwarding packet */
1265 			src_ill = ill;
1266 			src = ipv6_all_zeros;
1267 		}
1268 	}
1269 	dst = nce->nce_addr;
1270 	/*
1271 	 * If source address is unspecified, nce_xmit will choose
1272 	 * one for us and initialize the hardware address also
1273 	 * appropriately.
1274 	 */
1275 	if (IN6_IS_ADDR_UNSPECIFIED(&src))
1276 		src_ill  = NULL;
1277 	nce->nce_rcnt--;
1278 	mutex_exit(&nce->nce_lock);
1279 	rw_exit(&ill_g_lock);
1280 	dropped = nce_xmit(ill, ND_NEIGHBOR_SOLICIT, src_ill, B_TRUE, &src,
1281 	    &dst, 0);
1282 	rw_enter(&ill_g_lock, RW_READER);
1283 	mutex_enter(&nce->nce_lock);
1284 	if (dropped)
1285 		nce->nce_rcnt++;
1286 	return (ill->ill_reachable_retrans_time);
1287 }
1288 
1289 void
1290 ndp_input_solicit(ill_t *ill, mblk_t *mp)
1291 {
1292 	nd_neighbor_solicit_t *ns;
1293 	uint32_t	hlen = ill->ill_nd_lla_len;
1294 	uchar_t		*haddr = NULL;
1295 	icmp6_t		*icmp_nd;
1296 	ip6_t		*ip6h;
1297 	nce_t		*our_nce = NULL;
1298 	in6_addr_t	target;
1299 	in6_addr_t	src;
1300 	int		len;
1301 	int		flag = 0;
1302 	nd_opt_hdr_t	*opt = NULL;
1303 	boolean_t	bad_solicit = B_FALSE;
1304 	mib2_ipv6IfIcmpEntry_t	*mib = ill->ill_icmp6_mib;
1305 
1306 	ip6h = (ip6_t *)mp->b_rptr;
1307 	icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
1308 	len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN;
1309 	src = ip6h->ip6_src;
1310 	ns = (nd_neighbor_solicit_t *)icmp_nd;
1311 	target = ns->nd_ns_target;
1312 	if (IN6_IS_ADDR_MULTICAST(&target)) {
1313 		if (ip_debug > 2) {
1314 			/* ip1dbg */
1315 			pr_addr_dbg("ndp_input_solicit: Target is"
1316 			    " multicast! %s\n", AF_INET6, &target);
1317 		}
1318 		bad_solicit = B_TRUE;
1319 		goto done;
1320 	}
1321 	if (len > sizeof (nd_neighbor_solicit_t)) {
1322 		/* Options present */
1323 		opt = (nd_opt_hdr_t *)&ns[1];
1324 		len -= sizeof (nd_neighbor_solicit_t);
1325 		if (!ndp_verify_optlen(opt, len)) {
1326 			ip1dbg(("ndp_input_solicit: Bad opt len\n"));
1327 			bad_solicit = B_TRUE;
1328 			goto done;
1329 		}
1330 	}
1331 	if (IN6_IS_ADDR_UNSPECIFIED(&src)) {
1332 		/* Check to see if this is a valid DAD solicitation */
1333 		if (!IN6_IS_ADDR_MC_SOLICITEDNODE(&ip6h->ip6_dst)) {
1334 			if (ip_debug > 2) {
1335 				/* ip1dbg */
1336 				pr_addr_dbg("ndp_input_solicit: IPv6 "
1337 				    "Destination is not solicited node "
1338 				    "multicast %s\n", AF_INET6,
1339 				    &ip6h->ip6_dst);
1340 			}
1341 			bad_solicit = B_TRUE;
1342 			goto done;
1343 		}
1344 	}
1345 
1346 	our_nce = ndp_lookup(ill, &target, B_FALSE);
1347 	/*
1348 	 * If this is a valid Solicitation, a permanent
1349 	 * entry should exist in the cache
1350 	 */
1351 	if (our_nce == NULL ||
1352 	    !(our_nce->nce_flags & NCE_F_PERMANENT)) {
1353 		ip1dbg(("ndp_input_solicit: Wrong target in NS?!"
1354 		    "ifname=%s ", ill->ill_name));
1355 		if (ip_debug > 2) {
1356 			/* ip1dbg */
1357 			pr_addr_dbg(" dst %s\n", AF_INET6, &target);
1358 		}
1359 		bad_solicit = B_TRUE;
1360 		goto done;
1361 	}
1362 
1363 	/* At this point we should have a verified NS per spec */
1364 	if (opt != NULL) {
1365 		opt = ndp_get_option(opt, len, ND_OPT_SOURCE_LINKADDR);
1366 		if (opt != NULL) {
1367 			/*
1368 			 * No source link layer address option should
1369 			 * be present in a valid DAD request.
1370 			 */
1371 			if (IN6_IS_ADDR_UNSPECIFIED(&src)) {
1372 				ip1dbg(("ndp_input_solicit: source link-layer "
1373 				    "address option present with an "
1374 				    "unspecified source. \n"));
1375 				bad_solicit = B_TRUE;
1376 				goto done;
1377 			}
1378 			haddr = (uchar_t *)&opt[1];
1379 			if (hlen > opt->nd_opt_len * 8 ||
1380 			    hlen == 0) {
1381 				bad_solicit = B_TRUE;
1382 				goto done;
1383 			}
1384 		}
1385 	}
1386 	/*
1387 	 * haddr can be NULL if no options are present,
1388 	 * or no Source link layer address is present in,
1389 	 * recvd NDP options of solicitation message.
1390 	 */
1391 	if (haddr == NULL) {
1392 		nce_t   *nnce;
1393 		mutex_enter(&ndp_g_lock);
1394 		nnce = nce_lookup_addr(ill, &src);
1395 		mutex_exit(&ndp_g_lock);
1396 
1397 		if (nnce == NULL) {
1398 			in6_addr_t dst = ipv6_solicited_node_mcast;
1399 
1400 			/* Form solicited node multicast address */
1401 			dst.s6_addr32[3] |= src.s6_addr32[3];
1402 			(void) nce_xmit(ill,
1403 				ND_NEIGHBOR_SOLICIT,
1404 				ill,
1405 				B_TRUE,
1406 				&target,
1407 				&dst,
1408 				flag);
1409 			bad_solicit = B_TRUE;
1410 			goto done;
1411 		}
1412 	}
1413 	/* Set override flag, it will be reset later if need be. */
1414 	flag |= NDP_ORIDE;
1415 	if (!IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst)) {
1416 		flag |= NDP_UNICAST;
1417 	}
1418 
1419 	/*
1420 	 * Create/update the entry for the soliciting node.
1421 	 * or respond to outstanding queries, don't if
1422 	 * the source is unspecified address.
1423 	 */
1424 	if (!IN6_IS_ADDR_UNSPECIFIED(&src)) {
1425 		int	err = 0;
1426 		nce_t	*nnce;
1427 
1428 		err = ndp_lookup_then_add(ill,
1429 		    haddr,
1430 		    &src,	/* Soliciting nodes address */
1431 		    &ipv6_all_ones,
1432 		    &ipv6_all_zeros,
1433 		    0,
1434 		    0,
1435 		    ND_STALE,
1436 		    &nnce);
1437 		switch (err) {
1438 		case 0:
1439 			/* done with this entry */
1440 			NCE_REFRELE(nnce);
1441 			break;
1442 		case EEXIST:
1443 			/*
1444 			 * B_FALSE indicates this is not an
1445 			 * an advertisement.
1446 			 */
1447 			ndp_process(nnce, haddr, 0, B_FALSE);
1448 			NCE_REFRELE(nnce);
1449 			break;
1450 		default:
1451 			ip1dbg(("ndp_input_solicit: Can't create NCE %d\n",
1452 			    err));
1453 			goto done;
1454 		}
1455 		flag |= NDP_SOLICITED;
1456 	} else {
1457 		/*
1458 		 * This is a DAD req, multicast the advertisement
1459 		 * to the all-nodes address.
1460 		 */
1461 		src = ipv6_all_hosts_mcast;
1462 	}
1463 	if (our_nce->nce_flags & NCE_F_ISROUTER)
1464 		flag |= NDP_ISROUTER;
1465 	if (our_nce->nce_flags & NCE_F_PROXY)
1466 		flag &= ~NDP_ORIDE;
1467 	/* Response to a solicitation */
1468 	(void) nce_xmit(ill,
1469 	    ND_NEIGHBOR_ADVERT,
1470 	    ill,	/* ill to be used for extracting ill_nd_lla */
1471 	    B_TRUE,	/* use ill_nd_lla */
1472 	    &target,	/* Source and target of the advertisement pkt */
1473 	    &src,	/* IP Destination (source of original pkt) */
1474 	    flag);
1475 done:
1476 	if (bad_solicit)
1477 		BUMP_MIB(mib, ipv6IfIcmpInBadNeighborSolicitations);
1478 	if (our_nce != NULL)
1479 		NCE_REFRELE(our_nce);
1480 }
1481 
1482 void
1483 ndp_input_advert(ill_t *ill, mblk_t *mp)
1484 {
1485 	nd_neighbor_advert_t *na;
1486 	uint32_t	hlen = ill->ill_nd_lla_len;
1487 	uchar_t		*haddr = NULL;
1488 	icmp6_t		*icmp_nd;
1489 	ip6_t		*ip6h;
1490 	nce_t		*dst_nce = NULL;
1491 	in6_addr_t	target;
1492 	nd_opt_hdr_t	*opt = NULL;
1493 	int		len;
1494 	mib2_ipv6IfIcmpEntry_t	*mib = ill->ill_icmp6_mib;
1495 
1496 	ip6h = (ip6_t *)mp->b_rptr;
1497 	icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
1498 	len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN;
1499 	na = (nd_neighbor_advert_t *)icmp_nd;
1500 	if (IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst) &&
1501 	    (na->nd_na_flags_reserved & ND_NA_FLAG_SOLICITED)) {
1502 		ip1dbg(("ndp_input_advert: Target is multicast but the "
1503 		    "solicited flag is not zero\n"));
1504 		BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements);
1505 		return;
1506 	}
1507 	target = na->nd_na_target;
1508 	if (IN6_IS_ADDR_MULTICAST(&target)) {
1509 		ip1dbg(("ndp_input_advert: Target is multicast!\n"));
1510 		BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements);
1511 		return;
1512 	}
1513 	if (len > sizeof (nd_neighbor_advert_t)) {
1514 		opt = (nd_opt_hdr_t *)&na[1];
1515 		if (!ndp_verify_optlen(opt,
1516 		    len - sizeof (nd_neighbor_advert_t))) {
1517 			BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements);
1518 			return;
1519 		}
1520 		/* At this point we have a verified NA per spec */
1521 		len -= sizeof (nd_neighbor_advert_t);
1522 		opt = ndp_get_option(opt, len, ND_OPT_TARGET_LINKADDR);
1523 		if (opt != NULL) {
1524 			haddr = (uchar_t *)&opt[1];
1525 			if (hlen > opt->nd_opt_len * 8 ||
1526 			    hlen == 0) {
1527 				BUMP_MIB(mib,
1528 				    ipv6IfIcmpInBadNeighborAdvertisements);
1529 				return;
1530 			}
1531 		}
1532 	}
1533 
1534 	/*
1535 	 * If this interface is part of the group look at all the
1536 	 * ills in the group.
1537 	 */
1538 	rw_enter(&ill_g_lock, RW_READER);
1539 	if (ill->ill_group != NULL)
1540 		ill = ill->ill_group->illgrp_ill;
1541 
1542 	for (; ill != NULL; ill = ill->ill_group_next) {
1543 		mutex_enter(&ill->ill_lock);
1544 		if (!ILL_CAN_LOOKUP(ill)) {
1545 			mutex_exit(&ill->ill_lock);
1546 			continue;
1547 		}
1548 		ill_refhold_locked(ill);
1549 		mutex_exit(&ill->ill_lock);
1550 		dst_nce = ndp_lookup(ill, &target, B_FALSE);
1551 		/* We have to drop the lock since ndp_process calls put* */
1552 		rw_exit(&ill_g_lock);
1553 		if (dst_nce != NULL) {
1554 			if (na->nd_na_flags_reserved &
1555 			    ND_NA_FLAG_ROUTER) {
1556 				dst_nce->nce_flags |= NCE_F_ISROUTER;
1557 			}
1558 			/* B_TRUE indicates this an advertisement */
1559 			ndp_process(dst_nce, haddr,
1560 				na->nd_na_flags_reserved, B_TRUE);
1561 			NCE_REFRELE(dst_nce);
1562 		}
1563 		rw_enter(&ill_g_lock, RW_READER);
1564 		ill_refrele(ill);
1565 	}
1566 	rw_exit(&ill_g_lock);
1567 }
1568 
1569 /*
1570  * Process NDP neighbor solicitation/advertisement messages.
1571  * The checksum has already checked o.k before reaching here.
1572  */
1573 void
1574 ndp_input(ill_t *ill, mblk_t *mp)
1575 {
1576 	icmp6_t		*icmp_nd;
1577 	ip6_t		*ip6h;
1578 	int		len;
1579 	mib2_ipv6IfIcmpEntry_t	*mib = ill->ill_icmp6_mib;
1580 
1581 
1582 	if (!pullupmsg(mp, -1)) {
1583 		ip1dbg(("ndp_input: pullupmsg failed\n"));
1584 		BUMP_MIB(ill->ill_ip6_mib, ipv6InDiscards);
1585 		goto done;
1586 	}
1587 	ip6h = (ip6_t *)mp->b_rptr;
1588 	if (ip6h->ip6_hops != IPV6_MAX_HOPS) {
1589 		ip1dbg(("ndp_input: hoplimit != IPV6_MAX_HOPS\n"));
1590 		BUMP_MIB(mib, ipv6IfIcmpBadHoplimit);
1591 		goto done;
1592 	}
1593 	/*
1594 	 * NDP does not accept any extension headers between the
1595 	 * IP header and the ICMP header since e.g. a routing
1596 	 * header could be dangerous.
1597 	 * This assumes that any AH or ESP headers are removed
1598 	 * by ip prior to passing the packet to ndp_input.
1599 	 */
1600 	if (ip6h->ip6_nxt != IPPROTO_ICMPV6) {
1601 		ip1dbg(("ndp_input: Wrong next header 0x%x\n",
1602 		    ip6h->ip6_nxt));
1603 		BUMP_MIB(mib, ipv6IfIcmpInErrors);
1604 		goto done;
1605 	}
1606 	icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
1607 	ASSERT(icmp_nd->icmp6_type == ND_NEIGHBOR_SOLICIT ||
1608 	    icmp_nd->icmp6_type == ND_NEIGHBOR_ADVERT);
1609 	if (icmp_nd->icmp6_code != 0) {
1610 		ip1dbg(("ndp_input: icmp6 code != 0 \n"));
1611 		BUMP_MIB(mib, ipv6IfIcmpInErrors);
1612 		goto done;
1613 	}
1614 	len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN;
1615 	/*
1616 	 * Make sure packet length is large enough for either
1617 	 * a NS or a NA icmp packet.
1618 	 */
1619 	if (len <  sizeof (struct icmp6_hdr) + sizeof (struct in6_addr)) {
1620 		ip1dbg(("ndp_input: packet too short\n"));
1621 		BUMP_MIB(mib, ipv6IfIcmpInErrors);
1622 		goto done;
1623 	}
1624 	if (icmp_nd->icmp6_type == ND_NEIGHBOR_SOLICIT) {
1625 		ndp_input_solicit(ill, mp);
1626 	} else {
1627 		ndp_input_advert(ill, mp);
1628 	}
1629 done:
1630 	freemsg(mp);
1631 }
1632 
1633 /*
1634  * nce_xmit is called to form and transmit a ND solicitation or
1635  * advertisement ICMP packet.
1636  * If source address is unspecified, appropriate source address
1637  * and link layer address will be chosen here. This function
1638  * *always* sends the link layer option.
1639  * It returns B_FALSE only if it does a successful put() to the
1640  * corresponding ill's ill_wq otherwise returns B_TRUE.
1641  */
1642 static boolean_t
1643 nce_xmit(ill_t *ill, uint32_t operation, ill_t *hwaddr_ill,
1644     boolean_t use_nd_lla, const in6_addr_t *sender, const in6_addr_t *target,
1645     int flag)
1646 {
1647 	uint32_t	len;
1648 	icmp6_t 	*icmp6;
1649 	mblk_t		*mp;
1650 	ip6_t		*ip6h;
1651 	nd_opt_hdr_t	*opt;
1652 	uint_t		plen;
1653 	ip6i_t		*ip6i;
1654 	ipif_t		*src_ipif = NULL;
1655 
1656 	/*
1657 	 * If we have a unspecified source(sender) address, select a
1658 	 * proper source address for the solicitation here itself so
1659 	 * that we can initialize the h/w address correctly. This is
1660 	 * needed for interface groups as source address can come from
1661 	 * the whole group and the h/w address initialized from ill will
1662 	 * be wrong if the source address comes from a different ill.
1663 	 *
1664 	 * Note that the NA never comes here with the unspecified source
1665 	 * address. The following asserts that whenever the source
1666 	 * address is specified, the haddr also should be specified.
1667 	 */
1668 	ASSERT(IN6_IS_ADDR_UNSPECIFIED(sender) || (hwaddr_ill != NULL));
1669 
1670 	if (IN6_IS_ADDR_UNSPECIFIED(sender)) {
1671 		ASSERT(operation != ND_NEIGHBOR_ADVERT);
1672 		/*
1673 		 * Pick a source address for this solicitation, but
1674 		 * restrict the selection to addresses assigned to the
1675 		 * output interface (or interface group).  We do this
1676 		 * because the destination will create a neighbor cache
1677 		 * entry for the source address of this packet, so the
1678 		 * source address had better be a valid neighbor.
1679 		 */
1680 		src_ipif = ipif_select_source_v6(ill, target, B_TRUE,
1681 		    IPV6_PREFER_SRC_DEFAULT, GLOBAL_ZONEID);
1682 		if (src_ipif == NULL) {
1683 			char buf[INET6_ADDRSTRLEN];
1684 
1685 			ip0dbg(("nce_xmit: No source ipif for dst %s\n",
1686 			    inet_ntop(AF_INET6, (char *)target, buf,
1687 			    sizeof (buf))));
1688 			return (B_TRUE);
1689 		}
1690 		sender = &src_ipif->ipif_v6src_addr;
1691 		hwaddr_ill = src_ipif->ipif_ill;
1692 	}
1693 
1694 	plen = (sizeof (nd_opt_hdr_t) + ill->ill_nd_lla_len + 7)/8;
1695 	/*
1696 	 * Always make sure that the NS/NA packets don't get load
1697 	 * spread. This is needed so that the probe packets sent
1698 	 * by the in.mpathd daemon can really go out on the desired
1699 	 * interface. Probe packets are made to go out on a desired
1700 	 * interface by including a ip6i with ATTACH_IF flag. As these
1701 	 * packets indirectly end up sending/receiving NS/NA packets
1702 	 * (neighbor doing NUD), we have to make sure that NA
1703 	 * also go out on the same interface.
1704 	 */
1705 	len = IPV6_HDR_LEN + sizeof (ip6i_t) + sizeof (nd_neighbor_advert_t) +
1706 	    plen * 8;
1707 	mp = allocb(len,  BPRI_LO);
1708 	if (mp == NULL) {
1709 		if (src_ipif != NULL)
1710 			ipif_refrele(src_ipif);
1711 		return (B_TRUE);
1712 	}
1713 	bzero((char *)mp->b_rptr, len);
1714 	mp->b_wptr = mp->b_rptr + len;
1715 
1716 	ip6i = (ip6i_t *)mp->b_rptr;
1717 	ip6i->ip6i_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
1718 	ip6i->ip6i_nxt = IPPROTO_RAW;
1719 	ip6i->ip6i_flags = IP6I_ATTACH_IF | IP6I_HOPLIMIT;
1720 	ip6i->ip6i_ifindex = ill->ill_phyint->phyint_ifindex;
1721 
1722 	ip6h = (ip6_t *)(mp->b_rptr + sizeof (ip6i_t));
1723 	ip6h->ip6_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
1724 	ip6h->ip6_plen = htons(len - IPV6_HDR_LEN - sizeof (ip6i_t));
1725 	ip6h->ip6_nxt = IPPROTO_ICMPV6;
1726 	ip6h->ip6_hops = IPV6_MAX_HOPS;
1727 	ip6h->ip6_dst = *target;
1728 	icmp6 = (icmp6_t *)&ip6h[1];
1729 
1730 	opt = (nd_opt_hdr_t *)((uint8_t *)ip6h + IPV6_HDR_LEN +
1731 	    sizeof (nd_neighbor_advert_t));
1732 
1733 	if (operation == ND_NEIGHBOR_SOLICIT) {
1734 		nd_neighbor_solicit_t *ns = (nd_neighbor_solicit_t *)icmp6;
1735 
1736 		opt->nd_opt_type = ND_OPT_SOURCE_LINKADDR;
1737 		ip6h->ip6_src = *sender;
1738 		ns->nd_ns_target = *target;
1739 		if (!(flag & NDP_UNICAST)) {
1740 			/* Form multicast address of the target */
1741 			ip6h->ip6_dst = ipv6_solicited_node_mcast;
1742 			ip6h->ip6_dst.s6_addr32[3] |=
1743 			    ns->nd_ns_target.s6_addr32[3];
1744 		}
1745 	} else {
1746 		nd_neighbor_advert_t *na = (nd_neighbor_advert_t *)icmp6;
1747 
1748 		opt->nd_opt_type = ND_OPT_TARGET_LINKADDR;
1749 		ip6h->ip6_src = *sender;
1750 		na->nd_na_target = *sender;
1751 		if (flag & NDP_ISROUTER)
1752 			na->nd_na_flags_reserved |= ND_NA_FLAG_ROUTER;
1753 		if (flag & NDP_SOLICITED)
1754 			na->nd_na_flags_reserved |= ND_NA_FLAG_SOLICITED;
1755 		if (flag & NDP_ORIDE)
1756 			na->nd_na_flags_reserved |= ND_NA_FLAG_OVERRIDE;
1757 
1758 	}
1759 	/* Fill in link layer address and option len */
1760 	opt->nd_opt_len = (uint8_t)plen;
1761 	mutex_enter(&hwaddr_ill->ill_lock);
1762 	bcopy(use_nd_lla ? hwaddr_ill->ill_nd_lla : hwaddr_ill->ill_phys_addr,
1763 	    &opt[1], hwaddr_ill->ill_nd_lla_len);
1764 	mutex_exit(&hwaddr_ill->ill_lock);
1765 	icmp6->icmp6_type = (uint8_t)operation;
1766 	icmp6->icmp6_code = 0;
1767 	/*
1768 	 * Prepare for checksum by putting icmp length in the icmp
1769 	 * checksum field. The checksum is calculated in ip_wput_v6.
1770 	 */
1771 	icmp6->icmp6_cksum = ip6h->ip6_plen;
1772 
1773 	if (src_ipif != NULL)
1774 		ipif_refrele(src_ipif);
1775 	if (canput(ill->ill_wq)) {
1776 		put(ill->ill_wq, mp);
1777 		return (B_FALSE);
1778 	}
1779 	freemsg(mp);
1780 	return (B_TRUE);
1781 }
1782 
1783 /*
1784  * Make a link layer address (does not include the SAP) from an nce.
1785  * To form the link layer address, use the last four bytes of ipv6
1786  * address passed in and the fixed offset stored in nce.
1787  */
1788 static void
1789 nce_make_mapping(nce_t *nce, uchar_t *addrpos, uchar_t *addr)
1790 {
1791 	uchar_t *mask, *to;
1792 	ill_t	*ill = nce->nce_ill;
1793 	int 	len;
1794 
1795 	if (ill->ill_net_type == IRE_IF_NORESOLVER)
1796 		return;
1797 	ASSERT(nce->nce_res_mp != NULL);
1798 	ASSERT(ill->ill_net_type == IRE_IF_RESOLVER);
1799 	ASSERT(nce->nce_flags & NCE_F_MAPPING);
1800 	ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&nce->nce_extract_mask));
1801 	ASSERT(addr != NULL);
1802 	bcopy(nce->nce_res_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill),
1803 	    addrpos, ill->ill_nd_lla_len);
1804 	len = MIN((int)ill->ill_nd_lla_len - nce->nce_ll_extract_start,
1805 	    IPV6_ADDR_LEN);
1806 	mask = (uchar_t *)&nce->nce_extract_mask;
1807 	mask += (IPV6_ADDR_LEN - len);
1808 	addr += (IPV6_ADDR_LEN - len);
1809 	to = addrpos + nce->nce_ll_extract_start;
1810 	while (len-- > 0)
1811 		*to++ |= *mask++ & *addr++;
1812 }
1813 
1814 /*
1815  * Pass a cache report back out via NDD.
1816  */
1817 /* ARGSUSED */
1818 int
1819 ndp_report(queue_t *q, mblk_t *mp, caddr_t arg, cred_t *ioc_cr)
1820 {
1821 	(void) mi_mpprintf(mp, "ifname      hardware addr    flags"
1822 			"     proto addr/mask");
1823 	ndp_walk(NULL, (pfi_t)nce_report1, (uchar_t *)mp);
1824 	return (0);
1825 }
1826 
1827 /*
1828  * convert a link level address of arbitrary length
1829  * to an ascii string.
1830  * The caller *must* have already verified that the string buffer
1831  * is large enough to hold the entire string, including trailing NULL.
1832  */
1833 static void
1834 lla2ascii(uint8_t *lla, int addrlen, uchar_t *buf)
1835 {
1836 	uchar_t	addrbyte[8];	/* needs to hold ascii for a byte plus a NULL */
1837 	int	i;
1838 	size_t	len;
1839 
1840 	buf[0] = '\0';
1841 	for (i = 0; i < addrlen; i++) {
1842 		addrbyte[0] = '\0';
1843 		(void) sprintf((char *)addrbyte, "%02x:", (lla[i] & 0xff));
1844 		len = strlen((const char *)addrbyte);
1845 		bcopy(addrbyte, buf, len);
1846 		buf = buf + len;
1847 	}
1848 	*--buf = '\0';
1849 }
1850 
1851 /*
1852  * Add a single line to the NDP Cache Entry Report.
1853  */
1854 static void
1855 nce_report1(nce_t *nce, uchar_t *mp_arg)
1856 {
1857 	ill_t		*ill = nce->nce_ill;
1858 	char		local_buf[INET6_ADDRSTRLEN];
1859 	uchar_t		flags_buf[10];
1860 	uint32_t	flags = nce->nce_flags;
1861 	mblk_t		*mp = (mblk_t *)mp_arg;
1862 	uchar_t		*h;
1863 	uchar_t		*m = flags_buf;
1864 	in6_addr_t	v6addr;
1865 
1866 	/*
1867 	 * Lock the nce to protect nce_res_mp from being changed
1868 	 * if an external resolver address resolution completes
1869 	 * while nce_res_mp is being accessed here.
1870 	 *
1871 	 * Deal with all address formats, not just Ethernet-specific
1872 	 * In addition, make sure that the mblk has enough space
1873 	 * before writing to it. If is doesn't, allocate a new one.
1874 	 */
1875 	ASSERT(ill != NULL);
1876 	v6addr = nce->nce_mask;
1877 	if (flags & NCE_F_PERMANENT)
1878 		*m++ = 'P';
1879 	if (flags & NCE_F_ISROUTER)
1880 		*m++ = 'R';
1881 	if (flags & NCE_F_MAPPING)
1882 		*m++ = 'M';
1883 	*m = '\0';
1884 
1885 	if (ill->ill_net_type == IRE_IF_RESOLVER) {
1886 		size_t		addrlen;
1887 		uchar_t		*addr_buf;
1888 		dl_unitdata_req_t	*dl;
1889 
1890 		mutex_enter(&nce->nce_lock);
1891 		h = nce->nce_res_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill);
1892 		dl = (dl_unitdata_req_t *)nce->nce_res_mp->b_rptr;
1893 		if (ill->ill_flags & ILLF_XRESOLV)
1894 			addrlen = (3 * (dl->dl_dest_addr_length));
1895 		else
1896 			addrlen = (3 * (ill->ill_nd_lla_len));
1897 		if (addrlen <= 0) {
1898 			mutex_exit(&nce->nce_lock);
1899 			(void) mi_mpprintf(mp,
1900 			    "%8s %9s %5s %s/%d",
1901 			    ill->ill_name,
1902 			    "None",
1903 			    (uchar_t *)&flags_buf,
1904 			    inet_ntop(AF_INET6, (char *)&nce->nce_addr,
1905 				(char *)local_buf, sizeof (local_buf)),
1906 				ip_mask_to_plen_v6(&v6addr));
1907 		} else {
1908 			/*
1909 			 * Convert the hardware/lla address to ascii
1910 			 */
1911 			addr_buf = kmem_zalloc(addrlen, KM_NOSLEEP);
1912 			if (addr_buf == NULL) {
1913 				mutex_exit(&nce->nce_lock);
1914 				return;
1915 			}
1916 			if (ill->ill_flags & ILLF_XRESOLV)
1917 				lla2ascii((uint8_t *)h, dl->dl_dest_addr_length,
1918 				    addr_buf);
1919 			else
1920 				lla2ascii((uint8_t *)h, ill->ill_nd_lla_len,
1921 				    addr_buf);
1922 			mutex_exit(&nce->nce_lock);
1923 			(void) mi_mpprintf(mp, "%8s %17s %5s %s/%d",
1924 			    ill->ill_name, addr_buf, (uchar_t *)&flags_buf,
1925 			    inet_ntop(AF_INET6, (char *)&nce->nce_addr,
1926 				(char *)local_buf, sizeof (local_buf)),
1927 				ip_mask_to_plen_v6(&v6addr));
1928 			kmem_free(addr_buf, addrlen);
1929 		}
1930 	} else {
1931 		(void) mi_mpprintf(mp,
1932 		    "%8s %9s %5s %s/%d",
1933 		    ill->ill_name,
1934 		    "None",
1935 		    (uchar_t *)&flags_buf,
1936 		    inet_ntop(AF_INET6, (char *)&nce->nce_addr,
1937 			(char *)local_buf, sizeof (local_buf)),
1938 			ip_mask_to_plen_v6(&v6addr));
1939 	}
1940 }
1941 
1942 mblk_t *
1943 nce_udreq_alloc(ill_t *ill)
1944 {
1945 	mblk_t	*template_mp = NULL;
1946 	dl_unitdata_req_t *dlur;
1947 	int	sap_length;
1948 
1949 	sap_length = ill->ill_sap_length;
1950 	template_mp = ip_dlpi_alloc(sizeof (dl_unitdata_req_t) +
1951 	    ill->ill_nd_lla_len + ABS(sap_length), DL_UNITDATA_REQ);
1952 	if (template_mp == NULL)
1953 		return (NULL);
1954 
1955 	dlur = (dl_unitdata_req_t *)template_mp->b_rptr;
1956 	dlur->dl_priority.dl_min = 0;
1957 	dlur->dl_priority.dl_max = 0;
1958 	dlur->dl_dest_addr_length = ABS(sap_length) + ill->ill_nd_lla_len;
1959 	dlur->dl_dest_addr_offset = sizeof (dl_unitdata_req_t);
1960 
1961 	/* Copy in the SAP value. */
1962 	NCE_LL_SAP_COPY(ill, template_mp);
1963 
1964 	return (template_mp);
1965 }
1966 
1967 /*
1968  * NDP retransmit timer.
1969  * This timer goes off when:
1970  * a. It is time to retransmit NS for resolver.
1971  * b. It is time to send reachability probes.
1972  */
1973 void
1974 ndp_timer(void *arg)
1975 {
1976 	nce_t		*nce = arg;
1977 	ill_t		*ill = nce->nce_ill;
1978 	uint32_t	ms;
1979 	char		addrbuf[INET6_ADDRSTRLEN];
1980 	mblk_t		*mp;
1981 	boolean_t	dropped = B_FALSE;
1982 
1983 	/*
1984 	 * The timer has to be cancelled by ndp_delete before doing the final
1985 	 * refrele. So the NCE is guaranteed to exist when the timer runs
1986 	 * until it clears the timeout_id. Before clearing the timeout_id
1987 	 * bump up the refcnt so that we can continue to use the nce
1988 	 */
1989 	ASSERT(nce != NULL);
1990 
1991 	/*
1992 	 * Grab the ill_g_lock now itself to avoid lock order problems.
1993 	 * nce_solicit needs ill_g_lock to be able to traverse ills
1994 	 */
1995 	rw_enter(&ill_g_lock, RW_READER);
1996 	mutex_enter(&nce->nce_lock);
1997 	NCE_REFHOLD_LOCKED(nce);
1998 	nce->nce_timeout_id = 0;
1999 
2000 	/*
2001 	 * Check the reachability state first.
2002 	 */
2003 	switch (nce->nce_state) {
2004 	case ND_DELAY:
2005 		rw_exit(&ill_g_lock);
2006 		nce->nce_state = ND_PROBE;
2007 		mutex_exit(&nce->nce_lock);
2008 		(void) nce_xmit(ill, ND_NEIGHBOR_SOLICIT, NULL, B_FALSE,
2009 		    &ipv6_all_zeros, &nce->nce_addr, NDP_UNICAST);
2010 		if (ip_debug > 3) {
2011 			/* ip2dbg */
2012 			pr_addr_dbg("ndp_timer: state for %s changed "
2013 			    "to PROBE\n", AF_INET6, &nce->nce_addr);
2014 		}
2015 		NDP_RESTART_TIMER(nce, ill->ill_reachable_retrans_time);
2016 		NCE_REFRELE(nce);
2017 		return;
2018 	case ND_PROBE:
2019 		/* must be retransmit timer */
2020 		rw_exit(&ill_g_lock);
2021 		nce->nce_pcnt--;
2022 		ASSERT(nce->nce_pcnt < ND_MAX_UNICAST_SOLICIT &&
2023 		    nce->nce_pcnt >= -1);
2024 		if (nce->nce_pcnt == 0) {
2025 			/* Wait RetransTimer, before deleting the entry */
2026 			ip2dbg(("ndp_timer: pcount=%x dst %s\n",
2027 			    nce->nce_pcnt, inet_ntop(AF_INET6,
2028 			    &nce->nce_addr, addrbuf, sizeof (addrbuf))));
2029 			mutex_exit(&nce->nce_lock);
2030 			NDP_RESTART_TIMER(nce, ill->ill_reachable_retrans_time);
2031 		} else {
2032 			/*
2033 			 * As per RFC2461, the nce gets deleted after
2034 			 * MAX_UNICAST_SOLICIT unsuccessful re-transmissions.
2035 			 * Note that the first unicast solicitation is sent
2036 			 * during the DELAY state.
2037 			 */
2038 			if (nce->nce_pcnt > 0) {
2039 				ip2dbg(("ndp_timer: pcount=%x dst %s\n",
2040 				    nce->nce_pcnt, inet_ntop(AF_INET6,
2041 				    &nce->nce_addr,
2042 				    addrbuf, sizeof (addrbuf))));
2043 				mutex_exit(&nce->nce_lock);
2044 				dropped = nce_xmit(ill, ND_NEIGHBOR_SOLICIT,
2045 				    NULL, B_FALSE, &ipv6_all_zeros,
2046 				    &nce->nce_addr, NDP_UNICAST);
2047 				if (dropped) {
2048 					mutex_enter(&nce->nce_lock);
2049 					nce->nce_pcnt++;
2050 					mutex_exit(&nce->nce_lock);
2051 				}
2052 				NDP_RESTART_TIMER(nce,
2053 				    ill->ill_reachable_retrans_time);
2054 			} else {
2055 				/* No hope, delete the nce */
2056 				nce->nce_state = ND_UNREACHABLE;
2057 				mutex_exit(&nce->nce_lock);
2058 				if (ip_debug > 2) {
2059 					/* ip1dbg */
2060 					pr_addr_dbg("ndp_timer: Delete IRE for"
2061 					    " dst %s\n", AF_INET6,
2062 					    &nce->nce_addr);
2063 				}
2064 				ndp_delete(nce);
2065 			}
2066 		}
2067 		NCE_REFRELE(nce);
2068 		return;
2069 	case ND_INCOMPLETE:
2070 		/*
2071 		 * Must be resolvers retransmit timer.
2072 		 */
2073 		for (mp = nce->nce_qd_mp; mp != NULL; mp = mp->b_next) {
2074 			ip6i_t	*ip6i;
2075 			ip6_t	*ip6h;
2076 			mblk_t *data_mp;
2077 
2078 			/*
2079 			 * Walk the list of packets queued, and see if there
2080 			 * are any multipathing probe packets. Such packets
2081 			 * are always queued at the head. Since this is a
2082 			 * retransmit timer firing, mark such packets as
2083 			 * delayed in ND resolution. This info will be used
2084 			 * in ip_wput_v6(). Multipathing probe packets will
2085 			 * always have an ip6i_t. Once we hit a packet without
2086 			 * it, we can break out of this loop.
2087 			 */
2088 			if (mp->b_datap->db_type == M_CTL)
2089 				data_mp = mp->b_cont;
2090 			else
2091 				data_mp = mp;
2092 
2093 			ip6h = (ip6_t *)data_mp->b_rptr;
2094 			if (ip6h->ip6_nxt != IPPROTO_RAW)
2095 				break;
2096 
2097 			/*
2098 			 * This message should have been pulled up already in
2099 			 * ip_wput_v6. We can't do pullups here because the
2100 			 * b_next/b_prev is non-NULL.
2101 			 */
2102 			ip6i = (ip6i_t *)ip6h;
2103 			ASSERT((data_mp->b_wptr - (uchar_t *)ip6i) >=
2104 			    sizeof (ip6i_t) + IPV6_HDR_LEN);
2105 
2106 			/* Mark this packet as delayed due to ND resolution */
2107 			if (ip6i->ip6i_flags & IP6I_DROP_IFDELAYED)
2108 				ip6i->ip6i_flags |= IP6I_ND_DELAYED;
2109 		}
2110 		if (nce->nce_qd_mp != NULL) {
2111 			ms = nce_solicit(nce, NULL);
2112 			rw_exit(&ill_g_lock);
2113 			if (ms == 0) {
2114 				if (nce->nce_state != ND_REACHABLE) {
2115 					mutex_exit(&nce->nce_lock);
2116 					nce_resolv_failed(nce);
2117 					ndp_delete(nce);
2118 				} else {
2119 					mutex_exit(&nce->nce_lock);
2120 				}
2121 			} else {
2122 				mutex_exit(&nce->nce_lock);
2123 				NDP_RESTART_TIMER(nce, (clock_t)ms);
2124 			}
2125 			NCE_REFRELE(nce);
2126 			return;
2127 		}
2128 		mutex_exit(&nce->nce_lock);
2129 		rw_exit(&ill_g_lock);
2130 		NCE_REFRELE(nce);
2131 		break;
2132 	case ND_REACHABLE :
2133 		rw_exit(&ill_g_lock);
2134 		if (nce->nce_flags & NCE_F_UNSOL_ADV &&
2135 		    nce->nce_unsolicit_count != 0) {
2136 			nce->nce_unsolicit_count--;
2137 			mutex_exit(&nce->nce_lock);
2138 			dropped = nce_xmit(ill,
2139 			    ND_NEIGHBOR_ADVERT,
2140 			    ill,	/* ill to be used for hw addr */
2141 			    B_FALSE,	/* use ill_phys_addr */
2142 			    &nce->nce_addr,
2143 			    &ipv6_all_hosts_mcast,
2144 			    nce->nce_flags | NDP_ORIDE);
2145 			if (dropped) {
2146 				mutex_enter(&nce->nce_lock);
2147 				nce->nce_unsolicit_count++;
2148 				mutex_exit(&nce->nce_lock);
2149 			}
2150 			if (nce->nce_unsolicit_count != 0) {
2151 				NDP_RESTART_TIMER(nce,
2152 				    ip_ndp_unsolicit_interval);
2153 			}
2154 		} else {
2155 			mutex_exit(&nce->nce_lock);
2156 		}
2157 		NCE_REFRELE(nce);
2158 		break;
2159 	default:
2160 		rw_exit(&ill_g_lock);
2161 		mutex_exit(&nce->nce_lock);
2162 		NCE_REFRELE(nce);
2163 		break;
2164 	}
2165 }
2166 
2167 /*
2168  * Set a link layer address from the ll_addr passed in.
2169  * Copy SAP from ill.
2170  */
2171 static void
2172 nce_set_ll(nce_t *nce, uchar_t *ll_addr)
2173 {
2174 	ill_t	*ill = nce->nce_ill;
2175 	uchar_t	*woffset;
2176 
2177 	ASSERT(ll_addr != NULL);
2178 	/* Always called before fast_path_probe */
2179 	ASSERT(nce->nce_fp_mp == NULL);
2180 	if (ill->ill_sap_length != 0) {
2181 		/*
2182 		 * Copy the SAP type specified in the
2183 		 * request into the xmit template.
2184 		 */
2185 		NCE_LL_SAP_COPY(ill, nce->nce_res_mp);
2186 	}
2187 	if (ill->ill_phys_addr_length > 0) {
2188 		/*
2189 		 * The bcopy() below used to be called for the physical address
2190 		 * length rather than the link layer address length. For
2191 		 * ethernet and many other media, the phys_addr and lla are
2192 		 * identical.
2193 		 * However, with xresolv interfaces being introduced, the
2194 		 * phys_addr and lla are no longer the same, and the physical
2195 		 * address may not have any useful meaning, so we use the lla
2196 		 * for IPv6 address resolution and destination addressing.
2197 		 *
2198 		 * For PPP or other interfaces with a zero length
2199 		 * physical address, don't do anything here.
2200 		 * The bcopy() with a zero phys_addr length was previously
2201 		 * a no-op for interfaces with a zero-length physical address.
2202 		 * Using the lla for them would change the way they operate.
2203 		 * Doing nothing in such cases preserves expected behavior.
2204 		 */
2205 		woffset = nce->nce_res_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill);
2206 		bcopy(ll_addr, woffset, ill->ill_nd_lla_len);
2207 	}
2208 }
2209 
2210 static boolean_t
2211 nce_cmp_ll_addr(nce_t *nce, char *ll_addr, uint32_t ll_addr_len)
2212 {
2213 	ill_t	*ill = nce->nce_ill;
2214 	uchar_t	*ll_offset;
2215 
2216 	ASSERT(nce->nce_res_mp != NULL);
2217 	if (ll_addr == NULL)
2218 		return (B_FALSE);
2219 	ll_offset = nce->nce_res_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill);
2220 	if (bcmp(ll_addr, (char *)ll_offset, ll_addr_len) != 0)
2221 		return (B_TRUE);
2222 	return (B_FALSE);
2223 }
2224 
2225 /*
2226  * Updates the link layer address or the reachability state of
2227  * a cache entry.  Reset probe counter if needed.
2228  */
2229 static void
2230 nce_update(nce_t *nce, uint16_t new_state, uchar_t *new_ll_addr)
2231 {
2232 	ill_t	*ill = nce->nce_ill;
2233 	boolean_t need_stop_timer = B_FALSE;
2234 	boolean_t need_fastpath_update = B_FALSE;
2235 
2236 	ASSERT(MUTEX_HELD(&nce->nce_lock));
2237 	/*
2238 	 * If this interface does not do NUD, there is no point
2239 	 * in allowing an update to the cache entry.  Although
2240 	 * we will respond to NS.
2241 	 * The only time we accept an update for a resolver when
2242 	 * NUD is turned off is when it has just been created.
2243 	 * Non-Resolvers will always be created as REACHABLE.
2244 	 */
2245 	if (new_state != ND_UNCHANGED) {
2246 		if ((nce->nce_flags & NCE_F_NONUD) &&
2247 		    (nce->nce_state != ND_INCOMPLETE))
2248 			return;
2249 		ASSERT((int16_t)new_state >= ND_STATE_VALID_MIN);
2250 		ASSERT((int16_t)new_state <= ND_STATE_VALID_MAX);
2251 		need_stop_timer = B_TRUE;
2252 		if (new_state == ND_REACHABLE)
2253 			nce->nce_last = TICK_TO_MSEC(lbolt64);
2254 		else {
2255 			/* We force NUD in this case */
2256 			nce->nce_last = 0;
2257 		}
2258 		nce->nce_state = new_state;
2259 		nce->nce_pcnt = ND_MAX_UNICAST_SOLICIT;
2260 	}
2261 	/*
2262 	 * In case of fast path we need to free the the fastpath
2263 	 * M_DATA and do another probe.  Otherwise we can just
2264 	 * overwrite the DL_UNITDATA_REQ data, noting we'll lose
2265 	 * whatever packets that happens to be transmitting at the time.
2266 	 */
2267 	if (new_ll_addr != NULL) {
2268 		ASSERT(nce->nce_res_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill) +
2269 		    ill->ill_nd_lla_len <= nce->nce_res_mp->b_wptr);
2270 		bcopy(new_ll_addr, nce->nce_res_mp->b_rptr +
2271 		    NCE_LL_ADDR_OFFSET(ill), ill->ill_nd_lla_len);
2272 		if (nce->nce_fp_mp != NULL) {
2273 			freemsg(nce->nce_fp_mp);
2274 			nce->nce_fp_mp = NULL;
2275 		}
2276 		need_fastpath_update = B_TRUE;
2277 	}
2278 	mutex_exit(&nce->nce_lock);
2279 	if (need_stop_timer) {
2280 		(void) untimeout(nce->nce_timeout_id);
2281 		nce->nce_timeout_id = 0;
2282 	}
2283 	if (need_fastpath_update)
2284 		nce_fastpath(nce);
2285 	mutex_enter(&nce->nce_lock);
2286 }
2287 
2288 static void
2289 nce_queue_mp(nce_t *nce, mblk_t *mp)
2290 {
2291 	uint_t	count = 0;
2292 	mblk_t  **mpp;
2293 	boolean_t head_insert = B_FALSE;
2294 	ip6_t	*ip6h;
2295 	ip6i_t	*ip6i;
2296 	mblk_t *data_mp;
2297 
2298 	ASSERT(MUTEX_HELD(&nce->nce_lock));
2299 
2300 	if (mp->b_datap->db_type == M_CTL)
2301 		data_mp = mp->b_cont;
2302 	else
2303 		data_mp = mp;
2304 	ip6h = (ip6_t *)data_mp->b_rptr;
2305 	if (ip6h->ip6_nxt == IPPROTO_RAW) {
2306 		/*
2307 		 * This message should have been pulled up already in
2308 		 * ip_wput_v6. We can't do pullups here because the message
2309 		 * could be from the nce_qd_mp which could have b_next/b_prev
2310 		 * non-NULL.
2311 		 */
2312 		ip6i = (ip6i_t *)ip6h;
2313 		ASSERT((data_mp->b_wptr - (uchar_t *)ip6i) >=
2314 			    sizeof (ip6i_t) + IPV6_HDR_LEN);
2315 		/*
2316 		 * Multipathing probe packets have IP6I_DROP_IFDELAYED set.
2317 		 * This has 2 aspects mentioned below.
2318 		 * 1. Perform head insertion in the nce_qd_mp for these packets.
2319 		 * This ensures that next retransmit of ND solicitation
2320 		 * will use the interface specified by the probe packet,
2321 		 * for both NS and NA. This corresponds to the src address
2322 		 * in the IPv6 packet. If we insert at tail, we will be
2323 		 * depending on the packet at the head for successful
2324 		 * ND resolution. This is not reliable, because the interface
2325 		 * on which the NA arrives could be different from the interface
2326 		 * on which the NS was sent, and if the receiving interface is
2327 		 * failed, it will appear that the sending interface is also
2328 		 * failed, causing in.mpathd to misdiagnose this as link
2329 		 * failure.
2330 		 * 2. Drop the original packet, if the ND resolution did not
2331 		 * succeed in the first attempt. However we will create the
2332 		 * nce and the ire, as soon as the ND resolution succeeds.
2333 		 * We don't gain anything by queueing multiple probe packets
2334 		 * and sending them back-to-back once resolution succeeds.
2335 		 * It is sufficient to send just 1 packet after ND resolution
2336 		 * succeeds. Since mpathd is sending down probe packets at a
2337 		 * constant rate, we don't need to send the queued packet. We
2338 		 * need to queue it only for NDP resolution. The benefit of
2339 		 * dropping the probe packets that were delayed in ND
2340 		 * resolution, is that in.mpathd will not see inflated
2341 		 * RTT. If the ND resolution does not succeed within
2342 		 * in.mpathd's failure detection time, mpathd may detect
2343 		 * a failure, and it does not matter whether the packet
2344 		 * was queued or dropped.
2345 		 */
2346 		if (ip6i->ip6i_flags & IP6I_DROP_IFDELAYED)
2347 			head_insert = B_TRUE;
2348 	}
2349 
2350 	for (mpp = &nce->nce_qd_mp; *mpp != NULL;
2351 	    mpp = &(*mpp)->b_next) {
2352 		if (++count >
2353 		    nce->nce_ill->ill_max_buf) {
2354 			mblk_t *tmp = nce->nce_qd_mp->b_next;
2355 
2356 			nce->nce_qd_mp->b_next = NULL;
2357 			nce->nce_qd_mp->b_prev = NULL;
2358 			freemsg(nce->nce_qd_mp);
2359 			ip1dbg(("nce_queue_mp: pkt dropped\n"));
2360 			nce->nce_qd_mp = tmp;
2361 		}
2362 	}
2363 	/* put this on the list */
2364 	if (head_insert) {
2365 		mp->b_next = nce->nce_qd_mp;
2366 		nce->nce_qd_mp = mp;
2367 	} else {
2368 		*mpp = mp;
2369 	}
2370 }
2371 
2372 /*
2373  * Called when address resolution failed due to a timeout.
2374  * Send an ICMP unreachable in response to all queued packets.
2375  */
2376 void
2377 nce_resolv_failed(nce_t *nce)
2378 {
2379 	mblk_t	*mp, *nxt_mp, *first_mp;
2380 	char	buf[INET6_ADDRSTRLEN];
2381 	ip6_t *ip6h;
2382 	zoneid_t zoneid = GLOBAL_ZONEID;
2383 
2384 	ip1dbg(("nce_resolv_failed: dst %s\n",
2385 	    inet_ntop(AF_INET6, (char *)&nce->nce_addr, buf, sizeof (buf))));
2386 	mutex_enter(&nce->nce_lock);
2387 	mp = nce->nce_qd_mp;
2388 	nce->nce_qd_mp = NULL;
2389 	mutex_exit(&nce->nce_lock);
2390 	while (mp != NULL) {
2391 		nxt_mp = mp->b_next;
2392 		mp->b_next = NULL;
2393 		mp->b_prev = NULL;
2394 
2395 		first_mp = mp;
2396 		if (mp->b_datap->db_type == M_CTL) {
2397 			ipsec_out_t *io = (ipsec_out_t *)mp->b_rptr;
2398 			ASSERT(io->ipsec_out_type == IPSEC_OUT);
2399 			zoneid = io->ipsec_out_zoneid;
2400 			ASSERT(zoneid != ALL_ZONES);
2401 			mp = mp->b_cont;
2402 		}
2403 
2404 		ip6h = (ip6_t *)mp->b_rptr;
2405 		if (ip6h->ip6_nxt == IPPROTO_RAW) {
2406 			ip6i_t *ip6i;
2407 			/*
2408 			 * This message should have been pulled up already
2409 			 * in ip_wput_v6. ip_hdr_complete_v6 assumes that
2410 			 * the header is pulled up.
2411 			 */
2412 			ip6i = (ip6i_t *)ip6h;
2413 			ASSERT((mp->b_wptr - (uchar_t *)ip6i) >=
2414 			    sizeof (ip6i_t) + IPV6_HDR_LEN);
2415 			mp->b_rptr += sizeof (ip6i_t);
2416 		}
2417 		/*
2418 		 * Ignore failure since icmp_unreachable_v6 will silently
2419 		 * drop packets with an unspecified source address.
2420 		 */
2421 		(void) ip_hdr_complete_v6((ip6_t *)mp->b_rptr, zoneid);
2422 		icmp_unreachable_v6(nce->nce_ill->ill_wq, first_mp,
2423 		    ICMP6_DST_UNREACH_ADDR, B_FALSE, B_FALSE);
2424 		mp = nxt_mp;
2425 	}
2426 }
2427 
2428 /*
2429  * Called by SIOCSNDP* ioctl to add/change an nce entry
2430  * and the corresponding attributes.
2431  * Disallow states other than ND_REACHABLE or ND_STALE.
2432  */
2433 int
2434 ndp_sioc_update(ill_t *ill, lif_nd_req_t *lnr)
2435 {
2436 	sin6_t		*sin6;
2437 	in6_addr_t	*addr;
2438 	nce_t		*nce;
2439 	int		err;
2440 	uint16_t	new_flags = 0;
2441 	uint16_t	old_flags = 0;
2442 	int		inflags = lnr->lnr_flags;
2443 
2444 	if ((lnr->lnr_state_create != ND_REACHABLE) &&
2445 	    (lnr->lnr_state_create != ND_STALE))
2446 		return (EINVAL);
2447 
2448 	sin6 = (sin6_t *)&lnr->lnr_addr;
2449 	addr = &sin6->sin6_addr;
2450 
2451 	mutex_enter(&ndp_g_lock);
2452 	/* We know it can not be mapping so just look in the hash table */
2453 	nce = nce_lookup_addr(ill, addr);
2454 	if (nce != NULL)
2455 		new_flags = nce->nce_flags;
2456 
2457 	switch (inflags & (NDF_ISROUTER_ON|NDF_ISROUTER_OFF)) {
2458 	case NDF_ISROUTER_ON:
2459 		new_flags |= NCE_F_ISROUTER;
2460 		break;
2461 	case NDF_ISROUTER_OFF:
2462 		new_flags &= ~NCE_F_ISROUTER;
2463 		break;
2464 	case (NDF_ISROUTER_OFF|NDF_ISROUTER_ON):
2465 		mutex_exit(&ndp_g_lock);
2466 		if (nce != NULL)
2467 			NCE_REFRELE(nce);
2468 		return (EINVAL);
2469 	}
2470 
2471 	switch (inflags & (NDF_ANYCAST_ON|NDF_ANYCAST_OFF)) {
2472 	case NDF_ANYCAST_ON:
2473 		new_flags |= NCE_F_ANYCAST;
2474 		break;
2475 	case NDF_ANYCAST_OFF:
2476 		new_flags &= ~NCE_F_ANYCAST;
2477 		break;
2478 	case (NDF_ANYCAST_OFF|NDF_ANYCAST_ON):
2479 		mutex_exit(&ndp_g_lock);
2480 		if (nce != NULL)
2481 			NCE_REFRELE(nce);
2482 		return (EINVAL);
2483 	}
2484 
2485 	switch (inflags & (NDF_PROXY_ON|NDF_PROXY_OFF)) {
2486 	case NDF_PROXY_ON:
2487 		new_flags |= NCE_F_PROXY;
2488 		break;
2489 	case NDF_PROXY_OFF:
2490 		new_flags &= ~NCE_F_PROXY;
2491 		break;
2492 	case (NDF_PROXY_OFF|NDF_PROXY_ON):
2493 		mutex_exit(&ndp_g_lock);
2494 		if (nce != NULL)
2495 			NCE_REFRELE(nce);
2496 		return (EINVAL);
2497 	}
2498 
2499 	if (nce == NULL) {
2500 		err = ndp_add(ill,
2501 		    (uchar_t *)lnr->lnr_hdw_addr,
2502 		    addr,
2503 		    &ipv6_all_ones,
2504 		    &ipv6_all_zeros,
2505 		    0,
2506 		    new_flags,
2507 		    lnr->lnr_state_create,
2508 		    &nce);
2509 		if (err != 0) {
2510 			mutex_exit(&ndp_g_lock);
2511 			ip1dbg(("ndp_sioc_update: Can't create NCE %d\n", err));
2512 			return (err);
2513 		}
2514 	}
2515 	old_flags = nce->nce_flags;
2516 	if (old_flags & NCE_F_ISROUTER && !(new_flags & NCE_F_ISROUTER)) {
2517 		/*
2518 		 * Router turned to host, delete all ires.
2519 		 * XXX Just delete the entry, but we need to add too.
2520 		 */
2521 		nce->nce_flags &= ~NCE_F_ISROUTER;
2522 		mutex_exit(&ndp_g_lock);
2523 		ndp_delete(nce);
2524 		NCE_REFRELE(nce);
2525 		return (0);
2526 	}
2527 	mutex_exit(&ndp_g_lock);
2528 
2529 	mutex_enter(&nce->nce_lock);
2530 	nce->nce_flags = new_flags;
2531 	mutex_exit(&nce->nce_lock);
2532 	/*
2533 	 * Note that we ignore the state at this point, which
2534 	 * should be either STALE or REACHABLE.  Instead we let
2535 	 * the link layer address passed in to determine the state
2536 	 * much like incoming packets.
2537 	 */
2538 	ndp_process(nce, (uchar_t *)lnr->lnr_hdw_addr, 0, B_FALSE);
2539 	NCE_REFRELE(nce);
2540 	return (0);
2541 }
2542 
2543 /*
2544  * If the device driver supports it, we make nce_fp_mp to have
2545  * an M_DATA prepend.  Otherwise nce_fp_mp will be null.
2546  * The caller insures there is hold on nce for this function.
2547  * Note that since ill_fastpath_probe() copies the mblk there is
2548  * no need for the hold beyond this function.
2549  */
2550 static void
2551 nce_fastpath(nce_t *nce)
2552 {
2553 	ill_t	*ill = nce->nce_ill;
2554 	int res;
2555 
2556 	ASSERT(ill != NULL);
2557 	if (nce->nce_fp_mp != NULL) {
2558 		/* Already contains fastpath info */
2559 		return;
2560 	}
2561 	if (nce->nce_res_mp != NULL) {
2562 		nce_fastpath_list_add(nce);
2563 		res = ill_fastpath_probe(ill, nce->nce_res_mp);
2564 		/*
2565 		 * EAGAIN is an indication of a transient error
2566 		 * i.e. allocation failure etc. leave the nce in the list it
2567 		 * will be updated when another probe happens for another ire
2568 		 * if not it will be taken out of the list when the ire is
2569 		 * deleted.
2570 		 */
2571 
2572 		if (res != 0 && res != EAGAIN)
2573 			nce_fastpath_list_delete(nce);
2574 	}
2575 }
2576 
2577 /*
2578  * Drain the list of nce's waiting for fastpath response.
2579  */
2580 void
2581 nce_fastpath_list_dispatch(ill_t *ill, boolean_t (*func)(nce_t *, void  *),
2582     void *arg)
2583 {
2584 
2585 	nce_t *next_nce;
2586 	nce_t *current_nce;
2587 	nce_t *first_nce;
2588 	nce_t *prev_nce = NULL;
2589 
2590 	ASSERT(ill != NULL);
2591 
2592 	mutex_enter(&ill->ill_lock);
2593 	first_nce = current_nce = (nce_t *)ill->ill_fastpath_list;
2594 	while (current_nce != (nce_t *)&ill->ill_fastpath_list) {
2595 		next_nce = current_nce->nce_fastpath;
2596 		/*
2597 		 * Take it off the list if we're flushing, or if the callback
2598 		 * routine tells us to do so.  Otherwise, leave the nce in the
2599 		 * fastpath list to handle any pending response from the lower
2600 		 * layer.  We can't drain the list when the callback routine
2601 		 * comparison failed, because the response is asynchronous in
2602 		 * nature, and may not arrive in the same order as the list
2603 		 * insertion.
2604 		 */
2605 		if (func == NULL || func(current_nce, arg)) {
2606 			current_nce->nce_fastpath = NULL;
2607 			if (current_nce == first_nce)
2608 				ill->ill_fastpath_list = first_nce = next_nce;
2609 			else
2610 				prev_nce->nce_fastpath = next_nce;
2611 		} else {
2612 			/* previous element that is still in the list */
2613 			prev_nce = current_nce;
2614 		}
2615 		current_nce = next_nce;
2616 	}
2617 	mutex_exit(&ill->ill_lock);
2618 }
2619 
2620 /*
2621  * Add nce to the nce fastpath list.
2622  */
2623 void
2624 nce_fastpath_list_add(nce_t *nce)
2625 {
2626 	ill_t *ill;
2627 
2628 	ill = nce->nce_ill;
2629 	ASSERT(ill != NULL);
2630 
2631 	mutex_enter(&ill->ill_lock);
2632 	mutex_enter(&nce->nce_lock);
2633 
2634 	/*
2635 	 * if nce has not been deleted and
2636 	 * is not already in the list add it.
2637 	 */
2638 	if (!(nce->nce_flags & NCE_F_CONDEMNED) &&
2639 	    (nce->nce_fastpath == NULL)) {
2640 		nce->nce_fastpath = (nce_t *)ill->ill_fastpath_list;
2641 		ill->ill_fastpath_list = nce;
2642 	}
2643 
2644 	mutex_exit(&nce->nce_lock);
2645 	mutex_exit(&ill->ill_lock);
2646 }
2647 
2648 /*
2649  * remove nce from the nce fastpath list.
2650  */
2651 void
2652 nce_fastpath_list_delete(nce_t *nce)
2653 {
2654 	nce_t *nce_ptr;
2655 
2656 	ill_t *ill;
2657 
2658 	ill = nce->nce_ill;
2659 	ASSERT(ill != NULL);
2660 
2661 	mutex_enter(&ill->ill_lock);
2662 	if (nce->nce_fastpath == NULL)
2663 		goto done;
2664 
2665 	ASSERT(ill->ill_fastpath_list != &ill->ill_fastpath_list);
2666 
2667 	if (ill->ill_fastpath_list == nce) {
2668 		ill->ill_fastpath_list = nce->nce_fastpath;
2669 	} else {
2670 		nce_ptr = ill->ill_fastpath_list;
2671 		while (nce_ptr != (nce_t *)&ill->ill_fastpath_list) {
2672 			if (nce_ptr->nce_fastpath == nce) {
2673 				nce_ptr->nce_fastpath = nce->nce_fastpath;
2674 				break;
2675 			}
2676 			nce_ptr = nce_ptr->nce_fastpath;
2677 		}
2678 	}
2679 
2680 	nce->nce_fastpath = NULL;
2681 done:
2682 	mutex_exit(&ill->ill_lock);
2683 }
2684 
2685 /*
2686  * Update all NCE's that are not in fastpath mode and
2687  * have an nce_fp_mp that matches mp. mp->b_cont contains
2688  * the fastpath header.
2689  *
2690  * Returns TRUE if entry should be dequeued, or FALSE otherwise.
2691  */
2692 boolean_t
2693 ndp_fastpath_update(nce_t *nce, void *arg)
2694 {
2695 	mblk_t 	*mp, *fp_mp;
2696 	uchar_t	*mp_rptr, *ud_mp_rptr;
2697 	mblk_t	*ud_mp = nce->nce_res_mp;
2698 	ptrdiff_t	cmplen;
2699 
2700 	if (nce->nce_flags & NCE_F_MAPPING)
2701 		return (B_TRUE);
2702 	if ((nce->nce_fp_mp != NULL) || (ud_mp == NULL))
2703 		return (B_TRUE);
2704 
2705 	ip2dbg(("ndp_fastpath_update: trying\n"));
2706 	mp = (mblk_t *)arg;
2707 	mp_rptr = mp->b_rptr;
2708 	cmplen = mp->b_wptr - mp_rptr;
2709 	ASSERT(cmplen >= 0);
2710 	ud_mp_rptr = ud_mp->b_rptr;
2711 	/*
2712 	 * The nce is locked here to prevent any other threads
2713 	 * from accessing and changing nce_res_mp when the IPv6 address
2714 	 * becomes resolved to an lla while we're in the middle
2715 	 * of looking at and comparing the hardware address (lla).
2716 	 * It is also locked to prevent multiple threads in nce_fastpath_update
2717 	 * from examining nce_res_mp atthe same time.
2718 	 */
2719 	mutex_enter(&nce->nce_lock);
2720 	if (ud_mp->b_wptr - ud_mp_rptr != cmplen ||
2721 	    bcmp((char *)mp_rptr, (char *)ud_mp_rptr, cmplen) != 0) {
2722 		mutex_exit(&nce->nce_lock);
2723 		/*
2724 		 * Don't take the ire off the fastpath list yet,
2725 		 * since the response may come later.
2726 		 */
2727 		return (B_FALSE);
2728 	}
2729 	/* Matched - install mp as the fastpath mp */
2730 	ip1dbg(("ndp_fastpath_update: match\n"));
2731 	fp_mp = dupb(mp->b_cont);
2732 	if (fp_mp != NULL) {
2733 		nce->nce_fp_mp = fp_mp;
2734 	}
2735 	mutex_exit(&nce->nce_lock);
2736 	return (B_TRUE);
2737 }
2738 
2739 /*
2740  * This function handles the DL_NOTE_FASTPATH_FLUSH notification from
2741  * driver.  Note that it assumes IP is exclusive...
2742  */
2743 /* ARGSUSED */
2744 void
2745 ndp_fastpath_flush(nce_t *nce, char *arg)
2746 {
2747 	if (nce->nce_flags & NCE_F_MAPPING)
2748 		return;
2749 	/* No fastpath info? */
2750 	if (nce->nce_fp_mp == NULL || nce->nce_res_mp == NULL)
2751 		return;
2752 
2753 	/* Just delete the NCE... */
2754 	ndp_delete(nce);
2755 }
2756 
2757 /*
2758  * Return a pointer to a given option in the packet.
2759  * Assumes that option part of the packet have already been validated.
2760  */
2761 nd_opt_hdr_t *
2762 ndp_get_option(nd_opt_hdr_t *opt, int optlen, int opt_type)
2763 {
2764 	while (optlen > 0) {
2765 		if (opt->nd_opt_type == opt_type)
2766 			return (opt);
2767 		optlen -= 8 * opt->nd_opt_len;
2768 		opt = (struct nd_opt_hdr *)((char *)opt + 8 * opt->nd_opt_len);
2769 	}
2770 	return (NULL);
2771 }
2772 
2773 /*
2774  * Verify all option lengths present are > 0, also check to see
2775  * if the option lengths and packet length are consistent.
2776  */
2777 boolean_t
2778 ndp_verify_optlen(nd_opt_hdr_t *opt, int optlen)
2779 {
2780 	ASSERT(opt != NULL);
2781 	while (optlen > 0) {
2782 		if (opt->nd_opt_len == 0)
2783 			return (B_FALSE);
2784 		optlen -= 8 * opt->nd_opt_len;
2785 		if (optlen < 0)
2786 			return (B_FALSE);
2787 		opt = (struct nd_opt_hdr *)((char *)opt + 8 * opt->nd_opt_len);
2788 	}
2789 	return (B_TRUE);
2790 }
2791 
2792 /*
2793  * ndp_walk function.
2794  * Free a fraction of the NCE cache entries.
2795  * A fraction of zero means to not free any in that category.
2796  */
2797 void
2798 ndp_cache_reclaim(nce_t *nce, char *arg)
2799 {
2800 	nce_cache_reclaim_t *ncr = (nce_cache_reclaim_t *)arg;
2801 	uint_t	rand;
2802 
2803 	if (nce->nce_flags & NCE_F_PERMANENT)
2804 		return;
2805 
2806 	rand = (uint_t)lbolt +
2807 	    NCE_ADDR_HASH_V6(nce->nce_addr, NCE_TABLE_SIZE);
2808 	if (ncr->ncr_host != 0 &&
2809 	    (rand/ncr->ncr_host)*ncr->ncr_host == rand) {
2810 		ndp_delete(nce);
2811 		return;
2812 	}
2813 }
2814 
2815 /*
2816  * ndp_walk function.
2817  * Count the number of NCEs that can be deleted.
2818  * These would be hosts but not routers.
2819  */
2820 void
2821 ndp_cache_count(nce_t *nce, char *arg)
2822 {
2823 	ncc_cache_count_t *ncc = (ncc_cache_count_t *)arg;
2824 
2825 	if (nce->nce_flags & NCE_F_PERMANENT)
2826 		return;
2827 
2828 	ncc->ncc_total++;
2829 	if (!(nce->nce_flags & NCE_F_ISROUTER))
2830 		ncc->ncc_host++;
2831 }
2832 
2833 #ifdef NCE_DEBUG
2834 th_trace_t *
2835 th_trace_nce_lookup(nce_t *nce)
2836 {
2837 	int bucket_id;
2838 	th_trace_t *th_trace;
2839 
2840 	ASSERT(MUTEX_HELD(&nce->nce_lock));
2841 
2842 	bucket_id = IP_TR_HASH(curthread);
2843 	ASSERT(bucket_id < IP_TR_HASH_MAX);
2844 
2845 	for (th_trace = nce->nce_trace[bucket_id]; th_trace != NULL;
2846 	    th_trace = th_trace->th_next) {
2847 		if (th_trace->th_id == curthread)
2848 			return (th_trace);
2849 	}
2850 	return (NULL);
2851 }
2852 
2853 void
2854 nce_trace_ref(nce_t *nce)
2855 {
2856 	int bucket_id;
2857 	th_trace_t *th_trace;
2858 
2859 	/*
2860 	 * Attempt to locate the trace buffer for the curthread.
2861 	 * If it does not exist, then allocate a new trace buffer
2862 	 * and link it in list of trace bufs for this ipif, at the head
2863 	 */
2864 	ASSERT(MUTEX_HELD(&nce->nce_lock));
2865 
2866 	if (nce->nce_trace_disable == B_TRUE)
2867 		return;
2868 
2869 	th_trace = th_trace_nce_lookup(nce);
2870 	if (th_trace == NULL) {
2871 		bucket_id = IP_TR_HASH(curthread);
2872 		th_trace = (th_trace_t *)kmem_zalloc(sizeof (th_trace_t),
2873 		    KM_NOSLEEP);
2874 		if (th_trace == NULL) {
2875 			nce->nce_trace_disable = B_TRUE;
2876 			nce_trace_inactive(nce);
2877 			return;
2878 		}
2879 		th_trace->th_id = curthread;
2880 		th_trace->th_next = nce->nce_trace[bucket_id];
2881 		th_trace->th_prev = &nce->nce_trace[bucket_id];
2882 		if (th_trace->th_next != NULL)
2883 			th_trace->th_next->th_prev = &th_trace->th_next;
2884 		nce->nce_trace[bucket_id] = th_trace;
2885 	}
2886 	ASSERT(th_trace->th_refcnt < TR_BUF_MAX - 1);
2887 	th_trace->th_refcnt++;
2888 	th_trace_rrecord(th_trace);
2889 }
2890 
2891 void
2892 nce_untrace_ref(nce_t *nce)
2893 {
2894 	th_trace_t *th_trace;
2895 
2896 	ASSERT(MUTEX_HELD(&nce->nce_lock));
2897 
2898 	if (nce->nce_trace_disable == B_TRUE)
2899 		return;
2900 
2901 	th_trace = th_trace_nce_lookup(nce);
2902 	ASSERT(th_trace != NULL && th_trace->th_refcnt > 0);
2903 
2904 	th_trace_rrecord(th_trace);
2905 	th_trace->th_refcnt--;
2906 }
2907 
2908 void
2909 nce_trace_inactive(nce_t *nce)
2910 {
2911 	th_trace_t *th_trace;
2912 	int i;
2913 
2914 	ASSERT(MUTEX_HELD(&nce->nce_lock));
2915 
2916 	for (i = 0; i < IP_TR_HASH_MAX; i++) {
2917 		while (nce->nce_trace[i] != NULL) {
2918 			th_trace = nce->nce_trace[i];
2919 
2920 			/* unlink th_trace and free it */
2921 			nce->nce_trace[i] = th_trace->th_next;
2922 			if (th_trace->th_next != NULL)
2923 				th_trace->th_next->th_prev =
2924 				    &nce->nce_trace[i];
2925 
2926 			th_trace->th_next = NULL;
2927 			th_trace->th_prev = NULL;
2928 			kmem_free(th_trace, sizeof (th_trace_t));
2929 		}
2930 	}
2931 
2932 }
2933 
2934 /* ARGSUSED */
2935 int
2936 nce_thread_exit(nce_t *nce, caddr_t arg)
2937 {
2938 	th_trace_t	*th_trace;
2939 
2940 	mutex_enter(&nce->nce_lock);
2941 	th_trace = th_trace_nce_lookup(nce);
2942 
2943 	if (th_trace == NULL) {
2944 		mutex_exit(&nce->nce_lock);
2945 		return (0);
2946 	}
2947 
2948 	ASSERT(th_trace->th_refcnt == 0);
2949 
2950 	/* unlink th_trace and free it */
2951 	*th_trace->th_prev = th_trace->th_next;
2952 	if (th_trace->th_next != NULL)
2953 		th_trace->th_next->th_prev = th_trace->th_prev;
2954 	th_trace->th_next = NULL;
2955 	th_trace->th_prev = NULL;
2956 	kmem_free(th_trace, sizeof (th_trace_t));
2957 	mutex_exit(&nce->nce_lock);
2958 	return (0);
2959 }
2960 #endif
2961