xref: /titanic_50/usr/src/uts/common/inet/ip/ip_ndp.c (revision 8461248208fabd3a8230615f8615e5bf1b4dcdcb)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #include <sys/types.h>
30 #include <sys/stream.h>
31 #include <sys/stropts.h>
32 #include <sys/sysmacros.h>
33 #include <sys/errno.h>
34 #include <sys/strlog.h>
35 #include <sys/dlpi.h>
36 #include <sys/sockio.h>
37 #include <sys/tiuser.h>
38 #include <sys/tihdr.h>
39 #include <sys/socket.h>
40 #include <sys/ddi.h>
41 #include <sys/cmn_err.h>
42 #include <sys/debug.h>
43 #include <sys/vtrace.h>
44 #include <sys/kmem.h>
45 #include <sys/zone.h>
46 
47 #include <net/if.h>
48 #include <net/if_types.h>
49 #include <net/if_dl.h>
50 #include <net/route.h>
51 #include <sys/sockio.h>
52 #include <netinet/in.h>
53 #include <netinet/in_systm.h>
54 #include <netinet/ip6.h>
55 #include <netinet/icmp6.h>
56 
57 #include <inet/common.h>
58 #include <inet/mi.h>
59 #include <inet/mib2.h>
60 #include <inet/nd.h>
61 #include <inet/arp.h>
62 #include <inet/ip.h>
63 #include <inet/ip_multi.h>
64 #include <inet/ip_if.h>
65 #include <inet/ip_ire.h>
66 #include <inet/ip_rts.h>
67 #include <inet/ip6.h>
68 #include <inet/ip_ndp.h>
69 #include <inet/ipsec_impl.h>
70 #include <inet/ipsec_info.h>
71 
72 /*
73  * Function names with nce_ prefix are static while function
74  * names with ndp_ prefix are used by rest of the IP.
75  */
76 
77 static	boolean_t nce_cmp_ll_addr(nce_t *nce, char *new_ll_addr,
78     uint32_t ll_addr_len);
79 static	void	nce_fastpath(nce_t *nce);
80 static	void	nce_ire_delete(nce_t *nce);
81 static	void	nce_ire_delete1(ire_t *ire, char *nce_arg);
82 static	void 	nce_set_ll(nce_t *nce, uchar_t *ll_addr);
83 static	nce_t	*nce_lookup_addr(ill_t *ill, const in6_addr_t *addr);
84 static	nce_t	*nce_lookup_mapping(ill_t *ill, const in6_addr_t *addr);
85 static	void	nce_make_mapping(nce_t *nce, uchar_t *addrpos,
86     uchar_t *addr);
87 static	int	nce_set_multicast(ill_t *ill, const in6_addr_t *addr);
88 static	void	nce_queue_mp(nce_t *nce, mblk_t *mp);
89 static	void	nce_report1(nce_t *nce, uchar_t *mp_arg);
90 static	mblk_t	*nce_udreq_alloc(ill_t *ill);
91 static	void	nce_update(nce_t *nce, uint16_t new_state,
92     uchar_t *new_ll_addr);
93 static	uint32_t	nce_solicit(nce_t *nce, mblk_t *mp);
94 static	boolean_t	nce_xmit(ill_t *ill, uint32_t operation,
95     ill_t *hwaddr_ill, boolean_t use_lla_addr, const in6_addr_t *sender,
96     const in6_addr_t *target, int flag);
97 static	void	lla2ascii(uint8_t *lla, int addrlen, uchar_t *buf);
98 extern void	th_trace_rrecord(th_trace_t *);
99 
100 #ifdef NCE_DEBUG
101 void	nce_trace_inactive(nce_t *);
102 #endif
103 
104 /* NDP Cache Entry Hash Table */
105 #define	NCE_TABLE_SIZE	256
106 static	nce_t	*nce_hash_tbl[NCE_TABLE_SIZE];
107 static	nce_t	*nce_mask_entries;	/* mask not all ones */
108 static	int	ndp_g_walker = 0;	/* # of active thread */
109 					/* walking nce hash list */
110 /* ndp_g_walker_cleanup will be true, when deletion have to be defered */
111 static	boolean_t	ndp_g_walker_cleanup = B_FALSE;
112 
113 #ifdef _BIG_ENDIAN
114 #define	IN6_IS_ADDR_MC_SOLICITEDNODE(addr) \
115 	((((addr)->s6_addr32[0] & 0xff020000) == 0xff020000) && \
116 	((addr)->s6_addr32[1] == 0x0) && \
117 	((addr)->s6_addr32[2] == 0x00000001) && \
118 	((addr)->s6_addr32[3] & 0xff000000) == 0xff000000)
119 #else	/* _BIG_ENDIAN */
120 #define	IN6_IS_ADDR_MC_SOLICITEDNODE(addr) \
121 	((((addr)->s6_addr32[0] & 0x000002ff) == 0x000002ff) && \
122 	((addr)->s6_addr32[1] == 0x0) && \
123 	((addr)->s6_addr32[2] == 0x01000000) && \
124 	((addr)->s6_addr32[3] & 0x000000ff) == 0x000000ff)
125 #endif
126 
127 #define	NCE_HASH_PTR(addr) \
128 	(&(nce_hash_tbl[NCE_ADDR_HASH_V6(addr, NCE_TABLE_SIZE)]))
129 
130 /*
131  * NDP Cache Entry creation routine.
132  * Mapped entries will never do NUD .
133  * This routine must always be called with ndp_g_lock held.
134  * Prior to return, nce_refcnt is incremented.
135  */
136 int
137 ndp_add(ill_t *ill, uchar_t *hw_addr, const in6_addr_t *addr,
138     const in6_addr_t *mask, const in6_addr_t *extract_mask,
139     uint32_t hw_extract_start, uint16_t flags, uint16_t state,
140     nce_t **newnce)
141 {
142 static	nce_t		nce_nil;
143 	nce_t		*nce;
144 	mblk_t		*mp;
145 	mblk_t		*template;
146 	nce_t		**ncep;
147 	int		err = 0;
148 	boolean_t	dropped = B_FALSE;
149 
150 	ASSERT(MUTEX_HELD(&ndp_g_lock));
151 	ASSERT(ill != NULL);
152 	if (IN6_IS_ADDR_UNSPECIFIED(addr)) {
153 		ip0dbg(("ndp_add: no addr\n"));
154 		return (EINVAL);
155 	}
156 	if ((flags & ~NCE_EXTERNAL_FLAGS_MASK)) {
157 		ip0dbg(("ndp_add: flags = %x\n", (int)flags));
158 		return (EINVAL);
159 	}
160 	if (IN6_IS_ADDR_UNSPECIFIED(extract_mask) &&
161 	    (flags & NCE_F_MAPPING)) {
162 		ip0dbg(("ndp_add: extract mask zero for mapping"));
163 		return (EINVAL);
164 	}
165 	/*
166 	 * Allocate the mblk to hold the nce.
167 	 *
168 	 * XXX This can come out of a separate cache - nce_cache.
169 	 * We don't need the mp anymore as there are no more
170 	 * "qwriter"s
171 	 */
172 	mp = allocb(sizeof (nce_t), BPRI_MED);
173 	if (mp == NULL)
174 		return (ENOMEM);
175 
176 	nce = (nce_t *)mp->b_rptr;
177 	mp->b_wptr = (uchar_t *)&nce[1];
178 	*nce = nce_nil;
179 
180 	/*
181 	 * This one holds link layer address
182 	 */
183 	if (ill->ill_net_type == IRE_IF_RESOLVER) {
184 		template = nce_udreq_alloc(ill);
185 	} else {
186 		ASSERT((ill->ill_net_type == IRE_IF_NORESOLVER));
187 		ASSERT((ill->ill_resolver_mp != NULL));
188 		template = copyb(ill->ill_resolver_mp);
189 	}
190 	if (template == NULL) {
191 		freeb(mp);
192 		return (ENOMEM);
193 	}
194 	nce->nce_ill = ill;
195 	nce->nce_flags = flags;
196 	nce->nce_state = state;
197 	nce->nce_pcnt = ND_MAX_UNICAST_SOLICIT;
198 	nce->nce_rcnt = ill->ill_xmit_count;
199 	nce->nce_addr = *addr;
200 	nce->nce_mask = *mask;
201 	nce->nce_extract_mask = *extract_mask;
202 	nce->nce_ll_extract_start = hw_extract_start;
203 	nce->nce_fp_mp = NULL;
204 	nce->nce_res_mp = template;
205 	if (state == ND_REACHABLE)
206 		nce->nce_last = TICK_TO_MSEC(lbolt64);
207 	else
208 		nce->nce_last = 0;
209 	nce->nce_qd_mp = NULL;
210 	nce->nce_mp = mp;
211 	if (hw_addr != NULL)
212 		nce_set_ll(nce, hw_addr);
213 	/* This one is for nce getting created */
214 	nce->nce_refcnt = 1;
215 	mutex_init(&nce->nce_lock, NULL, MUTEX_DEFAULT, NULL);
216 	if (nce->nce_flags & NCE_F_MAPPING) {
217 		ASSERT(IN6_IS_ADDR_MULTICAST(addr));
218 		ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&nce->nce_mask));
219 		ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&nce->nce_extract_mask));
220 		ncep = &nce_mask_entries;
221 	} else {
222 		ncep = ((nce_t **)NCE_HASH_PTR(*addr));
223 	}
224 
225 #ifdef NCE_DEBUG
226 	bzero(nce->nce_trace, sizeof (th_trace_t *) * IP_TR_HASH_MAX);
227 #endif
228 	/*
229 	 * Atomically ensure that the ill is not CONDEMNED, before
230 	 * adding the NCE.
231 	 */
232 	mutex_enter(&ill->ill_lock);
233 	if (ill->ill_state_flags & ILL_CONDEMNED) {
234 		mutex_exit(&ill->ill_lock);
235 		freeb(mp);
236 		return (EINVAL);
237 	}
238 	if ((nce->nce_next = *ncep) != NULL)
239 		nce->nce_next->nce_ptpn = &nce->nce_next;
240 	*ncep = nce;
241 	nce->nce_ptpn = ncep;
242 	*newnce = nce;
243 	/* This one is for nce being used by an active thread */
244 	NCE_REFHOLD(*newnce);
245 
246 	/* Bump up the number of nce's referencing this ill */
247 	ill->ill_nce_cnt++;
248 	mutex_exit(&ill->ill_lock);
249 
250 	/*
251 	 * Before we insert the nce, honor the UNSOL_ADV flag.
252 	 * We cannot hold the ndp_g_lock and call nce_xmit
253 	 * which does a putnext.
254 	 */
255 	if (flags & NCE_F_UNSOL_ADV) {
256 		flags |= NDP_ORIDE;
257 		/*
258 		 * We account for the transmit below by assigning one
259 		 * less than the ndd variable. Subsequent decrements
260 		 * are done in ndp_timer.
261 		 */
262 		mutex_enter(&nce->nce_lock);
263 		mutex_exit(&ndp_g_lock);
264 		nce->nce_unsolicit_count = ip_ndp_unsolicit_count - 1;
265 		mutex_exit(&nce->nce_lock);
266 		dropped = nce_xmit(ill,
267 		    ND_NEIGHBOR_ADVERT,
268 		    ill,	/* ill to be used for extracting ill_nd_lla */
269 		    B_TRUE,	/* use ill_nd_lla */
270 		    addr,	/* Source and target of the advertisement pkt */
271 		    &ipv6_all_hosts_mcast, /* Destination of the packet */
272 		    flags);
273 		mutex_enter(&nce->nce_lock);
274 		if (dropped)
275 			nce->nce_unsolicit_count++;
276 		if (nce->nce_unsolicit_count != 0) {
277 			nce->nce_timeout_id = timeout(ndp_timer, nce,
278 			    MSEC_TO_TICK(ip_ndp_unsolicit_interval));
279 		}
280 		mutex_exit(&nce->nce_lock);
281 		mutex_enter(&ndp_g_lock);
282 	}
283 done:
284 	return (err);
285 }
286 
287 int
288 ndp_lookup_then_add(ill_t *ill, uchar_t *hw_addr, const in6_addr_t *addr,
289     const in6_addr_t *mask, const in6_addr_t *extract_mask,
290     uint32_t hw_extract_start, uint16_t flags, uint16_t state,
291     nce_t **newnce)
292 {
293 	int	err = 0;
294 	nce_t	*nce;
295 
296 	mutex_enter(&ndp_g_lock);
297 	nce = nce_lookup_addr(ill, addr);
298 	if (nce == NULL) {
299 		err = ndp_add(ill,
300 		    hw_addr,
301 		    addr,
302 		    mask,
303 		    extract_mask,
304 		    hw_extract_start,
305 		    flags,
306 		    state,
307 		    newnce);
308 	} else {
309 		*newnce = nce;
310 		err = EEXIST;
311 	}
312 	mutex_exit(&ndp_g_lock);
313 	return (err);
314 }
315 
316 /*
317  * Remove all the CONDEMNED nces from the appropriate hash table.
318  * We create a private list of NCEs, these may have ires pointing
319  * to them, so the list will be passed through to clean up dependent
320  * ires and only then we can do NCE_REFRELE which can make NCE inactive.
321  */
322 static void
323 nce_remove(nce_t *nce, nce_t **free_nce_list)
324 {
325 	nce_t *nce1;
326 	nce_t **ptpn;
327 
328 	ASSERT(MUTEX_HELD(&ndp_g_lock));
329 	ASSERT(ndp_g_walker == 0);
330 	for (; nce; nce = nce1) {
331 		nce1 = nce->nce_next;
332 		mutex_enter(&nce->nce_lock);
333 		if (nce->nce_flags & NCE_F_CONDEMNED) {
334 			ptpn = nce->nce_ptpn;
335 			nce1 = nce->nce_next;
336 			if (nce1 != NULL)
337 				nce1->nce_ptpn = ptpn;
338 			*ptpn = nce1;
339 			nce->nce_ptpn = NULL;
340 			nce->nce_next = NULL;
341 			nce->nce_next = *free_nce_list;
342 			*free_nce_list = nce;
343 		}
344 		mutex_exit(&nce->nce_lock);
345 	}
346 }
347 
348 /*
349  * 1. Mark the nce CONDEMNED. This ensures that no new nce_lookup()
350  *    will return this NCE. Also no new IREs will be created that
351  *    point to this NCE (See ire_add_v6).  Also no new timeouts will
352  *    be started (See NDP_RESTART_TIMER).
353  * 2. Cancel any currently running timeouts.
354  * 3. If there is an ndp walker, return. The walker will do the cleanup.
355  *    This ensures that walkers see a consistent list of NCEs while walking.
356  * 4. Otherwise remove the NCE from the list of NCEs
357  * 5. Delete all IREs pointing to this NCE.
358  */
359 void
360 ndp_delete(nce_t *nce)
361 {
362 	nce_t	**ptpn;
363 	nce_t	*nce1;
364 
365 	/* Serialize deletes */
366 	mutex_enter(&nce->nce_lock);
367 	if (nce->nce_flags & NCE_F_CONDEMNED) {
368 		/* Some other thread is doing the delete */
369 		mutex_exit(&nce->nce_lock);
370 		return;
371 	}
372 	/*
373 	 * Caller has a refhold. Also 1 ref for being in the list. Thus
374 	 * refcnt has to be >= 2
375 	 */
376 	ASSERT(nce->nce_refcnt >= 2);
377 	nce->nce_flags |= NCE_F_CONDEMNED;
378 	mutex_exit(&nce->nce_lock);
379 
380 	nce_fastpath_list_delete(nce);
381 
382 	/*
383 	 * Cancel any running timer. Timeout can't be restarted
384 	 * since CONDEMNED is set. Can't hold nce_lock across untimeout.
385 	 * Passing invalid timeout id is fine.
386 	 */
387 	if (nce->nce_timeout_id != 0) {
388 		(void) untimeout(nce->nce_timeout_id);
389 		nce->nce_timeout_id = 0;
390 	}
391 
392 	mutex_enter(&ndp_g_lock);
393 	if (nce->nce_ptpn == NULL) {
394 		/*
395 		 * The last ndp walker has already removed this nce from
396 		 * the list after we marked the nce CONDEMNED and before
397 		 * we grabbed the ndp_g_lock.
398 		 */
399 		mutex_exit(&ndp_g_lock);
400 		return;
401 	}
402 	if (ndp_g_walker > 0) {
403 		/*
404 		 * Can't unlink. The walker will clean up
405 		 */
406 		ndp_g_walker_cleanup = B_TRUE;
407 		mutex_exit(&ndp_g_lock);
408 		return;
409 	}
410 
411 	/*
412 	 * Now remove the nce from the list. NDP_RESTART_TIMER won't restart
413 	 * the timer since it is marked CONDEMNED.
414 	 */
415 	ptpn = nce->nce_ptpn;
416 	nce1 = nce->nce_next;
417 	if (nce1 != NULL)
418 		nce1->nce_ptpn = ptpn;
419 	*ptpn = nce1;
420 	nce->nce_ptpn = NULL;
421 	nce->nce_next = NULL;
422 	mutex_exit(&ndp_g_lock);
423 
424 	nce_ire_delete(nce);
425 }
426 
427 void
428 ndp_inactive(nce_t *nce)
429 {
430 	mblk_t		**mpp;
431 	ill_t		*ill;
432 
433 	ASSERT(nce->nce_refcnt == 0);
434 	ASSERT(MUTEX_HELD(&nce->nce_lock));
435 	ASSERT(nce->nce_fastpath == NULL);
436 
437 	/* Free all nce allocated messages */
438 	mpp = &nce->nce_first_mp_to_free;
439 	do {
440 		while (*mpp != NULL) {
441 			mblk_t  *mp;
442 
443 			mp = *mpp;
444 			*mpp = mp->b_next;
445 			mp->b_next = NULL;
446 			mp->b_prev = NULL;
447 			freemsg(mp);
448 		}
449 	} while (mpp++ != &nce->nce_last_mp_to_free);
450 
451 #ifdef NCE_DEBUG
452 	nce_trace_inactive(nce);
453 #endif
454 
455 	ill = nce->nce_ill;
456 	mutex_enter(&ill->ill_lock);
457 	ill->ill_nce_cnt--;
458 	/*
459 	 * If the number of nce's associated with this ill have dropped
460 	 * to zero, check whether we need to restart any operation that
461 	 * is waiting for this to happen.
462 	 */
463 	if (ill->ill_nce_cnt == 0) {
464 		/* ipif_ill_refrele_tail drops the ill_lock */
465 		ipif_ill_refrele_tail(ill);
466 	} else {
467 		mutex_exit(&ill->ill_lock);
468 	}
469 	mutex_destroy(&nce->nce_lock);
470 	freeb(nce->nce_mp);
471 }
472 
473 /*
474  * ndp_walk routine.  Delete the nce if it is associated with the ill
475  * that is going away.  Always called as a writer.
476  */
477 void
478 ndp_delete_per_ill(nce_t *nce, uchar_t *arg)
479 {
480 	if ((nce != NULL) && nce->nce_ill == (ill_t *)arg) {
481 		ndp_delete(nce);
482 	}
483 }
484 
485 /*
486  * Walk a list of to be inactive NCEs and blow away all the ires.
487  */
488 static void
489 nce_ire_delete_list(nce_t *nce)
490 {
491 	nce_t *nce_next;
492 
493 	ASSERT(nce != NULL);
494 	while (nce != NULL) {
495 		nce_next = nce->nce_next;
496 		nce->nce_next = NULL;
497 
498 		/*
499 		 * It is possible for the last ndp walker (this thread)
500 		 * to come here after ndp_delete has marked the nce CONDEMNED
501 		 * and before it has removed the nce from the fastpath list
502 		 * or called untimeout. So we need to do it here. It is safe
503 		 * for both ndp_delete and this thread to do it twice or
504 		 * even simultaneously since each of the threads has a
505 		 * reference on the nce.
506 		 */
507 		nce_fastpath_list_delete(nce);
508 		/*
509 		 * Cancel any running timer. Timeout can't be restarted
510 		 * since CONDEMNED is set. Can't hold nce_lock across untimeout.
511 		 * Passing invalid timeout id is fine.
512 		 */
513 		if (nce->nce_timeout_id != 0) {
514 			(void) untimeout(nce->nce_timeout_id);
515 			nce->nce_timeout_id = 0;
516 		}
517 
518 		ire_walk_ill_v6(MATCH_IRE_ILL | MATCH_IRE_TYPE, IRE_CACHE,
519 		    nce_ire_delete1, (char *)nce, nce->nce_ill);
520 		NCE_REFRELE_NOTR(nce);
521 		nce = nce_next;
522 	}
523 }
524 
525 /*
526  * Delete an ire when the nce goes away.
527  */
528 /* ARGSUSED */
529 static void
530 nce_ire_delete(nce_t *nce)
531 {
532 	ire_walk_ill_v6(MATCH_IRE_ILL | MATCH_IRE_TYPE, IRE_CACHE,
533 	    nce_ire_delete1, (char *)nce, nce->nce_ill);
534 	NCE_REFRELE_NOTR(nce);
535 }
536 
537 /*
538  * ire_walk routine used to delete every IRE that shares this nce
539  */
540 static void
541 nce_ire_delete1(ire_t *ire, char *nce_arg)
542 {
543 	nce_t	*nce = (nce_t *)nce_arg;
544 
545 	ASSERT(ire->ire_type == IRE_CACHE);
546 
547 	if (ire->ire_nce == nce)
548 		ire_delete(ire);
549 }
550 
551 /*
552  * Cache entry lookup.  Try to find an nce matching the parameters passed.
553  * If one is found, the refcnt on the nce will be incremented.
554  */
555 nce_t *
556 ndp_lookup(ill_t *ill, const in6_addr_t *addr, boolean_t caller_holds_lock)
557 {
558 	nce_t	*nce;
559 
560 	if (!caller_holds_lock)
561 		mutex_enter(&ndp_g_lock);
562 	nce = nce_lookup_addr(ill, addr);
563 	if (nce == NULL)
564 		nce = nce_lookup_mapping(ill, addr);
565 	if (!caller_holds_lock)
566 		mutex_exit(&ndp_g_lock);
567 	return (nce);
568 }
569 
570 /*
571  * Cache entry lookup.  Try to find an nce matching the parameters passed.
572  * Look only for exact entries (no mappings).  If an nce is found, increment
573  * the hold count on that nce.
574  */
575 static nce_t *
576 nce_lookup_addr(ill_t *ill, const in6_addr_t *addr)
577 {
578 	nce_t	*nce;
579 
580 	ASSERT(ill != NULL);
581 	ASSERT(MUTEX_HELD(&ndp_g_lock));
582 	if (IN6_IS_ADDR_UNSPECIFIED(addr))
583 		return (NULL);
584 	nce = *((nce_t **)NCE_HASH_PTR(*addr));
585 	for (; nce != NULL; nce = nce->nce_next) {
586 		if (nce->nce_ill == ill) {
587 			if (IN6_ARE_ADDR_EQUAL(&nce->nce_addr, addr) &&
588 			    IN6_ARE_ADDR_EQUAL(&nce->nce_mask,
589 			    &ipv6_all_ones)) {
590 				mutex_enter(&nce->nce_lock);
591 				if (!(nce->nce_flags & NCE_F_CONDEMNED)) {
592 					NCE_REFHOLD_LOCKED(nce);
593 					mutex_exit(&nce->nce_lock);
594 					break;
595 				}
596 				mutex_exit(&nce->nce_lock);
597 			}
598 		}
599 	}
600 	return (nce);
601 }
602 
603 /*
604  * Cache entry lookup.  Try to find an nce matching the parameters passed.
605  * Look only for mappings.
606  */
607 static nce_t *
608 nce_lookup_mapping(ill_t *ill, const in6_addr_t *addr)
609 {
610 	nce_t	*nce;
611 
612 	ASSERT(ill != NULL);
613 	ASSERT(MUTEX_HELD(&ndp_g_lock));
614 	if (!IN6_IS_ADDR_MULTICAST(addr))
615 		return (NULL);
616 	nce = nce_mask_entries;
617 	for (; nce != NULL; nce = nce->nce_next)
618 		if (nce->nce_ill == ill &&
619 		    (V6_MASK_EQ(*addr, nce->nce_mask, nce->nce_addr))) {
620 			mutex_enter(&nce->nce_lock);
621 			if (!(nce->nce_flags & NCE_F_CONDEMNED)) {
622 				NCE_REFHOLD_LOCKED(nce);
623 				mutex_exit(&nce->nce_lock);
624 				break;
625 			}
626 			mutex_exit(&nce->nce_lock);
627 		}
628 	return (nce);
629 }
630 
631 /*
632  * Process passed in parameters either from an incoming packet or via
633  * user ioctl.
634  */
635 void
636 ndp_process(nce_t *nce, uchar_t *hw_addr, uint32_t flag, boolean_t is_adv)
637 {
638 	ill_t	*ill = nce->nce_ill;
639 	uint32_t hw_addr_len = ill->ill_nd_lla_len;
640 	mblk_t	*mp;
641 	boolean_t ll_updated = B_FALSE;
642 	boolean_t ll_changed;
643 
644 	/*
645 	 * No updates of link layer address or the neighbor state is
646 	 * allowed, when the cache is in NONUD state.  This still
647 	 * allows for responding to reachability solicitation.
648 	 */
649 	mutex_enter(&nce->nce_lock);
650 	if (nce->nce_state == ND_INCOMPLETE) {
651 		if (hw_addr == NULL) {
652 			mutex_exit(&nce->nce_lock);
653 			return;
654 		}
655 		nce_set_ll(nce, hw_addr);
656 		/*
657 		 * Update nce state and send the queued packets
658 		 * back to ip this time ire will be added.
659 		 */
660 		if (flag & ND_NA_FLAG_SOLICITED) {
661 			nce_update(nce, ND_REACHABLE, NULL);
662 		} else {
663 			nce_update(nce, ND_STALE, NULL);
664 		}
665 		mutex_exit(&nce->nce_lock);
666 		nce_fastpath(nce);
667 		mutex_enter(&nce->nce_lock);
668 		mp = nce->nce_qd_mp;
669 		nce->nce_qd_mp = NULL;
670 		mutex_exit(&nce->nce_lock);
671 		while (mp != NULL) {
672 			mblk_t *nxt_mp;
673 
674 			nxt_mp = mp->b_next;
675 			mp->b_next = NULL;
676 			if (mp->b_prev != NULL) {
677 				ill_t   *inbound_ill;
678 				queue_t *fwdq = NULL;
679 				uint_t ifindex;
680 
681 				ifindex = (uint_t)(uintptr_t)mp->b_prev;
682 				inbound_ill = ill_lookup_on_ifindex(ifindex,
683 				    B_TRUE, NULL, NULL, NULL, NULL);
684 				if (inbound_ill == NULL) {
685 					mp->b_prev = NULL;
686 					freemsg(mp);
687 					return;
688 				} else {
689 					fwdq = inbound_ill->ill_rq;
690 				}
691 				mp->b_prev = NULL;
692 				/*
693 				 * Send a forwarded packet back into ip_rput_v6
694 				 * just as in ire_send_v6().
695 				 * Extract the queue from b_prev (set in
696 				 * ip_rput_data_v6).
697 				 */
698 				if (fwdq != NULL) {
699 					/*
700 					 * Forwarded packets hop count will
701 					 * get decremented in ip_rput_data_v6
702 					 */
703 					put(fwdq, mp);
704 				} else {
705 					/*
706 					 * Send locally originated packets back
707 					 * into * ip_wput_v6.
708 					 */
709 					put(ill->ill_wq, mp);
710 				}
711 				ill_refrele(inbound_ill);
712 			} else {
713 				put(ill->ill_wq, mp);
714 			}
715 			mp = nxt_mp;
716 		}
717 		return;
718 	}
719 	ll_changed = nce_cmp_ll_addr(nce, (char *)hw_addr, hw_addr_len);
720 	if (!is_adv) {
721 		/* If this is a SOLICITATION request only */
722 		if (ll_changed)
723 			nce_update(nce, ND_STALE, hw_addr);
724 		mutex_exit(&nce->nce_lock);
725 		return;
726 	}
727 	if (!(flag & ND_NA_FLAG_OVERRIDE) && ll_changed) {
728 		/* If in any other state than REACHABLE, ignore */
729 		if (nce->nce_state == ND_REACHABLE) {
730 			nce_update(nce, ND_STALE, NULL);
731 		}
732 		mutex_exit(&nce->nce_lock);
733 		return;
734 	} else {
735 		if (ll_changed) {
736 			nce_update(nce, ND_UNCHANGED, hw_addr);
737 			ll_updated = B_TRUE;
738 		}
739 		if (flag & ND_NA_FLAG_SOLICITED) {
740 			nce_update(nce, ND_REACHABLE, NULL);
741 		} else {
742 			if (ll_updated) {
743 				nce_update(nce, ND_STALE, NULL);
744 			}
745 		}
746 		mutex_exit(&nce->nce_lock);
747 		if (!(flag & ND_NA_FLAG_ROUTER) && (nce->nce_flags &
748 		    NCE_F_ISROUTER)) {
749 			ire_t *ire;
750 
751 			/*
752 			 * Router turned to host.  We need to remove the
753 			 * entry as well as any default route that may be
754 			 * using this as a next hop.  This is required by
755 			 * section 7.2.5 of RFC 2461.
756 			 */
757 			ire = ire_ftable_lookup_v6(&ipv6_all_zeros,
758 			    &ipv6_all_zeros, &nce->nce_addr, IRE_DEFAULT,
759 			    nce->nce_ill->ill_ipif, NULL, ALL_ZONES, 0,
760 			    MATCH_IRE_ILL | MATCH_IRE_TYPE | MATCH_IRE_GW |
761 			    MATCH_IRE_DEFAULT);
762 			if (ire != NULL) {
763 				ip_rts_rtmsg(RTM_DELETE, ire, 0);
764 				ire_delete(ire);
765 				ire_refrele(ire);
766 			}
767 			ndp_delete(nce);
768 		}
769 	}
770 }
771 
772 /*
773  * Pass arg1 to the pfi supplied, along with each nce in existence.
774  * ndp_walk() places a REFHOLD on the nce and drops the lock when
775  * walking the hash list.
776  */
777 void
778 ndp_walk_impl(ill_t *ill, pfi_t pfi, uchar_t *arg1, boolean_t trace)
779 {
780 
781 	nce_t	*nce;
782 	nce_t	*nce1;
783 	nce_t	**ncep;
784 	nce_t	*free_nce_list = NULL;
785 
786 	mutex_enter(&ndp_g_lock);
787 	ndp_g_walker++;	/* Prevent ndp_delete from unlink and free of NCE */
788 	mutex_exit(&ndp_g_lock);
789 	for (ncep = nce_hash_tbl; ncep < A_END(nce_hash_tbl); ncep++) {
790 		for (nce = *ncep; nce; nce = nce1) {
791 			nce1 = nce->nce_next;
792 			if (ill == NULL || nce->nce_ill == ill) {
793 				if (trace) {
794 					NCE_REFHOLD(nce);
795 					(*pfi)(nce, arg1);
796 					NCE_REFRELE(nce);
797 				} else {
798 					NCE_REFHOLD_NOTR(nce);
799 					(*pfi)(nce, arg1);
800 					NCE_REFRELE_NOTR(nce);
801 				}
802 			}
803 		}
804 	}
805 	for (nce = nce_mask_entries; nce; nce = nce1) {
806 		nce1 = nce->nce_next;
807 		if (ill == NULL || nce->nce_ill == ill) {
808 			if (trace) {
809 				NCE_REFHOLD(nce);
810 				(*pfi)(nce, arg1);
811 				NCE_REFRELE(nce);
812 			} else {
813 				NCE_REFHOLD_NOTR(nce);
814 				(*pfi)(nce, arg1);
815 				NCE_REFRELE_NOTR(nce);
816 			}
817 		}
818 	}
819 	mutex_enter(&ndp_g_lock);
820 	ndp_g_walker--;
821 	/*
822 	 * While NCE's are removed from global list they are placed
823 	 * in a private list, to be passed to nce_ire_delete_list().
824 	 * The reason is, there may be ires pointing to this nce
825 	 * which needs to cleaned up.
826 	 */
827 	if (ndp_g_walker_cleanup && ndp_g_walker == 0) {
828 		/* Time to delete condemned entries */
829 		for (ncep = nce_hash_tbl; ncep < A_END(nce_hash_tbl); ncep++) {
830 			nce = *ncep;
831 			if (nce != NULL) {
832 				nce_remove(nce, &free_nce_list);
833 			}
834 		}
835 		nce = nce_mask_entries;
836 		if (nce != NULL) {
837 			nce_remove(nce, &free_nce_list);
838 		}
839 		ndp_g_walker_cleanup = B_FALSE;
840 	}
841 	mutex_exit(&ndp_g_lock);
842 
843 	if (free_nce_list != NULL) {
844 		nce_ire_delete_list(free_nce_list);
845 	}
846 }
847 
848 void
849 ndp_walk(ill_t *ill, pfi_t pfi, uchar_t *arg1)
850 {
851 	ndp_walk_impl(ill, pfi, arg1, B_TRUE);
852 }
853 
854 /*
855  * Prepend the zoneid using an ipsec_out_t for later use by functions like
856  * ip_rput_v6() after neighbor discovery has taken place.  If the message
857  * block already has a M_CTL at the front of it, then simply set the zoneid
858  * appropriately.
859  */
860 static mblk_t *
861 ndp_prepend_zone(mblk_t *mp, zoneid_t zoneid)
862 {
863 	mblk_t		*first_mp;
864 	ipsec_out_t	*io;
865 
866 	if (mp->b_datap->db_type == M_CTL) {
867 		io = (ipsec_out_t *)mp->b_rptr;
868 		ASSERT(io->ipsec_out_type == IPSEC_OUT);
869 		io->ipsec_out_zoneid = zoneid;
870 		return (mp);
871 	}
872 
873 	first_mp = ipsec_alloc_ipsec_out();
874 	if (first_mp == NULL)
875 		return (NULL);
876 	io = (ipsec_out_t *)first_mp->b_rptr;
877 	/* This is not a secure packet */
878 	io->ipsec_out_secure = B_FALSE;
879 	io->ipsec_out_zoneid = zoneid;
880 	first_mp->b_cont = mp;
881 	return (first_mp);
882 }
883 
884 /*
885  * Process resolve requests.  Handles both mapped entries
886  * as well as cases that needs to be send out on the wire.
887  * Lookup a NCE for a given IRE.  Regardless of whether one exists
888  * or one is created, we defer making ire point to nce until the
889  * ire is actually added at which point the nce_refcnt on the nce is
890  * incremented.  This is done primarily to have symmetry between ire_add()
891  * and ire_delete() which decrements the nce_refcnt, when an ire is deleted.
892  */
893 int
894 ndp_resolver(ill_t *ill, const in6_addr_t *dst, mblk_t *mp, zoneid_t zoneid)
895 {
896 	nce_t		*nce;
897 	int		err = 0;
898 	uint32_t	ms;
899 	mblk_t		*mp_nce = NULL;
900 
901 	ASSERT(ill != NULL);
902 	if (IN6_IS_ADDR_MULTICAST(dst)) {
903 		err = nce_set_multicast(ill, dst);
904 		return (err);
905 	}
906 	err = ndp_lookup_then_add(ill,
907 	    NULL,	/* No hardware address */
908 	    dst,
909 	    &ipv6_all_ones,
910 	    &ipv6_all_zeros,
911 	    0,
912 	    (ill->ill_flags & ILLF_NONUD) ? NCE_F_NONUD : 0,
913 	    ND_INCOMPLETE,
914 	    &nce);
915 
916 	switch (err) {
917 	case 0:
918 		/*
919 		 * New cache entry was created. Make sure that the state
920 		 * is not ND_INCOMPLETE. It can be in some other state
921 		 * even before we send out the solicitation as we could
922 		 * get un-solicited advertisements.
923 		 *
924 		 * If this is an XRESOLV interface, simply return 0,
925 		 * since we don't want to solicit just yet.
926 		 */
927 		if (ill->ill_flags & ILLF_XRESOLV) {
928 			NCE_REFRELE(nce);
929 			return (0);
930 		}
931 		rw_enter(&ill_g_lock, RW_READER);
932 		mutex_enter(&nce->nce_lock);
933 		if (nce->nce_state != ND_INCOMPLETE) {
934 			mutex_exit(&nce->nce_lock);
935 			rw_exit(&ill_g_lock);
936 			NCE_REFRELE(nce);
937 			return (0);
938 		}
939 		mp_nce = ndp_prepend_zone(mp, zoneid);
940 		if (mp_nce == NULL) {
941 			/* The caller will free mp */
942 			mutex_exit(&nce->nce_lock);
943 			rw_exit(&ill_g_lock);
944 			ndp_delete(nce);
945 			NCE_REFRELE(nce);
946 			return (ENOMEM);
947 		}
948 		ms = nce_solicit(nce, mp_nce);
949 		rw_exit(&ill_g_lock);
950 		if (ms == 0) {
951 			/* The caller will free mp */
952 			if (mp_nce != mp)
953 				freeb(mp_nce);
954 			mutex_exit(&nce->nce_lock);
955 			ndp_delete(nce);
956 			NCE_REFRELE(nce);
957 			return (EBUSY);
958 		}
959 		mutex_exit(&nce->nce_lock);
960 		NDP_RESTART_TIMER(nce, (clock_t)ms);
961 		NCE_REFRELE(nce);
962 		return (EINPROGRESS);
963 	case EEXIST:
964 		/* Resolution in progress just queue the packet */
965 		mutex_enter(&nce->nce_lock);
966 		if (nce->nce_state == ND_INCOMPLETE) {
967 			mp_nce = ndp_prepend_zone(mp, zoneid);
968 			if (mp_nce == NULL) {
969 				err = ENOMEM;
970 			} else {
971 				nce_queue_mp(nce, mp_nce);
972 				err = EINPROGRESS;
973 			}
974 		} else {
975 			/*
976 			 * Any other state implies we have
977 			 * a nce but IRE needs to be added ...
978 			 * ire_add_v6() will take care of the
979 			 * the case when the nce becomes CONDEMNED
980 			 * before the ire is added to the table.
981 			 */
982 			err = 0;
983 		}
984 		mutex_exit(&nce->nce_lock);
985 		NCE_REFRELE(nce);
986 		break;
987 	default:
988 		ip1dbg(("ndp_resolver: Can't create NCE %d\n", err));
989 		break;
990 	}
991 	return (err);
992 }
993 
994 /*
995  * When there is no resolver, the link layer template is passed in
996  * the IRE.
997  * Lookup a NCE for a given IRE.  Regardless of whether one exists
998  * or one is created, we defer making ire point to nce until the
999  * ire is actually added at which point the nce_refcnt on the nce is
1000  * incremented.  This is done primarily to have symmetry between ire_add()
1001  * and ire_delete() which decrements the nce_refcnt, when an ire is deleted.
1002  */
1003 int
1004 ndp_noresolver(ill_t *ill, const in6_addr_t *dst)
1005 {
1006 	nce_t		*nce;
1007 	int		err = 0;
1008 
1009 	ASSERT(ill != NULL);
1010 	if (IN6_IS_ADDR_MULTICAST(dst)) {
1011 		err = nce_set_multicast(ill, dst);
1012 		return (err);
1013 	}
1014 
1015 	err = ndp_lookup_then_add(ill,
1016 	    NULL,	/* hardware address */
1017 	    dst,
1018 	    &ipv6_all_ones,
1019 	    &ipv6_all_zeros,
1020 	    0,
1021 	    (ill->ill_flags & ILLF_NONUD) ? NCE_F_NONUD : 0,
1022 	    ND_REACHABLE,
1023 	    &nce);
1024 
1025 	switch (err) {
1026 	case 0:
1027 		/*
1028 		 * Cache entry with a proper resolver cookie was
1029 		 * created.
1030 		 */
1031 		nce_fastpath(nce);
1032 		NCE_REFRELE(nce);
1033 		break;
1034 	case EEXIST:
1035 		err = 0;
1036 		NCE_REFRELE(nce);
1037 		break;
1038 	default:
1039 		ip1dbg(("ndp_noresolver: Can't create NCE %d\n", err));
1040 		break;
1041 	}
1042 	return (err);
1043 }
1044 
1045 /*
1046  * For each interface an entry is added for the unspecified multicast group.
1047  * Here that mapping is used to form the multicast cache entry for a particular
1048  * multicast destination.
1049  */
1050 static int
1051 nce_set_multicast(ill_t *ill, const in6_addr_t *dst)
1052 {
1053 	nce_t		*mnce;	/* Multicast mapping entry */
1054 	nce_t		*nce;
1055 	uchar_t		*hw_addr = NULL;
1056 	int		err = 0;
1057 
1058 	ASSERT(ill != NULL);
1059 	ASSERT(!(IN6_IS_ADDR_UNSPECIFIED(dst)));
1060 
1061 	mutex_enter(&ndp_g_lock);
1062 	nce = nce_lookup_addr(ill, dst);
1063 	if (nce != NULL) {
1064 		mutex_exit(&ndp_g_lock);
1065 		NCE_REFRELE(nce);
1066 		return (0);
1067 	}
1068 	/* No entry, now lookup for a mapping this should never fail */
1069 	mnce = nce_lookup_mapping(ill, dst);
1070 	if (mnce == NULL) {
1071 		/* Something broken for the interface. */
1072 		mutex_exit(&ndp_g_lock);
1073 		return (ESRCH);
1074 	}
1075 	ASSERT(mnce->nce_flags & NCE_F_MAPPING);
1076 	if (ill->ill_net_type == IRE_IF_RESOLVER) {
1077 		/*
1078 		 * For IRE_IF_RESOLVER a hardware mapping can be
1079 		 * generated, for IRE_IF_NORESOLVER, resolution cookie
1080 		 * in the ill is copied in ndp_add().
1081 		 */
1082 		hw_addr = kmem_alloc(ill->ill_nd_lla_len, KM_NOSLEEP);
1083 		if (hw_addr == NULL) {
1084 			mutex_exit(&ndp_g_lock);
1085 			NCE_REFRELE(mnce);
1086 			return (ENOMEM);
1087 		}
1088 		nce_make_mapping(mnce, hw_addr, (uchar_t *)dst);
1089 	}
1090 	NCE_REFRELE(mnce);
1091 	/*
1092 	 * IRE_IF_NORESOLVER type simply copies the resolution
1093 	 * cookie passed in.  So no hw_addr is needed.
1094 	 */
1095 	err = ndp_add(ill,
1096 	    hw_addr,
1097 	    dst,
1098 	    &ipv6_all_ones,
1099 	    &ipv6_all_zeros,
1100 	    0,
1101 	    NCE_F_NONUD,
1102 	    ND_REACHABLE,
1103 	    &nce);
1104 	mutex_exit(&ndp_g_lock);
1105 	if (hw_addr != NULL)
1106 		kmem_free(hw_addr, ill->ill_nd_lla_len);
1107 	if (err != 0) {
1108 		ip1dbg(("nce_set_multicast: create failed" "%d\n", err));
1109 		return (err);
1110 	}
1111 	nce_fastpath(nce);
1112 	NCE_REFRELE(nce);
1113 	return (0);
1114 }
1115 
1116 /*
1117  * Return the link layer address, and any flags of a nce.
1118  */
1119 int
1120 ndp_query(ill_t *ill, struct lif_nd_req *lnr)
1121 {
1122 	nce_t		*nce;
1123 	in6_addr_t	*addr;
1124 	sin6_t		*sin6;
1125 	dl_unitdata_req_t	*dl;
1126 
1127 	ASSERT(ill != NULL);
1128 	sin6 = (sin6_t *)&lnr->lnr_addr;
1129 	addr =  &sin6->sin6_addr;
1130 
1131 	nce = ndp_lookup(ill, addr, B_FALSE);
1132 	if (nce == NULL)
1133 		return (ESRCH);
1134 	/* If in INCOMPLETE state, no link layer address is available yet */
1135 	if (nce->nce_state == ND_INCOMPLETE)
1136 		goto done;
1137 	dl = (dl_unitdata_req_t *)nce->nce_res_mp->b_rptr;
1138 	if (ill->ill_flags & ILLF_XRESOLV)
1139 		lnr->lnr_hdw_len = dl->dl_dest_addr_length;
1140 	else
1141 		lnr->lnr_hdw_len = ill->ill_nd_lla_len;
1142 	ASSERT(NCE_LL_ADDR_OFFSET(ill) + lnr->lnr_hdw_len <=
1143 	    sizeof (lnr->lnr_hdw_addr));
1144 	bcopy(nce->nce_res_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill),
1145 	    (uchar_t *)&lnr->lnr_hdw_addr, lnr->lnr_hdw_len);
1146 	if (nce->nce_flags & NCE_F_ISROUTER)
1147 		lnr->lnr_flags = NDF_ISROUTER_ON;
1148 	if (nce->nce_flags & NCE_F_PROXY)
1149 		lnr->lnr_flags |= NDF_PROXY_ON;
1150 	if (nce->nce_flags & NCE_F_ANYCAST)
1151 		lnr->lnr_flags |= NDF_ANYCAST_ON;
1152 done:
1153 	NCE_REFRELE(nce);
1154 	return (0);
1155 }
1156 
1157 /*
1158  * Send Enable/Disable multicast reqs to driver.
1159  */
1160 int
1161 ndp_mcastreq(ill_t *ill, const in6_addr_t *addr, uint32_t hw_addr_len,
1162     uint32_t hw_addr_offset, mblk_t *mp)
1163 {
1164 	nce_t		*nce;
1165 	uchar_t		*hw_addr;
1166 
1167 	ASSERT(ill != NULL);
1168 	ASSERT(ill->ill_net_type == IRE_IF_RESOLVER);
1169 	hw_addr = mi_offset_paramc(mp, hw_addr_offset, hw_addr_len);
1170 	if (hw_addr == NULL || !IN6_IS_ADDR_MULTICAST(addr)) {
1171 		freemsg(mp);
1172 		return (EINVAL);
1173 	}
1174 	mutex_enter(&ndp_g_lock);
1175 	nce = nce_lookup_mapping(ill, addr);
1176 	if (nce == NULL) {
1177 		mutex_exit(&ndp_g_lock);
1178 		freemsg(mp);
1179 		return (ESRCH);
1180 	}
1181 	mutex_exit(&ndp_g_lock);
1182 	/*
1183 	 * Update dl_addr_length and dl_addr_offset for primitives that
1184 	 * have physical addresses as opposed to full saps
1185 	 */
1186 	switch (((union DL_primitives *)mp->b_rptr)->dl_primitive) {
1187 	case DL_ENABMULTI_REQ:
1188 		/* Track the state if this is the first enabmulti */
1189 		if (ill->ill_dlpi_multicast_state == IDMS_UNKNOWN)
1190 			ill->ill_dlpi_multicast_state = IDMS_INPROGRESS;
1191 		ip1dbg(("ndp_mcastreq: ENABMULTI\n"));
1192 		break;
1193 	case DL_DISABMULTI_REQ:
1194 		ip1dbg(("ndp_mcastreq: DISABMULTI\n"));
1195 		break;
1196 	default:
1197 		NCE_REFRELE(nce);
1198 		ip1dbg(("ndp_mcastreq: default\n"));
1199 		return (EINVAL);
1200 	}
1201 	nce_make_mapping(nce, hw_addr, (uchar_t *)addr);
1202 	NCE_REFRELE(nce);
1203 	putnext(ill->ill_wq, mp);
1204 	return (0);
1205 }
1206 
1207 /*
1208  * Send a neighbor solicitation.
1209  * Returns number of milliseconds after which we should either rexmit or abort.
1210  * Return of zero means we should abort.
1211  * The caller holds the nce_lock to protect nce_qd_mp and nce_rcnt.
1212  *
1213  * NOTE: This routine drops nce_lock (and later reacquires it) when sending
1214  * the packet.
1215  * NOTE: This routine does not consume mp.
1216  */
1217 uint32_t
1218 nce_solicit(nce_t *nce, mblk_t *mp)
1219 {
1220 	ill_t		*ill;
1221 	ill_t		*src_ill;
1222 	ip6_t		*ip6h;
1223 	in6_addr_t	src;
1224 	in6_addr_t	dst;
1225 	ipif_t		*ipif;
1226 	ip6i_t		*ip6i;
1227 	boolean_t	dropped = B_FALSE;
1228 
1229 	ASSERT(RW_READ_HELD(&ill_g_lock));
1230 	ASSERT(MUTEX_HELD(&nce->nce_lock));
1231 	ill = nce->nce_ill;
1232 	ASSERT(ill != NULL);
1233 
1234 	if (nce->nce_rcnt == 0) {
1235 		return (0);
1236 	}
1237 
1238 	if (mp == NULL) {
1239 		ASSERT(nce->nce_qd_mp != NULL);
1240 		mp = nce->nce_qd_mp;
1241 	} else {
1242 		nce_queue_mp(nce, mp);
1243 	}
1244 
1245 	/* Handle ip_newroute_v6 giving us IPSEC packets */
1246 	if (mp->b_datap->db_type == M_CTL)
1247 		mp = mp->b_cont;
1248 
1249 	ip6h = (ip6_t *)mp->b_rptr;
1250 	if (ip6h->ip6_nxt == IPPROTO_RAW) {
1251 		/*
1252 		 * This message should have been pulled up already in
1253 		 * ip_wput_v6. We can't do pullups here because the message
1254 		 * could be from the nce_qd_mp which could have b_next/b_prev
1255 		 * non-NULL.
1256 		 */
1257 		ip6i = (ip6i_t *)ip6h;
1258 		ASSERT((mp->b_wptr - (uchar_t *)ip6i) >=
1259 			    sizeof (ip6i_t) + IPV6_HDR_LEN);
1260 		ip6h = (ip6_t *)(mp->b_rptr + sizeof (ip6i_t));
1261 	}
1262 	src = ip6h->ip6_src;
1263 	/*
1264 	 * If the src of outgoing packet is one of the assigned interface
1265 	 * addresses use it, otherwise we will pick the source address below.
1266 	 */
1267 	src_ill = ill;
1268 	if (!IN6_IS_ADDR_UNSPECIFIED(&src)) {
1269 		if (ill->ill_group != NULL)
1270 			src_ill = ill->ill_group->illgrp_ill;
1271 		for (; src_ill != NULL; src_ill = src_ill->ill_group_next) {
1272 			for (ipif = src_ill->ill_ipif; ipif != NULL;
1273 			    ipif = ipif->ipif_next) {
1274 				if (IN6_ARE_ADDR_EQUAL(&src,
1275 				    &ipif->ipif_v6lcl_addr)) {
1276 					break;
1277 				}
1278 			}
1279 			if (ipif != NULL)
1280 				break;
1281 		}
1282 		if (src_ill == NULL) {
1283 			/* May be a forwarding packet */
1284 			src_ill = ill;
1285 			src = ipv6_all_zeros;
1286 		}
1287 	}
1288 	dst = nce->nce_addr;
1289 	/*
1290 	 * If source address is unspecified, nce_xmit will choose
1291 	 * one for us and initialize the hardware address also
1292 	 * appropriately.
1293 	 */
1294 	if (IN6_IS_ADDR_UNSPECIFIED(&src))
1295 		src_ill  = NULL;
1296 	nce->nce_rcnt--;
1297 	mutex_exit(&nce->nce_lock);
1298 	rw_exit(&ill_g_lock);
1299 	dropped = nce_xmit(ill, ND_NEIGHBOR_SOLICIT, src_ill, B_TRUE, &src,
1300 	    &dst, 0);
1301 	rw_enter(&ill_g_lock, RW_READER);
1302 	mutex_enter(&nce->nce_lock);
1303 	if (dropped)
1304 		nce->nce_rcnt++;
1305 	return (ill->ill_reachable_retrans_time);
1306 }
1307 
1308 void
1309 ndp_input_solicit(ill_t *ill, mblk_t *mp)
1310 {
1311 	nd_neighbor_solicit_t *ns;
1312 	uint32_t	hlen = ill->ill_nd_lla_len;
1313 	uchar_t		*haddr = NULL;
1314 	icmp6_t		*icmp_nd;
1315 	ip6_t		*ip6h;
1316 	nce_t		*our_nce = NULL;
1317 	in6_addr_t	target;
1318 	in6_addr_t	src;
1319 	int		len;
1320 	int		flag = 0;
1321 	nd_opt_hdr_t	*opt = NULL;
1322 	boolean_t	bad_solicit = B_FALSE;
1323 	mib2_ipv6IfIcmpEntry_t	*mib = ill->ill_icmp6_mib;
1324 
1325 	ip6h = (ip6_t *)mp->b_rptr;
1326 	icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
1327 	len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN;
1328 	src = ip6h->ip6_src;
1329 	ns = (nd_neighbor_solicit_t *)icmp_nd;
1330 	target = ns->nd_ns_target;
1331 	if (IN6_IS_ADDR_MULTICAST(&target)) {
1332 		if (ip_debug > 2) {
1333 			/* ip1dbg */
1334 			pr_addr_dbg("ndp_input_solicit: Target is"
1335 			    " multicast! %s\n", AF_INET6, &target);
1336 		}
1337 		bad_solicit = B_TRUE;
1338 		goto done;
1339 	}
1340 	if (len > sizeof (nd_neighbor_solicit_t)) {
1341 		/* Options present */
1342 		opt = (nd_opt_hdr_t *)&ns[1];
1343 		len -= sizeof (nd_neighbor_solicit_t);
1344 		if (!ndp_verify_optlen(opt, len)) {
1345 			ip1dbg(("ndp_input_solicit: Bad opt len\n"));
1346 			bad_solicit = B_TRUE;
1347 			goto done;
1348 		}
1349 	}
1350 	if (IN6_IS_ADDR_UNSPECIFIED(&src)) {
1351 		/* Check to see if this is a valid DAD solicitation */
1352 		if (!IN6_IS_ADDR_MC_SOLICITEDNODE(&ip6h->ip6_dst)) {
1353 			if (ip_debug > 2) {
1354 				/* ip1dbg */
1355 				pr_addr_dbg("ndp_input_solicit: IPv6 "
1356 				    "Destination is not solicited node "
1357 				    "multicast %s\n", AF_INET6,
1358 				    &ip6h->ip6_dst);
1359 			}
1360 			bad_solicit = B_TRUE;
1361 			goto done;
1362 		}
1363 	}
1364 
1365 	our_nce = ndp_lookup(ill, &target, B_FALSE);
1366 	/*
1367 	 * If this is a valid Solicitation, a permanent
1368 	 * entry should exist in the cache
1369 	 */
1370 	if (our_nce == NULL ||
1371 	    !(our_nce->nce_flags & NCE_F_PERMANENT)) {
1372 		ip1dbg(("ndp_input_solicit: Wrong target in NS?!"
1373 		    "ifname=%s ", ill->ill_name));
1374 		if (ip_debug > 2) {
1375 			/* ip1dbg */
1376 			pr_addr_dbg(" dst %s\n", AF_INET6, &target);
1377 		}
1378 		bad_solicit = B_TRUE;
1379 		goto done;
1380 	}
1381 
1382 	/* At this point we should have a verified NS per spec */
1383 	if (opt != NULL) {
1384 		opt = ndp_get_option(opt, len, ND_OPT_SOURCE_LINKADDR);
1385 		if (opt != NULL) {
1386 			/*
1387 			 * No source link layer address option should
1388 			 * be present in a valid DAD request.
1389 			 */
1390 			if (IN6_IS_ADDR_UNSPECIFIED(&src)) {
1391 				ip1dbg(("ndp_input_solicit: source link-layer "
1392 				    "address option present with an "
1393 				    "unspecified source. \n"));
1394 				bad_solicit = B_TRUE;
1395 				goto done;
1396 			}
1397 			haddr = (uchar_t *)&opt[1];
1398 			if (hlen > opt->nd_opt_len * 8 ||
1399 			    hlen == 0) {
1400 				bad_solicit = B_TRUE;
1401 				goto done;
1402 			}
1403 		}
1404 	}
1405 	/* Set override flag, it will be reset later if need be. */
1406 	flag |= NDP_ORIDE;
1407 	if (!IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst)) {
1408 		flag |= NDP_UNICAST;
1409 	}
1410 
1411 	/*
1412 	 * Create/update the entry for the soliciting node.
1413 	 * or respond to outstanding queries, don't if
1414 	 * the source is unspecified address.
1415 	 */
1416 	if (!IN6_IS_ADDR_UNSPECIFIED(&src)) {
1417 		int	err = 0;
1418 		nce_t	*nnce;
1419 
1420 		err = ndp_lookup_then_add(ill,
1421 		    haddr,
1422 		    &src,	/* Soliciting nodes address */
1423 		    &ipv6_all_ones,
1424 		    &ipv6_all_zeros,
1425 		    0,
1426 		    0,
1427 		    ND_STALE,
1428 		    &nnce);
1429 		switch (err) {
1430 		case 0:
1431 			/* done with this entry */
1432 			NCE_REFRELE(nnce);
1433 			break;
1434 		case EEXIST:
1435 			/*
1436 			 * B_FALSE indicates this is not an
1437 			 * an advertisement.
1438 			 */
1439 			ndp_process(nnce, haddr, 0, B_FALSE);
1440 			NCE_REFRELE(nnce);
1441 			break;
1442 		default:
1443 			ip1dbg(("ndp_input_solicit: Can't create NCE %d\n",
1444 			    err));
1445 			goto done;
1446 		}
1447 		flag |= NDP_SOLICITED;
1448 	} else {
1449 		/*
1450 		 * This is a DAD req, multicast the advertisement
1451 		 * to the all-nodes address.
1452 		 */
1453 		src = ipv6_all_hosts_mcast;
1454 	}
1455 	if (our_nce->nce_flags & NCE_F_ISROUTER)
1456 		flag |= NDP_ISROUTER;
1457 	if (our_nce->nce_flags & NCE_F_PROXY)
1458 		flag &= ~NDP_ORIDE;
1459 	/* Response to a solicitation */
1460 	(void) nce_xmit(ill,
1461 	    ND_NEIGHBOR_ADVERT,
1462 	    ill,	/* ill to be used for extracting ill_nd_lla */
1463 	    B_TRUE,	/* use ill_nd_lla */
1464 	    &target,	/* Source and target of the advertisement pkt */
1465 	    &src,	/* IP Destination (source of original pkt) */
1466 	    flag);
1467 done:
1468 	if (bad_solicit)
1469 		BUMP_MIB(mib, ipv6IfIcmpInBadNeighborSolicitations);
1470 	if (our_nce != NULL)
1471 		NCE_REFRELE(our_nce);
1472 }
1473 
1474 void
1475 ndp_input_advert(ill_t *ill, mblk_t *mp)
1476 {
1477 	nd_neighbor_advert_t *na;
1478 	uint32_t	hlen = ill->ill_nd_lla_len;
1479 	uchar_t		*haddr = NULL;
1480 	icmp6_t		*icmp_nd;
1481 	ip6_t		*ip6h;
1482 	nce_t		*dst_nce = NULL;
1483 	in6_addr_t	target;
1484 	nd_opt_hdr_t	*opt = NULL;
1485 	int		len;
1486 	mib2_ipv6IfIcmpEntry_t	*mib = ill->ill_icmp6_mib;
1487 
1488 	ip6h = (ip6_t *)mp->b_rptr;
1489 	icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
1490 	len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN;
1491 	na = (nd_neighbor_advert_t *)icmp_nd;
1492 	if (IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst) &&
1493 	    (na->nd_na_flags_reserved & ND_NA_FLAG_SOLICITED)) {
1494 		ip1dbg(("ndp_input_advert: Target is multicast but the "
1495 		    "solicited flag is not zero\n"));
1496 		BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements);
1497 		return;
1498 	}
1499 	target = na->nd_na_target;
1500 	if (IN6_IS_ADDR_MULTICAST(&target)) {
1501 		ip1dbg(("ndp_input_advert: Target is multicast!\n"));
1502 		BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements);
1503 		return;
1504 	}
1505 	if (len > sizeof (nd_neighbor_advert_t)) {
1506 		opt = (nd_opt_hdr_t *)&na[1];
1507 		if (!ndp_verify_optlen(opt,
1508 		    len - sizeof (nd_neighbor_advert_t))) {
1509 			BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements);
1510 			return;
1511 		}
1512 		/* At this point we have a verified NA per spec */
1513 		len -= sizeof (nd_neighbor_advert_t);
1514 		opt = ndp_get_option(opt, len, ND_OPT_TARGET_LINKADDR);
1515 		if (opt != NULL) {
1516 			haddr = (uchar_t *)&opt[1];
1517 			if (hlen > opt->nd_opt_len * 8 ||
1518 			    hlen == 0) {
1519 				BUMP_MIB(mib,
1520 				    ipv6IfIcmpInBadNeighborAdvertisements);
1521 				return;
1522 			}
1523 		}
1524 	}
1525 
1526 	/*
1527 	 * If this interface is part of the group look at all the
1528 	 * ills in the group.
1529 	 */
1530 	rw_enter(&ill_g_lock, RW_READER);
1531 	if (ill->ill_group != NULL)
1532 		ill = ill->ill_group->illgrp_ill;
1533 
1534 	for (; ill != NULL; ill = ill->ill_group_next) {
1535 		mutex_enter(&ill->ill_lock);
1536 		if (!ILL_CAN_LOOKUP(ill)) {
1537 			mutex_exit(&ill->ill_lock);
1538 			continue;
1539 		}
1540 		ill_refhold_locked(ill);
1541 		mutex_exit(&ill->ill_lock);
1542 		dst_nce = ndp_lookup(ill, &target, B_FALSE);
1543 		/* We have to drop the lock since ndp_process calls put* */
1544 		rw_exit(&ill_g_lock);
1545 		if (dst_nce != NULL) {
1546 			if (na->nd_na_flags_reserved &
1547 			    ND_NA_FLAG_ROUTER) {
1548 				dst_nce->nce_flags |= NCE_F_ISROUTER;
1549 			}
1550 			/* B_TRUE indicates this an advertisement */
1551 			ndp_process(dst_nce, haddr,
1552 				na->nd_na_flags_reserved, B_TRUE);
1553 			NCE_REFRELE(dst_nce);
1554 		}
1555 		rw_enter(&ill_g_lock, RW_READER);
1556 		ill_refrele(ill);
1557 	}
1558 	rw_exit(&ill_g_lock);
1559 }
1560 
1561 /*
1562  * Process NDP neighbor solicitation/advertisement messages.
1563  * The checksum has already checked o.k before reaching here.
1564  */
1565 void
1566 ndp_input(ill_t *ill, mblk_t *mp)
1567 {
1568 	icmp6_t		*icmp_nd;
1569 	ip6_t		*ip6h;
1570 	int		len;
1571 	mib2_ipv6IfIcmpEntry_t	*mib = ill->ill_icmp6_mib;
1572 
1573 
1574 	if (!pullupmsg(mp, -1)) {
1575 		ip1dbg(("ndp_input: pullupmsg failed\n"));
1576 		BUMP_MIB(ill->ill_ip6_mib, ipv6InDiscards);
1577 		goto done;
1578 	}
1579 	ip6h = (ip6_t *)mp->b_rptr;
1580 	if (ip6h->ip6_hops != IPV6_MAX_HOPS) {
1581 		ip1dbg(("ndp_input: hoplimit != IPV6_MAX_HOPS\n"));
1582 		BUMP_MIB(mib, ipv6IfIcmpBadHoplimit);
1583 		goto done;
1584 	}
1585 	/*
1586 	 * NDP does not accept any extension headers between the
1587 	 * IP header and the ICMP header since e.g. a routing
1588 	 * header could be dangerous.
1589 	 * This assumes that any AH or ESP headers are removed
1590 	 * by ip prior to passing the packet to ndp_input.
1591 	 */
1592 	if (ip6h->ip6_nxt != IPPROTO_ICMPV6) {
1593 		ip1dbg(("ndp_input: Wrong next header 0x%x\n",
1594 		    ip6h->ip6_nxt));
1595 		BUMP_MIB(mib, ipv6IfIcmpInErrors);
1596 		goto done;
1597 	}
1598 	icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
1599 	ASSERT(icmp_nd->icmp6_type == ND_NEIGHBOR_SOLICIT ||
1600 	    icmp_nd->icmp6_type == ND_NEIGHBOR_ADVERT);
1601 	if (icmp_nd->icmp6_code != 0) {
1602 		ip1dbg(("ndp_input: icmp6 code != 0 \n"));
1603 		BUMP_MIB(mib, ipv6IfIcmpInErrors);
1604 		goto done;
1605 	}
1606 	len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN;
1607 	/*
1608 	 * Make sure packet length is large enough for either
1609 	 * a NS or a NA icmp packet.
1610 	 */
1611 	if (len <  sizeof (struct icmp6_hdr) + sizeof (struct in6_addr)) {
1612 		ip1dbg(("ndp_input: packet too short\n"));
1613 		BUMP_MIB(mib, ipv6IfIcmpInErrors);
1614 		goto done;
1615 	}
1616 	if (icmp_nd->icmp6_type == ND_NEIGHBOR_SOLICIT) {
1617 		ndp_input_solicit(ill, mp);
1618 	} else {
1619 		ndp_input_advert(ill, mp);
1620 	}
1621 done:
1622 	freemsg(mp);
1623 }
1624 
1625 /*
1626  * nce_xmit is called to form and transmit a ND solicitation or
1627  * advertisement ICMP packet.
1628  * If source address is unspecified, appropriate source address
1629  * and link layer address will be chosen here. This function
1630  * *always* sends the link layer option.
1631  * It returns B_FALSE only if it does a successful put() to the
1632  * corresponding ill's ill_wq otherwise returns B_TRUE.
1633  */
1634 static boolean_t
1635 nce_xmit(ill_t *ill, uint32_t operation, ill_t *hwaddr_ill,
1636     boolean_t use_nd_lla, const in6_addr_t *sender, const in6_addr_t *target,
1637     int flag)
1638 {
1639 	uint32_t	len;
1640 	icmp6_t 	*icmp6;
1641 	mblk_t		*mp;
1642 	ip6_t		*ip6h;
1643 	nd_opt_hdr_t	*opt;
1644 	uint_t		plen;
1645 	ip6i_t		*ip6i;
1646 	ipif_t		*src_ipif = NULL;
1647 
1648 	/*
1649 	 * If we have a unspecified source(sender) address, select a
1650 	 * proper source address for the solicitation here itself so
1651 	 * that we can initialize the h/w address correctly. This is
1652 	 * needed for interface groups as source address can come from
1653 	 * the whole group and the h/w address initialized from ill will
1654 	 * be wrong if the source address comes from a different ill.
1655 	 *
1656 	 * Note that the NA never comes here with the unspecified source
1657 	 * address. The following asserts that whenever the source
1658 	 * address is specified, the haddr also should be specified.
1659 	 */
1660 	ASSERT(IN6_IS_ADDR_UNSPECIFIED(sender) || (hwaddr_ill != NULL));
1661 
1662 	if (IN6_IS_ADDR_UNSPECIFIED(sender)) {
1663 		ASSERT(operation != ND_NEIGHBOR_ADVERT);
1664 		/*
1665 		 * Pick a source address for this solicitation, but
1666 		 * restrict the selection to addresses assigned to the
1667 		 * output interface (or interface group).  We do this
1668 		 * because the destination will create a neighbor cache
1669 		 * entry for the source address of this packet, so the
1670 		 * source address had better be a valid neighbor.
1671 		 */
1672 		src_ipif = ipif_select_source_v6(ill, target, B_TRUE,
1673 		    IPV6_PREFER_SRC_DEFAULT, GLOBAL_ZONEID);
1674 		if (src_ipif == NULL) {
1675 			char buf[INET6_ADDRSTRLEN];
1676 
1677 			ip0dbg(("nce_xmit: No source ipif for dst %s\n",
1678 			    inet_ntop(AF_INET6, (char *)target, buf,
1679 			    sizeof (buf))));
1680 			return (B_TRUE);
1681 		}
1682 		sender = &src_ipif->ipif_v6src_addr;
1683 		hwaddr_ill = src_ipif->ipif_ill;
1684 	}
1685 
1686 	plen = (sizeof (nd_opt_hdr_t) + ill->ill_nd_lla_len + 7)/8;
1687 	/*
1688 	 * Always make sure that the NS/NA packets don't get load
1689 	 * spread. This is needed so that the probe packets sent
1690 	 * by the in.mpathd daemon can really go out on the desired
1691 	 * interface. Probe packets are made to go out on a desired
1692 	 * interface by including a ip6i with ATTACH_IF flag. As these
1693 	 * packets indirectly end up sending/receiving NS/NA packets
1694 	 * (neighbor doing NUD), we have to make sure that NA
1695 	 * also go out on the same interface.
1696 	 */
1697 	len = IPV6_HDR_LEN + sizeof (ip6i_t) + sizeof (nd_neighbor_advert_t) +
1698 	    plen * 8;
1699 	mp = allocb(len,  BPRI_LO);
1700 	if (mp == NULL) {
1701 		if (src_ipif != NULL)
1702 			ipif_refrele(src_ipif);
1703 		return (B_TRUE);
1704 	}
1705 	bzero((char *)mp->b_rptr, len);
1706 	mp->b_wptr = mp->b_rptr + len;
1707 
1708 	ip6i = (ip6i_t *)mp->b_rptr;
1709 	ip6i->ip6i_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
1710 	ip6i->ip6i_nxt = IPPROTO_RAW;
1711 	ip6i->ip6i_flags = IP6I_ATTACH_IF | IP6I_HOPLIMIT;
1712 	ip6i->ip6i_ifindex = ill->ill_phyint->phyint_ifindex;
1713 
1714 	ip6h = (ip6_t *)(mp->b_rptr + sizeof (ip6i_t));
1715 	ip6h->ip6_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
1716 	ip6h->ip6_plen = htons(len - IPV6_HDR_LEN - sizeof (ip6i_t));
1717 	ip6h->ip6_nxt = IPPROTO_ICMPV6;
1718 	ip6h->ip6_hops = IPV6_MAX_HOPS;
1719 	ip6h->ip6_dst = *target;
1720 	icmp6 = (icmp6_t *)&ip6h[1];
1721 
1722 	opt = (nd_opt_hdr_t *)((uint8_t *)ip6h + IPV6_HDR_LEN +
1723 	    sizeof (nd_neighbor_advert_t));
1724 
1725 	if (operation == ND_NEIGHBOR_SOLICIT) {
1726 		nd_neighbor_solicit_t *ns = (nd_neighbor_solicit_t *)icmp6;
1727 
1728 		opt->nd_opt_type = ND_OPT_SOURCE_LINKADDR;
1729 		ip6h->ip6_src = *sender;
1730 		ns->nd_ns_target = *target;
1731 		if (!(flag & NDP_UNICAST)) {
1732 			/* Form multicast address of the target */
1733 			ip6h->ip6_dst = ipv6_solicited_node_mcast;
1734 			ip6h->ip6_dst.s6_addr32[3] |=
1735 			    ns->nd_ns_target.s6_addr32[3];
1736 		}
1737 	} else {
1738 		nd_neighbor_advert_t *na = (nd_neighbor_advert_t *)icmp6;
1739 
1740 		opt->nd_opt_type = ND_OPT_TARGET_LINKADDR;
1741 		ip6h->ip6_src = *sender;
1742 		na->nd_na_target = *sender;
1743 		if (flag & NDP_ISROUTER)
1744 			na->nd_na_flags_reserved |= ND_NA_FLAG_ROUTER;
1745 		if (flag & NDP_SOLICITED)
1746 			na->nd_na_flags_reserved |= ND_NA_FLAG_SOLICITED;
1747 		if (flag & NDP_ORIDE)
1748 			na->nd_na_flags_reserved |= ND_NA_FLAG_OVERRIDE;
1749 
1750 	}
1751 	/* Fill in link layer address and option len */
1752 	opt->nd_opt_len = (uint8_t)plen;
1753 	mutex_enter(&hwaddr_ill->ill_lock);
1754 	bcopy(use_nd_lla ? hwaddr_ill->ill_nd_lla : hwaddr_ill->ill_phys_addr,
1755 	    &opt[1], hwaddr_ill->ill_nd_lla_len);
1756 	mutex_exit(&hwaddr_ill->ill_lock);
1757 	icmp6->icmp6_type = (uint8_t)operation;
1758 	icmp6->icmp6_code = 0;
1759 	/*
1760 	 * Prepare for checksum by putting icmp length in the icmp
1761 	 * checksum field. The checksum is calculated in ip_wput_v6.
1762 	 */
1763 	icmp6->icmp6_cksum = ip6h->ip6_plen;
1764 
1765 	if (src_ipif != NULL)
1766 		ipif_refrele(src_ipif);
1767 	if (canput(ill->ill_wq)) {
1768 		put(ill->ill_wq, mp);
1769 		return (B_FALSE);
1770 	}
1771 	freemsg(mp);
1772 	return (B_TRUE);
1773 }
1774 
1775 /*
1776  * Make a link layer address (does not include the SAP) from an nce.
1777  * To form the link layer address, use the last four bytes of ipv6
1778  * address passed in and the fixed offset stored in nce.
1779  */
1780 static void
1781 nce_make_mapping(nce_t *nce, uchar_t *addrpos, uchar_t *addr)
1782 {
1783 	uchar_t *mask, *to;
1784 	ill_t	*ill = nce->nce_ill;
1785 	int 	len;
1786 
1787 	if (ill->ill_net_type == IRE_IF_NORESOLVER)
1788 		return;
1789 	ASSERT(nce->nce_res_mp != NULL);
1790 	ASSERT(ill->ill_net_type == IRE_IF_RESOLVER);
1791 	ASSERT(nce->nce_flags & NCE_F_MAPPING);
1792 	ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&nce->nce_extract_mask));
1793 	ASSERT(addr != NULL);
1794 	bcopy(nce->nce_res_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill),
1795 	    addrpos, ill->ill_nd_lla_len);
1796 	len = MIN((int)ill->ill_nd_lla_len - nce->nce_ll_extract_start,
1797 	    IPV6_ADDR_LEN);
1798 	mask = (uchar_t *)&nce->nce_extract_mask;
1799 	mask += (IPV6_ADDR_LEN - len);
1800 	addr += (IPV6_ADDR_LEN - len);
1801 	to = addrpos + nce->nce_ll_extract_start;
1802 	while (len-- > 0)
1803 		*to++ |= *mask++ & *addr++;
1804 }
1805 
1806 /*
1807  * Pass a cache report back out via NDD.
1808  */
1809 /* ARGSUSED */
1810 int
1811 ndp_report(queue_t *q, mblk_t *mp, caddr_t arg, cred_t *ioc_cr)
1812 {
1813 	(void) mi_mpprintf(mp, "ifname      hardware addr    flags"
1814 			"     proto addr/mask");
1815 	ndp_walk(NULL, (pfi_t)nce_report1, (uchar_t *)mp);
1816 	return (0);
1817 }
1818 
1819 /*
1820  * convert a link level address of arbitrary length
1821  * to an ascii string.
1822  * The caller *must* have already verified that the string buffer
1823  * is large enough to hold the entire string, including trailing NULL.
1824  */
1825 static void
1826 lla2ascii(uint8_t *lla, int addrlen, uchar_t *buf)
1827 {
1828 	uchar_t	addrbyte[8];	/* needs to hold ascii for a byte plus a NULL */
1829 	int	i;
1830 	size_t	len;
1831 
1832 	buf[0] = '\0';
1833 	for (i = 0; i < addrlen; i++) {
1834 		addrbyte[0] = '\0';
1835 		(void) sprintf((char *)addrbyte, "%02x:", (lla[i] & 0xff));
1836 		len = strlen((const char *)addrbyte);
1837 		bcopy(addrbyte, buf, len);
1838 		buf = buf + len;
1839 	}
1840 	*--buf = '\0';
1841 }
1842 
1843 /*
1844  * Add a single line to the NDP Cache Entry Report.
1845  */
1846 static void
1847 nce_report1(nce_t *nce, uchar_t *mp_arg)
1848 {
1849 	ill_t		*ill = nce->nce_ill;
1850 	char		local_buf[INET6_ADDRSTRLEN];
1851 	uchar_t		flags_buf[10];
1852 	uint32_t	flags = nce->nce_flags;
1853 	mblk_t		*mp = (mblk_t *)mp_arg;
1854 	uchar_t		*h;
1855 	uchar_t		*m = flags_buf;
1856 	in6_addr_t	v6addr;
1857 
1858 	/*
1859 	 * Lock the nce to protect nce_res_mp from being changed
1860 	 * if an external resolver address resolution completes
1861 	 * while nce_res_mp is being accessed here.
1862 	 *
1863 	 * Deal with all address formats, not just Ethernet-specific
1864 	 * In addition, make sure that the mblk has enough space
1865 	 * before writing to it. If is doesn't, allocate a new one.
1866 	 */
1867 	ASSERT(ill != NULL);
1868 	v6addr = nce->nce_mask;
1869 	if (flags & NCE_F_PERMANENT)
1870 		*m++ = 'P';
1871 	if (flags & NCE_F_ISROUTER)
1872 		*m++ = 'R';
1873 	if (flags & NCE_F_MAPPING)
1874 		*m++ = 'M';
1875 	*m = '\0';
1876 
1877 	if (ill->ill_net_type == IRE_IF_RESOLVER) {
1878 		size_t		addrlen;
1879 		uchar_t		*addr_buf;
1880 		dl_unitdata_req_t	*dl;
1881 
1882 		mutex_enter(&nce->nce_lock);
1883 		h = nce->nce_res_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill);
1884 		dl = (dl_unitdata_req_t *)nce->nce_res_mp->b_rptr;
1885 		if (ill->ill_flags & ILLF_XRESOLV)
1886 			addrlen = (3 * (dl->dl_dest_addr_length));
1887 		else
1888 			addrlen = (3 * (ill->ill_nd_lla_len));
1889 		if (addrlen <= 0) {
1890 			mutex_exit(&nce->nce_lock);
1891 			(void) mi_mpprintf(mp,
1892 			    "%8s %9s %5s %s/%d",
1893 			    ill->ill_name,
1894 			    "None",
1895 			    (uchar_t *)&flags_buf,
1896 			    inet_ntop(AF_INET6, (char *)&nce->nce_addr,
1897 				(char *)local_buf, sizeof (local_buf)),
1898 				ip_mask_to_plen_v6(&v6addr));
1899 		} else {
1900 			/*
1901 			 * Convert the hardware/lla address to ascii
1902 			 */
1903 			addr_buf = kmem_zalloc(addrlen, KM_NOSLEEP);
1904 			if (addr_buf == NULL) {
1905 				mutex_exit(&nce->nce_lock);
1906 				return;
1907 			}
1908 			if (ill->ill_flags & ILLF_XRESOLV)
1909 				lla2ascii((uint8_t *)h, dl->dl_dest_addr_length,
1910 				    addr_buf);
1911 			else
1912 				lla2ascii((uint8_t *)h, ill->ill_nd_lla_len,
1913 				    addr_buf);
1914 			mutex_exit(&nce->nce_lock);
1915 			(void) mi_mpprintf(mp, "%8s %17s %5s %s/%d",
1916 			    ill->ill_name, addr_buf, (uchar_t *)&flags_buf,
1917 			    inet_ntop(AF_INET6, (char *)&nce->nce_addr,
1918 				(char *)local_buf, sizeof (local_buf)),
1919 				ip_mask_to_plen_v6(&v6addr));
1920 			kmem_free(addr_buf, addrlen);
1921 		}
1922 	} else {
1923 		(void) mi_mpprintf(mp,
1924 		    "%8s %9s %5s %s/%d",
1925 		    ill->ill_name,
1926 		    "None",
1927 		    (uchar_t *)&flags_buf,
1928 		    inet_ntop(AF_INET6, (char *)&nce->nce_addr,
1929 			(char *)local_buf, sizeof (local_buf)),
1930 			ip_mask_to_plen_v6(&v6addr));
1931 	}
1932 }
1933 
1934 mblk_t *
1935 nce_udreq_alloc(ill_t *ill)
1936 {
1937 	mblk_t	*template_mp = NULL;
1938 	dl_unitdata_req_t *dlur;
1939 	int	sap_length;
1940 
1941 	sap_length = ill->ill_sap_length;
1942 	template_mp = ip_dlpi_alloc(sizeof (dl_unitdata_req_t) +
1943 	    ill->ill_nd_lla_len + ABS(sap_length), DL_UNITDATA_REQ);
1944 	if (template_mp == NULL)
1945 		return (NULL);
1946 
1947 	dlur = (dl_unitdata_req_t *)template_mp->b_rptr;
1948 	dlur->dl_priority.dl_min = 0;
1949 	dlur->dl_priority.dl_max = 0;
1950 	dlur->dl_dest_addr_length = ABS(sap_length) + ill->ill_nd_lla_len;
1951 	dlur->dl_dest_addr_offset = sizeof (dl_unitdata_req_t);
1952 
1953 	/* Copy in the SAP value. */
1954 	NCE_LL_SAP_COPY(ill, template_mp);
1955 
1956 	return (template_mp);
1957 }
1958 
1959 /*
1960  * NDP retransmit timer.
1961  * This timer goes off when:
1962  * a. It is time to retransmit NS for resolver.
1963  * b. It is time to send reachability probes.
1964  */
1965 void
1966 ndp_timer(void *arg)
1967 {
1968 	nce_t		*nce = arg;
1969 	ill_t		*ill = nce->nce_ill;
1970 	uint32_t	ms;
1971 	char		addrbuf[INET6_ADDRSTRLEN];
1972 	mblk_t		*mp;
1973 	boolean_t	dropped = B_FALSE;
1974 
1975 	/*
1976 	 * The timer has to be cancelled by ndp_delete before doing the final
1977 	 * refrele. So the NCE is guaranteed to exist when the timer runs
1978 	 * until it clears the timeout_id. Before clearing the timeout_id
1979 	 * bump up the refcnt so that we can continue to use the nce
1980 	 */
1981 	ASSERT(nce != NULL);
1982 
1983 	/*
1984 	 * Grab the ill_g_lock now itself to avoid lock order problems.
1985 	 * nce_solicit needs ill_g_lock to be able to traverse ills
1986 	 */
1987 	rw_enter(&ill_g_lock, RW_READER);
1988 	mutex_enter(&nce->nce_lock);
1989 	NCE_REFHOLD_LOCKED(nce);
1990 	nce->nce_timeout_id = 0;
1991 
1992 	/*
1993 	 * Check the reachability state first.
1994 	 */
1995 	switch (nce->nce_state) {
1996 	case ND_DELAY:
1997 		rw_exit(&ill_g_lock);
1998 		nce->nce_state = ND_PROBE;
1999 		mutex_exit(&nce->nce_lock);
2000 		(void) nce_xmit(ill, ND_NEIGHBOR_SOLICIT, NULL, B_FALSE,
2001 		    &ipv6_all_zeros, &nce->nce_addr, NDP_UNICAST);
2002 		if (ip_debug > 3) {
2003 			/* ip2dbg */
2004 			pr_addr_dbg("ndp_timer: state for %s changed "
2005 			    "to PROBE\n", AF_INET6, &nce->nce_addr);
2006 		}
2007 		NDP_RESTART_TIMER(nce, ill->ill_reachable_retrans_time);
2008 		NCE_REFRELE(nce);
2009 		return;
2010 	case ND_PROBE:
2011 		/* must be retransmit timer */
2012 		rw_exit(&ill_g_lock);
2013 		nce->nce_pcnt--;
2014 		ASSERT(nce->nce_pcnt < ND_MAX_UNICAST_SOLICIT &&
2015 		    nce->nce_pcnt >= -1);
2016 		if (nce->nce_pcnt == 0) {
2017 			/* Wait RetransTimer, before deleting the entry */
2018 			ip2dbg(("ndp_timer: pcount=%x dst %s\n",
2019 			    nce->nce_pcnt, inet_ntop(AF_INET6,
2020 			    &nce->nce_addr, addrbuf, sizeof (addrbuf))));
2021 			mutex_exit(&nce->nce_lock);
2022 			NDP_RESTART_TIMER(nce, ill->ill_reachable_retrans_time);
2023 		} else {
2024 			/*
2025 			 * As per RFC2461, the nce gets deleted after
2026 			 * MAX_UNICAST_SOLICIT unsuccessful re-transmissions.
2027 			 * Note that the first unicast solicitation is sent
2028 			 * during the DELAY state.
2029 			 */
2030 			if (nce->nce_pcnt > 0) {
2031 				ip2dbg(("ndp_timer: pcount=%x dst %s\n",
2032 				    nce->nce_pcnt, inet_ntop(AF_INET6,
2033 				    &nce->nce_addr,
2034 				    addrbuf, sizeof (addrbuf))));
2035 				mutex_exit(&nce->nce_lock);
2036 				dropped = nce_xmit(ill, ND_NEIGHBOR_SOLICIT,
2037 				    NULL, B_FALSE, &ipv6_all_zeros,
2038 				    &nce->nce_addr, NDP_UNICAST);
2039 				if (dropped) {
2040 					mutex_enter(&nce->nce_lock);
2041 					nce->nce_pcnt++;
2042 					mutex_exit(&nce->nce_lock);
2043 				}
2044 				NDP_RESTART_TIMER(nce,
2045 				    ill->ill_reachable_retrans_time);
2046 			} else {
2047 				/* No hope, delete the nce */
2048 				nce->nce_state = ND_UNREACHABLE;
2049 				mutex_exit(&nce->nce_lock);
2050 				if (ip_debug > 2) {
2051 					/* ip1dbg */
2052 					pr_addr_dbg("ndp_timer: Delete IRE for"
2053 					    " dst %s\n", AF_INET6,
2054 					    &nce->nce_addr);
2055 				}
2056 				ndp_delete(nce);
2057 			}
2058 		}
2059 		NCE_REFRELE(nce);
2060 		return;
2061 	case ND_INCOMPLETE:
2062 		/*
2063 		 * Must be resolvers retransmit timer.
2064 		 */
2065 		for (mp = nce->nce_qd_mp; mp != NULL; mp = mp->b_next) {
2066 			ip6i_t	*ip6i;
2067 			ip6_t	*ip6h;
2068 			mblk_t *data_mp;
2069 
2070 			/*
2071 			 * Walk the list of packets queued, and see if there
2072 			 * are any multipathing probe packets. Such packets
2073 			 * are always queued at the head. Since this is a
2074 			 * retransmit timer firing, mark such packets as
2075 			 * delayed in ND resolution. This info will be used
2076 			 * in ip_wput_v6(). Multipathing probe packets will
2077 			 * always have an ip6i_t. Once we hit a packet without
2078 			 * it, we can break out of this loop.
2079 			 */
2080 			if (mp->b_datap->db_type == M_CTL)
2081 				data_mp = mp->b_cont;
2082 			else
2083 				data_mp = mp;
2084 
2085 			ip6h = (ip6_t *)data_mp->b_rptr;
2086 			if (ip6h->ip6_nxt != IPPROTO_RAW)
2087 				break;
2088 
2089 			/*
2090 			 * This message should have been pulled up already in
2091 			 * ip_wput_v6. We can't do pullups here because the
2092 			 * b_next/b_prev is non-NULL.
2093 			 */
2094 			ip6i = (ip6i_t *)ip6h;
2095 			ASSERT((data_mp->b_wptr - (uchar_t *)ip6i) >=
2096 			    sizeof (ip6i_t) + IPV6_HDR_LEN);
2097 
2098 			/* Mark this packet as delayed due to ND resolution */
2099 			if (ip6i->ip6i_flags & IP6I_DROP_IFDELAYED)
2100 				ip6i->ip6i_flags |= IP6I_ND_DELAYED;
2101 		}
2102 		if (nce->nce_qd_mp != NULL) {
2103 			ms = nce_solicit(nce, NULL);
2104 			rw_exit(&ill_g_lock);
2105 			if (ms == 0) {
2106 				if (nce->nce_state != ND_REACHABLE) {
2107 					mutex_exit(&nce->nce_lock);
2108 					nce_resolv_failed(nce);
2109 					ndp_delete(nce);
2110 				} else {
2111 					mutex_exit(&nce->nce_lock);
2112 				}
2113 			} else {
2114 				mutex_exit(&nce->nce_lock);
2115 				NDP_RESTART_TIMER(nce, (clock_t)ms);
2116 			}
2117 			NCE_REFRELE(nce);
2118 			return;
2119 		}
2120 		mutex_exit(&nce->nce_lock);
2121 		rw_exit(&ill_g_lock);
2122 		NCE_REFRELE(nce);
2123 		break;
2124 	case ND_REACHABLE :
2125 		rw_exit(&ill_g_lock);
2126 		if (nce->nce_flags & NCE_F_UNSOL_ADV &&
2127 		    nce->nce_unsolicit_count != 0) {
2128 			nce->nce_unsolicit_count--;
2129 			mutex_exit(&nce->nce_lock);
2130 			dropped = nce_xmit(ill,
2131 			    ND_NEIGHBOR_ADVERT,
2132 			    ill,	/* ill to be used for hw addr */
2133 			    B_FALSE,	/* use ill_phys_addr */
2134 			    &nce->nce_addr,
2135 			    &ipv6_all_hosts_mcast,
2136 			    nce->nce_flags | NDP_ORIDE);
2137 			if (dropped) {
2138 				mutex_enter(&nce->nce_lock);
2139 				nce->nce_unsolicit_count++;
2140 				mutex_exit(&nce->nce_lock);
2141 			}
2142 			if (nce->nce_unsolicit_count != 0) {
2143 				NDP_RESTART_TIMER(nce,
2144 				    ip_ndp_unsolicit_interval);
2145 			}
2146 		} else {
2147 			mutex_exit(&nce->nce_lock);
2148 		}
2149 		NCE_REFRELE(nce);
2150 		break;
2151 	default:
2152 		rw_exit(&ill_g_lock);
2153 		mutex_exit(&nce->nce_lock);
2154 		NCE_REFRELE(nce);
2155 		break;
2156 	}
2157 }
2158 
2159 /*
2160  * Set a link layer address from the ll_addr passed in.
2161  * Copy SAP from ill.
2162  */
2163 static void
2164 nce_set_ll(nce_t *nce, uchar_t *ll_addr)
2165 {
2166 	ill_t	*ill = nce->nce_ill;
2167 	uchar_t	*woffset;
2168 
2169 	ASSERT(ll_addr != NULL);
2170 	/* Always called before fast_path_probe */
2171 	if (nce->nce_fp_mp != NULL)
2172 		return;
2173 	if (ill->ill_sap_length != 0) {
2174 		/*
2175 		 * Copy the SAP type specified in the
2176 		 * request into the xmit template.
2177 		 */
2178 		NCE_LL_SAP_COPY(ill, nce->nce_res_mp);
2179 	}
2180 	if (ill->ill_phys_addr_length > 0) {
2181 		/*
2182 		 * The bcopy() below used to be called for the physical address
2183 		 * length rather than the link layer address length. For
2184 		 * ethernet and many other media, the phys_addr and lla are
2185 		 * identical.
2186 		 * However, with xresolv interfaces being introduced, the
2187 		 * phys_addr and lla are no longer the same, and the physical
2188 		 * address may not have any useful meaning, so we use the lla
2189 		 * for IPv6 address resolution and destination addressing.
2190 		 *
2191 		 * For PPP or other interfaces with a zero length
2192 		 * physical address, don't do anything here.
2193 		 * The bcopy() with a zero phys_addr length was previously
2194 		 * a no-op for interfaces with a zero-length physical address.
2195 		 * Using the lla for them would change the way they operate.
2196 		 * Doing nothing in such cases preserves expected behavior.
2197 		 */
2198 		woffset = nce->nce_res_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill);
2199 		bcopy(ll_addr, woffset, ill->ill_nd_lla_len);
2200 	}
2201 }
2202 
2203 static boolean_t
2204 nce_cmp_ll_addr(nce_t *nce, char *ll_addr, uint32_t ll_addr_len)
2205 {
2206 	ill_t	*ill = nce->nce_ill;
2207 	uchar_t	*ll_offset;
2208 
2209 	ASSERT(nce->nce_res_mp != NULL);
2210 	if (ll_addr == NULL)
2211 		return (B_FALSE);
2212 	ll_offset = nce->nce_res_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill);
2213 	if (bcmp(ll_addr, (char *)ll_offset, ll_addr_len) != 0)
2214 		return (B_TRUE);
2215 	return (B_FALSE);
2216 }
2217 
2218 /*
2219  * Updates the link layer address or the reachability state of
2220  * a cache entry.  Reset probe counter if needed.
2221  */
2222 static void
2223 nce_update(nce_t *nce, uint16_t new_state, uchar_t *new_ll_addr)
2224 {
2225 	ill_t	*ill = nce->nce_ill;
2226 	boolean_t need_stop_timer = B_FALSE;
2227 	boolean_t need_fastpath_update = B_FALSE;
2228 
2229 	ASSERT(MUTEX_HELD(&nce->nce_lock));
2230 	/*
2231 	 * If this interface does not do NUD, there is no point
2232 	 * in allowing an update to the cache entry.  Although
2233 	 * we will respond to NS.
2234 	 * The only time we accept an update for a resolver when
2235 	 * NUD is turned off is when it has just been created.
2236 	 * Non-Resolvers will always be created as REACHABLE.
2237 	 */
2238 	if (new_state != ND_UNCHANGED) {
2239 		if ((nce->nce_flags & NCE_F_NONUD) &&
2240 		    (nce->nce_state != ND_INCOMPLETE))
2241 			return;
2242 		ASSERT((int16_t)new_state >= ND_STATE_VALID_MIN);
2243 		ASSERT((int16_t)new_state <= ND_STATE_VALID_MAX);
2244 		need_stop_timer = B_TRUE;
2245 		if (new_state == ND_REACHABLE)
2246 			nce->nce_last = TICK_TO_MSEC(lbolt64);
2247 		else {
2248 			/* We force NUD in this case */
2249 			nce->nce_last = 0;
2250 		}
2251 		nce->nce_state = new_state;
2252 		nce->nce_pcnt = ND_MAX_UNICAST_SOLICIT;
2253 	}
2254 	/*
2255 	 * In case of fast path we need to free the the fastpath
2256 	 * M_DATA and do another probe.  Otherwise we can just
2257 	 * overwrite the DL_UNITDATA_REQ data, noting we'll lose
2258 	 * whatever packets that happens to be transmitting at the time.
2259 	 */
2260 	if (new_ll_addr != NULL) {
2261 		ASSERT(nce->nce_res_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill) +
2262 		    ill->ill_nd_lla_len <= nce->nce_res_mp->b_wptr);
2263 		bcopy(new_ll_addr, nce->nce_res_mp->b_rptr +
2264 		    NCE_LL_ADDR_OFFSET(ill), ill->ill_nd_lla_len);
2265 		if (nce->nce_fp_mp != NULL) {
2266 			freemsg(nce->nce_fp_mp);
2267 			nce->nce_fp_mp = NULL;
2268 			need_fastpath_update = B_TRUE;
2269 		}
2270 	}
2271 	mutex_exit(&nce->nce_lock);
2272 	if (need_stop_timer) {
2273 		(void) untimeout(nce->nce_timeout_id);
2274 		nce->nce_timeout_id = 0;
2275 	}
2276 	if (need_fastpath_update)
2277 		nce_fastpath(nce);
2278 	mutex_enter(&nce->nce_lock);
2279 }
2280 
2281 static void
2282 nce_queue_mp(nce_t *nce, mblk_t *mp)
2283 {
2284 	uint_t	count = 0;
2285 	mblk_t  **mpp;
2286 	boolean_t head_insert = B_FALSE;
2287 	ip6_t	*ip6h;
2288 	ip6i_t	*ip6i;
2289 	mblk_t *data_mp;
2290 
2291 	ASSERT(MUTEX_HELD(&nce->nce_lock));
2292 
2293 	if (mp->b_datap->db_type == M_CTL)
2294 		data_mp = mp->b_cont;
2295 	else
2296 		data_mp = mp;
2297 	ip6h = (ip6_t *)data_mp->b_rptr;
2298 	if (ip6h->ip6_nxt == IPPROTO_RAW) {
2299 		/*
2300 		 * This message should have been pulled up already in
2301 		 * ip_wput_v6. We can't do pullups here because the message
2302 		 * could be from the nce_qd_mp which could have b_next/b_prev
2303 		 * non-NULL.
2304 		 */
2305 		ip6i = (ip6i_t *)ip6h;
2306 		ASSERT((data_mp->b_wptr - (uchar_t *)ip6i) >=
2307 			    sizeof (ip6i_t) + IPV6_HDR_LEN);
2308 		/*
2309 		 * Multipathing probe packets have IP6I_DROP_IFDELAYED set.
2310 		 * This has 2 aspects mentioned below.
2311 		 * 1. Perform head insertion in the nce_qd_mp for these packets.
2312 		 * This ensures that next retransmit of ND solicitation
2313 		 * will use the interface specified by the probe packet,
2314 		 * for both NS and NA. This corresponds to the src address
2315 		 * in the IPv6 packet. If we insert at tail, we will be
2316 		 * depending on the packet at the head for successful
2317 		 * ND resolution. This is not reliable, because the interface
2318 		 * on which the NA arrives could be different from the interface
2319 		 * on which the NS was sent, and if the receiving interface is
2320 		 * failed, it will appear that the sending interface is also
2321 		 * failed, causing in.mpathd to misdiagnose this as link
2322 		 * failure.
2323 		 * 2. Drop the original packet, if the ND resolution did not
2324 		 * succeed in the first attempt. However we will create the
2325 		 * nce and the ire, as soon as the ND resolution succeeds.
2326 		 * We don't gain anything by queueing multiple probe packets
2327 		 * and sending them back-to-back once resolution succeeds.
2328 		 * It is sufficient to send just 1 packet after ND resolution
2329 		 * succeeds. Since mpathd is sending down probe packets at a
2330 		 * constant rate, we don't need to send the queued packet. We
2331 		 * need to queue it only for NDP resolution. The benefit of
2332 		 * dropping the probe packets that were delayed in ND
2333 		 * resolution, is that in.mpathd will not see inflated
2334 		 * RTT. If the ND resolution does not succeed within
2335 		 * in.mpathd's failure detection time, mpathd may detect
2336 		 * a failure, and it does not matter whether the packet
2337 		 * was queued or dropped.
2338 		 */
2339 		if (ip6i->ip6i_flags & IP6I_DROP_IFDELAYED)
2340 			head_insert = B_TRUE;
2341 	}
2342 
2343 	for (mpp = &nce->nce_qd_mp; *mpp != NULL;
2344 	    mpp = &(*mpp)->b_next) {
2345 		if (++count >
2346 		    nce->nce_ill->ill_max_buf) {
2347 			mblk_t *tmp = nce->nce_qd_mp->b_next;
2348 
2349 			nce->nce_qd_mp->b_next = NULL;
2350 			nce->nce_qd_mp->b_prev = NULL;
2351 			freemsg(nce->nce_qd_mp);
2352 			ip1dbg(("nce_queue_mp: pkt dropped\n"));
2353 			nce->nce_qd_mp = tmp;
2354 		}
2355 	}
2356 	/* put this on the list */
2357 	if (head_insert) {
2358 		mp->b_next = nce->nce_qd_mp;
2359 		nce->nce_qd_mp = mp;
2360 	} else {
2361 		*mpp = mp;
2362 	}
2363 }
2364 
2365 /*
2366  * Called when address resolution failed due to a timeout.
2367  * Send an ICMP unreachable in response to all queued packets.
2368  */
2369 void
2370 nce_resolv_failed(nce_t *nce)
2371 {
2372 	mblk_t	*mp, *nxt_mp, *first_mp;
2373 	char	buf[INET6_ADDRSTRLEN];
2374 	ip6_t *ip6h;
2375 	zoneid_t zoneid = GLOBAL_ZONEID;
2376 
2377 	ip1dbg(("nce_resolv_failed: dst %s\n",
2378 	    inet_ntop(AF_INET6, (char *)&nce->nce_addr, buf, sizeof (buf))));
2379 	mutex_enter(&nce->nce_lock);
2380 	mp = nce->nce_qd_mp;
2381 	nce->nce_qd_mp = NULL;
2382 	mutex_exit(&nce->nce_lock);
2383 	while (mp != NULL) {
2384 		nxt_mp = mp->b_next;
2385 		mp->b_next = NULL;
2386 		mp->b_prev = NULL;
2387 
2388 		first_mp = mp;
2389 		if (mp->b_datap->db_type == M_CTL) {
2390 			ipsec_out_t *io = (ipsec_out_t *)mp->b_rptr;
2391 			ASSERT(io->ipsec_out_type == IPSEC_OUT);
2392 			zoneid = io->ipsec_out_zoneid;
2393 			ASSERT(zoneid != ALL_ZONES);
2394 			mp = mp->b_cont;
2395 		}
2396 
2397 		ip6h = (ip6_t *)mp->b_rptr;
2398 		if (ip6h->ip6_nxt == IPPROTO_RAW) {
2399 			ip6i_t *ip6i;
2400 			/*
2401 			 * This message should have been pulled up already
2402 			 * in ip_wput_v6. ip_hdr_complete_v6 assumes that
2403 			 * the header is pulled up.
2404 			 */
2405 			ip6i = (ip6i_t *)ip6h;
2406 			ASSERT((mp->b_wptr - (uchar_t *)ip6i) >=
2407 			    sizeof (ip6i_t) + IPV6_HDR_LEN);
2408 			mp->b_rptr += sizeof (ip6i_t);
2409 		}
2410 		/*
2411 		 * Ignore failure since icmp_unreachable_v6 will silently
2412 		 * drop packets with an unspecified source address.
2413 		 */
2414 		(void) ip_hdr_complete_v6((ip6_t *)mp->b_rptr, zoneid);
2415 		icmp_unreachable_v6(nce->nce_ill->ill_wq, first_mp,
2416 		    ICMP6_DST_UNREACH_ADDR, B_FALSE, B_FALSE);
2417 		mp = nxt_mp;
2418 	}
2419 }
2420 
2421 /*
2422  * Called by SIOCSNDP* ioctl to add/change an nce entry
2423  * and the corresponding attributes.
2424  * Disallow states other than ND_REACHABLE or ND_STALE.
2425  */
2426 int
2427 ndp_sioc_update(ill_t *ill, lif_nd_req_t *lnr)
2428 {
2429 	sin6_t		*sin6;
2430 	in6_addr_t	*addr;
2431 	nce_t		*nce;
2432 	int		err;
2433 	uint16_t	new_flags = 0;
2434 	uint16_t	old_flags = 0;
2435 	int		inflags = lnr->lnr_flags;
2436 
2437 	if ((lnr->lnr_state_create != ND_REACHABLE) &&
2438 	    (lnr->lnr_state_create != ND_STALE))
2439 		return (EINVAL);
2440 
2441 	sin6 = (sin6_t *)&lnr->lnr_addr;
2442 	addr = &sin6->sin6_addr;
2443 
2444 	mutex_enter(&ndp_g_lock);
2445 	/* We know it can not be mapping so just look in the hash table */
2446 	nce = nce_lookup_addr(ill, addr);
2447 	if (nce != NULL)
2448 		new_flags = nce->nce_flags;
2449 
2450 	switch (inflags & (NDF_ISROUTER_ON|NDF_ISROUTER_OFF)) {
2451 	case NDF_ISROUTER_ON:
2452 		new_flags |= NCE_F_ISROUTER;
2453 		break;
2454 	case NDF_ISROUTER_OFF:
2455 		new_flags &= ~NCE_F_ISROUTER;
2456 		break;
2457 	case (NDF_ISROUTER_OFF|NDF_ISROUTER_ON):
2458 		mutex_exit(&ndp_g_lock);
2459 		if (nce != NULL)
2460 			NCE_REFRELE(nce);
2461 		return (EINVAL);
2462 	}
2463 
2464 	switch (inflags & (NDF_ANYCAST_ON|NDF_ANYCAST_OFF)) {
2465 	case NDF_ANYCAST_ON:
2466 		new_flags |= NCE_F_ANYCAST;
2467 		break;
2468 	case NDF_ANYCAST_OFF:
2469 		new_flags &= ~NCE_F_ANYCAST;
2470 		break;
2471 	case (NDF_ANYCAST_OFF|NDF_ANYCAST_ON):
2472 		mutex_exit(&ndp_g_lock);
2473 		if (nce != NULL)
2474 			NCE_REFRELE(nce);
2475 		return (EINVAL);
2476 	}
2477 
2478 	switch (inflags & (NDF_PROXY_ON|NDF_PROXY_OFF)) {
2479 	case NDF_PROXY_ON:
2480 		new_flags |= NCE_F_PROXY;
2481 		break;
2482 	case NDF_PROXY_OFF:
2483 		new_flags &= ~NCE_F_PROXY;
2484 		break;
2485 	case (NDF_PROXY_OFF|NDF_PROXY_ON):
2486 		mutex_exit(&ndp_g_lock);
2487 		if (nce != NULL)
2488 			NCE_REFRELE(nce);
2489 		return (EINVAL);
2490 	}
2491 
2492 	if (nce == NULL) {
2493 		err = ndp_add(ill,
2494 		    (uchar_t *)lnr->lnr_hdw_addr,
2495 		    addr,
2496 		    &ipv6_all_ones,
2497 		    &ipv6_all_zeros,
2498 		    0,
2499 		    new_flags,
2500 		    lnr->lnr_state_create,
2501 		    &nce);
2502 		if (err != 0) {
2503 			mutex_exit(&ndp_g_lock);
2504 			ip1dbg(("ndp_sioc_update: Can't create NCE %d\n", err));
2505 			return (err);
2506 		}
2507 	}
2508 	old_flags = nce->nce_flags;
2509 	if (old_flags & NCE_F_ISROUTER && !(new_flags & NCE_F_ISROUTER)) {
2510 		/*
2511 		 * Router turned to host, delete all ires.
2512 		 * XXX Just delete the entry, but we need to add too.
2513 		 */
2514 		nce->nce_flags &= ~NCE_F_ISROUTER;
2515 		mutex_exit(&ndp_g_lock);
2516 		ndp_delete(nce);
2517 		NCE_REFRELE(nce);
2518 		return (0);
2519 	}
2520 	mutex_exit(&ndp_g_lock);
2521 
2522 	mutex_enter(&nce->nce_lock);
2523 	nce->nce_flags = new_flags;
2524 	mutex_exit(&nce->nce_lock);
2525 	/*
2526 	 * Note that we ignore the state at this point, which
2527 	 * should be either STALE or REACHABLE.  Instead we let
2528 	 * the link layer address passed in to determine the state
2529 	 * much like incoming packets.
2530 	 */
2531 	ndp_process(nce, (uchar_t *)lnr->lnr_hdw_addr, 0, B_FALSE);
2532 	NCE_REFRELE(nce);
2533 	return (0);
2534 }
2535 
2536 /*
2537  * If the device driver supports it, we make nce_fp_mp to have
2538  * an M_DATA prepend.  Otherwise nce_fp_mp will be null.
2539  * The caller insures there is hold on nce for this function.
2540  * Note that since ill_fastpath_probe() copies the mblk there is
2541  * no need for the hold beyond this function.
2542  */
2543 static void
2544 nce_fastpath(nce_t *nce)
2545 {
2546 	ill_t	*ill = nce->nce_ill;
2547 	int res;
2548 
2549 	ASSERT(ill != NULL);
2550 	if (nce->nce_fp_mp != NULL) {
2551 		/* Already contains fastpath info */
2552 		return;
2553 	}
2554 	if (nce->nce_res_mp != NULL) {
2555 		nce_fastpath_list_add(nce);
2556 		res = ill_fastpath_probe(ill, nce->nce_res_mp);
2557 		/*
2558 		 * EAGAIN is an indication of a transient error
2559 		 * i.e. allocation failure etc. leave the nce in the list it
2560 		 * will be updated when another probe happens for another ire
2561 		 * if not it will be taken out of the list when the ire is
2562 		 * deleted.
2563 		 */
2564 
2565 		if (res != 0 && res != EAGAIN)
2566 			nce_fastpath_list_delete(nce);
2567 	}
2568 }
2569 
2570 /*
2571  * Drain the list of nce's waiting for fastpath response.
2572  */
2573 void
2574 nce_fastpath_list_dispatch(ill_t *ill, boolean_t (*func)(nce_t *, void  *),
2575     void *arg)
2576 {
2577 
2578 	nce_t *next_nce;
2579 	nce_t *current_nce;
2580 	nce_t *first_nce;
2581 	nce_t *prev_nce = NULL;
2582 
2583 	ASSERT(ill != NULL);
2584 
2585 	mutex_enter(&ill->ill_lock);
2586 	first_nce = current_nce = (nce_t *)ill->ill_fastpath_list;
2587 	while (current_nce != (nce_t *)&ill->ill_fastpath_list) {
2588 		next_nce = current_nce->nce_fastpath;
2589 		/*
2590 		 * Take it off the list if we're flushing, or if the callback
2591 		 * routine tells us to do so.  Otherwise, leave the nce in the
2592 		 * fastpath list to handle any pending response from the lower
2593 		 * layer.  We can't drain the list when the callback routine
2594 		 * comparison failed, because the response is asynchronous in
2595 		 * nature, and may not arrive in the same order as the list
2596 		 * insertion.
2597 		 */
2598 		if (func == NULL || func(current_nce, arg)) {
2599 			current_nce->nce_fastpath = NULL;
2600 			if (current_nce == first_nce)
2601 				ill->ill_fastpath_list = first_nce = next_nce;
2602 			else
2603 				prev_nce->nce_fastpath = next_nce;
2604 		} else {
2605 			/* previous element that is still in the list */
2606 			prev_nce = current_nce;
2607 		}
2608 		current_nce = next_nce;
2609 	}
2610 	mutex_exit(&ill->ill_lock);
2611 }
2612 
2613 /*
2614  * Add nce to the nce fastpath list.
2615  */
2616 void
2617 nce_fastpath_list_add(nce_t *nce)
2618 {
2619 	ill_t *ill;
2620 
2621 	ill = nce->nce_ill;
2622 	ASSERT(ill != NULL);
2623 
2624 	mutex_enter(&ill->ill_lock);
2625 	mutex_enter(&nce->nce_lock);
2626 
2627 	/*
2628 	 * if nce has not been deleted and
2629 	 * is not already in the list add it.
2630 	 */
2631 	if (!(nce->nce_flags & NCE_F_CONDEMNED) &&
2632 	    (nce->nce_fastpath == NULL)) {
2633 		nce->nce_fastpath = (nce_t *)ill->ill_fastpath_list;
2634 		ill->ill_fastpath_list = nce;
2635 	}
2636 
2637 	mutex_exit(&nce->nce_lock);
2638 	mutex_exit(&ill->ill_lock);
2639 }
2640 
2641 /*
2642  * remove nce from the nce fastpath list.
2643  */
2644 void
2645 nce_fastpath_list_delete(nce_t *nce)
2646 {
2647 	nce_t *nce_ptr;
2648 
2649 	ill_t *ill;
2650 
2651 	ill = nce->nce_ill;
2652 	ASSERT(ill != NULL);
2653 
2654 	mutex_enter(&ill->ill_lock);
2655 	if (nce->nce_fastpath == NULL)
2656 		goto done;
2657 
2658 	ASSERT(ill->ill_fastpath_list != &ill->ill_fastpath_list);
2659 
2660 	if (ill->ill_fastpath_list == nce) {
2661 		ill->ill_fastpath_list = nce->nce_fastpath;
2662 	} else {
2663 		nce_ptr = ill->ill_fastpath_list;
2664 		while (nce_ptr != (nce_t *)&ill->ill_fastpath_list) {
2665 			if (nce_ptr->nce_fastpath == nce) {
2666 				nce_ptr->nce_fastpath = nce->nce_fastpath;
2667 				break;
2668 			}
2669 			nce_ptr = nce_ptr->nce_fastpath;
2670 		}
2671 	}
2672 
2673 	nce->nce_fastpath = NULL;
2674 done:
2675 	mutex_exit(&ill->ill_lock);
2676 }
2677 
2678 /*
2679  * Update all NCE's that are not in fastpath mode and
2680  * have an nce_fp_mp that matches mp. mp->b_cont contains
2681  * the fastpath header.
2682  *
2683  * Returns TRUE if entry should be dequeued, or FALSE otherwise.
2684  */
2685 boolean_t
2686 ndp_fastpath_update(nce_t *nce, void *arg)
2687 {
2688 	mblk_t 	*mp, *fp_mp;
2689 	uchar_t	*mp_rptr, *ud_mp_rptr;
2690 	mblk_t	*ud_mp = nce->nce_res_mp;
2691 	ptrdiff_t	cmplen;
2692 
2693 	if (nce->nce_flags & NCE_F_MAPPING)
2694 		return (B_TRUE);
2695 	if ((nce->nce_fp_mp != NULL) || (ud_mp == NULL))
2696 		return (B_TRUE);
2697 
2698 	ip2dbg(("ndp_fastpath_update: trying\n"));
2699 	mp = (mblk_t *)arg;
2700 	mp_rptr = mp->b_rptr;
2701 	cmplen = mp->b_wptr - mp_rptr;
2702 	ASSERT(cmplen >= 0);
2703 	ud_mp_rptr = ud_mp->b_rptr;
2704 	/*
2705 	 * The nce is locked here to prevent any other threads
2706 	 * from accessing and changing nce_res_mp when the IPv6 address
2707 	 * becomes resolved to an lla while we're in the middle
2708 	 * of looking at and comparing the hardware address (lla).
2709 	 * It is also locked to prevent multiple threads in nce_fastpath_update
2710 	 * from examining nce_res_mp atthe same time.
2711 	 */
2712 	mutex_enter(&nce->nce_lock);
2713 	if (ud_mp->b_wptr - ud_mp_rptr != cmplen ||
2714 	    bcmp((char *)mp_rptr, (char *)ud_mp_rptr, cmplen) != 0) {
2715 		mutex_exit(&nce->nce_lock);
2716 		/*
2717 		 * Don't take the ire off the fastpath list yet,
2718 		 * since the response may come later.
2719 		 */
2720 		return (B_FALSE);
2721 	}
2722 	/* Matched - install mp as the fastpath mp */
2723 	ip1dbg(("ndp_fastpath_update: match\n"));
2724 	fp_mp = dupb(mp->b_cont);
2725 	if (fp_mp != NULL) {
2726 		nce->nce_fp_mp = fp_mp;
2727 	}
2728 	mutex_exit(&nce->nce_lock);
2729 	return (B_TRUE);
2730 }
2731 
2732 /*
2733  * This function handles the DL_NOTE_FASTPATH_FLUSH notification from
2734  * driver.  Note that it assumes IP is exclusive...
2735  */
2736 /* ARGSUSED */
2737 void
2738 ndp_fastpath_flush(nce_t *nce, char *arg)
2739 {
2740 	if (nce->nce_flags & NCE_F_MAPPING)
2741 		return;
2742 	/* No fastpath info? */
2743 	if (nce->nce_fp_mp == NULL || nce->nce_res_mp == NULL)
2744 		return;
2745 
2746 	/* Just delete the NCE... */
2747 	ndp_delete(nce);
2748 }
2749 
2750 /*
2751  * Return a pointer to a given option in the packet.
2752  * Assumes that option part of the packet have already been validated.
2753  */
2754 nd_opt_hdr_t *
2755 ndp_get_option(nd_opt_hdr_t *opt, int optlen, int opt_type)
2756 {
2757 	while (optlen > 0) {
2758 		if (opt->nd_opt_type == opt_type)
2759 			return (opt);
2760 		optlen -= 8 * opt->nd_opt_len;
2761 		opt = (struct nd_opt_hdr *)((char *)opt + 8 * opt->nd_opt_len);
2762 	}
2763 	return (NULL);
2764 }
2765 
2766 /*
2767  * Verify all option lengths present are > 0, also check to see
2768  * if the option lengths and packet length are consistent.
2769  */
2770 boolean_t
2771 ndp_verify_optlen(nd_opt_hdr_t *opt, int optlen)
2772 {
2773 	ASSERT(opt != NULL);
2774 	while (optlen > 0) {
2775 		if (opt->nd_opt_len == 0)
2776 			return (B_FALSE);
2777 		optlen -= 8 * opt->nd_opt_len;
2778 		if (optlen < 0)
2779 			return (B_FALSE);
2780 		opt = (struct nd_opt_hdr *)((char *)opt + 8 * opt->nd_opt_len);
2781 	}
2782 	return (B_TRUE);
2783 }
2784 
2785 /*
2786  * ndp_walk function.
2787  * Free a fraction of the NCE cache entries.
2788  * A fraction of zero means to not free any in that category.
2789  */
2790 void
2791 ndp_cache_reclaim(nce_t *nce, char *arg)
2792 {
2793 	nce_cache_reclaim_t *ncr = (nce_cache_reclaim_t *)arg;
2794 	uint_t	rand;
2795 
2796 	if (nce->nce_flags & NCE_F_PERMANENT)
2797 		return;
2798 
2799 	rand = (uint_t)lbolt +
2800 	    NCE_ADDR_HASH_V6(nce->nce_addr, NCE_TABLE_SIZE);
2801 	if (ncr->ncr_host != 0 &&
2802 	    (rand/ncr->ncr_host)*ncr->ncr_host == rand) {
2803 		ndp_delete(nce);
2804 		return;
2805 	}
2806 }
2807 
2808 /*
2809  * ndp_walk function.
2810  * Count the number of NCEs that can be deleted.
2811  * These would be hosts but not routers.
2812  */
2813 void
2814 ndp_cache_count(nce_t *nce, char *arg)
2815 {
2816 	ncc_cache_count_t *ncc = (ncc_cache_count_t *)arg;
2817 
2818 	if (nce->nce_flags & NCE_F_PERMANENT)
2819 		return;
2820 
2821 	ncc->ncc_total++;
2822 	if (!(nce->nce_flags & NCE_F_ISROUTER))
2823 		ncc->ncc_host++;
2824 }
2825 
2826 #ifdef NCE_DEBUG
2827 th_trace_t *
2828 th_trace_nce_lookup(nce_t *nce)
2829 {
2830 	int bucket_id;
2831 	th_trace_t *th_trace;
2832 
2833 	ASSERT(MUTEX_HELD(&nce->nce_lock));
2834 
2835 	bucket_id = IP_TR_HASH(curthread);
2836 	ASSERT(bucket_id < IP_TR_HASH_MAX);
2837 
2838 	for (th_trace = nce->nce_trace[bucket_id]; th_trace != NULL;
2839 	    th_trace = th_trace->th_next) {
2840 		if (th_trace->th_id == curthread)
2841 			return (th_trace);
2842 	}
2843 	return (NULL);
2844 }
2845 
2846 void
2847 nce_trace_ref(nce_t *nce)
2848 {
2849 	int bucket_id;
2850 	th_trace_t *th_trace;
2851 
2852 	/*
2853 	 * Attempt to locate the trace buffer for the curthread.
2854 	 * If it does not exist, then allocate a new trace buffer
2855 	 * and link it in list of trace bufs for this ipif, at the head
2856 	 */
2857 	ASSERT(MUTEX_HELD(&nce->nce_lock));
2858 
2859 	if (nce->nce_trace_disable == B_TRUE)
2860 		return;
2861 
2862 	th_trace = th_trace_nce_lookup(nce);
2863 	if (th_trace == NULL) {
2864 		bucket_id = IP_TR_HASH(curthread);
2865 		th_trace = (th_trace_t *)kmem_zalloc(sizeof (th_trace_t),
2866 		    KM_NOSLEEP);
2867 		if (th_trace == NULL) {
2868 			nce->nce_trace_disable = B_TRUE;
2869 			nce_trace_inactive(nce);
2870 			return;
2871 		}
2872 		th_trace->th_id = curthread;
2873 		th_trace->th_next = nce->nce_trace[bucket_id];
2874 		th_trace->th_prev = &nce->nce_trace[bucket_id];
2875 		if (th_trace->th_next != NULL)
2876 			th_trace->th_next->th_prev = &th_trace->th_next;
2877 		nce->nce_trace[bucket_id] = th_trace;
2878 	}
2879 	ASSERT(th_trace->th_refcnt < TR_BUF_MAX - 1);
2880 	th_trace->th_refcnt++;
2881 	th_trace_rrecord(th_trace);
2882 }
2883 
2884 void
2885 nce_untrace_ref(nce_t *nce)
2886 {
2887 	th_trace_t *th_trace;
2888 
2889 	ASSERT(MUTEX_HELD(&nce->nce_lock));
2890 
2891 	if (nce->nce_trace_disable == B_TRUE)
2892 		return;
2893 
2894 	th_trace = th_trace_nce_lookup(nce);
2895 	ASSERT(th_trace != NULL && th_trace->th_refcnt > 0);
2896 
2897 	th_trace_rrecord(th_trace);
2898 	th_trace->th_refcnt--;
2899 }
2900 
2901 void
2902 nce_trace_inactive(nce_t *nce)
2903 {
2904 	th_trace_t *th_trace;
2905 	int i;
2906 
2907 	ASSERT(MUTEX_HELD(&nce->nce_lock));
2908 
2909 	for (i = 0; i < IP_TR_HASH_MAX; i++) {
2910 		while (nce->nce_trace[i] != NULL) {
2911 			th_trace = nce->nce_trace[i];
2912 
2913 			/* unlink th_trace and free it */
2914 			nce->nce_trace[i] = th_trace->th_next;
2915 			if (th_trace->th_next != NULL)
2916 				th_trace->th_next->th_prev =
2917 				    &nce->nce_trace[i];
2918 
2919 			th_trace->th_next = NULL;
2920 			th_trace->th_prev = NULL;
2921 			kmem_free(th_trace, sizeof (th_trace_t));
2922 		}
2923 	}
2924 
2925 }
2926 
2927 /* ARGSUSED */
2928 int
2929 nce_thread_exit(nce_t *nce, caddr_t arg)
2930 {
2931 	th_trace_t	*th_trace;
2932 
2933 	mutex_enter(&nce->nce_lock);
2934 	th_trace = th_trace_nce_lookup(nce);
2935 
2936 	if (th_trace == NULL) {
2937 		mutex_exit(&nce->nce_lock);
2938 		return (0);
2939 	}
2940 
2941 	ASSERT(th_trace->th_refcnt == 0);
2942 
2943 	/* unlink th_trace and free it */
2944 	*th_trace->th_prev = th_trace->th_next;
2945 	if (th_trace->th_next != NULL)
2946 		th_trace->th_next->th_prev = th_trace->th_prev;
2947 	th_trace->th_next = NULL;
2948 	th_trace->th_prev = NULL;
2949 	kmem_free(th_trace, sizeof (th_trace_t));
2950 	mutex_exit(&nce->nce_lock);
2951 	return (0);
2952 }
2953 #endif
2954