xref: /titanic_44/usr/src/uts/common/inet/ip/ip_ndp.c (revision e49962a00eea60555f3c78ebf58a9a641590802c)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 #include <sys/types.h>
29 #include <sys/stream.h>
30 #include <sys/stropts.h>
31 #include <sys/sysmacros.h>
32 #include <sys/errno.h>
33 #include <sys/dlpi.h>
34 #include <sys/socket.h>
35 #include <sys/ddi.h>
36 #include <sys/cmn_err.h>
37 #include <sys/debug.h>
38 #include <sys/vtrace.h>
39 #include <sys/kmem.h>
40 #include <sys/zone.h>
41 
42 #include <net/if.h>
43 #include <net/if_dl.h>
44 #include <net/route.h>
45 #include <netinet/in.h>
46 #include <netinet/ip6.h>
47 #include <netinet/icmp6.h>
48 
49 #include <inet/common.h>
50 #include <inet/mi.h>
51 #include <inet/mib2.h>
52 #include <inet/nd.h>
53 #include <inet/ip.h>
54 #include <inet/ip_if.h>
55 #include <inet/ip_ire.h>
56 #include <inet/ip_rts.h>
57 #include <inet/ip6.h>
58 #include <inet/ip_ndp.h>
59 #include <inet/ipsec_impl.h>
60 #include <inet/ipsec_info.h>
61 
62 /*
63  * Function names with nce_ prefix are static while function
64  * names with ndp_ prefix are used by rest of the IP.
65  */
66 
67 static	boolean_t nce_cmp_ll_addr(nce_t *nce, char *new_ll_addr,
68     uint32_t ll_addr_len);
69 static	void	nce_fastpath(nce_t *nce);
70 static	void	nce_ire_delete(nce_t *nce);
71 static	void	nce_ire_delete1(ire_t *ire, char *nce_arg);
72 static	void 	nce_set_ll(nce_t *nce, uchar_t *ll_addr);
73 static	nce_t	*nce_lookup_addr(ill_t *ill, const in6_addr_t *addr);
74 static	nce_t	*nce_lookup_mapping(ill_t *ill, const in6_addr_t *addr);
75 static	void	nce_make_mapping(nce_t *nce, uchar_t *addrpos,
76     uchar_t *addr);
77 static	int	nce_set_multicast(ill_t *ill, const in6_addr_t *addr);
78 static	void	nce_queue_mp(nce_t *nce, mblk_t *mp);
79 static	void	nce_report1(nce_t *nce, uchar_t *mp_arg);
80 static	mblk_t	*nce_udreq_alloc(ill_t *ill);
81 static	void	nce_update(nce_t *nce, uint16_t new_state,
82     uchar_t *new_ll_addr);
83 static	uint32_t	nce_solicit(nce_t *nce, mblk_t *mp);
84 static	boolean_t	nce_xmit(ill_t *ill, uint32_t operation,
85     ill_t *hwaddr_ill, boolean_t use_lla_addr, const in6_addr_t *sender,
86     const in6_addr_t *target, int flag);
87 static	void	lla2ascii(uint8_t *lla, int addrlen, uchar_t *buf);
88 extern void	th_trace_rrecord(th_trace_t *);
89 
90 #ifdef NCE_DEBUG
91 void	nce_trace_inactive(nce_t *);
92 #endif
93 
94 /* NDP Cache Entry Hash Table */
95 #define	NCE_TABLE_SIZE	256
96 static	nce_t	*nce_hash_tbl[NCE_TABLE_SIZE];
97 static	nce_t	*nce_mask_entries;	/* mask not all ones */
98 static	int	ndp_g_walker = 0;	/* # of active thread */
99 					/* walking nce hash list */
100 /* ndp_g_walker_cleanup will be true, when deletion have to be defered */
101 static	boolean_t	ndp_g_walker_cleanup = B_FALSE;
102 
103 #ifdef _BIG_ENDIAN
104 #define	IN6_IS_ADDR_MC_SOLICITEDNODE(addr) \
105 	((((addr)->s6_addr32[0] & 0xff020000) == 0xff020000) && \
106 	((addr)->s6_addr32[1] == 0x0) && \
107 	((addr)->s6_addr32[2] == 0x00000001) && \
108 	((addr)->s6_addr32[3] & 0xff000000) == 0xff000000)
109 #else	/* _BIG_ENDIAN */
110 #define	IN6_IS_ADDR_MC_SOLICITEDNODE(addr) \
111 	((((addr)->s6_addr32[0] & 0x000002ff) == 0x000002ff) && \
112 	((addr)->s6_addr32[1] == 0x0) && \
113 	((addr)->s6_addr32[2] == 0x01000000) && \
114 	((addr)->s6_addr32[3] & 0x000000ff) == 0x000000ff)
115 #endif
116 
117 #define	NCE_HASH_PTR(addr) \
118 	(&(nce_hash_tbl[NCE_ADDR_HASH_V6(addr, NCE_TABLE_SIZE)]))
119 
120 /*
121  * NDP Cache Entry creation routine.
122  * Mapped entries will never do NUD .
123  * This routine must always be called with ndp_g_lock held.
124  * Prior to return, nce_refcnt is incremented.
125  */
126 int
127 ndp_add(ill_t *ill, uchar_t *hw_addr, const in6_addr_t *addr,
128     const in6_addr_t *mask, const in6_addr_t *extract_mask,
129     uint32_t hw_extract_start, uint16_t flags, uint16_t state,
130     nce_t **newnce)
131 {
132 static	nce_t		nce_nil;
133 	nce_t		*nce;
134 	mblk_t		*mp;
135 	mblk_t		*template;
136 	nce_t		**ncep;
137 	boolean_t	dropped = B_FALSE;
138 
139 	ASSERT(MUTEX_HELD(&ndp_g_lock));
140 	ASSERT(ill != NULL);
141 	if (IN6_IS_ADDR_UNSPECIFIED(addr)) {
142 		ip0dbg(("ndp_add: no addr\n"));
143 		return (EINVAL);
144 	}
145 	if ((flags & ~NCE_EXTERNAL_FLAGS_MASK)) {
146 		ip0dbg(("ndp_add: flags = %x\n", (int)flags));
147 		return (EINVAL);
148 	}
149 	if (IN6_IS_ADDR_UNSPECIFIED(extract_mask) &&
150 	    (flags & NCE_F_MAPPING)) {
151 		ip0dbg(("ndp_add: extract mask zero for mapping"));
152 		return (EINVAL);
153 	}
154 	/*
155 	 * Allocate the mblk to hold the nce.
156 	 *
157 	 * XXX This can come out of a separate cache - nce_cache.
158 	 * We don't need the mp anymore as there are no more
159 	 * "qwriter"s
160 	 */
161 	mp = allocb(sizeof (nce_t), BPRI_MED);
162 	if (mp == NULL)
163 		return (ENOMEM);
164 
165 	nce = (nce_t *)mp->b_rptr;
166 	mp->b_wptr = (uchar_t *)&nce[1];
167 	*nce = nce_nil;
168 
169 	/*
170 	 * This one holds link layer address
171 	 */
172 	if (ill->ill_net_type == IRE_IF_RESOLVER) {
173 		template = nce_udreq_alloc(ill);
174 	} else {
175 		ASSERT((ill->ill_net_type == IRE_IF_NORESOLVER));
176 		ASSERT((ill->ill_resolver_mp != NULL));
177 		template = copyb(ill->ill_resolver_mp);
178 	}
179 	if (template == NULL) {
180 		freeb(mp);
181 		return (ENOMEM);
182 	}
183 	nce->nce_ill = ill;
184 	nce->nce_flags = flags;
185 	nce->nce_state = state;
186 	nce->nce_pcnt = ND_MAX_UNICAST_SOLICIT;
187 	nce->nce_rcnt = ill->ill_xmit_count;
188 	nce->nce_addr = *addr;
189 	nce->nce_mask = *mask;
190 	nce->nce_extract_mask = *extract_mask;
191 	nce->nce_ll_extract_start = hw_extract_start;
192 	nce->nce_fp_mp = NULL;
193 	nce->nce_res_mp = template;
194 	if (state == ND_REACHABLE)
195 		nce->nce_last = TICK_TO_MSEC(lbolt64);
196 	else
197 		nce->nce_last = 0;
198 	nce->nce_qd_mp = NULL;
199 	nce->nce_mp = mp;
200 	if (hw_addr != NULL)
201 		nce_set_ll(nce, hw_addr);
202 	/* This one is for nce getting created */
203 	nce->nce_refcnt = 1;
204 	mutex_init(&nce->nce_lock, NULL, MUTEX_DEFAULT, NULL);
205 	if (nce->nce_flags & NCE_F_MAPPING) {
206 		ASSERT(IN6_IS_ADDR_MULTICAST(addr));
207 		ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&nce->nce_mask));
208 		ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&nce->nce_extract_mask));
209 		ncep = &nce_mask_entries;
210 	} else {
211 		ncep = ((nce_t **)NCE_HASH_PTR(*addr));
212 	}
213 
214 #ifdef NCE_DEBUG
215 	bzero(nce->nce_trace, sizeof (th_trace_t *) * IP_TR_HASH_MAX);
216 #endif
217 	/*
218 	 * Atomically ensure that the ill is not CONDEMNED, before
219 	 * adding the NCE.
220 	 */
221 	mutex_enter(&ill->ill_lock);
222 	if (ill->ill_state_flags & ILL_CONDEMNED) {
223 		mutex_exit(&ill->ill_lock);
224 		freeb(mp);
225 		return (EINVAL);
226 	}
227 	if ((nce->nce_next = *ncep) != NULL)
228 		nce->nce_next->nce_ptpn = &nce->nce_next;
229 	*ncep = nce;
230 	nce->nce_ptpn = ncep;
231 	*newnce = nce;
232 	/* This one is for nce being used by an active thread */
233 	NCE_REFHOLD(*newnce);
234 
235 	/* Bump up the number of nce's referencing this ill */
236 	ill->ill_nce_cnt++;
237 	mutex_exit(&ill->ill_lock);
238 
239 	/*
240 	 * Before we insert the nce, honor the UNSOL_ADV flag.
241 	 * We cannot hold the ndp_g_lock and call nce_xmit
242 	 * which does a putnext.
243 	 */
244 	if (flags & NCE_F_UNSOL_ADV) {
245 		flags |= NDP_ORIDE;
246 		/*
247 		 * We account for the transmit below by assigning one
248 		 * less than the ndd variable. Subsequent decrements
249 		 * are done in ndp_timer.
250 		 */
251 		mutex_enter(&nce->nce_lock);
252 		mutex_exit(&ndp_g_lock);
253 		nce->nce_unsolicit_count = ip_ndp_unsolicit_count - 1;
254 		mutex_exit(&nce->nce_lock);
255 		dropped = nce_xmit(ill,
256 		    ND_NEIGHBOR_ADVERT,
257 		    ill,	/* ill to be used for extracting ill_nd_lla */
258 		    B_TRUE,	/* use ill_nd_lla */
259 		    addr,	/* Source and target of the advertisement pkt */
260 		    &ipv6_all_hosts_mcast, /* Destination of the packet */
261 		    flags);
262 		mutex_enter(&nce->nce_lock);
263 		if (dropped)
264 			nce->nce_unsolicit_count++;
265 		if (nce->nce_unsolicit_count != 0) {
266 			nce->nce_timeout_id = timeout(ndp_timer, nce,
267 			    MSEC_TO_TICK(ip_ndp_unsolicit_interval));
268 		}
269 		mutex_exit(&nce->nce_lock);
270 		mutex_enter(&ndp_g_lock);
271 	}
272 	/*
273 	 * If the hw_addr is NULL, typically for ND_INCOMPLETE nces, then
274 	 * we call nce_fastpath as soon as the nce is resolved in ndp_process.
275 	 * We call nce_fastpath from nce_update if the link layer address of
276 	 * the peer changes from nce_update
277 	 */
278 	if (hw_addr != NULL || ill->ill_net_type == IRE_IF_NORESOLVER)
279 		nce_fastpath(nce);
280 	return (0);
281 }
282 
283 int
284 ndp_lookup_then_add(ill_t *ill, uchar_t *hw_addr, const in6_addr_t *addr,
285     const in6_addr_t *mask, const in6_addr_t *extract_mask,
286     uint32_t hw_extract_start, uint16_t flags, uint16_t state,
287     nce_t **newnce)
288 {
289 	int	err = 0;
290 	nce_t	*nce;
291 
292 	mutex_enter(&ndp_g_lock);
293 	nce = nce_lookup_addr(ill, addr);
294 	if (nce == NULL) {
295 		err = ndp_add(ill,
296 		    hw_addr,
297 		    addr,
298 		    mask,
299 		    extract_mask,
300 		    hw_extract_start,
301 		    flags,
302 		    state,
303 		    newnce);
304 	} else {
305 		*newnce = nce;
306 		err = EEXIST;
307 	}
308 	mutex_exit(&ndp_g_lock);
309 	return (err);
310 }
311 
312 /*
313  * Remove all the CONDEMNED nces from the appropriate hash table.
314  * We create a private list of NCEs, these may have ires pointing
315  * to them, so the list will be passed through to clean up dependent
316  * ires and only then we can do NCE_REFRELE which can make NCE inactive.
317  */
318 static void
319 nce_remove(nce_t *nce, nce_t **free_nce_list)
320 {
321 	nce_t *nce1;
322 	nce_t **ptpn;
323 
324 	ASSERT(MUTEX_HELD(&ndp_g_lock));
325 	ASSERT(ndp_g_walker == 0);
326 	for (; nce; nce = nce1) {
327 		nce1 = nce->nce_next;
328 		mutex_enter(&nce->nce_lock);
329 		if (nce->nce_flags & NCE_F_CONDEMNED) {
330 			ptpn = nce->nce_ptpn;
331 			nce1 = nce->nce_next;
332 			if (nce1 != NULL)
333 				nce1->nce_ptpn = ptpn;
334 			*ptpn = nce1;
335 			nce->nce_ptpn = NULL;
336 			nce->nce_next = NULL;
337 			nce->nce_next = *free_nce_list;
338 			*free_nce_list = nce;
339 		}
340 		mutex_exit(&nce->nce_lock);
341 	}
342 }
343 
344 /*
345  * 1. Mark the nce CONDEMNED. This ensures that no new nce_lookup()
346  *    will return this NCE. Also no new IREs will be created that
347  *    point to this NCE (See ire_add_v6).  Also no new timeouts will
348  *    be started (See NDP_RESTART_TIMER).
349  * 2. Cancel any currently running timeouts.
350  * 3. If there is an ndp walker, return. The walker will do the cleanup.
351  *    This ensures that walkers see a consistent list of NCEs while walking.
352  * 4. Otherwise remove the NCE from the list of NCEs
353  * 5. Delete all IREs pointing to this NCE.
354  */
355 void
356 ndp_delete(nce_t *nce)
357 {
358 	nce_t	**ptpn;
359 	nce_t	*nce1;
360 
361 	/* Serialize deletes */
362 	mutex_enter(&nce->nce_lock);
363 	if (nce->nce_flags & NCE_F_CONDEMNED) {
364 		/* Some other thread is doing the delete */
365 		mutex_exit(&nce->nce_lock);
366 		return;
367 	}
368 	/*
369 	 * Caller has a refhold. Also 1 ref for being in the list. Thus
370 	 * refcnt has to be >= 2
371 	 */
372 	ASSERT(nce->nce_refcnt >= 2);
373 	nce->nce_flags |= NCE_F_CONDEMNED;
374 	mutex_exit(&nce->nce_lock);
375 
376 	nce_fastpath_list_delete(nce);
377 
378 	/*
379 	 * Cancel any running timer. Timeout can't be restarted
380 	 * since CONDEMNED is set. Can't hold nce_lock across untimeout.
381 	 * Passing invalid timeout id is fine.
382 	 */
383 	if (nce->nce_timeout_id != 0) {
384 		(void) untimeout(nce->nce_timeout_id);
385 		nce->nce_timeout_id = 0;
386 	}
387 
388 	mutex_enter(&ndp_g_lock);
389 	if (nce->nce_ptpn == NULL) {
390 		/*
391 		 * The last ndp walker has already removed this nce from
392 		 * the list after we marked the nce CONDEMNED and before
393 		 * we grabbed the ndp_g_lock.
394 		 */
395 		mutex_exit(&ndp_g_lock);
396 		return;
397 	}
398 	if (ndp_g_walker > 0) {
399 		/*
400 		 * Can't unlink. The walker will clean up
401 		 */
402 		ndp_g_walker_cleanup = B_TRUE;
403 		mutex_exit(&ndp_g_lock);
404 		return;
405 	}
406 
407 	/*
408 	 * Now remove the nce from the list. NDP_RESTART_TIMER won't restart
409 	 * the timer since it is marked CONDEMNED.
410 	 */
411 	ptpn = nce->nce_ptpn;
412 	nce1 = nce->nce_next;
413 	if (nce1 != NULL)
414 		nce1->nce_ptpn = ptpn;
415 	*ptpn = nce1;
416 	nce->nce_ptpn = NULL;
417 	nce->nce_next = NULL;
418 	mutex_exit(&ndp_g_lock);
419 
420 	nce_ire_delete(nce);
421 }
422 
423 void
424 ndp_inactive(nce_t *nce)
425 {
426 	mblk_t		**mpp;
427 	ill_t		*ill;
428 
429 	ASSERT(nce->nce_refcnt == 0);
430 	ASSERT(MUTEX_HELD(&nce->nce_lock));
431 	ASSERT(nce->nce_fastpath == NULL);
432 
433 	/* Free all nce allocated messages */
434 	mpp = &nce->nce_first_mp_to_free;
435 	do {
436 		while (*mpp != NULL) {
437 			mblk_t  *mp;
438 
439 			mp = *mpp;
440 			*mpp = mp->b_next;
441 			mp->b_next = NULL;
442 			mp->b_prev = NULL;
443 			freemsg(mp);
444 		}
445 	} while (mpp++ != &nce->nce_last_mp_to_free);
446 
447 #ifdef NCE_DEBUG
448 	nce_trace_inactive(nce);
449 #endif
450 
451 	ill = nce->nce_ill;
452 	mutex_enter(&ill->ill_lock);
453 	ill->ill_nce_cnt--;
454 	/*
455 	 * If the number of nce's associated with this ill have dropped
456 	 * to zero, check whether we need to restart any operation that
457 	 * is waiting for this to happen.
458 	 */
459 	if (ill->ill_nce_cnt == 0) {
460 		/* ipif_ill_refrele_tail drops the ill_lock */
461 		ipif_ill_refrele_tail(ill);
462 	} else {
463 		mutex_exit(&ill->ill_lock);
464 	}
465 	mutex_destroy(&nce->nce_lock);
466 	freeb(nce->nce_mp);
467 }
468 
469 /*
470  * ndp_walk routine.  Delete the nce if it is associated with the ill
471  * that is going away.  Always called as a writer.
472  */
473 void
474 ndp_delete_per_ill(nce_t *nce, uchar_t *arg)
475 {
476 	if ((nce != NULL) && nce->nce_ill == (ill_t *)arg) {
477 		ndp_delete(nce);
478 	}
479 }
480 
481 /*
482  * Walk a list of to be inactive NCEs and blow away all the ires.
483  */
484 static void
485 nce_ire_delete_list(nce_t *nce)
486 {
487 	nce_t *nce_next;
488 
489 	ASSERT(nce != NULL);
490 	while (nce != NULL) {
491 		nce_next = nce->nce_next;
492 		nce->nce_next = NULL;
493 
494 		/*
495 		 * It is possible for the last ndp walker (this thread)
496 		 * to come here after ndp_delete has marked the nce CONDEMNED
497 		 * and before it has removed the nce from the fastpath list
498 		 * or called untimeout. So we need to do it here. It is safe
499 		 * for both ndp_delete and this thread to do it twice or
500 		 * even simultaneously since each of the threads has a
501 		 * reference on the nce.
502 		 */
503 		nce_fastpath_list_delete(nce);
504 		/*
505 		 * Cancel any running timer. Timeout can't be restarted
506 		 * since CONDEMNED is set. Can't hold nce_lock across untimeout.
507 		 * Passing invalid timeout id is fine.
508 		 */
509 		if (nce->nce_timeout_id != 0) {
510 			(void) untimeout(nce->nce_timeout_id);
511 			nce->nce_timeout_id = 0;
512 		}
513 
514 		ire_walk_ill_v6(MATCH_IRE_ILL | MATCH_IRE_TYPE, IRE_CACHE,
515 		    nce_ire_delete1, (char *)nce, nce->nce_ill);
516 		NCE_REFRELE_NOTR(nce);
517 		nce = nce_next;
518 	}
519 }
520 
521 /*
522  * Delete an ire when the nce goes away.
523  */
524 /* ARGSUSED */
525 static void
526 nce_ire_delete(nce_t *nce)
527 {
528 	ire_walk_ill_v6(MATCH_IRE_ILL | MATCH_IRE_TYPE, IRE_CACHE,
529 	    nce_ire_delete1, (char *)nce, nce->nce_ill);
530 	NCE_REFRELE_NOTR(nce);
531 }
532 
533 /*
534  * ire_walk routine used to delete every IRE that shares this nce
535  */
536 static void
537 nce_ire_delete1(ire_t *ire, char *nce_arg)
538 {
539 	nce_t	*nce = (nce_t *)nce_arg;
540 
541 	ASSERT(ire->ire_type == IRE_CACHE);
542 
543 	if (ire->ire_nce == nce)
544 		ire_delete(ire);
545 }
546 
547 /*
548  * Cache entry lookup.  Try to find an nce matching the parameters passed.
549  * If one is found, the refcnt on the nce will be incremented.
550  */
551 nce_t *
552 ndp_lookup(ill_t *ill, const in6_addr_t *addr, boolean_t caller_holds_lock)
553 {
554 	nce_t	*nce;
555 
556 	if (!caller_holds_lock)
557 		mutex_enter(&ndp_g_lock);
558 	nce = nce_lookup_addr(ill, addr);
559 	if (nce == NULL)
560 		nce = nce_lookup_mapping(ill, addr);
561 	if (!caller_holds_lock)
562 		mutex_exit(&ndp_g_lock);
563 	return (nce);
564 }
565 
566 /*
567  * Cache entry lookup.  Try to find an nce matching the parameters passed.
568  * Look only for exact entries (no mappings).  If an nce is found, increment
569  * the hold count on that nce.
570  */
571 static nce_t *
572 nce_lookup_addr(ill_t *ill, const in6_addr_t *addr)
573 {
574 	nce_t	*nce;
575 
576 	ASSERT(ill != NULL);
577 	ASSERT(MUTEX_HELD(&ndp_g_lock));
578 	if (IN6_IS_ADDR_UNSPECIFIED(addr))
579 		return (NULL);
580 	nce = *((nce_t **)NCE_HASH_PTR(*addr));
581 	for (; nce != NULL; nce = nce->nce_next) {
582 		if (nce->nce_ill == ill) {
583 			if (IN6_ARE_ADDR_EQUAL(&nce->nce_addr, addr) &&
584 			    IN6_ARE_ADDR_EQUAL(&nce->nce_mask,
585 			    &ipv6_all_ones)) {
586 				mutex_enter(&nce->nce_lock);
587 				if (!(nce->nce_flags & NCE_F_CONDEMNED)) {
588 					NCE_REFHOLD_LOCKED(nce);
589 					mutex_exit(&nce->nce_lock);
590 					break;
591 				}
592 				mutex_exit(&nce->nce_lock);
593 			}
594 		}
595 	}
596 	return (nce);
597 }
598 
599 /*
600  * Cache entry lookup.  Try to find an nce matching the parameters passed.
601  * Look only for mappings.
602  */
603 static nce_t *
604 nce_lookup_mapping(ill_t *ill, const in6_addr_t *addr)
605 {
606 	nce_t	*nce;
607 
608 	ASSERT(ill != NULL);
609 	ASSERT(MUTEX_HELD(&ndp_g_lock));
610 	if (!IN6_IS_ADDR_MULTICAST(addr))
611 		return (NULL);
612 	nce = nce_mask_entries;
613 	for (; nce != NULL; nce = nce->nce_next)
614 		if (nce->nce_ill == ill &&
615 		    (V6_MASK_EQ(*addr, nce->nce_mask, nce->nce_addr))) {
616 			mutex_enter(&nce->nce_lock);
617 			if (!(nce->nce_flags & NCE_F_CONDEMNED)) {
618 				NCE_REFHOLD_LOCKED(nce);
619 				mutex_exit(&nce->nce_lock);
620 				break;
621 			}
622 			mutex_exit(&nce->nce_lock);
623 		}
624 	return (nce);
625 }
626 
627 /*
628  * Process passed in parameters either from an incoming packet or via
629  * user ioctl.
630  */
631 void
632 ndp_process(nce_t *nce, uchar_t *hw_addr, uint32_t flag, boolean_t is_adv)
633 {
634 	ill_t	*ill = nce->nce_ill;
635 	uint32_t hw_addr_len = ill->ill_nd_lla_len;
636 	mblk_t	*mp;
637 	boolean_t ll_updated = B_FALSE;
638 	boolean_t ll_changed;
639 
640 	/*
641 	 * No updates of link layer address or the neighbor state is
642 	 * allowed, when the cache is in NONUD state.  This still
643 	 * allows for responding to reachability solicitation.
644 	 */
645 	mutex_enter(&nce->nce_lock);
646 	if (nce->nce_state == ND_INCOMPLETE) {
647 		if (hw_addr == NULL) {
648 			mutex_exit(&nce->nce_lock);
649 			return;
650 		}
651 		nce_set_ll(nce, hw_addr);
652 		/*
653 		 * Update nce state and send the queued packets
654 		 * back to ip this time ire will be added.
655 		 */
656 		if (flag & ND_NA_FLAG_SOLICITED) {
657 			nce_update(nce, ND_REACHABLE, NULL);
658 		} else {
659 			nce_update(nce, ND_STALE, NULL);
660 		}
661 		mutex_exit(&nce->nce_lock);
662 		nce_fastpath(nce);
663 		mutex_enter(&nce->nce_lock);
664 		mp = nce->nce_qd_mp;
665 		nce->nce_qd_mp = NULL;
666 		mutex_exit(&nce->nce_lock);
667 		while (mp != NULL) {
668 			mblk_t *nxt_mp;
669 
670 			nxt_mp = mp->b_next;
671 			mp->b_next = NULL;
672 			if (mp->b_prev != NULL) {
673 				ill_t   *inbound_ill;
674 				queue_t *fwdq = NULL;
675 				uint_t ifindex;
676 
677 				ifindex = (uint_t)(uintptr_t)mp->b_prev;
678 				inbound_ill = ill_lookup_on_ifindex(ifindex,
679 				    B_TRUE, NULL, NULL, NULL, NULL);
680 				if (inbound_ill == NULL) {
681 					mp->b_prev = NULL;
682 					freemsg(mp);
683 					return;
684 				} else {
685 					fwdq = inbound_ill->ill_rq;
686 				}
687 				mp->b_prev = NULL;
688 				/*
689 				 * Send a forwarded packet back into ip_rput_v6
690 				 * just as in ire_send_v6().
691 				 * Extract the queue from b_prev (set in
692 				 * ip_rput_data_v6).
693 				 */
694 				if (fwdq != NULL) {
695 					/*
696 					 * Forwarded packets hop count will
697 					 * get decremented in ip_rput_data_v6
698 					 */
699 					put(fwdq, mp);
700 				} else {
701 					/*
702 					 * Send locally originated packets back
703 					 * into * ip_wput_v6.
704 					 */
705 					put(ill->ill_wq, mp);
706 				}
707 				ill_refrele(inbound_ill);
708 			} else {
709 				put(ill->ill_wq, mp);
710 			}
711 			mp = nxt_mp;
712 		}
713 		return;
714 	}
715 	ll_changed = nce_cmp_ll_addr(nce, (char *)hw_addr, hw_addr_len);
716 	if (!is_adv) {
717 		/* If this is a SOLICITATION request only */
718 		if (ll_changed)
719 			nce_update(nce, ND_STALE, hw_addr);
720 		mutex_exit(&nce->nce_lock);
721 		return;
722 	}
723 	if (!(flag & ND_NA_FLAG_OVERRIDE) && ll_changed) {
724 		/* If in any other state than REACHABLE, ignore */
725 		if (nce->nce_state == ND_REACHABLE) {
726 			nce_update(nce, ND_STALE, NULL);
727 		}
728 		mutex_exit(&nce->nce_lock);
729 		return;
730 	} else {
731 		if (ll_changed) {
732 			nce_update(nce, ND_UNCHANGED, hw_addr);
733 			ll_updated = B_TRUE;
734 		}
735 		if (flag & ND_NA_FLAG_SOLICITED) {
736 			nce_update(nce, ND_REACHABLE, NULL);
737 		} else {
738 			if (ll_updated) {
739 				nce_update(nce, ND_STALE, NULL);
740 			}
741 		}
742 		mutex_exit(&nce->nce_lock);
743 		if (!(flag & ND_NA_FLAG_ROUTER) && (nce->nce_flags &
744 		    NCE_F_ISROUTER)) {
745 			ire_t *ire;
746 
747 			/*
748 			 * Router turned to host.  We need to remove the
749 			 * entry as well as any default route that may be
750 			 * using this as a next hop.  This is required by
751 			 * section 7.2.5 of RFC 2461.
752 			 */
753 			ire = ire_ftable_lookup_v6(&ipv6_all_zeros,
754 			    &ipv6_all_zeros, &nce->nce_addr, IRE_DEFAULT,
755 			    nce->nce_ill->ill_ipif, NULL, ALL_ZONES, 0, NULL,
756 			    MATCH_IRE_ILL | MATCH_IRE_TYPE | MATCH_IRE_GW |
757 			    MATCH_IRE_DEFAULT);
758 			if (ire != NULL) {
759 				ip_rts_rtmsg(RTM_DELETE, ire, 0);
760 				ire_delete(ire);
761 				ire_refrele(ire);
762 			}
763 			ndp_delete(nce);
764 		}
765 	}
766 }
767 
768 /*
769  * Pass arg1 to the pfi supplied, along with each nce in existence.
770  * ndp_walk() places a REFHOLD on the nce and drops the lock when
771  * walking the hash list.
772  */
773 void
774 ndp_walk_impl(ill_t *ill, pfi_t pfi, void *arg1, boolean_t trace)
775 {
776 
777 	nce_t	*nce;
778 	nce_t	*nce1;
779 	nce_t	**ncep;
780 	nce_t	*free_nce_list = NULL;
781 
782 	mutex_enter(&ndp_g_lock);
783 	ndp_g_walker++;	/* Prevent ndp_delete from unlink and free of NCE */
784 	mutex_exit(&ndp_g_lock);
785 	for (ncep = nce_hash_tbl; ncep < A_END(nce_hash_tbl); ncep++) {
786 		for (nce = *ncep; nce; nce = nce1) {
787 			nce1 = nce->nce_next;
788 			if (ill == NULL || nce->nce_ill == ill) {
789 				if (trace) {
790 					NCE_REFHOLD(nce);
791 					(*pfi)(nce, arg1);
792 					NCE_REFRELE(nce);
793 				} else {
794 					NCE_REFHOLD_NOTR(nce);
795 					(*pfi)(nce, arg1);
796 					NCE_REFRELE_NOTR(nce);
797 				}
798 			}
799 		}
800 	}
801 	for (nce = nce_mask_entries; nce; nce = nce1) {
802 		nce1 = nce->nce_next;
803 		if (ill == NULL || nce->nce_ill == ill) {
804 			if (trace) {
805 				NCE_REFHOLD(nce);
806 				(*pfi)(nce, arg1);
807 				NCE_REFRELE(nce);
808 			} else {
809 				NCE_REFHOLD_NOTR(nce);
810 				(*pfi)(nce, arg1);
811 				NCE_REFRELE_NOTR(nce);
812 			}
813 		}
814 	}
815 	mutex_enter(&ndp_g_lock);
816 	ndp_g_walker--;
817 	/*
818 	 * While NCE's are removed from global list they are placed
819 	 * in a private list, to be passed to nce_ire_delete_list().
820 	 * The reason is, there may be ires pointing to this nce
821 	 * which needs to cleaned up.
822 	 */
823 	if (ndp_g_walker_cleanup && ndp_g_walker == 0) {
824 		/* Time to delete condemned entries */
825 		for (ncep = nce_hash_tbl; ncep < A_END(nce_hash_tbl); ncep++) {
826 			nce = *ncep;
827 			if (nce != NULL) {
828 				nce_remove(nce, &free_nce_list);
829 			}
830 		}
831 		nce = nce_mask_entries;
832 		if (nce != NULL) {
833 			nce_remove(nce, &free_nce_list);
834 		}
835 		ndp_g_walker_cleanup = B_FALSE;
836 	}
837 	mutex_exit(&ndp_g_lock);
838 
839 	if (free_nce_list != NULL) {
840 		nce_ire_delete_list(free_nce_list);
841 	}
842 }
843 
844 void
845 ndp_walk(ill_t *ill, pfi_t pfi, void *arg1)
846 {
847 	ndp_walk_impl(ill, pfi, arg1, B_TRUE);
848 }
849 
850 /*
851  * Prepend the zoneid using an ipsec_out_t for later use by functions like
852  * ip_rput_v6() after neighbor discovery has taken place.  If the message
853  * block already has a M_CTL at the front of it, then simply set the zoneid
854  * appropriately.
855  */
856 static mblk_t *
857 ndp_prepend_zone(mblk_t *mp, zoneid_t zoneid)
858 {
859 	mblk_t		*first_mp;
860 	ipsec_out_t	*io;
861 
862 	ASSERT(zoneid != ALL_ZONES);
863 	if (mp->b_datap->db_type == M_CTL) {
864 		io = (ipsec_out_t *)mp->b_rptr;
865 		ASSERT(io->ipsec_out_type == IPSEC_OUT);
866 		io->ipsec_out_zoneid = zoneid;
867 		return (mp);
868 	}
869 
870 	first_mp = ipsec_alloc_ipsec_out();
871 	if (first_mp == NULL)
872 		return (NULL);
873 	io = (ipsec_out_t *)first_mp->b_rptr;
874 	/* This is not a secure packet */
875 	io->ipsec_out_secure = B_FALSE;
876 	io->ipsec_out_zoneid = zoneid;
877 	first_mp->b_cont = mp;
878 	return (first_mp);
879 }
880 
881 /*
882  * Process resolve requests.  Handles both mapped entries
883  * as well as cases that needs to be send out on the wire.
884  * Lookup a NCE for a given IRE.  Regardless of whether one exists
885  * or one is created, we defer making ire point to nce until the
886  * ire is actually added at which point the nce_refcnt on the nce is
887  * incremented.  This is done primarily to have symmetry between ire_add()
888  * and ire_delete() which decrements the nce_refcnt, when an ire is deleted.
889  */
890 int
891 ndp_resolver(ill_t *ill, const in6_addr_t *dst, mblk_t *mp, zoneid_t zoneid)
892 {
893 	nce_t		*nce;
894 	int		err = 0;
895 	uint32_t	ms;
896 	mblk_t		*mp_nce = NULL;
897 
898 	ASSERT(ill != NULL);
899 	if (IN6_IS_ADDR_MULTICAST(dst)) {
900 		err = nce_set_multicast(ill, dst);
901 		return (err);
902 	}
903 	err = ndp_lookup_then_add(ill,
904 	    NULL,	/* No hardware address */
905 	    dst,
906 	    &ipv6_all_ones,
907 	    &ipv6_all_zeros,
908 	    0,
909 	    (ill->ill_flags & ILLF_NONUD) ? NCE_F_NONUD : 0,
910 	    ND_INCOMPLETE,
911 	    &nce);
912 
913 	switch (err) {
914 	case 0:
915 		/*
916 		 * New cache entry was created. Make sure that the state
917 		 * is not ND_INCOMPLETE. It can be in some other state
918 		 * even before we send out the solicitation as we could
919 		 * get un-solicited advertisements.
920 		 *
921 		 * If this is an XRESOLV interface, simply return 0,
922 		 * since we don't want to solicit just yet.
923 		 */
924 		if (ill->ill_flags & ILLF_XRESOLV) {
925 			NCE_REFRELE(nce);
926 			return (0);
927 		}
928 		rw_enter(&ill_g_lock, RW_READER);
929 		mutex_enter(&nce->nce_lock);
930 		if (nce->nce_state != ND_INCOMPLETE) {
931 			mutex_exit(&nce->nce_lock);
932 			rw_exit(&ill_g_lock);
933 			NCE_REFRELE(nce);
934 			return (0);
935 		}
936 		mp_nce = ndp_prepend_zone(mp, zoneid);
937 		if (mp_nce == NULL) {
938 			/* The caller will free mp */
939 			mutex_exit(&nce->nce_lock);
940 			rw_exit(&ill_g_lock);
941 			ndp_delete(nce);
942 			NCE_REFRELE(nce);
943 			return (ENOMEM);
944 		}
945 		ms = nce_solicit(nce, mp_nce);
946 		rw_exit(&ill_g_lock);
947 		if (ms == 0) {
948 			/* The caller will free mp */
949 			if (mp_nce != mp)
950 				freeb(mp_nce);
951 			mutex_exit(&nce->nce_lock);
952 			ndp_delete(nce);
953 			NCE_REFRELE(nce);
954 			return (EBUSY);
955 		}
956 		mutex_exit(&nce->nce_lock);
957 		NDP_RESTART_TIMER(nce, (clock_t)ms);
958 		NCE_REFRELE(nce);
959 		return (EINPROGRESS);
960 	case EEXIST:
961 		/* Resolution in progress just queue the packet */
962 		mutex_enter(&nce->nce_lock);
963 		if (nce->nce_state == ND_INCOMPLETE) {
964 			mp_nce = ndp_prepend_zone(mp, zoneid);
965 			if (mp_nce == NULL) {
966 				err = ENOMEM;
967 			} else {
968 				nce_queue_mp(nce, mp_nce);
969 				err = EINPROGRESS;
970 			}
971 		} else {
972 			/*
973 			 * Any other state implies we have
974 			 * a nce but IRE needs to be added ...
975 			 * ire_add_v6() will take care of the
976 			 * the case when the nce becomes CONDEMNED
977 			 * before the ire is added to the table.
978 			 */
979 			err = 0;
980 		}
981 		mutex_exit(&nce->nce_lock);
982 		NCE_REFRELE(nce);
983 		break;
984 	default:
985 		ip1dbg(("ndp_resolver: Can't create NCE %d\n", err));
986 		break;
987 	}
988 	return (err);
989 }
990 
991 /*
992  * When there is no resolver, the link layer template is passed in
993  * the IRE.
994  * Lookup a NCE for a given IRE.  Regardless of whether one exists
995  * or one is created, we defer making ire point to nce until the
996  * ire is actually added at which point the nce_refcnt on the nce is
997  * incremented.  This is done primarily to have symmetry between ire_add()
998  * and ire_delete() which decrements the nce_refcnt, when an ire is deleted.
999  */
1000 int
1001 ndp_noresolver(ill_t *ill, const in6_addr_t *dst)
1002 {
1003 	nce_t		*nce;
1004 	int		err = 0;
1005 
1006 	ASSERT(ill != NULL);
1007 	if (IN6_IS_ADDR_MULTICAST(dst)) {
1008 		err = nce_set_multicast(ill, dst);
1009 		return (err);
1010 	}
1011 
1012 	err = ndp_lookup_then_add(ill,
1013 	    NULL,	/* hardware address */
1014 	    dst,
1015 	    &ipv6_all_ones,
1016 	    &ipv6_all_zeros,
1017 	    0,
1018 	    (ill->ill_flags & ILLF_NONUD) ? NCE_F_NONUD : 0,
1019 	    ND_REACHABLE,
1020 	    &nce);
1021 
1022 	switch (err) {
1023 	case 0:
1024 		/*
1025 		 * Cache entry with a proper resolver cookie was
1026 		 * created.
1027 		 */
1028 		NCE_REFRELE(nce);
1029 		break;
1030 	case EEXIST:
1031 		err = 0;
1032 		NCE_REFRELE(nce);
1033 		break;
1034 	default:
1035 		ip1dbg(("ndp_noresolver: Can't create NCE %d\n", err));
1036 		break;
1037 	}
1038 	return (err);
1039 }
1040 
1041 /*
1042  * For each interface an entry is added for the unspecified multicast group.
1043  * Here that mapping is used to form the multicast cache entry for a particular
1044  * multicast destination.
1045  */
1046 static int
1047 nce_set_multicast(ill_t *ill, const in6_addr_t *dst)
1048 {
1049 	nce_t		*mnce;	/* Multicast mapping entry */
1050 	nce_t		*nce;
1051 	uchar_t		*hw_addr = NULL;
1052 	int		err = 0;
1053 
1054 	ASSERT(ill != NULL);
1055 	ASSERT(!(IN6_IS_ADDR_UNSPECIFIED(dst)));
1056 
1057 	mutex_enter(&ndp_g_lock);
1058 	nce = nce_lookup_addr(ill, dst);
1059 	if (nce != NULL) {
1060 		mutex_exit(&ndp_g_lock);
1061 		NCE_REFRELE(nce);
1062 		return (0);
1063 	}
1064 	/* No entry, now lookup for a mapping this should never fail */
1065 	mnce = nce_lookup_mapping(ill, dst);
1066 	if (mnce == NULL) {
1067 		/* Something broken for the interface. */
1068 		mutex_exit(&ndp_g_lock);
1069 		return (ESRCH);
1070 	}
1071 	ASSERT(mnce->nce_flags & NCE_F_MAPPING);
1072 	if (ill->ill_net_type == IRE_IF_RESOLVER) {
1073 		/*
1074 		 * For IRE_IF_RESOLVER a hardware mapping can be
1075 		 * generated, for IRE_IF_NORESOLVER, resolution cookie
1076 		 * in the ill is copied in ndp_add().
1077 		 */
1078 		hw_addr = kmem_alloc(ill->ill_nd_lla_len, KM_NOSLEEP);
1079 		if (hw_addr == NULL) {
1080 			mutex_exit(&ndp_g_lock);
1081 			NCE_REFRELE(mnce);
1082 			return (ENOMEM);
1083 		}
1084 		nce_make_mapping(mnce, hw_addr, (uchar_t *)dst);
1085 	}
1086 	NCE_REFRELE(mnce);
1087 	/*
1088 	 * IRE_IF_NORESOLVER type simply copies the resolution
1089 	 * cookie passed in.  So no hw_addr is needed.
1090 	 */
1091 	err = ndp_add(ill,
1092 	    hw_addr,
1093 	    dst,
1094 	    &ipv6_all_ones,
1095 	    &ipv6_all_zeros,
1096 	    0,
1097 	    NCE_F_NONUD,
1098 	    ND_REACHABLE,
1099 	    &nce);
1100 	mutex_exit(&ndp_g_lock);
1101 	if (hw_addr != NULL)
1102 		kmem_free(hw_addr, ill->ill_nd_lla_len);
1103 	if (err != 0) {
1104 		ip1dbg(("nce_set_multicast: create failed" "%d\n", err));
1105 		return (err);
1106 	}
1107 	NCE_REFRELE(nce);
1108 	return (0);
1109 }
1110 
1111 /*
1112  * Return the link layer address, and any flags of a nce.
1113  */
1114 int
1115 ndp_query(ill_t *ill, struct lif_nd_req *lnr)
1116 {
1117 	nce_t		*nce;
1118 	in6_addr_t	*addr;
1119 	sin6_t		*sin6;
1120 	dl_unitdata_req_t	*dl;
1121 
1122 	ASSERT(ill != NULL);
1123 	sin6 = (sin6_t *)&lnr->lnr_addr;
1124 	addr =  &sin6->sin6_addr;
1125 
1126 	nce = ndp_lookup(ill, addr, B_FALSE);
1127 	if (nce == NULL)
1128 		return (ESRCH);
1129 	/* If in INCOMPLETE state, no link layer address is available yet */
1130 	if (nce->nce_state == ND_INCOMPLETE)
1131 		goto done;
1132 	dl = (dl_unitdata_req_t *)nce->nce_res_mp->b_rptr;
1133 	if (ill->ill_flags & ILLF_XRESOLV)
1134 		lnr->lnr_hdw_len = dl->dl_dest_addr_length;
1135 	else
1136 		lnr->lnr_hdw_len = ill->ill_nd_lla_len;
1137 	ASSERT(NCE_LL_ADDR_OFFSET(ill) + lnr->lnr_hdw_len <=
1138 	    sizeof (lnr->lnr_hdw_addr));
1139 	bcopy(nce->nce_res_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill),
1140 	    (uchar_t *)&lnr->lnr_hdw_addr, lnr->lnr_hdw_len);
1141 	if (nce->nce_flags & NCE_F_ISROUTER)
1142 		lnr->lnr_flags = NDF_ISROUTER_ON;
1143 	if (nce->nce_flags & NCE_F_PROXY)
1144 		lnr->lnr_flags |= NDF_PROXY_ON;
1145 	if (nce->nce_flags & NCE_F_ANYCAST)
1146 		lnr->lnr_flags |= NDF_ANYCAST_ON;
1147 done:
1148 	NCE_REFRELE(nce);
1149 	return (0);
1150 }
1151 
1152 /*
1153  * Send Enable/Disable multicast reqs to driver.
1154  */
1155 int
1156 ndp_mcastreq(ill_t *ill, const in6_addr_t *addr, uint32_t hw_addr_len,
1157     uint32_t hw_addr_offset, mblk_t *mp)
1158 {
1159 	nce_t		*nce;
1160 	uchar_t		*hw_addr;
1161 
1162 	ASSERT(ill != NULL);
1163 	ASSERT(ill->ill_net_type == IRE_IF_RESOLVER);
1164 	hw_addr = mi_offset_paramc(mp, hw_addr_offset, hw_addr_len);
1165 	if (hw_addr == NULL || !IN6_IS_ADDR_MULTICAST(addr)) {
1166 		freemsg(mp);
1167 		return (EINVAL);
1168 	}
1169 	mutex_enter(&ndp_g_lock);
1170 	nce = nce_lookup_mapping(ill, addr);
1171 	if (nce == NULL) {
1172 		mutex_exit(&ndp_g_lock);
1173 		freemsg(mp);
1174 		return (ESRCH);
1175 	}
1176 	mutex_exit(&ndp_g_lock);
1177 	/*
1178 	 * Update dl_addr_length and dl_addr_offset for primitives that
1179 	 * have physical addresses as opposed to full saps
1180 	 */
1181 	switch (((union DL_primitives *)mp->b_rptr)->dl_primitive) {
1182 	case DL_ENABMULTI_REQ:
1183 		/* Track the state if this is the first enabmulti */
1184 		if (ill->ill_dlpi_multicast_state == IDMS_UNKNOWN)
1185 			ill->ill_dlpi_multicast_state = IDMS_INPROGRESS;
1186 		ip1dbg(("ndp_mcastreq: ENABMULTI\n"));
1187 		break;
1188 	case DL_DISABMULTI_REQ:
1189 		ip1dbg(("ndp_mcastreq: DISABMULTI\n"));
1190 		break;
1191 	default:
1192 		NCE_REFRELE(nce);
1193 		ip1dbg(("ndp_mcastreq: default\n"));
1194 		return (EINVAL);
1195 	}
1196 	nce_make_mapping(nce, hw_addr, (uchar_t *)addr);
1197 	NCE_REFRELE(nce);
1198 	putnext(ill->ill_wq, mp);
1199 	return (0);
1200 }
1201 
1202 /*
1203  * Send a neighbor solicitation.
1204  * Returns number of milliseconds after which we should either rexmit or abort.
1205  * Return of zero means we should abort.
1206  * The caller holds the nce_lock to protect nce_qd_mp and nce_rcnt.
1207  *
1208  * NOTE: This routine drops nce_lock (and later reacquires it) when sending
1209  * the packet.
1210  * NOTE: This routine does not consume mp.
1211  */
1212 uint32_t
1213 nce_solicit(nce_t *nce, mblk_t *mp)
1214 {
1215 	ill_t		*ill;
1216 	ill_t		*src_ill;
1217 	ip6_t		*ip6h;
1218 	in6_addr_t	src;
1219 	in6_addr_t	dst;
1220 	ipif_t		*ipif;
1221 	ip6i_t		*ip6i;
1222 	boolean_t	dropped = B_FALSE;
1223 
1224 	ASSERT(RW_READ_HELD(&ill_g_lock));
1225 	ASSERT(MUTEX_HELD(&nce->nce_lock));
1226 	ill = nce->nce_ill;
1227 	ASSERT(ill != NULL);
1228 
1229 	if (nce->nce_rcnt == 0) {
1230 		return (0);
1231 	}
1232 
1233 	if (mp == NULL) {
1234 		ASSERT(nce->nce_qd_mp != NULL);
1235 		mp = nce->nce_qd_mp;
1236 	} else {
1237 		nce_queue_mp(nce, mp);
1238 	}
1239 
1240 	/* Handle ip_newroute_v6 giving us IPSEC packets */
1241 	if (mp->b_datap->db_type == M_CTL)
1242 		mp = mp->b_cont;
1243 
1244 	ip6h = (ip6_t *)mp->b_rptr;
1245 	if (ip6h->ip6_nxt == IPPROTO_RAW) {
1246 		/*
1247 		 * This message should have been pulled up already in
1248 		 * ip_wput_v6. We can't do pullups here because the message
1249 		 * could be from the nce_qd_mp which could have b_next/b_prev
1250 		 * non-NULL.
1251 		 */
1252 		ip6i = (ip6i_t *)ip6h;
1253 		ASSERT((mp->b_wptr - (uchar_t *)ip6i) >=
1254 			    sizeof (ip6i_t) + IPV6_HDR_LEN);
1255 		ip6h = (ip6_t *)(mp->b_rptr + sizeof (ip6i_t));
1256 	}
1257 	src = ip6h->ip6_src;
1258 	/*
1259 	 * If the src of outgoing packet is one of the assigned interface
1260 	 * addresses use it, otherwise we will pick the source address below.
1261 	 */
1262 	src_ill = ill;
1263 	if (!IN6_IS_ADDR_UNSPECIFIED(&src)) {
1264 		if (ill->ill_group != NULL)
1265 			src_ill = ill->ill_group->illgrp_ill;
1266 		for (; src_ill != NULL; src_ill = src_ill->ill_group_next) {
1267 			for (ipif = src_ill->ill_ipif; ipif != NULL;
1268 			    ipif = ipif->ipif_next) {
1269 				if (IN6_ARE_ADDR_EQUAL(&src,
1270 				    &ipif->ipif_v6lcl_addr)) {
1271 					break;
1272 				}
1273 			}
1274 			if (ipif != NULL)
1275 				break;
1276 		}
1277 		if (src_ill == NULL) {
1278 			/* May be a forwarding packet */
1279 			src_ill = ill;
1280 			src = ipv6_all_zeros;
1281 		}
1282 	}
1283 	dst = nce->nce_addr;
1284 	/*
1285 	 * If source address is unspecified, nce_xmit will choose
1286 	 * one for us and initialize the hardware address also
1287 	 * appropriately.
1288 	 */
1289 	if (IN6_IS_ADDR_UNSPECIFIED(&src))
1290 		src_ill  = NULL;
1291 	nce->nce_rcnt--;
1292 	mutex_exit(&nce->nce_lock);
1293 	rw_exit(&ill_g_lock);
1294 	dropped = nce_xmit(ill, ND_NEIGHBOR_SOLICIT, src_ill, B_TRUE, &src,
1295 	    &dst, 0);
1296 	rw_enter(&ill_g_lock, RW_READER);
1297 	mutex_enter(&nce->nce_lock);
1298 	if (dropped)
1299 		nce->nce_rcnt++;
1300 	return (ill->ill_reachable_retrans_time);
1301 }
1302 
1303 void
1304 ndp_input_solicit(ill_t *ill, mblk_t *mp)
1305 {
1306 	nd_neighbor_solicit_t *ns;
1307 	uint32_t	hlen = ill->ill_nd_lla_len;
1308 	uchar_t		*haddr = NULL;
1309 	icmp6_t		*icmp_nd;
1310 	ip6_t		*ip6h;
1311 	nce_t		*our_nce = NULL;
1312 	in6_addr_t	target;
1313 	in6_addr_t	src;
1314 	int		len;
1315 	int		flag = 0;
1316 	nd_opt_hdr_t	*opt = NULL;
1317 	boolean_t	bad_solicit = B_FALSE;
1318 	mib2_ipv6IfIcmpEntry_t	*mib = ill->ill_icmp6_mib;
1319 
1320 	ip6h = (ip6_t *)mp->b_rptr;
1321 	icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
1322 	len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN;
1323 	src = ip6h->ip6_src;
1324 	ns = (nd_neighbor_solicit_t *)icmp_nd;
1325 	target = ns->nd_ns_target;
1326 	if (IN6_IS_ADDR_MULTICAST(&target)) {
1327 		if (ip_debug > 2) {
1328 			/* ip1dbg */
1329 			pr_addr_dbg("ndp_input_solicit: Target is"
1330 			    " multicast! %s\n", AF_INET6, &target);
1331 		}
1332 		bad_solicit = B_TRUE;
1333 		goto done;
1334 	}
1335 	if (len > sizeof (nd_neighbor_solicit_t)) {
1336 		/* Options present */
1337 		opt = (nd_opt_hdr_t *)&ns[1];
1338 		len -= sizeof (nd_neighbor_solicit_t);
1339 		if (!ndp_verify_optlen(opt, len)) {
1340 			ip1dbg(("ndp_input_solicit: Bad opt len\n"));
1341 			bad_solicit = B_TRUE;
1342 			goto done;
1343 		}
1344 	}
1345 	if (IN6_IS_ADDR_UNSPECIFIED(&src)) {
1346 		/* Check to see if this is a valid DAD solicitation */
1347 		if (!IN6_IS_ADDR_MC_SOLICITEDNODE(&ip6h->ip6_dst)) {
1348 			if (ip_debug > 2) {
1349 				/* ip1dbg */
1350 				pr_addr_dbg("ndp_input_solicit: IPv6 "
1351 				    "Destination is not solicited node "
1352 				    "multicast %s\n", AF_INET6,
1353 				    &ip6h->ip6_dst);
1354 			}
1355 			bad_solicit = B_TRUE;
1356 			goto done;
1357 		}
1358 	}
1359 
1360 	our_nce = ndp_lookup(ill, &target, B_FALSE);
1361 	/*
1362 	 * If this is a valid Solicitation, a permanent
1363 	 * entry should exist in the cache
1364 	 */
1365 	if (our_nce == NULL ||
1366 	    !(our_nce->nce_flags & NCE_F_PERMANENT)) {
1367 		ip1dbg(("ndp_input_solicit: Wrong target in NS?!"
1368 		    "ifname=%s ", ill->ill_name));
1369 		if (ip_debug > 2) {
1370 			/* ip1dbg */
1371 			pr_addr_dbg(" dst %s\n", AF_INET6, &target);
1372 		}
1373 		bad_solicit = B_TRUE;
1374 		goto done;
1375 	}
1376 
1377 	/* At this point we should have a verified NS per spec */
1378 	if (opt != NULL) {
1379 		opt = ndp_get_option(opt, len, ND_OPT_SOURCE_LINKADDR);
1380 		if (opt != NULL) {
1381 			/*
1382 			 * No source link layer address option should
1383 			 * be present in a valid DAD request.
1384 			 */
1385 			if (IN6_IS_ADDR_UNSPECIFIED(&src)) {
1386 				ip1dbg(("ndp_input_solicit: source link-layer "
1387 				    "address option present with an "
1388 				    "unspecified source. \n"));
1389 				bad_solicit = B_TRUE;
1390 				goto done;
1391 			}
1392 			haddr = (uchar_t *)&opt[1];
1393 			if (hlen > opt->nd_opt_len * 8 ||
1394 			    hlen == 0) {
1395 				bad_solicit = B_TRUE;
1396 				goto done;
1397 			}
1398 		}
1399 	}
1400 	/*
1401 	 * haddr can be NULL if no options are present,
1402 	 * or no Source link layer address is present in,
1403 	 * recvd NDP options of solicitation message.
1404 	 */
1405 	if (haddr == NULL) {
1406 		nce_t   *nnce;
1407 		mutex_enter(&ndp_g_lock);
1408 		nnce = nce_lookup_addr(ill, &src);
1409 		mutex_exit(&ndp_g_lock);
1410 
1411 		if (nnce == NULL) {
1412 			in6_addr_t dst = ipv6_solicited_node_mcast;
1413 
1414 			/* Form solicited node multicast address */
1415 			dst.s6_addr32[3] |= src.s6_addr32[3];
1416 			(void) nce_xmit(ill,
1417 				ND_NEIGHBOR_SOLICIT,
1418 				ill,
1419 				B_TRUE,
1420 				&target,
1421 				&dst,
1422 				flag);
1423 			bad_solicit = B_TRUE;
1424 			goto done;
1425 		}
1426 	}
1427 	/* Set override flag, it will be reset later if need be. */
1428 	flag |= NDP_ORIDE;
1429 	if (!IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst)) {
1430 		flag |= NDP_UNICAST;
1431 	}
1432 
1433 	/*
1434 	 * Create/update the entry for the soliciting node.
1435 	 * or respond to outstanding queries, don't if
1436 	 * the source is unspecified address.
1437 	 */
1438 	if (!IN6_IS_ADDR_UNSPECIFIED(&src)) {
1439 		int	err = 0;
1440 		nce_t	*nnce;
1441 
1442 		err = ndp_lookup_then_add(ill,
1443 		    haddr,
1444 		    &src,	/* Soliciting nodes address */
1445 		    &ipv6_all_ones,
1446 		    &ipv6_all_zeros,
1447 		    0,
1448 		    0,
1449 		    ND_STALE,
1450 		    &nnce);
1451 		switch (err) {
1452 		case 0:
1453 			/* done with this entry */
1454 			NCE_REFRELE(nnce);
1455 			break;
1456 		case EEXIST:
1457 			/*
1458 			 * B_FALSE indicates this is not an
1459 			 * an advertisement.
1460 			 */
1461 			ndp_process(nnce, haddr, 0, B_FALSE);
1462 			NCE_REFRELE(nnce);
1463 			break;
1464 		default:
1465 			ip1dbg(("ndp_input_solicit: Can't create NCE %d\n",
1466 			    err));
1467 			goto done;
1468 		}
1469 		flag |= NDP_SOLICITED;
1470 	} else {
1471 		/*
1472 		 * This is a DAD req, multicast the advertisement
1473 		 * to the all-nodes address.
1474 		 */
1475 		src = ipv6_all_hosts_mcast;
1476 	}
1477 	if (our_nce->nce_flags & NCE_F_ISROUTER)
1478 		flag |= NDP_ISROUTER;
1479 	if (our_nce->nce_flags & NCE_F_PROXY)
1480 		flag &= ~NDP_ORIDE;
1481 	/* Response to a solicitation */
1482 	(void) nce_xmit(ill,
1483 	    ND_NEIGHBOR_ADVERT,
1484 	    ill,	/* ill to be used for extracting ill_nd_lla */
1485 	    B_TRUE,	/* use ill_nd_lla */
1486 	    &target,	/* Source and target of the advertisement pkt */
1487 	    &src,	/* IP Destination (source of original pkt) */
1488 	    flag);
1489 done:
1490 	if (bad_solicit)
1491 		BUMP_MIB(mib, ipv6IfIcmpInBadNeighborSolicitations);
1492 	if (our_nce != NULL)
1493 		NCE_REFRELE(our_nce);
1494 }
1495 
1496 void
1497 ndp_input_advert(ill_t *ill, mblk_t *mp)
1498 {
1499 	nd_neighbor_advert_t *na;
1500 	uint32_t	hlen = ill->ill_nd_lla_len;
1501 	uchar_t		*haddr = NULL;
1502 	icmp6_t		*icmp_nd;
1503 	ip6_t		*ip6h;
1504 	nce_t		*dst_nce = NULL;
1505 	in6_addr_t	target;
1506 	nd_opt_hdr_t	*opt = NULL;
1507 	int		len;
1508 	mib2_ipv6IfIcmpEntry_t	*mib = ill->ill_icmp6_mib;
1509 
1510 	ip6h = (ip6_t *)mp->b_rptr;
1511 	icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
1512 	len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN;
1513 	na = (nd_neighbor_advert_t *)icmp_nd;
1514 	if (IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst) &&
1515 	    (na->nd_na_flags_reserved & ND_NA_FLAG_SOLICITED)) {
1516 		ip1dbg(("ndp_input_advert: Target is multicast but the "
1517 		    "solicited flag is not zero\n"));
1518 		BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements);
1519 		return;
1520 	}
1521 	target = na->nd_na_target;
1522 	if (IN6_IS_ADDR_MULTICAST(&target)) {
1523 		ip1dbg(("ndp_input_advert: Target is multicast!\n"));
1524 		BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements);
1525 		return;
1526 	}
1527 	if (len > sizeof (nd_neighbor_advert_t)) {
1528 		opt = (nd_opt_hdr_t *)&na[1];
1529 		if (!ndp_verify_optlen(opt,
1530 		    len - sizeof (nd_neighbor_advert_t))) {
1531 			BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements);
1532 			return;
1533 		}
1534 		/* At this point we have a verified NA per spec */
1535 		len -= sizeof (nd_neighbor_advert_t);
1536 		opt = ndp_get_option(opt, len, ND_OPT_TARGET_LINKADDR);
1537 		if (opt != NULL) {
1538 			haddr = (uchar_t *)&opt[1];
1539 			if (hlen > opt->nd_opt_len * 8 ||
1540 			    hlen == 0) {
1541 				BUMP_MIB(mib,
1542 				    ipv6IfIcmpInBadNeighborAdvertisements);
1543 				return;
1544 			}
1545 		}
1546 	}
1547 
1548 	/*
1549 	 * If this interface is part of the group look at all the
1550 	 * ills in the group.
1551 	 */
1552 	rw_enter(&ill_g_lock, RW_READER);
1553 	if (ill->ill_group != NULL)
1554 		ill = ill->ill_group->illgrp_ill;
1555 
1556 	for (; ill != NULL; ill = ill->ill_group_next) {
1557 		mutex_enter(&ill->ill_lock);
1558 		if (!ILL_CAN_LOOKUP(ill)) {
1559 			mutex_exit(&ill->ill_lock);
1560 			continue;
1561 		}
1562 		ill_refhold_locked(ill);
1563 		mutex_exit(&ill->ill_lock);
1564 		dst_nce = ndp_lookup(ill, &target, B_FALSE);
1565 		/* We have to drop the lock since ndp_process calls put* */
1566 		rw_exit(&ill_g_lock);
1567 		if (dst_nce != NULL) {
1568 			if (na->nd_na_flags_reserved &
1569 			    ND_NA_FLAG_ROUTER) {
1570 				dst_nce->nce_flags |= NCE_F_ISROUTER;
1571 			}
1572 			/* B_TRUE indicates this an advertisement */
1573 			ndp_process(dst_nce, haddr,
1574 				na->nd_na_flags_reserved, B_TRUE);
1575 			NCE_REFRELE(dst_nce);
1576 		}
1577 		rw_enter(&ill_g_lock, RW_READER);
1578 		ill_refrele(ill);
1579 	}
1580 	rw_exit(&ill_g_lock);
1581 }
1582 
1583 /*
1584  * Process NDP neighbor solicitation/advertisement messages.
1585  * The checksum has already checked o.k before reaching here.
1586  */
1587 void
1588 ndp_input(ill_t *ill, mblk_t *mp)
1589 {
1590 	icmp6_t		*icmp_nd;
1591 	ip6_t		*ip6h;
1592 	int		len;
1593 	mib2_ipv6IfIcmpEntry_t	*mib = ill->ill_icmp6_mib;
1594 
1595 
1596 	if (!pullupmsg(mp, -1)) {
1597 		ip1dbg(("ndp_input: pullupmsg failed\n"));
1598 		BUMP_MIB(ill->ill_ip6_mib, ipv6InDiscards);
1599 		goto done;
1600 	}
1601 	ip6h = (ip6_t *)mp->b_rptr;
1602 	if (ip6h->ip6_hops != IPV6_MAX_HOPS) {
1603 		ip1dbg(("ndp_input: hoplimit != IPV6_MAX_HOPS\n"));
1604 		BUMP_MIB(mib, ipv6IfIcmpBadHoplimit);
1605 		goto done;
1606 	}
1607 	/*
1608 	 * NDP does not accept any extension headers between the
1609 	 * IP header and the ICMP header since e.g. a routing
1610 	 * header could be dangerous.
1611 	 * This assumes that any AH or ESP headers are removed
1612 	 * by ip prior to passing the packet to ndp_input.
1613 	 */
1614 	if (ip6h->ip6_nxt != IPPROTO_ICMPV6) {
1615 		ip1dbg(("ndp_input: Wrong next header 0x%x\n",
1616 		    ip6h->ip6_nxt));
1617 		BUMP_MIB(mib, ipv6IfIcmpInErrors);
1618 		goto done;
1619 	}
1620 	icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
1621 	ASSERT(icmp_nd->icmp6_type == ND_NEIGHBOR_SOLICIT ||
1622 	    icmp_nd->icmp6_type == ND_NEIGHBOR_ADVERT);
1623 	if (icmp_nd->icmp6_code != 0) {
1624 		ip1dbg(("ndp_input: icmp6 code != 0 \n"));
1625 		BUMP_MIB(mib, ipv6IfIcmpInErrors);
1626 		goto done;
1627 	}
1628 	len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN;
1629 	/*
1630 	 * Make sure packet length is large enough for either
1631 	 * a NS or a NA icmp packet.
1632 	 */
1633 	if (len <  sizeof (struct icmp6_hdr) + sizeof (struct in6_addr)) {
1634 		ip1dbg(("ndp_input: packet too short\n"));
1635 		BUMP_MIB(mib, ipv6IfIcmpInErrors);
1636 		goto done;
1637 	}
1638 	if (icmp_nd->icmp6_type == ND_NEIGHBOR_SOLICIT) {
1639 		ndp_input_solicit(ill, mp);
1640 	} else {
1641 		ndp_input_advert(ill, mp);
1642 	}
1643 done:
1644 	freemsg(mp);
1645 }
1646 
1647 /*
1648  * nce_xmit is called to form and transmit a ND solicitation or
1649  * advertisement ICMP packet.
1650  * If source address is unspecified, appropriate source address
1651  * and link layer address will be chosen here. This function
1652  * *always* sends the link layer option.
1653  * It returns B_FALSE only if it does a successful put() to the
1654  * corresponding ill's ill_wq otherwise returns B_TRUE.
1655  */
1656 static boolean_t
1657 nce_xmit(ill_t *ill, uint32_t operation, ill_t *hwaddr_ill,
1658     boolean_t use_nd_lla, const in6_addr_t *sender, const in6_addr_t *target,
1659     int flag)
1660 {
1661 	uint32_t	len;
1662 	icmp6_t 	*icmp6;
1663 	mblk_t		*mp;
1664 	ip6_t		*ip6h;
1665 	nd_opt_hdr_t	*opt;
1666 	uint_t		plen;
1667 	ip6i_t		*ip6i;
1668 	ipif_t		*src_ipif = NULL;
1669 
1670 	/*
1671 	 * If we have a unspecified source(sender) address, select a
1672 	 * proper source address for the solicitation here itself so
1673 	 * that we can initialize the h/w address correctly. This is
1674 	 * needed for interface groups as source address can come from
1675 	 * the whole group and the h/w address initialized from ill will
1676 	 * be wrong if the source address comes from a different ill.
1677 	 *
1678 	 * Note that the NA never comes here with the unspecified source
1679 	 * address. The following asserts that whenever the source
1680 	 * address is specified, the haddr also should be specified.
1681 	 */
1682 	ASSERT(IN6_IS_ADDR_UNSPECIFIED(sender) || (hwaddr_ill != NULL));
1683 
1684 	if (IN6_IS_ADDR_UNSPECIFIED(sender)) {
1685 		ASSERT(operation != ND_NEIGHBOR_ADVERT);
1686 		/*
1687 		 * Pick a source address for this solicitation, but
1688 		 * restrict the selection to addresses assigned to the
1689 		 * output interface (or interface group).  We do this
1690 		 * because the destination will create a neighbor cache
1691 		 * entry for the source address of this packet, so the
1692 		 * source address had better be a valid neighbor.
1693 		 */
1694 		src_ipif = ipif_select_source_v6(ill, target, B_TRUE,
1695 		    IPV6_PREFER_SRC_DEFAULT, GLOBAL_ZONEID);
1696 		if (src_ipif == NULL) {
1697 			char buf[INET6_ADDRSTRLEN];
1698 
1699 			ip0dbg(("nce_xmit: No source ipif for dst %s\n",
1700 			    inet_ntop(AF_INET6, (char *)target, buf,
1701 			    sizeof (buf))));
1702 			return (B_TRUE);
1703 		}
1704 		sender = &src_ipif->ipif_v6src_addr;
1705 		hwaddr_ill = src_ipif->ipif_ill;
1706 	}
1707 
1708 	plen = (sizeof (nd_opt_hdr_t) + ill->ill_nd_lla_len + 7)/8;
1709 	/*
1710 	 * Always make sure that the NS/NA packets don't get load
1711 	 * spread. This is needed so that the probe packets sent
1712 	 * by the in.mpathd daemon can really go out on the desired
1713 	 * interface. Probe packets are made to go out on a desired
1714 	 * interface by including a ip6i with ATTACH_IF flag. As these
1715 	 * packets indirectly end up sending/receiving NS/NA packets
1716 	 * (neighbor doing NUD), we have to make sure that NA
1717 	 * also go out on the same interface.
1718 	 */
1719 	len = IPV6_HDR_LEN + sizeof (ip6i_t) + sizeof (nd_neighbor_advert_t) +
1720 	    plen * 8;
1721 	mp = allocb(len,  BPRI_LO);
1722 	if (mp == NULL) {
1723 		if (src_ipif != NULL)
1724 			ipif_refrele(src_ipif);
1725 		return (B_TRUE);
1726 	}
1727 	bzero((char *)mp->b_rptr, len);
1728 	mp->b_wptr = mp->b_rptr + len;
1729 
1730 	ip6i = (ip6i_t *)mp->b_rptr;
1731 	ip6i->ip6i_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
1732 	ip6i->ip6i_nxt = IPPROTO_RAW;
1733 	ip6i->ip6i_flags = IP6I_ATTACH_IF | IP6I_HOPLIMIT;
1734 	ip6i->ip6i_ifindex = ill->ill_phyint->phyint_ifindex;
1735 
1736 	ip6h = (ip6_t *)(mp->b_rptr + sizeof (ip6i_t));
1737 	ip6h->ip6_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
1738 	ip6h->ip6_plen = htons(len - IPV6_HDR_LEN - sizeof (ip6i_t));
1739 	ip6h->ip6_nxt = IPPROTO_ICMPV6;
1740 	ip6h->ip6_hops = IPV6_MAX_HOPS;
1741 	ip6h->ip6_dst = *target;
1742 	icmp6 = (icmp6_t *)&ip6h[1];
1743 
1744 	opt = (nd_opt_hdr_t *)((uint8_t *)ip6h + IPV6_HDR_LEN +
1745 	    sizeof (nd_neighbor_advert_t));
1746 
1747 	if (operation == ND_NEIGHBOR_SOLICIT) {
1748 		nd_neighbor_solicit_t *ns = (nd_neighbor_solicit_t *)icmp6;
1749 
1750 		opt->nd_opt_type = ND_OPT_SOURCE_LINKADDR;
1751 		ip6h->ip6_src = *sender;
1752 		ns->nd_ns_target = *target;
1753 		if (!(flag & NDP_UNICAST)) {
1754 			/* Form multicast address of the target */
1755 			ip6h->ip6_dst = ipv6_solicited_node_mcast;
1756 			ip6h->ip6_dst.s6_addr32[3] |=
1757 			    ns->nd_ns_target.s6_addr32[3];
1758 		}
1759 	} else {
1760 		nd_neighbor_advert_t *na = (nd_neighbor_advert_t *)icmp6;
1761 
1762 		opt->nd_opt_type = ND_OPT_TARGET_LINKADDR;
1763 		ip6h->ip6_src = *sender;
1764 		na->nd_na_target = *sender;
1765 		if (flag & NDP_ISROUTER)
1766 			na->nd_na_flags_reserved |= ND_NA_FLAG_ROUTER;
1767 		if (flag & NDP_SOLICITED)
1768 			na->nd_na_flags_reserved |= ND_NA_FLAG_SOLICITED;
1769 		if (flag & NDP_ORIDE)
1770 			na->nd_na_flags_reserved |= ND_NA_FLAG_OVERRIDE;
1771 
1772 	}
1773 	/* Fill in link layer address and option len */
1774 	opt->nd_opt_len = (uint8_t)plen;
1775 	mutex_enter(&hwaddr_ill->ill_lock);
1776 	bcopy(use_nd_lla ? hwaddr_ill->ill_nd_lla : hwaddr_ill->ill_phys_addr,
1777 	    &opt[1], hwaddr_ill->ill_nd_lla_len);
1778 	mutex_exit(&hwaddr_ill->ill_lock);
1779 	icmp6->icmp6_type = (uint8_t)operation;
1780 	icmp6->icmp6_code = 0;
1781 	/*
1782 	 * Prepare for checksum by putting icmp length in the icmp
1783 	 * checksum field. The checksum is calculated in ip_wput_v6.
1784 	 */
1785 	icmp6->icmp6_cksum = ip6h->ip6_plen;
1786 
1787 	if (src_ipif != NULL)
1788 		ipif_refrele(src_ipif);
1789 	if (canput(ill->ill_wq)) {
1790 		put(ill->ill_wq, mp);
1791 		return (B_FALSE);
1792 	}
1793 	freemsg(mp);
1794 	return (B_TRUE);
1795 }
1796 
1797 /*
1798  * Make a link layer address (does not include the SAP) from an nce.
1799  * To form the link layer address, use the last four bytes of ipv6
1800  * address passed in and the fixed offset stored in nce.
1801  */
1802 static void
1803 nce_make_mapping(nce_t *nce, uchar_t *addrpos, uchar_t *addr)
1804 {
1805 	uchar_t *mask, *to;
1806 	ill_t	*ill = nce->nce_ill;
1807 	int 	len;
1808 
1809 	if (ill->ill_net_type == IRE_IF_NORESOLVER)
1810 		return;
1811 	ASSERT(nce->nce_res_mp != NULL);
1812 	ASSERT(ill->ill_net_type == IRE_IF_RESOLVER);
1813 	ASSERT(nce->nce_flags & NCE_F_MAPPING);
1814 	ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&nce->nce_extract_mask));
1815 	ASSERT(addr != NULL);
1816 	bcopy(nce->nce_res_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill),
1817 	    addrpos, ill->ill_nd_lla_len);
1818 	len = MIN((int)ill->ill_nd_lla_len - nce->nce_ll_extract_start,
1819 	    IPV6_ADDR_LEN);
1820 	mask = (uchar_t *)&nce->nce_extract_mask;
1821 	mask += (IPV6_ADDR_LEN - len);
1822 	addr += (IPV6_ADDR_LEN - len);
1823 	to = addrpos + nce->nce_ll_extract_start;
1824 	while (len-- > 0)
1825 		*to++ |= *mask++ & *addr++;
1826 }
1827 
1828 /*
1829  * Pass a cache report back out via NDD.
1830  */
1831 /* ARGSUSED */
1832 int
1833 ndp_report(queue_t *q, mblk_t *mp, caddr_t arg, cred_t *ioc_cr)
1834 {
1835 	(void) mi_mpprintf(mp, "ifname      hardware addr    flags"
1836 			"     proto addr/mask");
1837 	ndp_walk(NULL, (pfi_t)nce_report1, (uchar_t *)mp);
1838 	return (0);
1839 }
1840 
1841 /*
1842  * convert a link level address of arbitrary length
1843  * to an ascii string.
1844  * The caller *must* have already verified that the string buffer
1845  * is large enough to hold the entire string, including trailing NULL.
1846  */
1847 static void
1848 lla2ascii(uint8_t *lla, int addrlen, uchar_t *buf)
1849 {
1850 	uchar_t	addrbyte[8];	/* needs to hold ascii for a byte plus a NULL */
1851 	int	i;
1852 	size_t	len;
1853 
1854 	buf[0] = '\0';
1855 	for (i = 0; i < addrlen; i++) {
1856 		addrbyte[0] = '\0';
1857 		(void) sprintf((char *)addrbyte, "%02x:", (lla[i] & 0xff));
1858 		len = strlen((const char *)addrbyte);
1859 		bcopy(addrbyte, buf, len);
1860 		buf = buf + len;
1861 	}
1862 	*--buf = '\0';
1863 }
1864 
1865 /*
1866  * Add a single line to the NDP Cache Entry Report.
1867  */
1868 static void
1869 nce_report1(nce_t *nce, uchar_t *mp_arg)
1870 {
1871 	ill_t		*ill = nce->nce_ill;
1872 	char		local_buf[INET6_ADDRSTRLEN];
1873 	uchar_t		flags_buf[10];
1874 	uint32_t	flags = nce->nce_flags;
1875 	mblk_t		*mp = (mblk_t *)mp_arg;
1876 	uchar_t		*h;
1877 	uchar_t		*m = flags_buf;
1878 	in6_addr_t	v6addr;
1879 
1880 	/*
1881 	 * Lock the nce to protect nce_res_mp from being changed
1882 	 * if an external resolver address resolution completes
1883 	 * while nce_res_mp is being accessed here.
1884 	 *
1885 	 * Deal with all address formats, not just Ethernet-specific
1886 	 * In addition, make sure that the mblk has enough space
1887 	 * before writing to it. If is doesn't, allocate a new one.
1888 	 */
1889 	ASSERT(ill != NULL);
1890 	v6addr = nce->nce_mask;
1891 	if (flags & NCE_F_PERMANENT)
1892 		*m++ = 'P';
1893 	if (flags & NCE_F_ISROUTER)
1894 		*m++ = 'R';
1895 	if (flags & NCE_F_MAPPING)
1896 		*m++ = 'M';
1897 	*m = '\0';
1898 
1899 	if (ill->ill_net_type == IRE_IF_RESOLVER) {
1900 		size_t		addrlen;
1901 		uchar_t		*addr_buf;
1902 		dl_unitdata_req_t	*dl;
1903 
1904 		mutex_enter(&nce->nce_lock);
1905 		h = nce->nce_res_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill);
1906 		dl = (dl_unitdata_req_t *)nce->nce_res_mp->b_rptr;
1907 		if (ill->ill_flags & ILLF_XRESOLV)
1908 			addrlen = (3 * (dl->dl_dest_addr_length));
1909 		else
1910 			addrlen = (3 * (ill->ill_nd_lla_len));
1911 		if (addrlen <= 0) {
1912 			mutex_exit(&nce->nce_lock);
1913 			(void) mi_mpprintf(mp,
1914 			    "%8s %9s %5s %s/%d",
1915 			    ill->ill_name,
1916 			    "None",
1917 			    (uchar_t *)&flags_buf,
1918 			    inet_ntop(AF_INET6, (char *)&nce->nce_addr,
1919 				(char *)local_buf, sizeof (local_buf)),
1920 				ip_mask_to_plen_v6(&v6addr));
1921 		} else {
1922 			/*
1923 			 * Convert the hardware/lla address to ascii
1924 			 */
1925 			addr_buf = kmem_zalloc(addrlen, KM_NOSLEEP);
1926 			if (addr_buf == NULL) {
1927 				mutex_exit(&nce->nce_lock);
1928 				return;
1929 			}
1930 			if (ill->ill_flags & ILLF_XRESOLV)
1931 				lla2ascii((uint8_t *)h, dl->dl_dest_addr_length,
1932 				    addr_buf);
1933 			else
1934 				lla2ascii((uint8_t *)h, ill->ill_nd_lla_len,
1935 				    addr_buf);
1936 			mutex_exit(&nce->nce_lock);
1937 			(void) mi_mpprintf(mp, "%8s %17s %5s %s/%d",
1938 			    ill->ill_name, addr_buf, (uchar_t *)&flags_buf,
1939 			    inet_ntop(AF_INET6, (char *)&nce->nce_addr,
1940 				(char *)local_buf, sizeof (local_buf)),
1941 				ip_mask_to_plen_v6(&v6addr));
1942 			kmem_free(addr_buf, addrlen);
1943 		}
1944 	} else {
1945 		(void) mi_mpprintf(mp,
1946 		    "%8s %9s %5s %s/%d",
1947 		    ill->ill_name,
1948 		    "None",
1949 		    (uchar_t *)&flags_buf,
1950 		    inet_ntop(AF_INET6, (char *)&nce->nce_addr,
1951 			(char *)local_buf, sizeof (local_buf)),
1952 			ip_mask_to_plen_v6(&v6addr));
1953 	}
1954 }
1955 
1956 mblk_t *
1957 nce_udreq_alloc(ill_t *ill)
1958 {
1959 	mblk_t	*template_mp = NULL;
1960 	dl_unitdata_req_t *dlur;
1961 	int	sap_length;
1962 
1963 	sap_length = ill->ill_sap_length;
1964 	template_mp = ip_dlpi_alloc(sizeof (dl_unitdata_req_t) +
1965 	    ill->ill_nd_lla_len + ABS(sap_length), DL_UNITDATA_REQ);
1966 	if (template_mp == NULL)
1967 		return (NULL);
1968 
1969 	dlur = (dl_unitdata_req_t *)template_mp->b_rptr;
1970 	dlur->dl_priority.dl_min = 0;
1971 	dlur->dl_priority.dl_max = 0;
1972 	dlur->dl_dest_addr_length = ABS(sap_length) + ill->ill_nd_lla_len;
1973 	dlur->dl_dest_addr_offset = sizeof (dl_unitdata_req_t);
1974 
1975 	/* Copy in the SAP value. */
1976 	NCE_LL_SAP_COPY(ill, template_mp);
1977 
1978 	return (template_mp);
1979 }
1980 
1981 /*
1982  * NDP retransmit timer.
1983  * This timer goes off when:
1984  * a. It is time to retransmit NS for resolver.
1985  * b. It is time to send reachability probes.
1986  */
1987 void
1988 ndp_timer(void *arg)
1989 {
1990 	nce_t		*nce = arg;
1991 	ill_t		*ill = nce->nce_ill;
1992 	uint32_t	ms;
1993 	char		addrbuf[INET6_ADDRSTRLEN];
1994 	mblk_t		*mp;
1995 	boolean_t	dropped = B_FALSE;
1996 
1997 	/*
1998 	 * The timer has to be cancelled by ndp_delete before doing the final
1999 	 * refrele. So the NCE is guaranteed to exist when the timer runs
2000 	 * until it clears the timeout_id. Before clearing the timeout_id
2001 	 * bump up the refcnt so that we can continue to use the nce
2002 	 */
2003 	ASSERT(nce != NULL);
2004 
2005 	/*
2006 	 * Grab the ill_g_lock now itself to avoid lock order problems.
2007 	 * nce_solicit needs ill_g_lock to be able to traverse ills
2008 	 */
2009 	rw_enter(&ill_g_lock, RW_READER);
2010 	mutex_enter(&nce->nce_lock);
2011 	NCE_REFHOLD_LOCKED(nce);
2012 	nce->nce_timeout_id = 0;
2013 
2014 	/*
2015 	 * Check the reachability state first.
2016 	 */
2017 	switch (nce->nce_state) {
2018 	case ND_DELAY:
2019 		rw_exit(&ill_g_lock);
2020 		nce->nce_state = ND_PROBE;
2021 		mutex_exit(&nce->nce_lock);
2022 		(void) nce_xmit(ill, ND_NEIGHBOR_SOLICIT, NULL, B_FALSE,
2023 		    &ipv6_all_zeros, &nce->nce_addr, NDP_UNICAST);
2024 		if (ip_debug > 3) {
2025 			/* ip2dbg */
2026 			pr_addr_dbg("ndp_timer: state for %s changed "
2027 			    "to PROBE\n", AF_INET6, &nce->nce_addr);
2028 		}
2029 		NDP_RESTART_TIMER(nce, ill->ill_reachable_retrans_time);
2030 		NCE_REFRELE(nce);
2031 		return;
2032 	case ND_PROBE:
2033 		/* must be retransmit timer */
2034 		rw_exit(&ill_g_lock);
2035 		nce->nce_pcnt--;
2036 		ASSERT(nce->nce_pcnt < ND_MAX_UNICAST_SOLICIT &&
2037 		    nce->nce_pcnt >= -1);
2038 		if (nce->nce_pcnt == 0) {
2039 			/* Wait RetransTimer, before deleting the entry */
2040 			ip2dbg(("ndp_timer: pcount=%x dst %s\n",
2041 			    nce->nce_pcnt, inet_ntop(AF_INET6,
2042 			    &nce->nce_addr, addrbuf, sizeof (addrbuf))));
2043 			mutex_exit(&nce->nce_lock);
2044 			NDP_RESTART_TIMER(nce, ill->ill_reachable_retrans_time);
2045 		} else {
2046 			/*
2047 			 * As per RFC2461, the nce gets deleted after
2048 			 * MAX_UNICAST_SOLICIT unsuccessful re-transmissions.
2049 			 * Note that the first unicast solicitation is sent
2050 			 * during the DELAY state.
2051 			 */
2052 			if (nce->nce_pcnt > 0) {
2053 				ip2dbg(("ndp_timer: pcount=%x dst %s\n",
2054 				    nce->nce_pcnt, inet_ntop(AF_INET6,
2055 				    &nce->nce_addr,
2056 				    addrbuf, sizeof (addrbuf))));
2057 				mutex_exit(&nce->nce_lock);
2058 				dropped = nce_xmit(ill, ND_NEIGHBOR_SOLICIT,
2059 				    NULL, B_FALSE, &ipv6_all_zeros,
2060 				    &nce->nce_addr, NDP_UNICAST);
2061 				if (dropped) {
2062 					mutex_enter(&nce->nce_lock);
2063 					nce->nce_pcnt++;
2064 					mutex_exit(&nce->nce_lock);
2065 				}
2066 				NDP_RESTART_TIMER(nce,
2067 				    ill->ill_reachable_retrans_time);
2068 			} else {
2069 				/* No hope, delete the nce */
2070 				nce->nce_state = ND_UNREACHABLE;
2071 				mutex_exit(&nce->nce_lock);
2072 				if (ip_debug > 2) {
2073 					/* ip1dbg */
2074 					pr_addr_dbg("ndp_timer: Delete IRE for"
2075 					    " dst %s\n", AF_INET6,
2076 					    &nce->nce_addr);
2077 				}
2078 				ndp_delete(nce);
2079 			}
2080 		}
2081 		NCE_REFRELE(nce);
2082 		return;
2083 	case ND_INCOMPLETE:
2084 		/*
2085 		 * Must be resolvers retransmit timer.
2086 		 */
2087 		for (mp = nce->nce_qd_mp; mp != NULL; mp = mp->b_next) {
2088 			ip6i_t	*ip6i;
2089 			ip6_t	*ip6h;
2090 			mblk_t *data_mp;
2091 
2092 			/*
2093 			 * Walk the list of packets queued, and see if there
2094 			 * are any multipathing probe packets. Such packets
2095 			 * are always queued at the head. Since this is a
2096 			 * retransmit timer firing, mark such packets as
2097 			 * delayed in ND resolution. This info will be used
2098 			 * in ip_wput_v6(). Multipathing probe packets will
2099 			 * always have an ip6i_t. Once we hit a packet without
2100 			 * it, we can break out of this loop.
2101 			 */
2102 			if (mp->b_datap->db_type == M_CTL)
2103 				data_mp = mp->b_cont;
2104 			else
2105 				data_mp = mp;
2106 
2107 			ip6h = (ip6_t *)data_mp->b_rptr;
2108 			if (ip6h->ip6_nxt != IPPROTO_RAW)
2109 				break;
2110 
2111 			/*
2112 			 * This message should have been pulled up already in
2113 			 * ip_wput_v6. We can't do pullups here because the
2114 			 * b_next/b_prev is non-NULL.
2115 			 */
2116 			ip6i = (ip6i_t *)ip6h;
2117 			ASSERT((data_mp->b_wptr - (uchar_t *)ip6i) >=
2118 			    sizeof (ip6i_t) + IPV6_HDR_LEN);
2119 
2120 			/* Mark this packet as delayed due to ND resolution */
2121 			if (ip6i->ip6i_flags & IP6I_DROP_IFDELAYED)
2122 				ip6i->ip6i_flags |= IP6I_ND_DELAYED;
2123 		}
2124 		if (nce->nce_qd_mp != NULL) {
2125 			ms = nce_solicit(nce, NULL);
2126 			rw_exit(&ill_g_lock);
2127 			if (ms == 0) {
2128 				if (nce->nce_state != ND_REACHABLE) {
2129 					mutex_exit(&nce->nce_lock);
2130 					nce_resolv_failed(nce);
2131 					ndp_delete(nce);
2132 				} else {
2133 					mutex_exit(&nce->nce_lock);
2134 				}
2135 			} else {
2136 				mutex_exit(&nce->nce_lock);
2137 				NDP_RESTART_TIMER(nce, (clock_t)ms);
2138 			}
2139 			NCE_REFRELE(nce);
2140 			return;
2141 		}
2142 		mutex_exit(&nce->nce_lock);
2143 		rw_exit(&ill_g_lock);
2144 		NCE_REFRELE(nce);
2145 		break;
2146 	case ND_REACHABLE :
2147 		rw_exit(&ill_g_lock);
2148 		if (nce->nce_flags & NCE_F_UNSOL_ADV &&
2149 		    nce->nce_unsolicit_count != 0) {
2150 			nce->nce_unsolicit_count--;
2151 			mutex_exit(&nce->nce_lock);
2152 			dropped = nce_xmit(ill,
2153 			    ND_NEIGHBOR_ADVERT,
2154 			    ill,	/* ill to be used for hw addr */
2155 			    B_FALSE,	/* use ill_phys_addr */
2156 			    &nce->nce_addr,
2157 			    &ipv6_all_hosts_mcast,
2158 			    nce->nce_flags | NDP_ORIDE);
2159 			if (dropped) {
2160 				mutex_enter(&nce->nce_lock);
2161 				nce->nce_unsolicit_count++;
2162 				mutex_exit(&nce->nce_lock);
2163 			}
2164 			if (nce->nce_unsolicit_count != 0) {
2165 				NDP_RESTART_TIMER(nce,
2166 				    ip_ndp_unsolicit_interval);
2167 			}
2168 		} else {
2169 			mutex_exit(&nce->nce_lock);
2170 		}
2171 		NCE_REFRELE(nce);
2172 		break;
2173 	default:
2174 		rw_exit(&ill_g_lock);
2175 		mutex_exit(&nce->nce_lock);
2176 		NCE_REFRELE(nce);
2177 		break;
2178 	}
2179 }
2180 
2181 /*
2182  * Set a link layer address from the ll_addr passed in.
2183  * Copy SAP from ill.
2184  */
2185 static void
2186 nce_set_ll(nce_t *nce, uchar_t *ll_addr)
2187 {
2188 	ill_t	*ill = nce->nce_ill;
2189 	uchar_t	*woffset;
2190 
2191 	ASSERT(ll_addr != NULL);
2192 	/* Always called before fast_path_probe */
2193 	ASSERT(nce->nce_fp_mp == NULL);
2194 	if (ill->ill_sap_length != 0) {
2195 		/*
2196 		 * Copy the SAP type specified in the
2197 		 * request into the xmit template.
2198 		 */
2199 		NCE_LL_SAP_COPY(ill, nce->nce_res_mp);
2200 	}
2201 	if (ill->ill_phys_addr_length > 0) {
2202 		/*
2203 		 * The bcopy() below used to be called for the physical address
2204 		 * length rather than the link layer address length. For
2205 		 * ethernet and many other media, the phys_addr and lla are
2206 		 * identical.
2207 		 * However, with xresolv interfaces being introduced, the
2208 		 * phys_addr and lla are no longer the same, and the physical
2209 		 * address may not have any useful meaning, so we use the lla
2210 		 * for IPv6 address resolution and destination addressing.
2211 		 *
2212 		 * For PPP or other interfaces with a zero length
2213 		 * physical address, don't do anything here.
2214 		 * The bcopy() with a zero phys_addr length was previously
2215 		 * a no-op for interfaces with a zero-length physical address.
2216 		 * Using the lla for them would change the way they operate.
2217 		 * Doing nothing in such cases preserves expected behavior.
2218 		 */
2219 		woffset = nce->nce_res_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill);
2220 		bcopy(ll_addr, woffset, ill->ill_nd_lla_len);
2221 	}
2222 }
2223 
2224 static boolean_t
2225 nce_cmp_ll_addr(nce_t *nce, char *ll_addr, uint32_t ll_addr_len)
2226 {
2227 	ill_t	*ill = nce->nce_ill;
2228 	uchar_t	*ll_offset;
2229 
2230 	ASSERT(nce->nce_res_mp != NULL);
2231 	if (ll_addr == NULL)
2232 		return (B_FALSE);
2233 	ll_offset = nce->nce_res_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill);
2234 	if (bcmp(ll_addr, (char *)ll_offset, ll_addr_len) != 0)
2235 		return (B_TRUE);
2236 	return (B_FALSE);
2237 }
2238 
2239 /*
2240  * Updates the link layer address or the reachability state of
2241  * a cache entry.  Reset probe counter if needed.
2242  */
2243 static void
2244 nce_update(nce_t *nce, uint16_t new_state, uchar_t *new_ll_addr)
2245 {
2246 	ill_t	*ill = nce->nce_ill;
2247 	boolean_t need_stop_timer = B_FALSE;
2248 	boolean_t need_fastpath_update = B_FALSE;
2249 
2250 	ASSERT(MUTEX_HELD(&nce->nce_lock));
2251 	/*
2252 	 * If this interface does not do NUD, there is no point
2253 	 * in allowing an update to the cache entry.  Although
2254 	 * we will respond to NS.
2255 	 * The only time we accept an update for a resolver when
2256 	 * NUD is turned off is when it has just been created.
2257 	 * Non-Resolvers will always be created as REACHABLE.
2258 	 */
2259 	if (new_state != ND_UNCHANGED) {
2260 		if ((nce->nce_flags & NCE_F_NONUD) &&
2261 		    (nce->nce_state != ND_INCOMPLETE))
2262 			return;
2263 		ASSERT((int16_t)new_state >= ND_STATE_VALID_MIN);
2264 		ASSERT((int16_t)new_state <= ND_STATE_VALID_MAX);
2265 		need_stop_timer = B_TRUE;
2266 		if (new_state == ND_REACHABLE)
2267 			nce->nce_last = TICK_TO_MSEC(lbolt64);
2268 		else {
2269 			/* We force NUD in this case */
2270 			nce->nce_last = 0;
2271 		}
2272 		nce->nce_state = new_state;
2273 		nce->nce_pcnt = ND_MAX_UNICAST_SOLICIT;
2274 	}
2275 	/*
2276 	 * In case of fast path we need to free the the fastpath
2277 	 * M_DATA and do another probe.  Otherwise we can just
2278 	 * overwrite the DL_UNITDATA_REQ data, noting we'll lose
2279 	 * whatever packets that happens to be transmitting at the time.
2280 	 */
2281 	if (new_ll_addr != NULL) {
2282 		ASSERT(nce->nce_res_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill) +
2283 		    ill->ill_nd_lla_len <= nce->nce_res_mp->b_wptr);
2284 		bcopy(new_ll_addr, nce->nce_res_mp->b_rptr +
2285 		    NCE_LL_ADDR_OFFSET(ill), ill->ill_nd_lla_len);
2286 		if (nce->nce_fp_mp != NULL) {
2287 			freemsg(nce->nce_fp_mp);
2288 			nce->nce_fp_mp = NULL;
2289 		}
2290 		need_fastpath_update = B_TRUE;
2291 	}
2292 	mutex_exit(&nce->nce_lock);
2293 	if (need_stop_timer) {
2294 		(void) untimeout(nce->nce_timeout_id);
2295 		nce->nce_timeout_id = 0;
2296 	}
2297 	if (need_fastpath_update)
2298 		nce_fastpath(nce);
2299 	mutex_enter(&nce->nce_lock);
2300 }
2301 
2302 static void
2303 nce_queue_mp(nce_t *nce, mblk_t *mp)
2304 {
2305 	uint_t	count = 0;
2306 	mblk_t  **mpp;
2307 	boolean_t head_insert = B_FALSE;
2308 	ip6_t	*ip6h;
2309 	ip6i_t	*ip6i;
2310 	mblk_t *data_mp;
2311 
2312 	ASSERT(MUTEX_HELD(&nce->nce_lock));
2313 
2314 	if (mp->b_datap->db_type == M_CTL)
2315 		data_mp = mp->b_cont;
2316 	else
2317 		data_mp = mp;
2318 	ip6h = (ip6_t *)data_mp->b_rptr;
2319 	if (ip6h->ip6_nxt == IPPROTO_RAW) {
2320 		/*
2321 		 * This message should have been pulled up already in
2322 		 * ip_wput_v6. We can't do pullups here because the message
2323 		 * could be from the nce_qd_mp which could have b_next/b_prev
2324 		 * non-NULL.
2325 		 */
2326 		ip6i = (ip6i_t *)ip6h;
2327 		ASSERT((data_mp->b_wptr - (uchar_t *)ip6i) >=
2328 			    sizeof (ip6i_t) + IPV6_HDR_LEN);
2329 		/*
2330 		 * Multipathing probe packets have IP6I_DROP_IFDELAYED set.
2331 		 * This has 2 aspects mentioned below.
2332 		 * 1. Perform head insertion in the nce_qd_mp for these packets.
2333 		 * This ensures that next retransmit of ND solicitation
2334 		 * will use the interface specified by the probe packet,
2335 		 * for both NS and NA. This corresponds to the src address
2336 		 * in the IPv6 packet. If we insert at tail, we will be
2337 		 * depending on the packet at the head for successful
2338 		 * ND resolution. This is not reliable, because the interface
2339 		 * on which the NA arrives could be different from the interface
2340 		 * on which the NS was sent, and if the receiving interface is
2341 		 * failed, it will appear that the sending interface is also
2342 		 * failed, causing in.mpathd to misdiagnose this as link
2343 		 * failure.
2344 		 * 2. Drop the original packet, if the ND resolution did not
2345 		 * succeed in the first attempt. However we will create the
2346 		 * nce and the ire, as soon as the ND resolution succeeds.
2347 		 * We don't gain anything by queueing multiple probe packets
2348 		 * and sending them back-to-back once resolution succeeds.
2349 		 * It is sufficient to send just 1 packet after ND resolution
2350 		 * succeeds. Since mpathd is sending down probe packets at a
2351 		 * constant rate, we don't need to send the queued packet. We
2352 		 * need to queue it only for NDP resolution. The benefit of
2353 		 * dropping the probe packets that were delayed in ND
2354 		 * resolution, is that in.mpathd will not see inflated
2355 		 * RTT. If the ND resolution does not succeed within
2356 		 * in.mpathd's failure detection time, mpathd may detect
2357 		 * a failure, and it does not matter whether the packet
2358 		 * was queued or dropped.
2359 		 */
2360 		if (ip6i->ip6i_flags & IP6I_DROP_IFDELAYED)
2361 			head_insert = B_TRUE;
2362 	}
2363 
2364 	for (mpp = &nce->nce_qd_mp; *mpp != NULL;
2365 	    mpp = &(*mpp)->b_next) {
2366 		if (++count >
2367 		    nce->nce_ill->ill_max_buf) {
2368 			mblk_t *tmp = nce->nce_qd_mp->b_next;
2369 
2370 			nce->nce_qd_mp->b_next = NULL;
2371 			nce->nce_qd_mp->b_prev = NULL;
2372 			freemsg(nce->nce_qd_mp);
2373 			ip1dbg(("nce_queue_mp: pkt dropped\n"));
2374 			nce->nce_qd_mp = tmp;
2375 		}
2376 	}
2377 	/* put this on the list */
2378 	if (head_insert) {
2379 		mp->b_next = nce->nce_qd_mp;
2380 		nce->nce_qd_mp = mp;
2381 	} else {
2382 		*mpp = mp;
2383 	}
2384 }
2385 
2386 /*
2387  * Called when address resolution failed due to a timeout.
2388  * Send an ICMP unreachable in response to all queued packets.
2389  */
2390 void
2391 nce_resolv_failed(nce_t *nce)
2392 {
2393 	mblk_t	*mp, *nxt_mp, *first_mp;
2394 	char	buf[INET6_ADDRSTRLEN];
2395 	ip6_t *ip6h;
2396 	zoneid_t zoneid = GLOBAL_ZONEID;
2397 
2398 	ip1dbg(("nce_resolv_failed: dst %s\n",
2399 	    inet_ntop(AF_INET6, (char *)&nce->nce_addr, buf, sizeof (buf))));
2400 	mutex_enter(&nce->nce_lock);
2401 	mp = nce->nce_qd_mp;
2402 	nce->nce_qd_mp = NULL;
2403 	mutex_exit(&nce->nce_lock);
2404 	while (mp != NULL) {
2405 		nxt_mp = mp->b_next;
2406 		mp->b_next = NULL;
2407 		mp->b_prev = NULL;
2408 
2409 		first_mp = mp;
2410 		if (mp->b_datap->db_type == M_CTL) {
2411 			ipsec_out_t *io = (ipsec_out_t *)mp->b_rptr;
2412 			ASSERT(io->ipsec_out_type == IPSEC_OUT);
2413 			zoneid = io->ipsec_out_zoneid;
2414 			ASSERT(zoneid != ALL_ZONES);
2415 			mp = mp->b_cont;
2416 		}
2417 
2418 		ip6h = (ip6_t *)mp->b_rptr;
2419 		if (ip6h->ip6_nxt == IPPROTO_RAW) {
2420 			ip6i_t *ip6i;
2421 			/*
2422 			 * This message should have been pulled up already
2423 			 * in ip_wput_v6. ip_hdr_complete_v6 assumes that
2424 			 * the header is pulled up.
2425 			 */
2426 			ip6i = (ip6i_t *)ip6h;
2427 			ASSERT((mp->b_wptr - (uchar_t *)ip6i) >=
2428 			    sizeof (ip6i_t) + IPV6_HDR_LEN);
2429 			mp->b_rptr += sizeof (ip6i_t);
2430 		}
2431 		/*
2432 		 * Ignore failure since icmp_unreachable_v6 will silently
2433 		 * drop packets with an unspecified source address.
2434 		 */
2435 		(void) ip_hdr_complete_v6((ip6_t *)mp->b_rptr, zoneid);
2436 		icmp_unreachable_v6(nce->nce_ill->ill_wq, first_mp,
2437 		    ICMP6_DST_UNREACH_ADDR, B_FALSE, B_FALSE);
2438 		mp = nxt_mp;
2439 	}
2440 }
2441 
2442 /*
2443  * Called by SIOCSNDP* ioctl to add/change an nce entry
2444  * and the corresponding attributes.
2445  * Disallow states other than ND_REACHABLE or ND_STALE.
2446  */
2447 int
2448 ndp_sioc_update(ill_t *ill, lif_nd_req_t *lnr)
2449 {
2450 	sin6_t		*sin6;
2451 	in6_addr_t	*addr;
2452 	nce_t		*nce;
2453 	int		err;
2454 	uint16_t	new_flags = 0;
2455 	uint16_t	old_flags = 0;
2456 	int		inflags = lnr->lnr_flags;
2457 
2458 	if ((lnr->lnr_state_create != ND_REACHABLE) &&
2459 	    (lnr->lnr_state_create != ND_STALE))
2460 		return (EINVAL);
2461 
2462 	sin6 = (sin6_t *)&lnr->lnr_addr;
2463 	addr = &sin6->sin6_addr;
2464 
2465 	mutex_enter(&ndp_g_lock);
2466 	/* We know it can not be mapping so just look in the hash table */
2467 	nce = nce_lookup_addr(ill, addr);
2468 	if (nce != NULL)
2469 		new_flags = nce->nce_flags;
2470 
2471 	switch (inflags & (NDF_ISROUTER_ON|NDF_ISROUTER_OFF)) {
2472 	case NDF_ISROUTER_ON:
2473 		new_flags |= NCE_F_ISROUTER;
2474 		break;
2475 	case NDF_ISROUTER_OFF:
2476 		new_flags &= ~NCE_F_ISROUTER;
2477 		break;
2478 	case (NDF_ISROUTER_OFF|NDF_ISROUTER_ON):
2479 		mutex_exit(&ndp_g_lock);
2480 		if (nce != NULL)
2481 			NCE_REFRELE(nce);
2482 		return (EINVAL);
2483 	}
2484 
2485 	switch (inflags & (NDF_ANYCAST_ON|NDF_ANYCAST_OFF)) {
2486 	case NDF_ANYCAST_ON:
2487 		new_flags |= NCE_F_ANYCAST;
2488 		break;
2489 	case NDF_ANYCAST_OFF:
2490 		new_flags &= ~NCE_F_ANYCAST;
2491 		break;
2492 	case (NDF_ANYCAST_OFF|NDF_ANYCAST_ON):
2493 		mutex_exit(&ndp_g_lock);
2494 		if (nce != NULL)
2495 			NCE_REFRELE(nce);
2496 		return (EINVAL);
2497 	}
2498 
2499 	switch (inflags & (NDF_PROXY_ON|NDF_PROXY_OFF)) {
2500 	case NDF_PROXY_ON:
2501 		new_flags |= NCE_F_PROXY;
2502 		break;
2503 	case NDF_PROXY_OFF:
2504 		new_flags &= ~NCE_F_PROXY;
2505 		break;
2506 	case (NDF_PROXY_OFF|NDF_PROXY_ON):
2507 		mutex_exit(&ndp_g_lock);
2508 		if (nce != NULL)
2509 			NCE_REFRELE(nce);
2510 		return (EINVAL);
2511 	}
2512 
2513 	if (nce == NULL) {
2514 		err = ndp_add(ill,
2515 		    (uchar_t *)lnr->lnr_hdw_addr,
2516 		    addr,
2517 		    &ipv6_all_ones,
2518 		    &ipv6_all_zeros,
2519 		    0,
2520 		    new_flags,
2521 		    lnr->lnr_state_create,
2522 		    &nce);
2523 		if (err != 0) {
2524 			mutex_exit(&ndp_g_lock);
2525 			ip1dbg(("ndp_sioc_update: Can't create NCE %d\n", err));
2526 			return (err);
2527 		}
2528 	}
2529 	old_flags = nce->nce_flags;
2530 	if (old_flags & NCE_F_ISROUTER && !(new_flags & NCE_F_ISROUTER)) {
2531 		/*
2532 		 * Router turned to host, delete all ires.
2533 		 * XXX Just delete the entry, but we need to add too.
2534 		 */
2535 		nce->nce_flags &= ~NCE_F_ISROUTER;
2536 		mutex_exit(&ndp_g_lock);
2537 		ndp_delete(nce);
2538 		NCE_REFRELE(nce);
2539 		return (0);
2540 	}
2541 	mutex_exit(&ndp_g_lock);
2542 
2543 	mutex_enter(&nce->nce_lock);
2544 	nce->nce_flags = new_flags;
2545 	mutex_exit(&nce->nce_lock);
2546 	/*
2547 	 * Note that we ignore the state at this point, which
2548 	 * should be either STALE or REACHABLE.  Instead we let
2549 	 * the link layer address passed in to determine the state
2550 	 * much like incoming packets.
2551 	 */
2552 	ndp_process(nce, (uchar_t *)lnr->lnr_hdw_addr, 0, B_FALSE);
2553 	NCE_REFRELE(nce);
2554 	return (0);
2555 }
2556 
2557 /*
2558  * If the device driver supports it, we make nce_fp_mp to have
2559  * an M_DATA prepend.  Otherwise nce_fp_mp will be null.
2560  * The caller insures there is hold on nce for this function.
2561  * Note that since ill_fastpath_probe() copies the mblk there is
2562  * no need for the hold beyond this function.
2563  */
2564 static void
2565 nce_fastpath(nce_t *nce)
2566 {
2567 	ill_t	*ill = nce->nce_ill;
2568 	int res;
2569 
2570 	ASSERT(ill != NULL);
2571 	if (nce->nce_fp_mp != NULL) {
2572 		/* Already contains fastpath info */
2573 		return;
2574 	}
2575 	if (nce->nce_res_mp != NULL) {
2576 		nce_fastpath_list_add(nce);
2577 		res = ill_fastpath_probe(ill, nce->nce_res_mp);
2578 		/*
2579 		 * EAGAIN is an indication of a transient error
2580 		 * i.e. allocation failure etc. leave the nce in the list it
2581 		 * will be updated when another probe happens for another ire
2582 		 * if not it will be taken out of the list when the ire is
2583 		 * deleted.
2584 		 */
2585 
2586 		if (res != 0 && res != EAGAIN)
2587 			nce_fastpath_list_delete(nce);
2588 	}
2589 }
2590 
2591 /*
2592  * Drain the list of nce's waiting for fastpath response.
2593  */
2594 void
2595 nce_fastpath_list_dispatch(ill_t *ill, boolean_t (*func)(nce_t *, void  *),
2596     void *arg)
2597 {
2598 
2599 	nce_t *next_nce;
2600 	nce_t *current_nce;
2601 	nce_t *first_nce;
2602 	nce_t *prev_nce = NULL;
2603 
2604 	ASSERT(ill != NULL);
2605 
2606 	mutex_enter(&ill->ill_lock);
2607 	first_nce = current_nce = (nce_t *)ill->ill_fastpath_list;
2608 	while (current_nce != (nce_t *)&ill->ill_fastpath_list) {
2609 		next_nce = current_nce->nce_fastpath;
2610 		/*
2611 		 * Take it off the list if we're flushing, or if the callback
2612 		 * routine tells us to do so.  Otherwise, leave the nce in the
2613 		 * fastpath list to handle any pending response from the lower
2614 		 * layer.  We can't drain the list when the callback routine
2615 		 * comparison failed, because the response is asynchronous in
2616 		 * nature, and may not arrive in the same order as the list
2617 		 * insertion.
2618 		 */
2619 		if (func == NULL || func(current_nce, arg)) {
2620 			current_nce->nce_fastpath = NULL;
2621 			if (current_nce == first_nce)
2622 				ill->ill_fastpath_list = first_nce = next_nce;
2623 			else
2624 				prev_nce->nce_fastpath = next_nce;
2625 		} else {
2626 			/* previous element that is still in the list */
2627 			prev_nce = current_nce;
2628 		}
2629 		current_nce = next_nce;
2630 	}
2631 	mutex_exit(&ill->ill_lock);
2632 }
2633 
2634 /*
2635  * Add nce to the nce fastpath list.
2636  */
2637 void
2638 nce_fastpath_list_add(nce_t *nce)
2639 {
2640 	ill_t *ill;
2641 
2642 	ill = nce->nce_ill;
2643 	ASSERT(ill != NULL);
2644 
2645 	mutex_enter(&ill->ill_lock);
2646 	mutex_enter(&nce->nce_lock);
2647 
2648 	/*
2649 	 * if nce has not been deleted and
2650 	 * is not already in the list add it.
2651 	 */
2652 	if (!(nce->nce_flags & NCE_F_CONDEMNED) &&
2653 	    (nce->nce_fastpath == NULL)) {
2654 		nce->nce_fastpath = (nce_t *)ill->ill_fastpath_list;
2655 		ill->ill_fastpath_list = nce;
2656 	}
2657 
2658 	mutex_exit(&nce->nce_lock);
2659 	mutex_exit(&ill->ill_lock);
2660 }
2661 
2662 /*
2663  * remove nce from the nce fastpath list.
2664  */
2665 void
2666 nce_fastpath_list_delete(nce_t *nce)
2667 {
2668 	nce_t *nce_ptr;
2669 
2670 	ill_t *ill;
2671 
2672 	ill = nce->nce_ill;
2673 	ASSERT(ill != NULL);
2674 
2675 	mutex_enter(&ill->ill_lock);
2676 	if (nce->nce_fastpath == NULL)
2677 		goto done;
2678 
2679 	ASSERT(ill->ill_fastpath_list != &ill->ill_fastpath_list);
2680 
2681 	if (ill->ill_fastpath_list == nce) {
2682 		ill->ill_fastpath_list = nce->nce_fastpath;
2683 	} else {
2684 		nce_ptr = ill->ill_fastpath_list;
2685 		while (nce_ptr != (nce_t *)&ill->ill_fastpath_list) {
2686 			if (nce_ptr->nce_fastpath == nce) {
2687 				nce_ptr->nce_fastpath = nce->nce_fastpath;
2688 				break;
2689 			}
2690 			nce_ptr = nce_ptr->nce_fastpath;
2691 		}
2692 	}
2693 
2694 	nce->nce_fastpath = NULL;
2695 done:
2696 	mutex_exit(&ill->ill_lock);
2697 }
2698 
2699 /*
2700  * Update all NCE's that are not in fastpath mode and
2701  * have an nce_fp_mp that matches mp. mp->b_cont contains
2702  * the fastpath header.
2703  *
2704  * Returns TRUE if entry should be dequeued, or FALSE otherwise.
2705  */
2706 boolean_t
2707 ndp_fastpath_update(nce_t *nce, void *arg)
2708 {
2709 	mblk_t 	*mp, *fp_mp;
2710 	uchar_t	*mp_rptr, *ud_mp_rptr;
2711 	mblk_t	*ud_mp = nce->nce_res_mp;
2712 	ptrdiff_t	cmplen;
2713 
2714 	if (nce->nce_flags & NCE_F_MAPPING)
2715 		return (B_TRUE);
2716 	if ((nce->nce_fp_mp != NULL) || (ud_mp == NULL))
2717 		return (B_TRUE);
2718 
2719 	ip2dbg(("ndp_fastpath_update: trying\n"));
2720 	mp = (mblk_t *)arg;
2721 	mp_rptr = mp->b_rptr;
2722 	cmplen = mp->b_wptr - mp_rptr;
2723 	ASSERT(cmplen >= 0);
2724 	ud_mp_rptr = ud_mp->b_rptr;
2725 	/*
2726 	 * The nce is locked here to prevent any other threads
2727 	 * from accessing and changing nce_res_mp when the IPv6 address
2728 	 * becomes resolved to an lla while we're in the middle
2729 	 * of looking at and comparing the hardware address (lla).
2730 	 * It is also locked to prevent multiple threads in nce_fastpath_update
2731 	 * from examining nce_res_mp atthe same time.
2732 	 */
2733 	mutex_enter(&nce->nce_lock);
2734 	if (ud_mp->b_wptr - ud_mp_rptr != cmplen ||
2735 	    bcmp((char *)mp_rptr, (char *)ud_mp_rptr, cmplen) != 0) {
2736 		mutex_exit(&nce->nce_lock);
2737 		/*
2738 		 * Don't take the ire off the fastpath list yet,
2739 		 * since the response may come later.
2740 		 */
2741 		return (B_FALSE);
2742 	}
2743 	/* Matched - install mp as the fastpath mp */
2744 	ip1dbg(("ndp_fastpath_update: match\n"));
2745 	fp_mp = dupb(mp->b_cont);
2746 	if (fp_mp != NULL) {
2747 		nce->nce_fp_mp = fp_mp;
2748 	}
2749 	mutex_exit(&nce->nce_lock);
2750 	return (B_TRUE);
2751 }
2752 
2753 /*
2754  * This function handles the DL_NOTE_FASTPATH_FLUSH notification from
2755  * driver.  Note that it assumes IP is exclusive...
2756  */
2757 /* ARGSUSED */
2758 void
2759 ndp_fastpath_flush(nce_t *nce, char *arg)
2760 {
2761 	if (nce->nce_flags & NCE_F_MAPPING)
2762 		return;
2763 	/* No fastpath info? */
2764 	if (nce->nce_fp_mp == NULL || nce->nce_res_mp == NULL)
2765 		return;
2766 
2767 	/* Just delete the NCE... */
2768 	ndp_delete(nce);
2769 }
2770 
2771 /*
2772  * Return a pointer to a given option in the packet.
2773  * Assumes that option part of the packet have already been validated.
2774  */
2775 nd_opt_hdr_t *
2776 ndp_get_option(nd_opt_hdr_t *opt, int optlen, int opt_type)
2777 {
2778 	while (optlen > 0) {
2779 		if (opt->nd_opt_type == opt_type)
2780 			return (opt);
2781 		optlen -= 8 * opt->nd_opt_len;
2782 		opt = (struct nd_opt_hdr *)((char *)opt + 8 * opt->nd_opt_len);
2783 	}
2784 	return (NULL);
2785 }
2786 
2787 /*
2788  * Verify all option lengths present are > 0, also check to see
2789  * if the option lengths and packet length are consistent.
2790  */
2791 boolean_t
2792 ndp_verify_optlen(nd_opt_hdr_t *opt, int optlen)
2793 {
2794 	ASSERT(opt != NULL);
2795 	while (optlen > 0) {
2796 		if (opt->nd_opt_len == 0)
2797 			return (B_FALSE);
2798 		optlen -= 8 * opt->nd_opt_len;
2799 		if (optlen < 0)
2800 			return (B_FALSE);
2801 		opt = (struct nd_opt_hdr *)((char *)opt + 8 * opt->nd_opt_len);
2802 	}
2803 	return (B_TRUE);
2804 }
2805 
2806 /*
2807  * ndp_walk function.
2808  * Free a fraction of the NCE cache entries.
2809  * A fraction of zero means to not free any in that category.
2810  */
2811 void
2812 ndp_cache_reclaim(nce_t *nce, char *arg)
2813 {
2814 	nce_cache_reclaim_t *ncr = (nce_cache_reclaim_t *)arg;
2815 	uint_t	rand;
2816 
2817 	if (nce->nce_flags & NCE_F_PERMANENT)
2818 		return;
2819 
2820 	rand = (uint_t)lbolt +
2821 	    NCE_ADDR_HASH_V6(nce->nce_addr, NCE_TABLE_SIZE);
2822 	if (ncr->ncr_host != 0 &&
2823 	    (rand/ncr->ncr_host)*ncr->ncr_host == rand) {
2824 		ndp_delete(nce);
2825 		return;
2826 	}
2827 }
2828 
2829 /*
2830  * ndp_walk function.
2831  * Count the number of NCEs that can be deleted.
2832  * These would be hosts but not routers.
2833  */
2834 void
2835 ndp_cache_count(nce_t *nce, char *arg)
2836 {
2837 	ncc_cache_count_t *ncc = (ncc_cache_count_t *)arg;
2838 
2839 	if (nce->nce_flags & NCE_F_PERMANENT)
2840 		return;
2841 
2842 	ncc->ncc_total++;
2843 	if (!(nce->nce_flags & NCE_F_ISROUTER))
2844 		ncc->ncc_host++;
2845 }
2846 
2847 #ifdef NCE_DEBUG
2848 th_trace_t *
2849 th_trace_nce_lookup(nce_t *nce)
2850 {
2851 	int bucket_id;
2852 	th_trace_t *th_trace;
2853 
2854 	ASSERT(MUTEX_HELD(&nce->nce_lock));
2855 
2856 	bucket_id = IP_TR_HASH(curthread);
2857 	ASSERT(bucket_id < IP_TR_HASH_MAX);
2858 
2859 	for (th_trace = nce->nce_trace[bucket_id]; th_trace != NULL;
2860 	    th_trace = th_trace->th_next) {
2861 		if (th_trace->th_id == curthread)
2862 			return (th_trace);
2863 	}
2864 	return (NULL);
2865 }
2866 
2867 void
2868 nce_trace_ref(nce_t *nce)
2869 {
2870 	int bucket_id;
2871 	th_trace_t *th_trace;
2872 
2873 	/*
2874 	 * Attempt to locate the trace buffer for the curthread.
2875 	 * If it does not exist, then allocate a new trace buffer
2876 	 * and link it in list of trace bufs for this ipif, at the head
2877 	 */
2878 	ASSERT(MUTEX_HELD(&nce->nce_lock));
2879 
2880 	if (nce->nce_trace_disable == B_TRUE)
2881 		return;
2882 
2883 	th_trace = th_trace_nce_lookup(nce);
2884 	if (th_trace == NULL) {
2885 		bucket_id = IP_TR_HASH(curthread);
2886 		th_trace = (th_trace_t *)kmem_zalloc(sizeof (th_trace_t),
2887 		    KM_NOSLEEP);
2888 		if (th_trace == NULL) {
2889 			nce->nce_trace_disable = B_TRUE;
2890 			nce_trace_inactive(nce);
2891 			return;
2892 		}
2893 		th_trace->th_id = curthread;
2894 		th_trace->th_next = nce->nce_trace[bucket_id];
2895 		th_trace->th_prev = &nce->nce_trace[bucket_id];
2896 		if (th_trace->th_next != NULL)
2897 			th_trace->th_next->th_prev = &th_trace->th_next;
2898 		nce->nce_trace[bucket_id] = th_trace;
2899 	}
2900 	ASSERT(th_trace->th_refcnt < TR_BUF_MAX - 1);
2901 	th_trace->th_refcnt++;
2902 	th_trace_rrecord(th_trace);
2903 }
2904 
2905 void
2906 nce_untrace_ref(nce_t *nce)
2907 {
2908 	th_trace_t *th_trace;
2909 
2910 	ASSERT(MUTEX_HELD(&nce->nce_lock));
2911 
2912 	if (nce->nce_trace_disable == B_TRUE)
2913 		return;
2914 
2915 	th_trace = th_trace_nce_lookup(nce);
2916 	ASSERT(th_trace != NULL && th_trace->th_refcnt > 0);
2917 
2918 	th_trace_rrecord(th_trace);
2919 	th_trace->th_refcnt--;
2920 }
2921 
2922 void
2923 nce_trace_inactive(nce_t *nce)
2924 {
2925 	th_trace_t *th_trace;
2926 	int i;
2927 
2928 	ASSERT(MUTEX_HELD(&nce->nce_lock));
2929 
2930 	for (i = 0; i < IP_TR_HASH_MAX; i++) {
2931 		while (nce->nce_trace[i] != NULL) {
2932 			th_trace = nce->nce_trace[i];
2933 
2934 			/* unlink th_trace and free it */
2935 			nce->nce_trace[i] = th_trace->th_next;
2936 			if (th_trace->th_next != NULL)
2937 				th_trace->th_next->th_prev =
2938 				    &nce->nce_trace[i];
2939 
2940 			th_trace->th_next = NULL;
2941 			th_trace->th_prev = NULL;
2942 			kmem_free(th_trace, sizeof (th_trace_t));
2943 		}
2944 	}
2945 
2946 }
2947 
2948 /* ARGSUSED */
2949 int
2950 nce_thread_exit(nce_t *nce, caddr_t arg)
2951 {
2952 	th_trace_t	*th_trace;
2953 
2954 	mutex_enter(&nce->nce_lock);
2955 	th_trace = th_trace_nce_lookup(nce);
2956 
2957 	if (th_trace == NULL) {
2958 		mutex_exit(&nce->nce_lock);
2959 		return (0);
2960 	}
2961 
2962 	ASSERT(th_trace->th_refcnt == 0);
2963 
2964 	/* unlink th_trace and free it */
2965 	*th_trace->th_prev = th_trace->th_next;
2966 	if (th_trace->th_next != NULL)
2967 		th_trace->th_next->th_prev = th_trace->th_prev;
2968 	th_trace->th_next = NULL;
2969 	th_trace->th_prev = NULL;
2970 	kmem_free(th_trace, sizeof (th_trace_t));
2971 	mutex_exit(&nce->nce_lock);
2972 	return (0);
2973 }
2974 #endif
2975