xref: /titanic_52/usr/src/uts/common/inet/ip/ip_ndp.c (revision fb2f18f820d90b001aea4fb27dd654bc1263c440)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 #include <sys/types.h>
29 #include <sys/stream.h>
30 #include <sys/stropts.h>
31 #include <sys/strsun.h>
32 #include <sys/sysmacros.h>
33 #include <sys/errno.h>
34 #include <sys/dlpi.h>
35 #include <sys/socket.h>
36 #include <sys/ddi.h>
37 #include <sys/sunddi.h>
38 #include <sys/cmn_err.h>
39 #include <sys/debug.h>
40 #include <sys/vtrace.h>
41 #include <sys/kmem.h>
42 #include <sys/zone.h>
43 #include <sys/ethernet.h>
44 #include <sys/sdt.h>
45 
46 #include <net/if.h>
47 #include <net/if_types.h>
48 #include <net/if_dl.h>
49 #include <net/route.h>
50 #include <netinet/in.h>
51 #include <netinet/ip6.h>
52 #include <netinet/icmp6.h>
53 
54 #include <inet/common.h>
55 #include <inet/mi.h>
56 #include <inet/mib2.h>
57 #include <inet/nd.h>
58 #include <inet/ip.h>
59 #include <inet/ip_impl.h>
60 #include <inet/ip_if.h>
61 #include <inet/ip_ire.h>
62 #include <inet/ip_rts.h>
63 #include <inet/ip6.h>
64 #include <inet/ip_ndp.h>
65 #include <inet/ipsec_impl.h>
66 #include <inet/ipsec_info.h>
67 #include <inet/sctp_ip.h>
68 
69 /*
70  * Function names with nce_ prefix are static while function
71  * names with ndp_ prefix are used by rest of the IP.
72  *
73  * Lock ordering:
74  *
75  *	ndp_g_lock -> ill_lock -> nce_lock
76  *
77  * The ndp_g_lock protects the NCE hash (nce_hash_tbl, NCE_HASH_PTR) and
78  * nce_next.  Nce_lock protects the contents of the NCE (particularly
79  * nce_refcnt).
80  */
81 
82 static	boolean_t nce_cmp_ll_addr(const nce_t *nce, const uchar_t *new_ll_addr,
83     uint32_t ll_addr_len);
84 static	void	nce_ire_delete(nce_t *nce);
85 static	void	nce_ire_delete1(ire_t *ire, char *nce_arg);
86 static	void 	nce_set_ll(nce_t *nce, uchar_t *ll_addr);
87 static	nce_t	*nce_lookup_addr(ill_t *, const in6_addr_t *, nce_t *);
88 static	nce_t	*nce_lookup_mapping(ill_t *ill, const in6_addr_t *addr);
89 static	void	nce_make_mapping(nce_t *nce, uchar_t *addrpos,
90     uchar_t *addr);
91 static	int	nce_set_multicast(ill_t *ill, const in6_addr_t *addr);
92 static	void	nce_queue_mp(nce_t *nce, mblk_t *mp);
93 static	void	nce_report1(nce_t *nce, uchar_t *mp_arg);
94 static	mblk_t	*nce_udreq_alloc(ill_t *ill);
95 static	void	nce_update(nce_t *nce, uint16_t new_state,
96     uchar_t *new_ll_addr);
97 static	uint32_t	nce_solicit(nce_t *nce, mblk_t *mp);
98 static	boolean_t	nce_xmit(ill_t *ill, uint32_t operation,
99     ill_t *hwaddr_ill, boolean_t use_lla_addr, const in6_addr_t *sender,
100     const in6_addr_t *target, int flag);
101 extern void	th_trace_rrecord(th_trace_t *);
102 static	int	ndp_lookup_then_add_v6(ill_t *, uchar_t *,
103     const in6_addr_t *, const in6_addr_t *, const in6_addr_t *,
104     uint32_t, uint16_t, uint16_t, nce_t **, mblk_t *, mblk_t *);
105 static	int	ndp_lookup_then_add_v4(ill_t *, uchar_t *,
106     const in_addr_t *, const in_addr_t *, const in_addr_t *,
107     uint32_t, uint16_t, uint16_t, nce_t **, mblk_t *, mblk_t *);
108 static	int	ndp_add_v6(ill_t *, uchar_t *, const in6_addr_t *,
109     const in6_addr_t *, const in6_addr_t *, uint32_t, uint16_t, uint16_t,
110     nce_t **);
111 static	int	ndp_add_v4(ill_t *, uchar_t *, const in_addr_t *,
112     const in_addr_t *, const in_addr_t *, uint32_t, uint16_t, uint16_t,
113     nce_t **, mblk_t *, mblk_t *);
114 
115 
116 #ifdef NCE_DEBUG
117 void	nce_trace_inactive(nce_t *);
118 #endif
119 
120 ndp_g_t ndp4, ndp6;
121 
122 #define	NCE_HASH_PTR_V4(addr) \
123 	(&(ndp4.nce_hash_tbl[IRE_ADDR_HASH(addr, NCE_TABLE_SIZE)]))
124 
125 #define	NCE_HASH_PTR_V6(addr) \
126 	(&(ndp6.nce_hash_tbl[NCE_ADDR_HASH_V6(addr, NCE_TABLE_SIZE)]))
127 
128 /*
129  * Compute default flags to use for an advertisement of this nce's address.
130  */
131 static int
132 nce_advert_flags(const nce_t *nce)
133 {
134 	int flag = 0;
135 
136 	if (nce->nce_flags & NCE_F_ISROUTER)
137 		flag |= NDP_ISROUTER;
138 	if (!(nce->nce_flags & NCE_F_PROXY))
139 		flag |= NDP_ORIDE;
140 	return (flag);
141 }
142 
143 int
144 ndp_add(ill_t *ill, uchar_t *hw_addr, const void *addr,
145     const void *mask, const void *extract_mask,
146     uint32_t hw_extract_start, uint16_t flags, uint16_t state,
147     nce_t **newnce, mblk_t *fp_mp, mblk_t *res_mp)
148 {
149 	int status;
150 
151 	if (ill->ill_isv6)
152 		status = ndp_add_v6(ill, hw_addr, (in6_addr_t *)addr,
153 		    (in6_addr_t *)mask, (in6_addr_t *)extract_mask,
154 		    hw_extract_start, flags, state, newnce);
155 	else
156 		status = ndp_add_v4(ill, hw_addr, (in_addr_t *)addr,
157 		    (in_addr_t *)mask, (in_addr_t *)extract_mask,
158 		    hw_extract_start, flags, state, newnce, fp_mp, res_mp);
159 	return (status);
160 }
161 
162 /* Non-tunable probe interval, based on link capabilities */
163 #define	ILL_PROBE_INTERVAL(ill)	((ill)->ill_note_link ? 150 : 1500)
164 
165 /*
166  * NDP Cache Entry creation routine.
167  * Mapped entries will never do NUD .
168  * This routine must always be called with ndp6.ndp_g_lock held.
169  * Prior to return, nce_refcnt is incremented.
170  */
171 static int
172 ndp_add_v6(ill_t *ill, uchar_t *hw_addr, const in6_addr_t *addr,
173     const in6_addr_t *mask, const in6_addr_t *extract_mask,
174     uint32_t hw_extract_start, uint16_t flags, uint16_t state,
175     nce_t **newnce)
176 {
177 	static	nce_t		nce_nil;
178 	nce_t		*nce;
179 	mblk_t		*mp;
180 	mblk_t		*template;
181 	nce_t		**ncep;
182 	int		err;
183 	boolean_t	dropped = B_FALSE;
184 
185 	ASSERT(MUTEX_HELD(&ndp6.ndp_g_lock));
186 	ASSERT(ill != NULL && ill->ill_isv6);
187 	if (IN6_IS_ADDR_UNSPECIFIED(addr)) {
188 		ip0dbg(("ndp_add: no addr\n"));
189 		return (EINVAL);
190 	}
191 	if ((flags & ~NCE_EXTERNAL_FLAGS_MASK)) {
192 		ip0dbg(("ndp_add: flags = %x\n", (int)flags));
193 		return (EINVAL);
194 	}
195 	if (IN6_IS_ADDR_UNSPECIFIED(extract_mask) &&
196 	    (flags & NCE_F_MAPPING)) {
197 		ip0dbg(("ndp_add: extract mask zero for mapping"));
198 		return (EINVAL);
199 	}
200 	/*
201 	 * Allocate the mblk to hold the nce.
202 	 *
203 	 * XXX This can come out of a separate cache - nce_cache.
204 	 * We don't need the mp anymore as there are no more
205 	 * "qwriter"s
206 	 */
207 	mp = allocb(sizeof (nce_t), BPRI_MED);
208 	if (mp == NULL)
209 		return (ENOMEM);
210 
211 	nce = (nce_t *)mp->b_rptr;
212 	mp->b_wptr = (uchar_t *)&nce[1];
213 	*nce = nce_nil;
214 
215 	/*
216 	 * This one holds link layer address
217 	 */
218 	if (ill->ill_net_type == IRE_IF_RESOLVER) {
219 		template = nce_udreq_alloc(ill);
220 	} else {
221 		if (ill->ill_resolver_mp == NULL) {
222 			freeb(mp);
223 			return (EINVAL);
224 		}
225 		ASSERT((ill->ill_net_type == IRE_IF_NORESOLVER));
226 		template = copyb(ill->ill_resolver_mp);
227 	}
228 	if (template == NULL) {
229 		freeb(mp);
230 		return (ENOMEM);
231 	}
232 	nce->nce_ill = ill;
233 	nce->nce_ipversion = IPV6_VERSION;
234 	nce->nce_flags = flags;
235 	nce->nce_state = state;
236 	nce->nce_pcnt = ND_MAX_UNICAST_SOLICIT;
237 	nce->nce_rcnt = ill->ill_xmit_count;
238 	nce->nce_addr = *addr;
239 	nce->nce_mask = *mask;
240 	nce->nce_extract_mask = *extract_mask;
241 	nce->nce_ll_extract_start = hw_extract_start;
242 	nce->nce_fp_mp = NULL;
243 	nce->nce_res_mp = template;
244 	if (state == ND_REACHABLE)
245 		nce->nce_last = TICK_TO_MSEC(lbolt64);
246 	else
247 		nce->nce_last = 0;
248 	nce->nce_qd_mp = NULL;
249 	nce->nce_mp = mp;
250 	if (hw_addr != NULL)
251 		nce_set_ll(nce, hw_addr);
252 	/* This one is for nce getting created */
253 	nce->nce_refcnt = 1;
254 	mutex_init(&nce->nce_lock, NULL, MUTEX_DEFAULT, NULL);
255 	if (nce->nce_flags & NCE_F_MAPPING) {
256 		ASSERT(IN6_IS_ADDR_MULTICAST(addr));
257 		ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&nce->nce_mask));
258 		ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&nce->nce_extract_mask));
259 		ncep = &ndp6.nce_mask_entries;
260 	} else {
261 		ncep = ((nce_t **)NCE_HASH_PTR_V6(*addr));
262 	}
263 
264 #ifdef NCE_DEBUG
265 	bzero(nce->nce_trace, sizeof (th_trace_t *) * IP_TR_HASH_MAX);
266 #endif
267 	/*
268 	 * Atomically ensure that the ill is not CONDEMNED, before
269 	 * adding the NCE.
270 	 */
271 	mutex_enter(&ill->ill_lock);
272 	if (ill->ill_state_flags & ILL_CONDEMNED) {
273 		mutex_exit(&ill->ill_lock);
274 		freeb(mp);
275 		freeb(template);
276 		return (EINVAL);
277 	}
278 	if ((nce->nce_next = *ncep) != NULL)
279 		nce->nce_next->nce_ptpn = &nce->nce_next;
280 	*ncep = nce;
281 	nce->nce_ptpn = ncep;
282 	*newnce = nce;
283 	/* This one is for nce being used by an active thread */
284 	NCE_REFHOLD(*newnce);
285 
286 	/* Bump up the number of nce's referencing this ill */
287 	ill->ill_nce_cnt++;
288 	mutex_exit(&ill->ill_lock);
289 
290 	err = 0;
291 	if ((flags & NCE_F_PERMANENT) && state == ND_PROBE) {
292 		mutex_enter(&nce->nce_lock);
293 		mutex_exit(&ndp6.ndp_g_lock);
294 		nce->nce_pcnt = ND_MAX_UNICAST_SOLICIT;
295 		mutex_exit(&nce->nce_lock);
296 		dropped = nce_xmit(ill, ND_NEIGHBOR_SOLICIT, NULL, B_FALSE,
297 		    &ipv6_all_zeros, addr, NDP_PROBE);
298 		if (dropped) {
299 			mutex_enter(&nce->nce_lock);
300 			nce->nce_pcnt++;
301 			mutex_exit(&nce->nce_lock);
302 		}
303 		NDP_RESTART_TIMER(nce, ILL_PROBE_INTERVAL(ill));
304 		mutex_enter(&ndp6.ndp_g_lock);
305 		err = EINPROGRESS;
306 	} else if (flags & NCE_F_UNSOL_ADV) {
307 		/*
308 		 * We account for the transmit below by assigning one
309 		 * less than the ndd variable. Subsequent decrements
310 		 * are done in ndp_timer.
311 		 */
312 		mutex_enter(&nce->nce_lock);
313 		mutex_exit(&ndp6.ndp_g_lock);
314 		nce->nce_unsolicit_count = ip_ndp_unsolicit_count - 1;
315 		mutex_exit(&nce->nce_lock);
316 		dropped = nce_xmit(ill,
317 		    ND_NEIGHBOR_ADVERT,
318 		    ill,	/* ill to be used for extracting ill_nd_lla */
319 		    B_TRUE,	/* use ill_nd_lla */
320 		    addr,	/* Source and target of the advertisement pkt */
321 		    &ipv6_all_hosts_mcast, /* Destination of the packet */
322 		    nce_advert_flags(nce));
323 		mutex_enter(&nce->nce_lock);
324 		if (dropped)
325 			nce->nce_unsolicit_count++;
326 		if (nce->nce_unsolicit_count != 0) {
327 			nce->nce_timeout_id = timeout(ndp_timer, nce,
328 			    MSEC_TO_TICK(ip_ndp_unsolicit_interval));
329 		}
330 		mutex_exit(&nce->nce_lock);
331 		mutex_enter(&ndp6.ndp_g_lock);
332 	}
333 	/*
334 	 * If the hw_addr is NULL, typically for ND_INCOMPLETE nces, then
335 	 * we call nce_fastpath as soon as the nce is resolved in ndp_process.
336 	 * We call nce_fastpath from nce_update if the link layer address of
337 	 * the peer changes from nce_update
338 	 */
339 	if (hw_addr != NULL || ill->ill_net_type == IRE_IF_NORESOLVER)
340 		nce_fastpath(nce);
341 	return (err);
342 }
343 
344 int
345 ndp_lookup_then_add(ill_t *ill, uchar_t *hw_addr, const void *addr,
346     const void *mask, const void *extract_mask,
347     uint32_t hw_extract_start, uint16_t flags, uint16_t state,
348     nce_t **newnce, mblk_t *fp_mp, mblk_t *res_mp)
349 {
350 	int status;
351 
352 	if (ill->ill_isv6) {
353 		status = ndp_lookup_then_add_v6(ill, hw_addr,
354 		    (in6_addr_t *)addr, (in6_addr_t *)mask,
355 		    (in6_addr_t *)extract_mask, hw_extract_start, flags,
356 		    state, newnce, fp_mp, res_mp);
357 	} else  {
358 		status = ndp_lookup_then_add_v4(ill, hw_addr,
359 		    (in_addr_t *)addr, (in_addr_t *)mask,
360 		    (in_addr_t *)extract_mask, hw_extract_start, flags,
361 		    state, newnce, fp_mp, res_mp);
362 	}
363 
364 	return (status);
365 }
366 
367 static int
368 ndp_lookup_then_add_v6(ill_t *ill, uchar_t *hw_addr, const in6_addr_t *addr,
369     const in6_addr_t *mask, const in6_addr_t *extract_mask,
370     uint32_t hw_extract_start, uint16_t flags, uint16_t state,
371     nce_t **newnce, mblk_t *fp_mp, mblk_t *res_mp)
372 {
373 	int	err = 0;
374 	nce_t	*nce;
375 
376 	ASSERT(ill != NULL && ill->ill_isv6);
377 	mutex_enter(&ndp6.ndp_g_lock);
378 	nce = *((nce_t **)NCE_HASH_PTR_V6(*addr)); /* head of v6 hash table */
379 	nce = nce_lookup_addr(ill, addr, nce);
380 	if (nce == NULL) {
381 		err = ndp_add(ill,
382 		    hw_addr,
383 		    addr,
384 		    mask,
385 		    extract_mask,
386 		    hw_extract_start,
387 		    flags,
388 		    state,
389 		    newnce,
390 		    fp_mp,
391 		    res_mp);
392 	} else {
393 		*newnce = nce;
394 		err = EEXIST;
395 	}
396 	mutex_exit(&ndp6.ndp_g_lock);
397 	return (err);
398 }
399 
400 /*
401  * Remove all the CONDEMNED nces from the appropriate hash table.
402  * We create a private list of NCEs, these may have ires pointing
403  * to them, so the list will be passed through to clean up dependent
404  * ires and only then we can do NCE_REFRELE which can make NCE inactive.
405  */
406 static void
407 nce_remove(ndp_g_t *ndp, nce_t *nce, nce_t **free_nce_list)
408 {
409 	nce_t *nce1;
410 	nce_t **ptpn;
411 
412 	ASSERT(MUTEX_HELD(&ndp->ndp_g_lock));
413 	ASSERT(ndp->ndp_g_walker == 0);
414 	for (; nce; nce = nce1) {
415 		nce1 = nce->nce_next;
416 		mutex_enter(&nce->nce_lock);
417 		if (nce->nce_flags & NCE_F_CONDEMNED) {
418 			ptpn = nce->nce_ptpn;
419 			nce1 = nce->nce_next;
420 			if (nce1 != NULL)
421 				nce1->nce_ptpn = ptpn;
422 			*ptpn = nce1;
423 			nce->nce_ptpn = NULL;
424 			nce->nce_next = NULL;
425 			nce->nce_next = *free_nce_list;
426 			*free_nce_list = nce;
427 		}
428 		mutex_exit(&nce->nce_lock);
429 	}
430 }
431 
432 /*
433  * 1. Mark the nce CONDEMNED. This ensures that no new nce_lookup()
434  *    will return this NCE. Also no new IREs will be created that
435  *    point to this NCE (See ire_add_v6).  Also no new timeouts will
436  *    be started (See NDP_RESTART_TIMER).
437  * 2. Cancel any currently running timeouts.
438  * 3. If there is an ndp walker, return. The walker will do the cleanup.
439  *    This ensures that walkers see a consistent list of NCEs while walking.
440  * 4. Otherwise remove the NCE from the list of NCEs
441  * 5. Delete all IREs pointing to this NCE.
442  */
443 void
444 ndp_delete(nce_t *nce)
445 {
446 	nce_t	**ptpn;
447 	nce_t	*nce1;
448 	int	ipversion = nce->nce_ipversion;
449 	ndp_g_t *ndp = (ipversion == IPV4_VERSION ? &ndp4 : &ndp6);
450 
451 	/* Serialize deletes */
452 	mutex_enter(&nce->nce_lock);
453 	if (nce->nce_flags & NCE_F_CONDEMNED) {
454 		/* Some other thread is doing the delete */
455 		mutex_exit(&nce->nce_lock);
456 		return;
457 	}
458 	/*
459 	 * Caller has a refhold. Also 1 ref for being in the list. Thus
460 	 * refcnt has to be >= 2
461 	 */
462 	ASSERT(nce->nce_refcnt >= 2);
463 	nce->nce_flags |= NCE_F_CONDEMNED;
464 	mutex_exit(&nce->nce_lock);
465 
466 	nce_fastpath_list_delete(nce);
467 
468 	/*
469 	 * Cancel any running timer. Timeout can't be restarted
470 	 * since CONDEMNED is set. Can't hold nce_lock across untimeout.
471 	 * Passing invalid timeout id is fine.
472 	 */
473 	if (nce->nce_timeout_id != 0) {
474 		(void) untimeout(nce->nce_timeout_id);
475 		nce->nce_timeout_id = 0;
476 	}
477 
478 	mutex_enter(&ndp->ndp_g_lock);
479 	if (nce->nce_ptpn == NULL) {
480 		/*
481 		 * The last ndp walker has already removed this nce from
482 		 * the list after we marked the nce CONDEMNED and before
483 		 * we grabbed the global lock.
484 		 */
485 		mutex_exit(&ndp->ndp_g_lock);
486 		return;
487 	}
488 	if (ndp->ndp_g_walker > 0) {
489 		/*
490 		 * Can't unlink. The walker will clean up
491 		 */
492 		ndp->ndp_g_walker_cleanup = B_TRUE;
493 		mutex_exit(&ndp->ndp_g_lock);
494 		return;
495 	}
496 
497 	/*
498 	 * Now remove the nce from the list. NDP_RESTART_TIMER won't restart
499 	 * the timer since it is marked CONDEMNED.
500 	 */
501 	ptpn = nce->nce_ptpn;
502 	nce1 = nce->nce_next;
503 	if (nce1 != NULL)
504 		nce1->nce_ptpn = ptpn;
505 	*ptpn = nce1;
506 	nce->nce_ptpn = NULL;
507 	nce->nce_next = NULL;
508 	mutex_exit(&ndp->ndp_g_lock);
509 
510 	nce_ire_delete(nce);
511 }
512 
513 void
514 ndp_inactive(nce_t *nce)
515 {
516 	mblk_t		**mpp;
517 	ill_t		*ill;
518 
519 	ASSERT(nce->nce_refcnt == 0);
520 	ASSERT(MUTEX_HELD(&nce->nce_lock));
521 	ASSERT(nce->nce_fastpath == NULL);
522 
523 	/* Free all nce allocated messages */
524 	mpp = &nce->nce_first_mp_to_free;
525 	do {
526 		while (*mpp != NULL) {
527 			mblk_t  *mp;
528 
529 			mp = *mpp;
530 			*mpp = mp->b_next;
531 
532 			inet_freemsg(mp);
533 		}
534 	} while (mpp++ != &nce->nce_last_mp_to_free);
535 
536 #ifdef NCE_DEBUG
537 	nce_trace_inactive(nce);
538 #endif
539 
540 	ill = nce->nce_ill;
541 	mutex_enter(&ill->ill_lock);
542 	ill->ill_nce_cnt--;
543 	/*
544 	 * If the number of nce's associated with this ill have dropped
545 	 * to zero, check whether we need to restart any operation that
546 	 * is waiting for this to happen.
547 	 */
548 	if (ill->ill_nce_cnt == 0) {
549 		/* ipif_ill_refrele_tail drops the ill_lock */
550 		ipif_ill_refrele_tail(ill);
551 	} else {
552 		mutex_exit(&ill->ill_lock);
553 	}
554 	mutex_destroy(&nce->nce_lock);
555 	if (nce->nce_mp != NULL)
556 		inet_freemsg(nce->nce_mp);
557 }
558 
559 /*
560  * ndp_walk routine.  Delete the nce if it is associated with the ill
561  * that is going away.  Always called as a writer.
562  */
563 void
564 ndp_delete_per_ill(nce_t *nce, uchar_t *arg)
565 {
566 	if ((nce != NULL) && nce->nce_ill == (ill_t *)arg) {
567 		ndp_delete(nce);
568 	}
569 }
570 
571 /*
572  * Walk a list of to be inactive NCEs and blow away all the ires.
573  */
574 static void
575 nce_ire_delete_list(nce_t *nce)
576 {
577 	nce_t *nce_next;
578 
579 	ASSERT(nce != NULL);
580 	while (nce != NULL) {
581 		nce_next = nce->nce_next;
582 		nce->nce_next = NULL;
583 
584 		/*
585 		 * It is possible for the last ndp walker (this thread)
586 		 * to come here after ndp_delete has marked the nce CONDEMNED
587 		 * and before it has removed the nce from the fastpath list
588 		 * or called untimeout. So we need to do it here. It is safe
589 		 * for both ndp_delete and this thread to do it twice or
590 		 * even simultaneously since each of the threads has a
591 		 * reference on the nce.
592 		 */
593 		nce_fastpath_list_delete(nce);
594 		/*
595 		 * Cancel any running timer. Timeout can't be restarted
596 		 * since CONDEMNED is set. Can't hold nce_lock across untimeout.
597 		 * Passing invalid timeout id is fine.
598 		 */
599 		if (nce->nce_timeout_id != 0) {
600 			(void) untimeout(nce->nce_timeout_id);
601 			nce->nce_timeout_id = 0;
602 		}
603 		/*
604 		 * We might hit this func thus in the v4 case:
605 		 * ipif_down->ipif_ndp_down->ndp_walk
606 		 */
607 
608 		if (nce->nce_ipversion == IPV4_VERSION) {
609 			ire_walk_ill_v4(MATCH_IRE_ILL | MATCH_IRE_TYPE,
610 			    IRE_CACHE, nce_ire_delete1,
611 			    (char *)nce, nce->nce_ill);
612 		} else {
613 			ASSERT(nce->nce_ipversion == IPV6_VERSION);
614 			ire_walk_ill_v6(MATCH_IRE_ILL | MATCH_IRE_TYPE,
615 			    IRE_CACHE, nce_ire_delete1,
616 			    (char *)nce, nce->nce_ill);
617 		}
618 		NCE_REFRELE_NOTR(nce);
619 		nce = nce_next;
620 	}
621 }
622 
623 /*
624  * Delete an ire when the nce goes away.
625  */
626 /* ARGSUSED */
627 static void
628 nce_ire_delete(nce_t *nce)
629 {
630 	if (nce->nce_ipversion == IPV6_VERSION) {
631 		ire_walk_ill_v6(MATCH_IRE_ILL | MATCH_IRE_TYPE, IRE_CACHE,
632 		    nce_ire_delete1, (char *)nce, nce->nce_ill);
633 		NCE_REFRELE_NOTR(nce);
634 	} else {
635 		ire_walk_ill_v4(MATCH_IRE_ILL | MATCH_IRE_TYPE, IRE_CACHE,
636 		    nce_ire_delete1, (char *)nce, nce->nce_ill);
637 		NCE_REFRELE_NOTR(nce);
638 	}
639 }
640 
641 /*
642  * ire_walk routine used to delete every IRE that shares this nce
643  */
644 static void
645 nce_ire_delete1(ire_t *ire, char *nce_arg)
646 {
647 	nce_t	*nce = (nce_t *)nce_arg;
648 
649 	ASSERT(ire->ire_type == IRE_CACHE);
650 
651 	if (ire->ire_nce == nce) {
652 		ASSERT(ire->ire_ipversion == nce->nce_ipversion);
653 		ire_delete(ire);
654 	}
655 }
656 
657 /*
658  * Restart DAD on given NCE.  Returns B_TRUE if DAD has been restarted.
659  */
660 boolean_t
661 ndp_restart_dad(nce_t *nce)
662 {
663 	boolean_t started;
664 	boolean_t dropped;
665 
666 	if (nce == NULL)
667 		return (B_FALSE);
668 	mutex_enter(&nce->nce_lock);
669 	if (nce->nce_state == ND_PROBE) {
670 		mutex_exit(&nce->nce_lock);
671 		started = B_TRUE;
672 	} else if (nce->nce_state == ND_REACHABLE) {
673 		nce->nce_state = ND_PROBE;
674 		nce->nce_pcnt = ND_MAX_UNICAST_SOLICIT - 1;
675 		mutex_exit(&nce->nce_lock);
676 		dropped = nce_xmit(nce->nce_ill, ND_NEIGHBOR_SOLICIT, NULL,
677 		    B_FALSE, &ipv6_all_zeros, &nce->nce_addr, NDP_PROBE);
678 		if (dropped) {
679 			mutex_enter(&nce->nce_lock);
680 			nce->nce_pcnt++;
681 			mutex_exit(&nce->nce_lock);
682 		}
683 		NDP_RESTART_TIMER(nce, ILL_PROBE_INTERVAL(nce->nce_ill));
684 		started = B_TRUE;
685 	} else {
686 		mutex_exit(&nce->nce_lock);
687 		started = B_FALSE;
688 	}
689 	return (started);
690 }
691 
692 /*
693  * IPv6 Cache entry lookup.  Try to find an nce matching the parameters passed.
694  * If one is found, the refcnt on the nce will be incremented.
695  */
696 nce_t *
697 ndp_lookup_v6(ill_t *ill, const in6_addr_t *addr, boolean_t caller_holds_lock)
698 {
699 	nce_t	*nce;
700 
701 	ASSERT(ill != NULL && ill->ill_isv6);
702 	if (!caller_holds_lock) {
703 		mutex_enter(&ndp6.ndp_g_lock);
704 	}
705 	nce = *((nce_t **)NCE_HASH_PTR_V6(*addr)); /* head of v6 hash table */
706 	nce = nce_lookup_addr(ill, addr, nce);
707 	if (nce == NULL)
708 		nce = nce_lookup_mapping(ill, addr);
709 	if (!caller_holds_lock)
710 		mutex_exit(&ndp6.ndp_g_lock);
711 	return (nce);
712 }
713 /*
714  * IPv4 Cache entry lookup.  Try to find an nce matching the parameters passed.
715  * If one is found, the refcnt on the nce will be incremented.
716  * Since multicast mappings are handled in arp, there are no nce_mcast_entries
717  * so we skip the nce_lookup_mapping call.
718  * XXX TODO: if the nce is found to be ND_STALE, ndp_delete it and return NULL
719  */
720 nce_t *
721 ndp_lookup_v4(ill_t *ill, const in_addr_t *addr, boolean_t caller_holds_lock)
722 {
723 	nce_t	*nce;
724 	in6_addr_t addr6;
725 
726 	if (!caller_holds_lock) {
727 		mutex_enter(&ndp4.ndp_g_lock);
728 	}
729 	nce = *((nce_t **)NCE_HASH_PTR_V4(*addr)); /* head of v6 hash table */
730 	IN6_IPADDR_TO_V4MAPPED(*addr, &addr6);
731 	nce = nce_lookup_addr(ill, &addr6, nce);
732 	if (!caller_holds_lock)
733 		mutex_exit(&ndp4.ndp_g_lock);
734 	return (nce);
735 }
736 
737 /*
738  * Cache entry lookup.  Try to find an nce matching the parameters passed.
739  * Look only for exact entries (no mappings).  If an nce is found, increment
740  * the hold count on that nce. The caller passes in the start of the
741  * appropriate hash table, and must be holding the appropriate global
742  * lock (ndp_g_lock).
743  */
744 static nce_t *
745 nce_lookup_addr(ill_t *ill, const in6_addr_t *addr, nce_t *nce)
746 {
747 	ndp_g_t *ndp = (ill->ill_isv6 ? &ndp6 : &ndp4);
748 
749 	ASSERT(ill != NULL);
750 	ASSERT(MUTEX_HELD(&ndp->ndp_g_lock));
751 	if (IN6_IS_ADDR_UNSPECIFIED(addr))
752 		return (NULL);
753 	for (; nce != NULL; nce = nce->nce_next) {
754 		if (nce->nce_ill == ill) {
755 			if (IN6_ARE_ADDR_EQUAL(&nce->nce_addr, addr) &&
756 			    IN6_ARE_ADDR_EQUAL(&nce->nce_mask,
757 			    &ipv6_all_ones)) {
758 				mutex_enter(&nce->nce_lock);
759 				if (!(nce->nce_flags & NCE_F_CONDEMNED)) {
760 					NCE_REFHOLD_LOCKED(nce);
761 					mutex_exit(&nce->nce_lock);
762 					break;
763 				}
764 				mutex_exit(&nce->nce_lock);
765 			}
766 		}
767 	}
768 	return (nce);
769 }
770 
771 /*
772  * Cache entry lookup.  Try to find an nce matching the parameters passed.
773  * Look only for mappings.
774  */
775 static nce_t *
776 nce_lookup_mapping(ill_t *ill, const in6_addr_t *addr)
777 {
778 	nce_t	*nce;
779 
780 	ASSERT(ill != NULL && ill->ill_isv6);
781 	ASSERT(MUTEX_HELD(&ndp6.ndp_g_lock));
782 	if (!IN6_IS_ADDR_MULTICAST(addr))
783 		return (NULL);
784 	nce = ndp6.nce_mask_entries;
785 	for (; nce != NULL; nce = nce->nce_next)
786 		if (nce->nce_ill == ill &&
787 		    (V6_MASK_EQ(*addr, nce->nce_mask, nce->nce_addr))) {
788 			mutex_enter(&nce->nce_lock);
789 			if (!(nce->nce_flags & NCE_F_CONDEMNED)) {
790 				NCE_REFHOLD_LOCKED(nce);
791 				mutex_exit(&nce->nce_lock);
792 				break;
793 			}
794 			mutex_exit(&nce->nce_lock);
795 		}
796 	return (nce);
797 }
798 
799 /*
800  * Process passed in parameters either from an incoming packet or via
801  * user ioctl.
802  */
803 void
804 ndp_process(nce_t *nce, uchar_t *hw_addr, uint32_t flag, boolean_t is_adv)
805 {
806 	ill_t	*ill = nce->nce_ill;
807 	uint32_t hw_addr_len = ill->ill_nd_lla_len;
808 	mblk_t	*mp;
809 	boolean_t ll_updated = B_FALSE;
810 	boolean_t ll_changed;
811 
812 	ASSERT(nce->nce_ipversion == IPV6_VERSION);
813 	/*
814 	 * No updates of link layer address or the neighbor state is
815 	 * allowed, when the cache is in NONUD state.  This still
816 	 * allows for responding to reachability solicitation.
817 	 */
818 	mutex_enter(&nce->nce_lock);
819 	if (nce->nce_state == ND_INCOMPLETE) {
820 		if (hw_addr == NULL) {
821 			mutex_exit(&nce->nce_lock);
822 			return;
823 		}
824 		nce_set_ll(nce, hw_addr);
825 		/*
826 		 * Update nce state and send the queued packets
827 		 * back to ip this time ire will be added.
828 		 */
829 		if (flag & ND_NA_FLAG_SOLICITED) {
830 			nce_update(nce, ND_REACHABLE, NULL);
831 		} else {
832 			nce_update(nce, ND_STALE, NULL);
833 		}
834 		mutex_exit(&nce->nce_lock);
835 		nce_fastpath(nce);
836 		mutex_enter(&nce->nce_lock);
837 		mp = nce->nce_qd_mp;
838 		nce->nce_qd_mp = NULL;
839 		mutex_exit(&nce->nce_lock);
840 		while (mp != NULL) {
841 			mblk_t *nxt_mp, *data_mp;
842 
843 			nxt_mp = mp->b_next;
844 			mp->b_next = NULL;
845 
846 			if (mp->b_datap->db_type == M_CTL)
847 				data_mp = mp->b_cont;
848 			else
849 				data_mp = mp;
850 			if (data_mp->b_prev != NULL) {
851 				ill_t   *inbound_ill;
852 				queue_t *fwdq = NULL;
853 				uint_t ifindex;
854 
855 				ifindex = (uint_t)(uintptr_t)data_mp->b_prev;
856 				inbound_ill = ill_lookup_on_ifindex(ifindex,
857 				    B_TRUE, NULL, NULL, NULL, NULL);
858 				if (inbound_ill == NULL) {
859 					data_mp->b_prev = NULL;
860 					freemsg(mp);
861 					return;
862 				} else {
863 					fwdq = inbound_ill->ill_rq;
864 				}
865 				data_mp->b_prev = NULL;
866 				/*
867 				 * Send a forwarded packet back into ip_rput_v6
868 				 * just as in ire_send_v6().
869 				 * Extract the queue from b_prev (set in
870 				 * ip_rput_data_v6).
871 				 */
872 				if (fwdq != NULL) {
873 					/*
874 					 * Forwarded packets hop count will
875 					 * get decremented in ip_rput_data_v6
876 					 */
877 					if (data_mp != mp)
878 						freeb(mp);
879 					put(fwdq, data_mp);
880 				} else {
881 					/*
882 					 * Send locally originated packets back
883 					 * into * ip_wput_v6.
884 					 */
885 					put(ill->ill_wq, mp);
886 				}
887 				ill_refrele(inbound_ill);
888 			} else {
889 				put(ill->ill_wq, mp);
890 			}
891 			mp = nxt_mp;
892 		}
893 		return;
894 	}
895 	ll_changed = nce_cmp_ll_addr(nce, hw_addr, hw_addr_len);
896 	if (!is_adv) {
897 		/* If this is a SOLICITATION request only */
898 		if (ll_changed)
899 			nce_update(nce, ND_STALE, hw_addr);
900 		mutex_exit(&nce->nce_lock);
901 		return;
902 	}
903 	if (!(flag & ND_NA_FLAG_OVERRIDE) && ll_changed) {
904 		/* If in any other state than REACHABLE, ignore */
905 		if (nce->nce_state == ND_REACHABLE) {
906 			nce_update(nce, ND_STALE, NULL);
907 		}
908 		mutex_exit(&nce->nce_lock);
909 		return;
910 	} else {
911 		if (ll_changed) {
912 			nce_update(nce, ND_UNCHANGED, hw_addr);
913 			ll_updated = B_TRUE;
914 		}
915 		if (flag & ND_NA_FLAG_SOLICITED) {
916 			nce_update(nce, ND_REACHABLE, NULL);
917 		} else {
918 			if (ll_updated) {
919 				nce_update(nce, ND_STALE, NULL);
920 			}
921 		}
922 		mutex_exit(&nce->nce_lock);
923 		if (!(flag & ND_NA_FLAG_ROUTER) && (nce->nce_flags &
924 		    NCE_F_ISROUTER)) {
925 			ire_t *ire;
926 
927 			/*
928 			 * Router turned to host.  We need to remove the
929 			 * entry as well as any default route that may be
930 			 * using this as a next hop.  This is required by
931 			 * section 7.2.5 of RFC 2461.
932 			 */
933 			ire = ire_ftable_lookup_v6(&ipv6_all_zeros,
934 			    &ipv6_all_zeros, &nce->nce_addr, IRE_DEFAULT,
935 			    nce->nce_ill->ill_ipif, NULL, ALL_ZONES, 0, NULL,
936 			    MATCH_IRE_ILL | MATCH_IRE_TYPE | MATCH_IRE_GW |
937 			    MATCH_IRE_DEFAULT);
938 			if (ire != NULL) {
939 				ip_rts_rtmsg(RTM_DELETE, ire, 0);
940 				ire_delete(ire);
941 				ire_refrele(ire);
942 			}
943 			ndp_delete(nce);
944 		}
945 	}
946 }
947 
948 /*
949  * Pass arg1 to the pfi supplied, along with each nce in existence.
950  * ndp_walk() places a REFHOLD on the nce and drops the lock when
951  * walking the hash list.
952  */
953 void
954 ndp_walk_common(ndp_g_t *ndp, ill_t *ill, pfi_t pfi, void *arg1,
955     boolean_t trace)
956 {
957 
958 	nce_t	*nce;
959 	nce_t	*nce1;
960 	nce_t	**ncep;
961 	nce_t	*free_nce_list = NULL;
962 
963 	mutex_enter(&ndp->ndp_g_lock);
964 	/* Prevent ndp_delete from unlink and free of NCE */
965 	ndp->ndp_g_walker++;
966 	mutex_exit(&ndp->ndp_g_lock);
967 	for (ncep = ndp->nce_hash_tbl;
968 	    ncep < A_END(ndp->nce_hash_tbl); ncep++) {
969 		for (nce = *ncep; nce != NULL; nce = nce1) {
970 			nce1 = nce->nce_next;
971 			if (ill == NULL || nce->nce_ill == ill) {
972 				if (trace) {
973 					NCE_REFHOLD(nce);
974 					(*pfi)(nce, arg1);
975 					NCE_REFRELE(nce);
976 				} else {
977 					NCE_REFHOLD_NOTR(nce);
978 					(*pfi)(nce, arg1);
979 					NCE_REFRELE_NOTR(nce);
980 				}
981 			}
982 		}
983 	}
984 	for (nce = ndp->nce_mask_entries; nce != NULL; nce = nce1) {
985 		nce1 = nce->nce_next;
986 		if (ill == NULL || nce->nce_ill == ill) {
987 			if (trace) {
988 				NCE_REFHOLD(nce);
989 				(*pfi)(nce, arg1);
990 				NCE_REFRELE(nce);
991 			} else {
992 				NCE_REFHOLD_NOTR(nce);
993 				(*pfi)(nce, arg1);
994 				NCE_REFRELE_NOTR(nce);
995 			}
996 		}
997 	}
998 	mutex_enter(&ndp->ndp_g_lock);
999 	ndp->ndp_g_walker--;
1000 	/*
1001 	 * While NCE's are removed from global list they are placed
1002 	 * in a private list, to be passed to nce_ire_delete_list().
1003 	 * The reason is, there may be ires pointing to this nce
1004 	 * which needs to cleaned up.
1005 	 */
1006 	if (ndp->ndp_g_walker_cleanup && ndp->ndp_g_walker == 0) {
1007 		/* Time to delete condemned entries */
1008 		for (ncep = ndp->nce_hash_tbl;
1009 		    ncep < A_END(ndp->nce_hash_tbl); ncep++) {
1010 			nce = *ncep;
1011 			if (nce != NULL) {
1012 				nce_remove(ndp, nce, &free_nce_list);
1013 			}
1014 		}
1015 		nce = ndp->nce_mask_entries;
1016 		if (nce != NULL) {
1017 			nce_remove(ndp, nce, &free_nce_list);
1018 		}
1019 		ndp->ndp_g_walker_cleanup = B_FALSE;
1020 	}
1021 	mutex_exit(&ndp->ndp_g_lock);
1022 
1023 	if (free_nce_list != NULL) {
1024 		nce_ire_delete_list(free_nce_list);
1025 	}
1026 }
1027 
1028 void
1029 ndp_walk(ill_t *ill, pfi_t pfi, void *arg1)
1030 {
1031 	ndp_walk_common(&ndp4, ill, pfi, arg1, B_TRUE);
1032 	ndp_walk_common(&ndp6, ill, pfi, arg1, B_TRUE);
1033 }
1034 
1035 /*
1036  * Process resolve requests.  Handles both mapped entries
1037  * as well as cases that needs to be send out on the wire.
1038  * Lookup a NCE for a given IRE.  Regardless of whether one exists
1039  * or one is created, we defer making ire point to nce until the
1040  * ire is actually added at which point the nce_refcnt on the nce is
1041  * incremented.  This is done primarily to have symmetry between ire_add()
1042  * and ire_delete() which decrements the nce_refcnt, when an ire is deleted.
1043  */
1044 int
1045 ndp_resolver(ill_t *ill, const in6_addr_t *dst, mblk_t *mp, zoneid_t zoneid)
1046 {
1047 	nce_t		*nce;
1048 	int		err = 0;
1049 	uint32_t	ms;
1050 	mblk_t		*mp_nce = NULL;
1051 
1052 	ASSERT(ill != NULL);
1053 	ASSERT(ill->ill_isv6);
1054 	if (IN6_IS_ADDR_MULTICAST(dst)) {
1055 		err = nce_set_multicast(ill, dst);
1056 		return (err);
1057 	}
1058 	err = ndp_lookup_then_add(ill,
1059 	    NULL,	/* No hardware address */
1060 	    dst,
1061 	    &ipv6_all_ones,
1062 	    &ipv6_all_zeros,
1063 	    0,
1064 	    (ill->ill_flags & ILLF_NONUD) ? NCE_F_NONUD : 0,
1065 	    ND_INCOMPLETE,
1066 	    &nce,
1067 	    NULL, /* let ndp_add figure out fastpath mp and dlureq_mp for v6 */
1068 	    NULL);
1069 
1070 	switch (err) {
1071 	case 0:
1072 		/*
1073 		 * New cache entry was created. Make sure that the state
1074 		 * is not ND_INCOMPLETE. It can be in some other state
1075 		 * even before we send out the solicitation as we could
1076 		 * get un-solicited advertisements.
1077 		 *
1078 		 * If this is an XRESOLV interface, simply return 0,
1079 		 * since we don't want to solicit just yet.
1080 		 */
1081 		if (ill->ill_flags & ILLF_XRESOLV) {
1082 			NCE_REFRELE(nce);
1083 			return (0);
1084 		}
1085 		rw_enter(&ill_g_lock, RW_READER);
1086 		mutex_enter(&nce->nce_lock);
1087 		if (nce->nce_state != ND_INCOMPLETE) {
1088 			mutex_exit(&nce->nce_lock);
1089 			rw_exit(&ill_g_lock);
1090 			NCE_REFRELE(nce);
1091 			return (0);
1092 		}
1093 		mp_nce = ip_prepend_zoneid(mp, zoneid);
1094 		if (mp_nce == NULL) {
1095 			/* The caller will free mp */
1096 			mutex_exit(&nce->nce_lock);
1097 			rw_exit(&ill_g_lock);
1098 			ndp_delete(nce);
1099 			NCE_REFRELE(nce);
1100 			return (ENOMEM);
1101 		}
1102 		ms = nce_solicit(nce, mp_nce);
1103 		rw_exit(&ill_g_lock);
1104 		if (ms == 0) {
1105 			/* The caller will free mp */
1106 			if (mp_nce != mp)
1107 				freeb(mp_nce);
1108 			mutex_exit(&nce->nce_lock);
1109 			ndp_delete(nce);
1110 			NCE_REFRELE(nce);
1111 			return (EBUSY);
1112 		}
1113 		mutex_exit(&nce->nce_lock);
1114 		NDP_RESTART_TIMER(nce, (clock_t)ms);
1115 		NCE_REFRELE(nce);
1116 		return (EINPROGRESS);
1117 	case EEXIST:
1118 		/* Resolution in progress just queue the packet */
1119 		mutex_enter(&nce->nce_lock);
1120 		if (nce->nce_state == ND_INCOMPLETE) {
1121 			mp_nce = ip_prepend_zoneid(mp, zoneid);
1122 			if (mp_nce == NULL) {
1123 				err = ENOMEM;
1124 			} else {
1125 				nce_queue_mp(nce, mp_nce);
1126 				err = EINPROGRESS;
1127 			}
1128 		} else {
1129 			/*
1130 			 * Any other state implies we have
1131 			 * a nce but IRE needs to be added ...
1132 			 * ire_add_v6() will take care of the
1133 			 * the case when the nce becomes CONDEMNED
1134 			 * before the ire is added to the table.
1135 			 */
1136 			err = 0;
1137 		}
1138 		mutex_exit(&nce->nce_lock);
1139 		NCE_REFRELE(nce);
1140 		break;
1141 	default:
1142 		ip1dbg(("ndp_resolver: Can't create NCE %d\n", err));
1143 		break;
1144 	}
1145 	return (err);
1146 }
1147 
1148 /*
1149  * When there is no resolver, the link layer template is passed in
1150  * the IRE.
1151  * Lookup a NCE for a given IRE.  Regardless of whether one exists
1152  * or one is created, we defer making ire point to nce until the
1153  * ire is actually added at which point the nce_refcnt on the nce is
1154  * incremented.  This is done primarily to have symmetry between ire_add()
1155  * and ire_delete() which decrements the nce_refcnt, when an ire is deleted.
1156  */
1157 int
1158 ndp_noresolver(ill_t *ill, const in6_addr_t *dst)
1159 {
1160 	nce_t		*nce;
1161 	int		err = 0;
1162 
1163 	ASSERT(ill != NULL);
1164 	ASSERT(ill->ill_isv6);
1165 	if (IN6_IS_ADDR_MULTICAST(dst)) {
1166 		err = nce_set_multicast(ill, dst);
1167 		return (err);
1168 	}
1169 
1170 	err = ndp_lookup_then_add(ill,
1171 	    NULL,	/* hardware address */
1172 	    dst,
1173 	    &ipv6_all_ones,
1174 	    &ipv6_all_zeros,
1175 	    0,
1176 	    (ill->ill_flags & ILLF_NONUD) ? NCE_F_NONUD : 0,
1177 	    ND_REACHABLE,
1178 	    &nce,
1179 	    NULL, /* let ndp_add figure out fp_mp/dlureq_mp for v6 */
1180 	    NULL);
1181 
1182 	switch (err) {
1183 	case 0:
1184 		/*
1185 		 * Cache entry with a proper resolver cookie was
1186 		 * created.
1187 		 */
1188 		NCE_REFRELE(nce);
1189 		break;
1190 	case EEXIST:
1191 		err = 0;
1192 		NCE_REFRELE(nce);
1193 		break;
1194 	default:
1195 		ip1dbg(("ndp_noresolver: Can't create NCE %d\n", err));
1196 		break;
1197 	}
1198 	return (err);
1199 }
1200 
1201 /*
1202  * For each interface an entry is added for the unspecified multicast group.
1203  * Here that mapping is used to form the multicast cache entry for a particular
1204  * multicast destination.
1205  */
1206 static int
1207 nce_set_multicast(ill_t *ill, const in6_addr_t *dst)
1208 {
1209 	nce_t		*mnce;	/* Multicast mapping entry */
1210 	nce_t		*nce;
1211 	uchar_t		*hw_addr = NULL;
1212 	int		err = 0;
1213 
1214 	ASSERT(ill != NULL);
1215 	ASSERT(ill->ill_isv6);
1216 	ASSERT(!(IN6_IS_ADDR_UNSPECIFIED(dst)));
1217 
1218 	mutex_enter(&ndp6.ndp_g_lock);
1219 	nce = *((nce_t **)NCE_HASH_PTR_V6(*dst));
1220 	nce = nce_lookup_addr(ill, dst, nce);
1221 	if (nce != NULL) {
1222 		mutex_exit(&ndp6.ndp_g_lock);
1223 		NCE_REFRELE(nce);
1224 		return (0);
1225 	}
1226 	/* No entry, now lookup for a mapping this should never fail */
1227 	mnce = nce_lookup_mapping(ill, dst);
1228 	if (mnce == NULL) {
1229 		/* Something broken for the interface. */
1230 		mutex_exit(&ndp6.ndp_g_lock);
1231 		return (ESRCH);
1232 	}
1233 	ASSERT(mnce->nce_flags & NCE_F_MAPPING);
1234 	if (ill->ill_net_type == IRE_IF_RESOLVER) {
1235 		/*
1236 		 * For IRE_IF_RESOLVER a hardware mapping can be
1237 		 * generated, for IRE_IF_NORESOLVER, resolution cookie
1238 		 * in the ill is copied in ndp_add().
1239 		 */
1240 		hw_addr = kmem_alloc(ill->ill_nd_lla_len, KM_NOSLEEP);
1241 		if (hw_addr == NULL) {
1242 			mutex_exit(&ndp6.ndp_g_lock);
1243 			NCE_REFRELE(mnce);
1244 			return (ENOMEM);
1245 		}
1246 		nce_make_mapping(mnce, hw_addr, (uchar_t *)dst);
1247 	}
1248 	NCE_REFRELE(mnce);
1249 	/*
1250 	 * IRE_IF_NORESOLVER type simply copies the resolution
1251 	 * cookie passed in.  So no hw_addr is needed.
1252 	 */
1253 	err = ndp_add(ill,
1254 	    hw_addr,
1255 	    dst,
1256 	    &ipv6_all_ones,
1257 	    &ipv6_all_zeros,
1258 	    0,
1259 	    NCE_F_NONUD,
1260 	    ND_REACHABLE,
1261 	    &nce,
1262 	    NULL,
1263 	    NULL);
1264 	mutex_exit(&ndp6.ndp_g_lock);
1265 	if (hw_addr != NULL)
1266 		kmem_free(hw_addr, ill->ill_nd_lla_len);
1267 	if (err != 0) {
1268 		ip1dbg(("nce_set_multicast: create failed" "%d\n", err));
1269 		return (err);
1270 	}
1271 	NCE_REFRELE(nce);
1272 	return (0);
1273 }
1274 
1275 /*
1276  * Return the link layer address, and any flags of a nce.
1277  */
1278 int
1279 ndp_query(ill_t *ill, struct lif_nd_req *lnr)
1280 {
1281 	nce_t		*nce;
1282 	in6_addr_t	*addr;
1283 	sin6_t		*sin6;
1284 	dl_unitdata_req_t	*dl;
1285 
1286 	ASSERT(ill != NULL && ill->ill_isv6);
1287 	sin6 = (sin6_t *)&lnr->lnr_addr;
1288 	addr =  &sin6->sin6_addr;
1289 
1290 	nce = ndp_lookup_v6(ill, addr, B_FALSE);
1291 	if (nce == NULL)
1292 		return (ESRCH);
1293 	/* If in INCOMPLETE state, no link layer address is available yet */
1294 	if (nce->nce_state == ND_INCOMPLETE)
1295 		goto done;
1296 	dl = (dl_unitdata_req_t *)nce->nce_res_mp->b_rptr;
1297 	if (ill->ill_flags & ILLF_XRESOLV)
1298 		lnr->lnr_hdw_len = dl->dl_dest_addr_length;
1299 	else
1300 		lnr->lnr_hdw_len = ill->ill_nd_lla_len;
1301 	ASSERT(NCE_LL_ADDR_OFFSET(ill) + lnr->lnr_hdw_len <=
1302 	    sizeof (lnr->lnr_hdw_addr));
1303 	bcopy(nce->nce_res_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill),
1304 	    (uchar_t *)&lnr->lnr_hdw_addr, lnr->lnr_hdw_len);
1305 	if (nce->nce_flags & NCE_F_ISROUTER)
1306 		lnr->lnr_flags = NDF_ISROUTER_ON;
1307 	if (nce->nce_flags & NCE_F_PROXY)
1308 		lnr->lnr_flags |= NDF_PROXY_ON;
1309 	if (nce->nce_flags & NCE_F_ANYCAST)
1310 		lnr->lnr_flags |= NDF_ANYCAST_ON;
1311 done:
1312 	NCE_REFRELE(nce);
1313 	return (0);
1314 }
1315 
1316 /*
1317  * Send Enable/Disable multicast reqs to driver.
1318  */
1319 int
1320 ndp_mcastreq(ill_t *ill, const in6_addr_t *addr, uint32_t hw_addr_len,
1321     uint32_t hw_addr_offset, mblk_t *mp)
1322 {
1323 	nce_t		*nce;
1324 	uchar_t		*hw_addr;
1325 
1326 	ASSERT(ill != NULL && ill->ill_isv6);
1327 	ASSERT(ill->ill_net_type == IRE_IF_RESOLVER);
1328 	hw_addr = mi_offset_paramc(mp, hw_addr_offset, hw_addr_len);
1329 	if (hw_addr == NULL || !IN6_IS_ADDR_MULTICAST(addr)) {
1330 		freemsg(mp);
1331 		return (EINVAL);
1332 	}
1333 	mutex_enter(&ndp6.ndp_g_lock);
1334 	nce = nce_lookup_mapping(ill, addr);
1335 	if (nce == NULL) {
1336 		mutex_exit(&ndp6.ndp_g_lock);
1337 		freemsg(mp);
1338 		return (ESRCH);
1339 	}
1340 	mutex_exit(&ndp6.ndp_g_lock);
1341 	/*
1342 	 * Update dl_addr_length and dl_addr_offset for primitives that
1343 	 * have physical addresses as opposed to full saps
1344 	 */
1345 	switch (((union DL_primitives *)mp->b_rptr)->dl_primitive) {
1346 	case DL_ENABMULTI_REQ:
1347 		/* Track the state if this is the first enabmulti */
1348 		if (ill->ill_dlpi_multicast_state == IDS_UNKNOWN)
1349 			ill->ill_dlpi_multicast_state = IDS_INPROGRESS;
1350 		ip1dbg(("ndp_mcastreq: ENABMULTI\n"));
1351 		break;
1352 	case DL_DISABMULTI_REQ:
1353 		ip1dbg(("ndp_mcastreq: DISABMULTI\n"));
1354 		break;
1355 	default:
1356 		NCE_REFRELE(nce);
1357 		ip1dbg(("ndp_mcastreq: default\n"));
1358 		return (EINVAL);
1359 	}
1360 	nce_make_mapping(nce, hw_addr, (uchar_t *)addr);
1361 	NCE_REFRELE(nce);
1362 	putnext(ill->ill_wq, mp);
1363 	return (0);
1364 }
1365 
1366 /*
1367  * Send a neighbor solicitation.
1368  * Returns number of milliseconds after which we should either rexmit or abort.
1369  * Return of zero means we should abort.
1370  * The caller holds the nce_lock to protect nce_qd_mp and nce_rcnt.
1371  *
1372  * NOTE: This routine drops nce_lock (and later reacquires it) when sending
1373  * the packet.
1374  * NOTE: This routine does not consume mp.
1375  */
1376 uint32_t
1377 nce_solicit(nce_t *nce, mblk_t *mp)
1378 {
1379 	ill_t		*ill;
1380 	ill_t		*src_ill;
1381 	ip6_t		*ip6h;
1382 	in6_addr_t	src;
1383 	in6_addr_t	dst;
1384 	ipif_t		*ipif;
1385 	ip6i_t		*ip6i;
1386 	boolean_t	dropped = B_FALSE;
1387 
1388 	ASSERT(RW_READ_HELD(&ill_g_lock));
1389 	ASSERT(MUTEX_HELD(&nce->nce_lock));
1390 	ill = nce->nce_ill;
1391 	ASSERT(ill != NULL);
1392 
1393 	if (nce->nce_rcnt == 0) {
1394 		return (0);
1395 	}
1396 
1397 	if (mp == NULL) {
1398 		ASSERT(nce->nce_qd_mp != NULL);
1399 		mp = nce->nce_qd_mp;
1400 	} else {
1401 		nce_queue_mp(nce, mp);
1402 	}
1403 
1404 	/* Handle ip_newroute_v6 giving us IPSEC packets */
1405 	if (mp->b_datap->db_type == M_CTL)
1406 		mp = mp->b_cont;
1407 
1408 	ip6h = (ip6_t *)mp->b_rptr;
1409 	if (ip6h->ip6_nxt == IPPROTO_RAW) {
1410 		/*
1411 		 * This message should have been pulled up already in
1412 		 * ip_wput_v6. We can't do pullups here because the message
1413 		 * could be from the nce_qd_mp which could have b_next/b_prev
1414 		 * non-NULL.
1415 		 */
1416 		ip6i = (ip6i_t *)ip6h;
1417 		ASSERT((mp->b_wptr - (uchar_t *)ip6i) >=
1418 			    sizeof (ip6i_t) + IPV6_HDR_LEN);
1419 		ip6h = (ip6_t *)(mp->b_rptr + sizeof (ip6i_t));
1420 	}
1421 	src = ip6h->ip6_src;
1422 	/*
1423 	 * If the src of outgoing packet is one of the assigned interface
1424 	 * addresses use it, otherwise we will pick the source address below.
1425 	 */
1426 	src_ill = ill;
1427 	if (!IN6_IS_ADDR_UNSPECIFIED(&src)) {
1428 		if (ill->ill_group != NULL)
1429 			src_ill = ill->ill_group->illgrp_ill;
1430 		for (; src_ill != NULL; src_ill = src_ill->ill_group_next) {
1431 			for (ipif = src_ill->ill_ipif; ipif != NULL;
1432 			    ipif = ipif->ipif_next) {
1433 				if (IN6_ARE_ADDR_EQUAL(&src,
1434 				    &ipif->ipif_v6lcl_addr)) {
1435 					break;
1436 				}
1437 			}
1438 			if (ipif != NULL)
1439 				break;
1440 		}
1441 		/*
1442 		 * If no relevant ipif can be found, then it's not one of our
1443 		 * addresses.  Reset to :: and let nce_xmit.  If an ipif can be
1444 		 * found, but it's not yet done with DAD verification, then
1445 		 * just postpone this transmission until later.
1446 		 */
1447 		if (src_ill == NULL)
1448 			src = ipv6_all_zeros;
1449 		else if (!ipif->ipif_addr_ready)
1450 			return (ill->ill_reachable_retrans_time);
1451 	}
1452 	dst = nce->nce_addr;
1453 	/*
1454 	 * If source address is unspecified, nce_xmit will choose
1455 	 * one for us and initialize the hardware address also
1456 	 * appropriately.
1457 	 */
1458 	if (IN6_IS_ADDR_UNSPECIFIED(&src))
1459 		src_ill = NULL;
1460 	nce->nce_rcnt--;
1461 	mutex_exit(&nce->nce_lock);
1462 	rw_exit(&ill_g_lock);
1463 	dropped = nce_xmit(ill, ND_NEIGHBOR_SOLICIT, src_ill, B_TRUE, &src,
1464 	    &dst, 0);
1465 	rw_enter(&ill_g_lock, RW_READER);
1466 	mutex_enter(&nce->nce_lock);
1467 	if (dropped)
1468 		nce->nce_rcnt++;
1469 	return (ill->ill_reachable_retrans_time);
1470 }
1471 
1472 /*
1473  * Attempt to recover an address on an interface that's been marked as a
1474  * duplicate.  Because NCEs are destroyed when the interface goes down, there's
1475  * no easy way to just probe the address and have the right thing happen if
1476  * it's no longer in use.  Instead, we just bring it up normally and allow the
1477  * regular interface start-up logic to probe for a remaining duplicate and take
1478  * us back down if necessary.
1479  * Neither DHCP nor temporary addresses arrive here; they're excluded by
1480  * ip_ndp_excl.
1481  */
1482 /* ARGSUSED */
1483 static void
1484 ip_ndp_recover(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg)
1485 {
1486 	ill_t	*ill = rq->q_ptr;
1487 	ipif_t	*ipif;
1488 	in6_addr_t *addr = (in6_addr_t *)mp->b_rptr;
1489 
1490 	for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
1491 		/*
1492 		 * We do not support recovery of proxy ARP'd interfaces,
1493 		 * because the system lacks a complete proxy ARP mechanism.
1494 		 */
1495 		if ((ipif->ipif_flags & IPIF_POINTOPOINT) ||
1496 		    !IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6lcl_addr, addr)) {
1497 			continue;
1498 		}
1499 
1500 		/*
1501 		 * If we have already recovered or if the interface is going
1502 		 * away, then ignore.
1503 		 */
1504 		mutex_enter(&ill->ill_lock);
1505 		if (!(ipif->ipif_flags & IPIF_DUPLICATE) ||
1506 		    (ipif->ipif_flags & (IPIF_MOVING | IPIF_CONDEMNED))) {
1507 			mutex_exit(&ill->ill_lock);
1508 			continue;
1509 		}
1510 
1511 		ipif->ipif_flags &= ~IPIF_DUPLICATE;
1512 		ill->ill_ipif_dup_count--;
1513 		mutex_exit(&ill->ill_lock);
1514 		ipif->ipif_was_dup = B_TRUE;
1515 
1516 		if (ipif_ndp_up(ipif, addr) != EINPROGRESS)
1517 			(void) ipif_up_done_v6(ipif);
1518 	}
1519 	freeb(mp);
1520 }
1521 
1522 /*
1523  * Attempt to recover an IPv6 interface that's been shut down as a duplicate.
1524  * As long as someone else holds the address, the interface will stay down.
1525  * When that conflict goes away, the interface is brought back up.  This is
1526  * done so that accidental shutdowns of addresses aren't made permanent.  Your
1527  * server will recover from a failure.
1528  *
1529  * For DHCP and temporary addresses, recovery is not done in the kernel.
1530  * Instead, it's handled by user space processes (dhcpagent and in.ndpd).
1531  *
1532  * This function is entered on a timer expiry; the ID is in ipif_recovery_id.
1533  */
1534 static void
1535 ipif6_dup_recovery(void *arg)
1536 {
1537 	ipif_t *ipif = arg;
1538 
1539 	ipif->ipif_recovery_id = 0;
1540 	if (!(ipif->ipif_flags & IPIF_DUPLICATE))
1541 		return;
1542 
1543 	/*
1544 	 * No lock, because this is just an optimization.
1545 	 */
1546 	if (ipif->ipif_state_flags & (IPIF_MOVING | IPIF_CONDEMNED))
1547 		return;
1548 
1549 	/* If the link is down, we'll retry this later */
1550 	if (!(ipif->ipif_ill->ill_phyint->phyint_flags & PHYI_RUNNING))
1551 		return;
1552 
1553 	ndp_do_recovery(ipif);
1554 }
1555 
1556 /*
1557  * Perform interface recovery by forcing the duplicate interfaces up and
1558  * allowing the system to determine which ones should stay up.
1559  *
1560  * Called both by recovery timer expiry and link-up notification.
1561  */
1562 void
1563 ndp_do_recovery(ipif_t *ipif)
1564 {
1565 	ill_t *ill = ipif->ipif_ill;
1566 	mblk_t *mp;
1567 
1568 	mp = allocb(sizeof (ipif->ipif_v6lcl_addr), BPRI_MED);
1569 	if (mp == NULL) {
1570 		mutex_enter(&ill->ill_lock);
1571 		if (ipif->ipif_recovery_id == 0 &&
1572 		    !(ipif->ipif_state_flags & (IPIF_MOVING |
1573 		    IPIF_CONDEMNED))) {
1574 			ipif->ipif_recovery_id = timeout(ipif6_dup_recovery,
1575 			    ipif, MSEC_TO_TICK(ip_dup_recovery));
1576 		}
1577 		mutex_exit(&ill->ill_lock);
1578 	} else {
1579 		bcopy(&ipif->ipif_v6lcl_addr, mp->b_rptr,
1580 		    sizeof (ipif->ipif_v6lcl_addr));
1581 		ill_refhold(ill);
1582 		(void) qwriter_ip(NULL, ill, ill->ill_rq, mp, ip_ndp_recover,
1583 		    CUR_OP, B_FALSE);
1584 	}
1585 }
1586 
1587 /*
1588  * Find the solicitation in the given message, and extract printable details
1589  * (MAC and IP addresses) from it.
1590  */
1591 static nd_neighbor_solicit_t *
1592 ip_ndp_find_solicitation(mblk_t *mp, mblk_t *dl_mp, ill_t *ill, char *hbuf,
1593     size_t hlen, char *sbuf, size_t slen, uchar_t **haddr)
1594 {
1595 	nd_neighbor_solicit_t *ns;
1596 	ip6_t *ip6h;
1597 	uchar_t *addr;
1598 	int alen;
1599 
1600 	alen = 0;
1601 	ip6h = (ip6_t *)mp->b_rptr;
1602 	if (dl_mp == NULL) {
1603 		nd_opt_hdr_t *opt;
1604 		int nslen;
1605 
1606 		/*
1607 		 * If it's from the fast-path, then it can't be a probe
1608 		 * message, and thus must include the source linkaddr option.
1609 		 * Extract that here.
1610 		 */
1611 		ns = (nd_neighbor_solicit_t *)((char *)ip6h + IPV6_HDR_LEN);
1612 		nslen = mp->b_wptr - (uchar_t *)ns;
1613 		if ((nslen -= sizeof (*ns)) > 0) {
1614 			opt = ndp_get_option((nd_opt_hdr_t *)(ns + 1), nslen,
1615 			    ND_OPT_SOURCE_LINKADDR);
1616 			if (opt != NULL &&
1617 			    opt->nd_opt_len * 8 - sizeof (*opt) >=
1618 			    ill->ill_nd_lla_len) {
1619 				addr = (uchar_t *)(opt + 1);
1620 				alen = ill->ill_nd_lla_len;
1621 			}
1622 		}
1623 		/*
1624 		 * We cheat a bit here for the sake of printing usable log
1625 		 * messages in the rare case where the reply we got was unicast
1626 		 * without a source linkaddr option, and the interface is in
1627 		 * fastpath mode.  (Sigh.)
1628 		 */
1629 		if (alen == 0 && ill->ill_type == IFT_ETHER &&
1630 		    MBLKHEAD(mp) >= sizeof (struct ether_header)) {
1631 			struct ether_header *pether;
1632 
1633 			pether = (struct ether_header *)((char *)ip6h -
1634 			    sizeof (*pether));
1635 			addr = pether->ether_shost.ether_addr_octet;
1636 			alen = ETHERADDRL;
1637 		}
1638 	} else {
1639 		dl_unitdata_ind_t *dlu;
1640 
1641 		dlu = (dl_unitdata_ind_t *)dl_mp->b_rptr;
1642 		alen = dlu->dl_src_addr_length;
1643 		if (alen > 0 && dlu->dl_src_addr_offset >= sizeof (*dlu) &&
1644 		    dlu->dl_src_addr_offset + alen <= MBLKL(dl_mp)) {
1645 			addr = dl_mp->b_rptr + dlu->dl_src_addr_offset;
1646 			if (ill->ill_sap_length < 0) {
1647 				alen += ill->ill_sap_length;
1648 			} else {
1649 				addr += ill->ill_sap_length;
1650 				alen -= ill->ill_sap_length;
1651 			}
1652 		}
1653 	}
1654 	if (alen > 0) {
1655 		*haddr = addr;
1656 		(void) mac_colon_addr(addr, alen, hbuf, hlen);
1657 	} else {
1658 		*haddr = NULL;
1659 		(void) strcpy(hbuf, "?");
1660 	}
1661 	ns = (nd_neighbor_solicit_t *)((char *)ip6h + IPV6_HDR_LEN);
1662 	(void) inet_ntop(AF_INET6, &ns->nd_ns_target, sbuf, slen);
1663 	return (ns);
1664 }
1665 
1666 /*
1667  * This is for exclusive changes due to NDP duplicate address detection
1668  * failure.
1669  */
1670 /* ARGSUSED */
1671 static void
1672 ip_ndp_excl(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg)
1673 {
1674 	ill_t	*ill = rq->q_ptr;
1675 	ipif_t	*ipif;
1676 	char ibuf[LIFNAMSIZ + 10];	/* 10 digits for logical i/f number */
1677 	char hbuf[MAC_STR_LEN];
1678 	char sbuf[INET6_ADDRSTRLEN];
1679 	nd_neighbor_solicit_t *ns;
1680 	mblk_t *dl_mp = NULL;
1681 	uchar_t *haddr;
1682 
1683 	if (DB_TYPE(mp) != M_DATA) {
1684 		dl_mp = mp;
1685 		mp = mp->b_cont;
1686 	}
1687 	ns = ip_ndp_find_solicitation(mp, dl_mp, ill, hbuf, sizeof (hbuf), sbuf,
1688 	    sizeof (sbuf), &haddr);
1689 	if (haddr != NULL &&
1690 	    bcmp(haddr, ill->ill_phys_addr, ill->ill_phys_addr_length) == 0) {
1691 		/*
1692 		 * Ignore conflicts generated by misbehaving switches that just
1693 		 * reflect our own messages back to us.
1694 		 */
1695 		goto ignore_conflict;
1696 	}
1697 	(void) strlcpy(ibuf, ill->ill_name, sizeof (ibuf));
1698 	for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
1699 
1700 		if ((ipif->ipif_flags & IPIF_POINTOPOINT) ||
1701 		    !IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6lcl_addr,
1702 		    &ns->nd_ns_target)) {
1703 			continue;
1704 		}
1705 
1706 		/* If it's already marked, then don't do anything. */
1707 		if (ipif->ipif_flags & IPIF_DUPLICATE)
1708 			continue;
1709 
1710 		/*
1711 		 * If this is a failure during duplicate recovery, then don't
1712 		 * complain.  It may take a long time to recover.
1713 		 */
1714 		if (!ipif->ipif_was_dup) {
1715 			if (ipif->ipif_id != 0) {
1716 				(void) snprintf(ibuf + ill->ill_name_length - 1,
1717 				    sizeof (ibuf) - ill->ill_name_length + 1,
1718 				    ":%d", ipif->ipif_id);
1719 			}
1720 			cmn_err(CE_WARN, "%s has duplicate address %s (in "
1721 			    "use by %s); disabled", ibuf, sbuf, hbuf);
1722 		}
1723 		mutex_enter(&ill->ill_lock);
1724 		ASSERT(!(ipif->ipif_flags & IPIF_DUPLICATE));
1725 		ipif->ipif_flags |= IPIF_DUPLICATE;
1726 		ill->ill_ipif_dup_count++;
1727 		mutex_exit(&ill->ill_lock);
1728 		(void) ipif_down(ipif, NULL, NULL);
1729 		ipif_down_tail(ipif);
1730 		mutex_enter(&ill->ill_lock);
1731 		if (!(ipif->ipif_flags & (IPIF_DHCPRUNNING|IPIF_TEMPORARY)) &&
1732 		    ill->ill_net_type == IRE_IF_RESOLVER &&
1733 		    !(ipif->ipif_state_flags & (IPIF_MOVING |
1734 		    IPIF_CONDEMNED)) &&
1735 		    ip_dup_recovery > 0) {
1736 			ipif->ipif_recovery_id = timeout(ipif6_dup_recovery,
1737 			    ipif, MSEC_TO_TICK(ip_dup_recovery));
1738 		}
1739 		mutex_exit(&ill->ill_lock);
1740 	}
1741 ignore_conflict:
1742 	if (dl_mp != NULL)
1743 		freeb(dl_mp);
1744 	freemsg(mp);
1745 }
1746 
1747 /*
1748  * Handle failure by tearing down the ipifs with the specified address.  Note
1749  * that tearing down the ipif also means deleting the nce through ipif_down, so
1750  * it's not possible to do recovery by just restarting the nce timer.  Instead,
1751  * we start a timer on the ipif.
1752  */
1753 static void
1754 ip_ndp_failure(ill_t *ill, mblk_t *mp, mblk_t *dl_mp, nce_t *nce)
1755 {
1756 	if ((mp = copymsg(mp)) != NULL) {
1757 		if (dl_mp == NULL)
1758 			dl_mp = mp;
1759 		else if ((dl_mp = copyb(dl_mp)) != NULL)
1760 			dl_mp->b_cont = mp;
1761 		if (dl_mp == NULL) {
1762 			freemsg(mp);
1763 		} else {
1764 			ill_refhold(ill);
1765 			(void) qwriter_ip(NULL, ill, ill->ill_rq, dl_mp,
1766 			    ip_ndp_excl, CUR_OP, B_FALSE);
1767 		}
1768 	}
1769 	ndp_delete(nce);
1770 }
1771 
1772 /*
1773  * Handle a discovered conflict: some other system is advertising that it owns
1774  * one of our IP addresses.  We need to defend ourselves, or just shut down the
1775  * interface.
1776  */
1777 static void
1778 ip_ndp_conflict(ill_t *ill, mblk_t *mp, mblk_t *dl_mp, nce_t *nce)
1779 {
1780 	ipif_t *ipif;
1781 	uint32_t now;
1782 	uint_t maxdefense;
1783 	uint_t defs;
1784 
1785 	ipif = ipif_lookup_addr_v6(&nce->nce_addr, ill, ALL_ZONES, NULL, NULL,
1786 	    NULL, NULL);
1787 	if (ipif == NULL)
1788 		return;
1789 	/*
1790 	 * First, figure out if this address is disposable.
1791 	 */
1792 	if (ipif->ipif_flags & (IPIF_DHCPRUNNING | IPIF_TEMPORARY))
1793 		maxdefense = ip_max_temp_defend;
1794 	else
1795 		maxdefense = ip_max_defend;
1796 
1797 	/*
1798 	 * Now figure out how many times we've defended ourselves.  Ignore
1799 	 * defenses that happened long in the past.
1800 	 */
1801 	now = gethrestime_sec();
1802 	mutex_enter(&nce->nce_lock);
1803 	if ((defs = nce->nce_defense_count) > 0 &&
1804 	    now - nce->nce_defense_time > ip_defend_interval) {
1805 		nce->nce_defense_count = defs = 0;
1806 	}
1807 	nce->nce_defense_count++;
1808 	nce->nce_defense_time = now;
1809 	mutex_exit(&nce->nce_lock);
1810 	ipif_refrele(ipif);
1811 
1812 	/*
1813 	 * If we've defended ourselves too many times already, then give up and
1814 	 * tear down the interface(s) using this address.  Otherwise, defend by
1815 	 * sending out an unsolicited Neighbor Advertisement.
1816 	 */
1817 	if (defs >= maxdefense) {
1818 		ip_ndp_failure(ill, mp, dl_mp, nce);
1819 	} else {
1820 		char hbuf[MAC_STR_LEN];
1821 		char sbuf[INET6_ADDRSTRLEN];
1822 		uchar_t *haddr;
1823 
1824 		(void) ip_ndp_find_solicitation(mp, dl_mp, ill, hbuf,
1825 		    sizeof (hbuf), sbuf, sizeof (sbuf), &haddr);
1826 		cmn_err(CE_WARN, "node %s is using our IP address %s on %s",
1827 		    hbuf, sbuf, ill->ill_name);
1828 		(void) nce_xmit(ill, ND_NEIGHBOR_ADVERT, ill, B_FALSE,
1829 		    &nce->nce_addr, &ipv6_all_hosts_mcast,
1830 		    nce_advert_flags(nce));
1831 	}
1832 }
1833 
1834 static void
1835 ndp_input_solicit(ill_t *ill, mblk_t *mp, mblk_t *dl_mp)
1836 {
1837 	nd_neighbor_solicit_t *ns;
1838 	uint32_t	hlen = ill->ill_nd_lla_len;
1839 	uchar_t		*haddr = NULL;
1840 	icmp6_t		*icmp_nd;
1841 	ip6_t		*ip6h;
1842 	nce_t		*our_nce = NULL;
1843 	in6_addr_t	target;
1844 	in6_addr_t	src;
1845 	int		len;
1846 	int		flag = 0;
1847 	nd_opt_hdr_t	*opt = NULL;
1848 	boolean_t	bad_solicit = B_FALSE;
1849 	mib2_ipv6IfIcmpEntry_t	*mib = ill->ill_icmp6_mib;
1850 
1851 	ip6h = (ip6_t *)mp->b_rptr;
1852 	icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
1853 	len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN;
1854 	src = ip6h->ip6_src;
1855 	ns = (nd_neighbor_solicit_t *)icmp_nd;
1856 	target = ns->nd_ns_target;
1857 	if (IN6_IS_ADDR_MULTICAST(&target)) {
1858 		if (ip_debug > 2) {
1859 			/* ip1dbg */
1860 			pr_addr_dbg("ndp_input_solicit: Target is"
1861 			    " multicast! %s\n", AF_INET6, &target);
1862 		}
1863 		bad_solicit = B_TRUE;
1864 		goto done;
1865 	}
1866 	if (len > sizeof (nd_neighbor_solicit_t)) {
1867 		/* Options present */
1868 		opt = (nd_opt_hdr_t *)&ns[1];
1869 		len -= sizeof (nd_neighbor_solicit_t);
1870 		if (!ndp_verify_optlen(opt, len)) {
1871 			ip1dbg(("ndp_input_solicit: Bad opt len\n"));
1872 			bad_solicit = B_TRUE;
1873 			goto done;
1874 		}
1875 	}
1876 	if (IN6_IS_ADDR_UNSPECIFIED(&src)) {
1877 		/* Check to see if this is a valid DAD solicitation */
1878 		if (!IN6_IS_ADDR_MC_SOLICITEDNODE(&ip6h->ip6_dst)) {
1879 			if (ip_debug > 2) {
1880 				/* ip1dbg */
1881 				pr_addr_dbg("ndp_input_solicit: IPv6 "
1882 				    "Destination is not solicited node "
1883 				    "multicast %s\n", AF_INET6,
1884 				    &ip6h->ip6_dst);
1885 			}
1886 			bad_solicit = B_TRUE;
1887 			goto done;
1888 		}
1889 	}
1890 
1891 	our_nce = ndp_lookup_v6(ill, &target, B_FALSE);
1892 	/*
1893 	 * If this is a valid Solicitation, a permanent
1894 	 * entry should exist in the cache
1895 	 */
1896 	if (our_nce == NULL ||
1897 	    !(our_nce->nce_flags & NCE_F_PERMANENT)) {
1898 		ip1dbg(("ndp_input_solicit: Wrong target in NS?!"
1899 		    "ifname=%s ", ill->ill_name));
1900 		if (ip_debug > 2) {
1901 			/* ip1dbg */
1902 			pr_addr_dbg(" dst %s\n", AF_INET6, &target);
1903 		}
1904 		bad_solicit = B_TRUE;
1905 		goto done;
1906 	}
1907 
1908 	/* At this point we should have a verified NS per spec */
1909 	if (opt != NULL) {
1910 		opt = ndp_get_option(opt, len, ND_OPT_SOURCE_LINKADDR);
1911 		if (opt != NULL) {
1912 			haddr = (uchar_t *)&opt[1];
1913 			if (hlen > opt->nd_opt_len * 8 - sizeof (*opt) ||
1914 			    hlen == 0) {
1915 				ip1dbg(("ndp_input_advert: bad SLLA\n"));
1916 				bad_solicit = B_TRUE;
1917 				goto done;
1918 			}
1919 		}
1920 	}
1921 
1922 	/* If sending directly to peer, set the unicast flag */
1923 	if (!IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst))
1924 		flag |= NDP_UNICAST;
1925 
1926 	/*
1927 	 * Create/update the entry for the soliciting node.
1928 	 * or respond to outstanding queries, don't if
1929 	 * the source is unspecified address.
1930 	 */
1931 	if (!IN6_IS_ADDR_UNSPECIFIED(&src)) {
1932 		int	err;
1933 		nce_t	*nnce;
1934 
1935 		ASSERT(ill->ill_isv6);
1936 		/*
1937 		 * Regular solicitations *must* include the Source Link-Layer
1938 		 * Address option.  Ignore messages that do not.
1939 		 */
1940 		if (haddr == NULL && IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst)) {
1941 			ip1dbg(("ndp_input_solicit: source link-layer address "
1942 			    "option missing with a specified source.\n"));
1943 			bad_solicit = B_TRUE;
1944 			goto done;
1945 		}
1946 
1947 		/*
1948 		 * This is a regular solicitation.  If we're still in the
1949 		 * process of verifying the address, then don't respond at all
1950 		 * and don't keep track of the sender.
1951 		 */
1952 		if (our_nce->nce_state == ND_PROBE)
1953 			goto done;
1954 
1955 		/*
1956 		 * If the solicitation doesn't have sender hardware address
1957 		 * (legal for unicast solicitation), then process without
1958 		 * installing the return NCE.  Either we already know it, or
1959 		 * we'll be forced to look it up when (and if) we reply to the
1960 		 * packet.
1961 		 */
1962 		if (haddr == NULL)
1963 			goto no_source;
1964 
1965 		err = ndp_lookup_then_add(ill,
1966 		    haddr,
1967 		    &src,	/* Soliciting nodes address */
1968 		    &ipv6_all_ones,
1969 		    &ipv6_all_zeros,
1970 		    0,
1971 		    0,
1972 		    ND_STALE,
1973 		    &nnce,
1974 		    NULL,
1975 		    NULL);
1976 		switch (err) {
1977 		case 0:
1978 			/* done with this entry */
1979 			NCE_REFRELE(nnce);
1980 			break;
1981 		case EEXIST:
1982 			/*
1983 			 * B_FALSE indicates this is not an
1984 			 * an advertisement.
1985 			 */
1986 			ndp_process(nnce, haddr, 0, B_FALSE);
1987 			NCE_REFRELE(nnce);
1988 			break;
1989 		default:
1990 			ip1dbg(("ndp_input_solicit: Can't create NCE %d\n",
1991 			    err));
1992 			goto done;
1993 		}
1994 no_source:
1995 		flag |= NDP_SOLICITED;
1996 	} else {
1997 		/*
1998 		 * No source link layer address option should be present in a
1999 		 * valid DAD request.
2000 		 */
2001 		if (haddr != NULL) {
2002 			ip1dbg(("ndp_input_solicit: source link-layer address "
2003 			    "option present with an unspecified source.\n"));
2004 			bad_solicit = B_TRUE;
2005 			goto done;
2006 		}
2007 		if (our_nce->nce_state == ND_PROBE) {
2008 			/*
2009 			 * Internally looped-back probes won't have DLPI
2010 			 * attached to them.  External ones (which are sent by
2011 			 * multicast) always will.  Just ignore our own
2012 			 * transmissions.
2013 			 */
2014 			if (dl_mp != NULL) {
2015 				/*
2016 				 * If someone else is probing our address, then
2017 				 * we've crossed wires.  Declare failure.
2018 				 */
2019 				ip_ndp_failure(ill, mp, dl_mp, our_nce);
2020 			}
2021 			goto done;
2022 		}
2023 		/*
2024 		 * This is a DAD probe.  Multicast the advertisement to the
2025 		 * all-nodes address.
2026 		 */
2027 		src = ipv6_all_hosts_mcast;
2028 	}
2029 	flag |= nce_advert_flags(our_nce);
2030 	/* Response to a solicitation */
2031 	(void) nce_xmit(ill,
2032 	    ND_NEIGHBOR_ADVERT,
2033 	    ill,	/* ill to be used for extracting ill_nd_lla */
2034 	    B_TRUE,	/* use ill_nd_lla */
2035 	    &target,	/* Source and target of the advertisement pkt */
2036 	    &src,	/* IP Destination (source of original pkt) */
2037 	    flag);
2038 done:
2039 	if (bad_solicit)
2040 		BUMP_MIB(mib, ipv6IfIcmpInBadNeighborSolicitations);
2041 	if (our_nce != NULL)
2042 		NCE_REFRELE(our_nce);
2043 }
2044 
2045 void
2046 ndp_input_advert(ill_t *ill, mblk_t *mp, mblk_t *dl_mp)
2047 {
2048 	nd_neighbor_advert_t *na;
2049 	uint32_t	hlen = ill->ill_nd_lla_len;
2050 	uchar_t		*haddr = NULL;
2051 	icmp6_t		*icmp_nd;
2052 	ip6_t		*ip6h;
2053 	nce_t		*dst_nce = NULL;
2054 	in6_addr_t	target;
2055 	nd_opt_hdr_t	*opt = NULL;
2056 	int		len;
2057 	mib2_ipv6IfIcmpEntry_t	*mib = ill->ill_icmp6_mib;
2058 
2059 	ip6h = (ip6_t *)mp->b_rptr;
2060 	icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
2061 	len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN;
2062 	na = (nd_neighbor_advert_t *)icmp_nd;
2063 	if (IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst) &&
2064 	    (na->nd_na_flags_reserved & ND_NA_FLAG_SOLICITED)) {
2065 		ip1dbg(("ndp_input_advert: Target is multicast but the "
2066 		    "solicited flag is not zero\n"));
2067 		BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements);
2068 		return;
2069 	}
2070 	target = na->nd_na_target;
2071 	if (IN6_IS_ADDR_MULTICAST(&target)) {
2072 		ip1dbg(("ndp_input_advert: Target is multicast!\n"));
2073 		BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements);
2074 		return;
2075 	}
2076 	if (len > sizeof (nd_neighbor_advert_t)) {
2077 		opt = (nd_opt_hdr_t *)&na[1];
2078 		if (!ndp_verify_optlen(opt,
2079 		    len - sizeof (nd_neighbor_advert_t))) {
2080 			ip1dbg(("ndp_input_advert: cannot verify SLLA\n"));
2081 			BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements);
2082 			return;
2083 		}
2084 		/* At this point we have a verified NA per spec */
2085 		len -= sizeof (nd_neighbor_advert_t);
2086 		opt = ndp_get_option(opt, len, ND_OPT_TARGET_LINKADDR);
2087 		if (opt != NULL) {
2088 			haddr = (uchar_t *)&opt[1];
2089 			if (hlen > opt->nd_opt_len * 8 - sizeof (*opt) ||
2090 			    hlen == 0) {
2091 				ip1dbg(("ndp_input_advert: bad SLLA\n"));
2092 				BUMP_MIB(mib,
2093 				    ipv6IfIcmpInBadNeighborAdvertisements);
2094 				return;
2095 			}
2096 		}
2097 	}
2098 
2099 	/*
2100 	 * If this interface is part of the group look at all the
2101 	 * ills in the group.
2102 	 */
2103 	rw_enter(&ill_g_lock, RW_READER);
2104 	if (ill->ill_group != NULL)
2105 		ill = ill->ill_group->illgrp_ill;
2106 
2107 	for (; ill != NULL; ill = ill->ill_group_next) {
2108 		mutex_enter(&ill->ill_lock);
2109 		if (!ILL_CAN_LOOKUP(ill)) {
2110 			mutex_exit(&ill->ill_lock);
2111 			continue;
2112 		}
2113 		ill_refhold_locked(ill);
2114 		mutex_exit(&ill->ill_lock);
2115 		dst_nce = ndp_lookup_v6(ill, &target, B_FALSE);
2116 		/* We have to drop the lock since ndp_process calls put* */
2117 		rw_exit(&ill_g_lock);
2118 		if (dst_nce != NULL) {
2119 			if ((dst_nce->nce_flags & NCE_F_PERMANENT) &&
2120 			    dst_nce->nce_state == ND_PROBE) {
2121 				/*
2122 				 * Someone else sent an advertisement for an
2123 				 * address that we're trying to configure.
2124 				 * Tear it down.  Note that dl_mp might be NULL
2125 				 * if we're getting a unicast reply.  This
2126 				 * isn't typically done (multicast is the norm
2127 				 * in response to a probe), but ip_ndp_failure
2128 				 * will handle the dl_mp == NULL case as well.
2129 				 */
2130 				ip_ndp_failure(ill, mp, dl_mp, dst_nce);
2131 			} else if (dst_nce->nce_flags & NCE_F_PERMANENT) {
2132 				/*
2133 				 * Someone just announced one of our local
2134 				 * addresses.  If it wasn't us, then this is a
2135 				 * conflict.  Defend the address or shut it
2136 				 * down.
2137 				 */
2138 				if (dl_mp != NULL &&
2139 				    (haddr == NULL ||
2140 				    nce_cmp_ll_addr(dst_nce, haddr,
2141 				    ill->ill_nd_lla_len))) {
2142 					ip_ndp_conflict(ill, mp, dl_mp,
2143 					    dst_nce);
2144 				}
2145 			} else {
2146 				if (na->nd_na_flags_reserved &
2147 				    ND_NA_FLAG_ROUTER) {
2148 					dst_nce->nce_flags |= NCE_F_ISROUTER;
2149 				}
2150 				/* B_TRUE indicates this an advertisement */
2151 				ndp_process(dst_nce, haddr,
2152 				    na->nd_na_flags_reserved, B_TRUE);
2153 			}
2154 			NCE_REFRELE(dst_nce);
2155 		}
2156 		rw_enter(&ill_g_lock, RW_READER);
2157 		ill_refrele(ill);
2158 	}
2159 	rw_exit(&ill_g_lock);
2160 }
2161 
2162 /*
2163  * Process NDP neighbor solicitation/advertisement messages.
2164  * The checksum has already checked o.k before reaching here.
2165  */
2166 void
2167 ndp_input(ill_t *ill, mblk_t *mp, mblk_t *dl_mp)
2168 {
2169 	icmp6_t		*icmp_nd;
2170 	ip6_t		*ip6h;
2171 	int		len;
2172 	mib2_ipv6IfIcmpEntry_t	*mib = ill->ill_icmp6_mib;
2173 
2174 
2175 	if (!pullupmsg(mp, -1)) {
2176 		ip1dbg(("ndp_input: pullupmsg failed\n"));
2177 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2178 		goto done;
2179 	}
2180 	ip6h = (ip6_t *)mp->b_rptr;
2181 	if (ip6h->ip6_hops != IPV6_MAX_HOPS) {
2182 		ip1dbg(("ndp_input: hoplimit != IPV6_MAX_HOPS\n"));
2183 		BUMP_MIB(mib, ipv6IfIcmpBadHoplimit);
2184 		goto done;
2185 	}
2186 	/*
2187 	 * NDP does not accept any extension headers between the
2188 	 * IP header and the ICMP header since e.g. a routing
2189 	 * header could be dangerous.
2190 	 * This assumes that any AH or ESP headers are removed
2191 	 * by ip prior to passing the packet to ndp_input.
2192 	 */
2193 	if (ip6h->ip6_nxt != IPPROTO_ICMPV6) {
2194 		ip1dbg(("ndp_input: Wrong next header 0x%x\n",
2195 		    ip6h->ip6_nxt));
2196 		BUMP_MIB(mib, ipv6IfIcmpInErrors);
2197 		goto done;
2198 	}
2199 	icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
2200 	ASSERT(icmp_nd->icmp6_type == ND_NEIGHBOR_SOLICIT ||
2201 	    icmp_nd->icmp6_type == ND_NEIGHBOR_ADVERT);
2202 	if (icmp_nd->icmp6_code != 0) {
2203 		ip1dbg(("ndp_input: icmp6 code != 0 \n"));
2204 		BUMP_MIB(mib, ipv6IfIcmpInErrors);
2205 		goto done;
2206 	}
2207 	len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN;
2208 	/*
2209 	 * Make sure packet length is large enough for either
2210 	 * a NS or a NA icmp packet.
2211 	 */
2212 	if (len <  sizeof (struct icmp6_hdr) + sizeof (struct in6_addr)) {
2213 		ip1dbg(("ndp_input: packet too short\n"));
2214 		BUMP_MIB(mib, ipv6IfIcmpInErrors);
2215 		goto done;
2216 	}
2217 	if (icmp_nd->icmp6_type == ND_NEIGHBOR_SOLICIT) {
2218 		ndp_input_solicit(ill, mp, dl_mp);
2219 	} else {
2220 		ndp_input_advert(ill, mp, dl_mp);
2221 	}
2222 done:
2223 	freemsg(mp);
2224 }
2225 
2226 /*
2227  * nce_xmit is called to form and transmit a ND solicitation or
2228  * advertisement ICMP packet.
2229  *
2230  * If the source address is unspecified and this isn't a probe (used for
2231  * duplicate address detection), an appropriate source address and link layer
2232  * address will be chosen here.  The link layer address option is included if
2233  * the source is specified (i.e., all non-probe packets), and omitted (per the
2234  * specification) otherwise.
2235  *
2236  * It returns B_FALSE only if it does a successful put() to the
2237  * corresponding ill's ill_wq otherwise returns B_TRUE.
2238  */
2239 static boolean_t
2240 nce_xmit(ill_t *ill, uint32_t operation, ill_t *hwaddr_ill,
2241     boolean_t use_nd_lla, const in6_addr_t *sender, const in6_addr_t *target,
2242     int flag)
2243 {
2244 	uint32_t	len;
2245 	icmp6_t 	*icmp6;
2246 	mblk_t		*mp;
2247 	ip6_t		*ip6h;
2248 	nd_opt_hdr_t	*opt;
2249 	uint_t		plen;
2250 	ip6i_t		*ip6i;
2251 	ipif_t		*src_ipif = NULL;
2252 	uint8_t		*hw_addr;
2253 
2254 	/*
2255 	 * If we have a unspecified source(sender) address, select a
2256 	 * proper source address for the solicitation here itself so
2257 	 * that we can initialize the h/w address correctly. This is
2258 	 * needed for interface groups as source address can come from
2259 	 * the whole group and the h/w address initialized from ill will
2260 	 * be wrong if the source address comes from a different ill.
2261 	 *
2262 	 * Note that the NA never comes here with the unspecified source
2263 	 * address. The following asserts that whenever the source
2264 	 * address is specified, the haddr also should be specified.
2265 	 */
2266 	ASSERT(IN6_IS_ADDR_UNSPECIFIED(sender) || (hwaddr_ill != NULL));
2267 
2268 	if (IN6_IS_ADDR_UNSPECIFIED(sender) && !(flag & NDP_PROBE)) {
2269 		ASSERT(operation != ND_NEIGHBOR_ADVERT);
2270 		/*
2271 		 * Pick a source address for this solicitation, but
2272 		 * restrict the selection to addresses assigned to the
2273 		 * output interface (or interface group).  We do this
2274 		 * because the destination will create a neighbor cache
2275 		 * entry for the source address of this packet, so the
2276 		 * source address had better be a valid neighbor.
2277 		 */
2278 		src_ipif = ipif_select_source_v6(ill, target, RESTRICT_TO_ILL,
2279 		    IPV6_PREFER_SRC_DEFAULT, GLOBAL_ZONEID);
2280 		if (src_ipif == NULL) {
2281 			char buf[INET6_ADDRSTRLEN];
2282 
2283 			ip1dbg(("nce_xmit: No source ipif for dst %s\n",
2284 			    inet_ntop(AF_INET6, (char *)target, buf,
2285 			    sizeof (buf))));
2286 			return (B_TRUE);
2287 		}
2288 		sender = &src_ipif->ipif_v6src_addr;
2289 		hwaddr_ill = src_ipif->ipif_ill;
2290 	}
2291 
2292 	/*
2293 	 * Always make sure that the NS/NA packets don't get load
2294 	 * spread. This is needed so that the probe packets sent
2295 	 * by the in.mpathd daemon can really go out on the desired
2296 	 * interface. Probe packets are made to go out on a desired
2297 	 * interface by including a ip6i with ATTACH_IF flag. As these
2298 	 * packets indirectly end up sending/receiving NS/NA packets
2299 	 * (neighbor doing NUD), we have to make sure that NA
2300 	 * also go out on the same interface.
2301 	 */
2302 	plen = (sizeof (nd_opt_hdr_t) + ill->ill_nd_lla_len + 7) / 8;
2303 	len = IPV6_HDR_LEN + sizeof (ip6i_t) + sizeof (nd_neighbor_advert_t) +
2304 	    plen * 8;
2305 	mp = allocb(len,  BPRI_LO);
2306 	if (mp == NULL) {
2307 		if (src_ipif != NULL)
2308 			ipif_refrele(src_ipif);
2309 		return (B_TRUE);
2310 	}
2311 	bzero((char *)mp->b_rptr, len);
2312 	mp->b_wptr = mp->b_rptr + len;
2313 
2314 	ip6i = (ip6i_t *)mp->b_rptr;
2315 	ip6i->ip6i_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
2316 	ip6i->ip6i_nxt = IPPROTO_RAW;
2317 	ip6i->ip6i_flags = IP6I_ATTACH_IF | IP6I_HOPLIMIT;
2318 	if (flag & NDP_PROBE)
2319 		ip6i->ip6i_flags |= IP6I_UNSPEC_SRC;
2320 	ip6i->ip6i_ifindex = ill->ill_phyint->phyint_ifindex;
2321 
2322 	ip6h = (ip6_t *)(mp->b_rptr + sizeof (ip6i_t));
2323 	ip6h->ip6_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
2324 	ip6h->ip6_plen = htons(len - IPV6_HDR_LEN - sizeof (ip6i_t));
2325 	ip6h->ip6_nxt = IPPROTO_ICMPV6;
2326 	ip6h->ip6_hops = IPV6_MAX_HOPS;
2327 	ip6h->ip6_dst = *target;
2328 	icmp6 = (icmp6_t *)&ip6h[1];
2329 
2330 	opt = (nd_opt_hdr_t *)((uint8_t *)ip6h + IPV6_HDR_LEN +
2331 	    sizeof (nd_neighbor_advert_t));
2332 
2333 	if (operation == ND_NEIGHBOR_SOLICIT) {
2334 		nd_neighbor_solicit_t *ns = (nd_neighbor_solicit_t *)icmp6;
2335 
2336 		if (!(flag & NDP_PROBE))
2337 			opt->nd_opt_type = ND_OPT_SOURCE_LINKADDR;
2338 		ip6h->ip6_src = *sender;
2339 		ns->nd_ns_target = *target;
2340 		if (!(flag & NDP_UNICAST)) {
2341 			/* Form multicast address of the target */
2342 			ip6h->ip6_dst = ipv6_solicited_node_mcast;
2343 			ip6h->ip6_dst.s6_addr32[3] |=
2344 			    ns->nd_ns_target.s6_addr32[3];
2345 		}
2346 	} else {
2347 		nd_neighbor_advert_t *na = (nd_neighbor_advert_t *)icmp6;
2348 
2349 		ASSERT(!(flag & NDP_PROBE));
2350 		opt->nd_opt_type = ND_OPT_TARGET_LINKADDR;
2351 		ip6h->ip6_src = *sender;
2352 		na->nd_na_target = *sender;
2353 		if (flag & NDP_ISROUTER)
2354 			na->nd_na_flags_reserved |= ND_NA_FLAG_ROUTER;
2355 		if (flag & NDP_SOLICITED)
2356 			na->nd_na_flags_reserved |= ND_NA_FLAG_SOLICITED;
2357 		if (flag & NDP_ORIDE)
2358 			na->nd_na_flags_reserved |= ND_NA_FLAG_OVERRIDE;
2359 	}
2360 
2361 	hw_addr = NULL;
2362 	if (!(flag & NDP_PROBE)) {
2363 		hw_addr = use_nd_lla ? hwaddr_ill->ill_nd_lla :
2364 		    hwaddr_ill->ill_phys_addr;
2365 		if (hw_addr != NULL) {
2366 			/* Fill in link layer address and option len */
2367 			opt->nd_opt_len = (uint8_t)plen;
2368 			bcopy(hw_addr, &opt[1], hwaddr_ill->ill_nd_lla_len);
2369 		}
2370 	}
2371 	if (hw_addr == NULL) {
2372 		/* If there's no link layer address option, then strip it. */
2373 		len -= plen * 8;
2374 		mp->b_wptr = mp->b_rptr + len;
2375 		ip6h->ip6_plen = htons(len - IPV6_HDR_LEN - sizeof (ip6i_t));
2376 	}
2377 
2378 	icmp6->icmp6_type = (uint8_t)operation;
2379 	icmp6->icmp6_code = 0;
2380 	/*
2381 	 * Prepare for checksum by putting icmp length in the icmp
2382 	 * checksum field. The checksum is calculated in ip_wput_v6.
2383 	 */
2384 	icmp6->icmp6_cksum = ip6h->ip6_plen;
2385 
2386 	if (src_ipif != NULL)
2387 		ipif_refrele(src_ipif);
2388 	if (canput(ill->ill_wq)) {
2389 		put(ill->ill_wq, mp);
2390 		return (B_FALSE);
2391 	}
2392 	freemsg(mp);
2393 	return (B_TRUE);
2394 }
2395 
2396 /*
2397  * Make a link layer address (does not include the SAP) from an nce.
2398  * To form the link layer address, use the last four bytes of ipv6
2399  * address passed in and the fixed offset stored in nce.
2400  */
2401 static void
2402 nce_make_mapping(nce_t *nce, uchar_t *addrpos, uchar_t *addr)
2403 {
2404 	uchar_t *mask, *to;
2405 	ill_t	*ill = nce->nce_ill;
2406 	int 	len;
2407 
2408 	if (ill->ill_net_type == IRE_IF_NORESOLVER)
2409 		return;
2410 	ASSERT(nce->nce_res_mp != NULL);
2411 	ASSERT(ill->ill_net_type == IRE_IF_RESOLVER);
2412 	ASSERT(nce->nce_flags & NCE_F_MAPPING);
2413 	ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&nce->nce_extract_mask));
2414 	ASSERT(addr != NULL);
2415 	bcopy(nce->nce_res_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill),
2416 	    addrpos, ill->ill_nd_lla_len);
2417 	len = MIN((int)ill->ill_nd_lla_len - nce->nce_ll_extract_start,
2418 	    IPV6_ADDR_LEN);
2419 	mask = (uchar_t *)&nce->nce_extract_mask;
2420 	mask += (IPV6_ADDR_LEN - len);
2421 	addr += (IPV6_ADDR_LEN - len);
2422 	to = addrpos + nce->nce_ll_extract_start;
2423 	while (len-- > 0)
2424 		*to++ |= *mask++ & *addr++;
2425 }
2426 
2427 /*
2428  * Pass a cache report back out via NDD.
2429  */
2430 /* ARGSUSED */
2431 int
2432 ndp_report(queue_t *q, mblk_t *mp, caddr_t arg, cred_t *ioc_cr)
2433 {
2434 	(void) mi_mpprintf(mp, "ifname      hardware addr    flags"
2435 			"     proto addr/mask");
2436 	ndp_walk(NULL, (pfi_t)nce_report1, (uchar_t *)mp);
2437 	return (0);
2438 }
2439 
2440 /*
2441  * Add a single line to the NDP Cache Entry Report.
2442  */
2443 static void
2444 nce_report1(nce_t *nce, uchar_t *mp_arg)
2445 {
2446 	ill_t		*ill = nce->nce_ill;
2447 	char		local_buf[INET6_ADDRSTRLEN];
2448 	uchar_t		flags_buf[10];
2449 	uint32_t	flags = nce->nce_flags;
2450 	mblk_t		*mp = (mblk_t *)mp_arg;
2451 	uchar_t		*h;
2452 	uchar_t		*m = flags_buf;
2453 	in6_addr_t	v6addr;
2454 
2455 	/*
2456 	 * Lock the nce to protect nce_res_mp from being changed
2457 	 * if an external resolver address resolution completes
2458 	 * while nce_res_mp is being accessed here.
2459 	 *
2460 	 * Deal with all address formats, not just Ethernet-specific
2461 	 * In addition, make sure that the mblk has enough space
2462 	 * before writing to it. If is doesn't, allocate a new one.
2463 	 */
2464 	if (nce->nce_ipversion == IPV4_VERSION)
2465 		/* Don't include v4 nce_ts in NDP cache entry report */
2466 		return;
2467 
2468 	ASSERT(ill != NULL);
2469 	v6addr = nce->nce_mask;
2470 	if (flags & NCE_F_PERMANENT)
2471 		*m++ = 'P';
2472 	if (flags & NCE_F_ISROUTER)
2473 		*m++ = 'R';
2474 	if (flags & NCE_F_MAPPING)
2475 		*m++ = 'M';
2476 	*m = '\0';
2477 
2478 	if (ill->ill_net_type == IRE_IF_RESOLVER) {
2479 		size_t		addrlen;
2480 		char		*addr_buf;
2481 		dl_unitdata_req_t	*dl;
2482 
2483 		mutex_enter(&nce->nce_lock);
2484 		h = nce->nce_res_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill);
2485 		dl = (dl_unitdata_req_t *)nce->nce_res_mp->b_rptr;
2486 		if (ill->ill_flags & ILLF_XRESOLV)
2487 			addrlen = (3 * (dl->dl_dest_addr_length));
2488 		else
2489 			addrlen = (3 * (ill->ill_nd_lla_len));
2490 		if (addrlen <= 0) {
2491 			mutex_exit(&nce->nce_lock);
2492 			(void) mi_mpprintf(mp,
2493 			    "%8s %9s %5s %s/%d",
2494 			    ill->ill_name,
2495 			    "None",
2496 			    (uchar_t *)&flags_buf,
2497 			    inet_ntop(AF_INET6, (char *)&nce->nce_addr,
2498 				(char *)local_buf, sizeof (local_buf)),
2499 				ip_mask_to_plen_v6(&v6addr));
2500 		} else {
2501 			/*
2502 			 * Convert the hardware/lla address to ascii
2503 			 */
2504 			addr_buf = kmem_zalloc(addrlen, KM_NOSLEEP);
2505 			if (addr_buf == NULL) {
2506 				mutex_exit(&nce->nce_lock);
2507 				return;
2508 			}
2509 			(void) mac_colon_addr((uint8_t *)h,
2510 			    (ill->ill_flags & ILLF_XRESOLV) ?
2511 			    dl->dl_dest_addr_length : ill->ill_nd_lla_len,
2512 			    addr_buf, addrlen);
2513 			mutex_exit(&nce->nce_lock);
2514 			(void) mi_mpprintf(mp, "%8s %17s %5s %s/%d",
2515 			    ill->ill_name, addr_buf, (uchar_t *)&flags_buf,
2516 			    inet_ntop(AF_INET6, (char *)&nce->nce_addr,
2517 				(char *)local_buf, sizeof (local_buf)),
2518 				ip_mask_to_plen_v6(&v6addr));
2519 			kmem_free(addr_buf, addrlen);
2520 		}
2521 	} else {
2522 		(void) mi_mpprintf(mp,
2523 		    "%8s %9s %5s %s/%d",
2524 		    ill->ill_name,
2525 		    "None",
2526 		    (uchar_t *)&flags_buf,
2527 		    inet_ntop(AF_INET6, (char *)&nce->nce_addr,
2528 			(char *)local_buf, sizeof (local_buf)),
2529 			ip_mask_to_plen_v6(&v6addr));
2530 	}
2531 }
2532 
2533 mblk_t *
2534 nce_udreq_alloc(ill_t *ill)
2535 {
2536 	mblk_t	*template_mp = NULL;
2537 	dl_unitdata_req_t *dlur;
2538 	int	sap_length;
2539 
2540 	ASSERT(ill->ill_isv6);
2541 
2542 	sap_length = ill->ill_sap_length;
2543 	template_mp = ip_dlpi_alloc(sizeof (dl_unitdata_req_t) +
2544 	    ill->ill_nd_lla_len + ABS(sap_length), DL_UNITDATA_REQ);
2545 	if (template_mp == NULL)
2546 		return (NULL);
2547 
2548 	dlur = (dl_unitdata_req_t *)template_mp->b_rptr;
2549 	dlur->dl_priority.dl_min = 0;
2550 	dlur->dl_priority.dl_max = 0;
2551 	dlur->dl_dest_addr_length = ABS(sap_length) + ill->ill_nd_lla_len;
2552 	dlur->dl_dest_addr_offset = sizeof (dl_unitdata_req_t);
2553 
2554 	/* Copy in the SAP value. */
2555 	NCE_LL_SAP_COPY(ill, template_mp);
2556 
2557 	return (template_mp);
2558 }
2559 
2560 /*
2561  * NDP retransmit timer.
2562  * This timer goes off when:
2563  * a. It is time to retransmit NS for resolver.
2564  * b. It is time to send reachability probes.
2565  */
2566 void
2567 ndp_timer(void *arg)
2568 {
2569 	nce_t		*nce = arg;
2570 	ill_t		*ill = nce->nce_ill;
2571 	uint32_t	ms;
2572 	char		addrbuf[INET6_ADDRSTRLEN];
2573 	mblk_t		*mp;
2574 	boolean_t	dropped = B_FALSE;
2575 
2576 	/*
2577 	 * The timer has to be cancelled by ndp_delete before doing the final
2578 	 * refrele. So the NCE is guaranteed to exist when the timer runs
2579 	 * until it clears the timeout_id. Before clearing the timeout_id
2580 	 * bump up the refcnt so that we can continue to use the nce
2581 	 */
2582 	ASSERT(nce != NULL);
2583 
2584 	/*
2585 	 * Grab the ill_g_lock now itself to avoid lock order problems.
2586 	 * nce_solicit needs ill_g_lock to be able to traverse ills
2587 	 */
2588 	rw_enter(&ill_g_lock, RW_READER);
2589 	mutex_enter(&nce->nce_lock);
2590 	NCE_REFHOLD_LOCKED(nce);
2591 	nce->nce_timeout_id = 0;
2592 
2593 	/*
2594 	 * Check the reachability state first.
2595 	 */
2596 	switch (nce->nce_state) {
2597 	case ND_DELAY:
2598 		rw_exit(&ill_g_lock);
2599 		nce->nce_state = ND_PROBE;
2600 		mutex_exit(&nce->nce_lock);
2601 		(void) nce_xmit(ill, ND_NEIGHBOR_SOLICIT, NULL, B_FALSE,
2602 		    &ipv6_all_zeros, &nce->nce_addr, NDP_UNICAST);
2603 		if (ip_debug > 3) {
2604 			/* ip2dbg */
2605 			pr_addr_dbg("ndp_timer: state for %s changed "
2606 			    "to PROBE\n", AF_INET6, &nce->nce_addr);
2607 		}
2608 		NDP_RESTART_TIMER(nce, ill->ill_reachable_retrans_time);
2609 		NCE_REFRELE(nce);
2610 		return;
2611 	case ND_PROBE:
2612 		/* must be retransmit timer */
2613 		rw_exit(&ill_g_lock);
2614 		nce->nce_pcnt--;
2615 		ASSERT(nce->nce_pcnt < ND_MAX_UNICAST_SOLICIT &&
2616 		    nce->nce_pcnt >= -1);
2617 		if (nce->nce_pcnt > 0) {
2618 			/*
2619 			 * As per RFC2461, the nce gets deleted after
2620 			 * MAX_UNICAST_SOLICIT unsuccessful re-transmissions.
2621 			 * Note that the first unicast solicitation is sent
2622 			 * during the DELAY state.
2623 			 */
2624 			ip2dbg(("ndp_timer: pcount=%x dst %s\n",
2625 			    nce->nce_pcnt, inet_ntop(AF_INET6, &nce->nce_addr,
2626 			    addrbuf, sizeof (addrbuf))));
2627 			mutex_exit(&nce->nce_lock);
2628 			dropped = nce_xmit(ill, ND_NEIGHBOR_SOLICIT, NULL,
2629 			    B_FALSE, &ipv6_all_zeros, &nce->nce_addr,
2630 			    (nce->nce_flags & NCE_F_PERMANENT) ? NDP_PROBE :
2631 			    NDP_UNICAST);
2632 			if (dropped) {
2633 				mutex_enter(&nce->nce_lock);
2634 				nce->nce_pcnt++;
2635 				mutex_exit(&nce->nce_lock);
2636 			}
2637 			NDP_RESTART_TIMER(nce, ILL_PROBE_INTERVAL(ill));
2638 		} else if (nce->nce_pcnt < 0) {
2639 			/* No hope, delete the nce */
2640 			nce->nce_state = ND_UNREACHABLE;
2641 			mutex_exit(&nce->nce_lock);
2642 			if (ip_debug > 2) {
2643 				/* ip1dbg */
2644 				pr_addr_dbg("ndp_timer: Delete IRE for"
2645 				    " dst %s\n", AF_INET6, &nce->nce_addr);
2646 			}
2647 			ndp_delete(nce);
2648 		} else if (!(nce->nce_flags & NCE_F_PERMANENT)) {
2649 			/* Wait RetransTimer, before deleting the entry */
2650 			ip2dbg(("ndp_timer: pcount=%x dst %s\n",
2651 			    nce->nce_pcnt, inet_ntop(AF_INET6,
2652 			    &nce->nce_addr, addrbuf, sizeof (addrbuf))));
2653 			mutex_exit(&nce->nce_lock);
2654 			/* Wait one interval before killing */
2655 			NDP_RESTART_TIMER(nce, ill->ill_reachable_retrans_time);
2656 		} else if (ill->ill_phyint->phyint_flags & PHYI_RUNNING) {
2657 			ipif_t *ipif;
2658 
2659 			/*
2660 			 * We're done probing, and we can now declare this
2661 			 * address to be usable.  Let IP know that it's ok to
2662 			 * use.
2663 			 */
2664 			nce->nce_state = ND_REACHABLE;
2665 			mutex_exit(&nce->nce_lock);
2666 			ipif = ipif_lookup_addr_v6(&nce->nce_addr, ill,
2667 			    ALL_ZONES, NULL, NULL, NULL, NULL);
2668 			if (ipif != NULL) {
2669 				if (ipif->ipif_was_dup) {
2670 					char ibuf[LIFNAMSIZ + 10];
2671 					char sbuf[INET6_ADDRSTRLEN];
2672 
2673 					ipif->ipif_was_dup = B_FALSE;
2674 					(void) strlcpy(ibuf, ill->ill_name,
2675 					    sizeof (ibuf));
2676 					(void) inet_ntop(AF_INET6,
2677 					    &ipif->ipif_v6lcl_addr,
2678 					    sbuf, sizeof (sbuf));
2679 					if (ipif->ipif_id != 0) {
2680 						(void) snprintf(ibuf +
2681 						    ill->ill_name_length - 1,
2682 						    sizeof (ibuf) -
2683 						    ill->ill_name_length + 1,
2684 						    ":%d", ipif->ipif_id);
2685 					}
2686 					cmn_err(CE_NOTE, "recovered address "
2687 					    "%s on %s", sbuf, ibuf);
2688 				}
2689 				if ((ipif->ipif_flags & IPIF_UP) &&
2690 				    !ipif->ipif_addr_ready) {
2691 					ip_rts_ifmsg(ipif);
2692 					ip_rts_newaddrmsg(RTM_ADD, 0, ipif);
2693 					sctp_update_ipif(ipif, SCTP_IPIF_UP);
2694 				}
2695 				ipif->ipif_addr_ready = 1;
2696 				ipif_refrele(ipif);
2697 			}
2698 			/* Begin defending our new address */
2699 			nce->nce_unsolicit_count = 0;
2700 			dropped = nce_xmit(ill, ND_NEIGHBOR_ADVERT, ill,
2701 			    B_FALSE, &nce->nce_addr, &ipv6_all_hosts_mcast,
2702 			    nce_advert_flags(nce));
2703 			if (dropped) {
2704 				nce->nce_unsolicit_count = 1;
2705 				NDP_RESTART_TIMER(nce,
2706 				    ip_ndp_unsolicit_interval);
2707 			} else if (ip_ndp_defense_interval != 0) {
2708 				NDP_RESTART_TIMER(nce, ip_ndp_defense_interval);
2709 			}
2710 		} else {
2711 			/*
2712 			 * This is an address we're probing to be our own, but
2713 			 * the ill is down.  Wait until it comes back before
2714 			 * doing anything, but switch to reachable state so
2715 			 * that the restart will work.
2716 			 */
2717 			nce->nce_state = ND_REACHABLE;
2718 			mutex_exit(&nce->nce_lock);
2719 		}
2720 		NCE_REFRELE(nce);
2721 		return;
2722 	case ND_INCOMPLETE:
2723 		/*
2724 		 * Must be resolvers retransmit timer.
2725 		 */
2726 		for (mp = nce->nce_qd_mp; mp != NULL; mp = mp->b_next) {
2727 			ip6i_t	*ip6i;
2728 			ip6_t	*ip6h;
2729 			mblk_t *data_mp;
2730 
2731 			/*
2732 			 * Walk the list of packets queued, and see if there
2733 			 * are any multipathing probe packets. Such packets
2734 			 * are always queued at the head. Since this is a
2735 			 * retransmit timer firing, mark such packets as
2736 			 * delayed in ND resolution. This info will be used
2737 			 * in ip_wput_v6(). Multipathing probe packets will
2738 			 * always have an ip6i_t. Once we hit a packet without
2739 			 * it, we can break out of this loop.
2740 			 */
2741 			if (mp->b_datap->db_type == M_CTL)
2742 				data_mp = mp->b_cont;
2743 			else
2744 				data_mp = mp;
2745 
2746 			ip6h = (ip6_t *)data_mp->b_rptr;
2747 			if (ip6h->ip6_nxt != IPPROTO_RAW)
2748 				break;
2749 
2750 			/*
2751 			 * This message should have been pulled up already in
2752 			 * ip_wput_v6. We can't do pullups here because the
2753 			 * b_next/b_prev is non-NULL.
2754 			 */
2755 			ip6i = (ip6i_t *)ip6h;
2756 			ASSERT((data_mp->b_wptr - (uchar_t *)ip6i) >=
2757 			    sizeof (ip6i_t) + IPV6_HDR_LEN);
2758 
2759 			/* Mark this packet as delayed due to ND resolution */
2760 			if (ip6i->ip6i_flags & IP6I_DROP_IFDELAYED)
2761 				ip6i->ip6i_flags |= IP6I_ND_DELAYED;
2762 		}
2763 		if (nce->nce_qd_mp != NULL) {
2764 			ms = nce_solicit(nce, NULL);
2765 			rw_exit(&ill_g_lock);
2766 			if (ms == 0) {
2767 				if (nce->nce_state != ND_REACHABLE) {
2768 					mutex_exit(&nce->nce_lock);
2769 					nce_resolv_failed(nce);
2770 					ndp_delete(nce);
2771 				} else {
2772 					mutex_exit(&nce->nce_lock);
2773 				}
2774 			} else {
2775 				mutex_exit(&nce->nce_lock);
2776 				NDP_RESTART_TIMER(nce, (clock_t)ms);
2777 			}
2778 			NCE_REFRELE(nce);
2779 			return;
2780 		}
2781 		mutex_exit(&nce->nce_lock);
2782 		rw_exit(&ill_g_lock);
2783 		NCE_REFRELE(nce);
2784 		break;
2785 	case ND_REACHABLE :
2786 		rw_exit(&ill_g_lock);
2787 		if (((nce->nce_flags & NCE_F_UNSOL_ADV) &&
2788 		    nce->nce_unsolicit_count != 0) ||
2789 		    ((nce->nce_flags & NCE_F_PERMANENT) &&
2790 		    ip_ndp_defense_interval != 0)) {
2791 			if (nce->nce_unsolicit_count > 0)
2792 				nce->nce_unsolicit_count--;
2793 			mutex_exit(&nce->nce_lock);
2794 			dropped = nce_xmit(ill,
2795 			    ND_NEIGHBOR_ADVERT,
2796 			    ill,	/* ill to be used for hw addr */
2797 			    B_FALSE,	/* use ill_phys_addr */
2798 			    &nce->nce_addr,
2799 			    &ipv6_all_hosts_mcast,
2800 			    nce_advert_flags(nce));
2801 			if (dropped) {
2802 				mutex_enter(&nce->nce_lock);
2803 				nce->nce_unsolicit_count++;
2804 				mutex_exit(&nce->nce_lock);
2805 			}
2806 			if (nce->nce_unsolicit_count != 0) {
2807 				NDP_RESTART_TIMER(nce,
2808 				    ip_ndp_unsolicit_interval);
2809 			} else {
2810 				NDP_RESTART_TIMER(nce,
2811 				    ip_ndp_defense_interval);
2812 			}
2813 		} else {
2814 			mutex_exit(&nce->nce_lock);
2815 		}
2816 		NCE_REFRELE(nce);
2817 		break;
2818 	default:
2819 		rw_exit(&ill_g_lock);
2820 		mutex_exit(&nce->nce_lock);
2821 		NCE_REFRELE(nce);
2822 		break;
2823 	}
2824 }
2825 
2826 /*
2827  * Set a link layer address from the ll_addr passed in.
2828  * Copy SAP from ill.
2829  */
2830 static void
2831 nce_set_ll(nce_t *nce, uchar_t *ll_addr)
2832 {
2833 	ill_t	*ill = nce->nce_ill;
2834 	uchar_t	*woffset;
2835 
2836 	ASSERT(ll_addr != NULL);
2837 	/* Always called before fast_path_probe */
2838 	ASSERT(nce->nce_fp_mp == NULL);
2839 	if (ill->ill_sap_length != 0) {
2840 		/*
2841 		 * Copy the SAP type specified in the
2842 		 * request into the xmit template.
2843 		 */
2844 		NCE_LL_SAP_COPY(ill, nce->nce_res_mp);
2845 	}
2846 	if (ill->ill_phys_addr_length > 0) {
2847 		/*
2848 		 * The bcopy() below used to be called for the physical address
2849 		 * length rather than the link layer address length. For
2850 		 * ethernet and many other media, the phys_addr and lla are
2851 		 * identical.
2852 		 * However, with xresolv interfaces being introduced, the
2853 		 * phys_addr and lla are no longer the same, and the physical
2854 		 * address may not have any useful meaning, so we use the lla
2855 		 * for IPv6 address resolution and destination addressing.
2856 		 *
2857 		 * For PPP or other interfaces with a zero length
2858 		 * physical address, don't do anything here.
2859 		 * The bcopy() with a zero phys_addr length was previously
2860 		 * a no-op for interfaces with a zero-length physical address.
2861 		 * Using the lla for them would change the way they operate.
2862 		 * Doing nothing in such cases preserves expected behavior.
2863 		 */
2864 		woffset = nce->nce_res_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill);
2865 		bcopy(ll_addr, woffset, ill->ill_nd_lla_len);
2866 	}
2867 }
2868 
2869 static boolean_t
2870 nce_cmp_ll_addr(const nce_t *nce, const uchar_t *ll_addr, uint32_t ll_addr_len)
2871 {
2872 	ill_t	*ill = nce->nce_ill;
2873 	uchar_t	*ll_offset;
2874 
2875 	ASSERT(nce->nce_res_mp != NULL);
2876 	if (ll_addr == NULL)
2877 		return (B_FALSE);
2878 	ll_offset = nce->nce_res_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill);
2879 	if (bcmp(ll_addr, ll_offset, ll_addr_len) != 0)
2880 		return (B_TRUE);
2881 	return (B_FALSE);
2882 }
2883 
2884 /*
2885  * Updates the link layer address or the reachability state of
2886  * a cache entry.  Reset probe counter if needed.
2887  */
2888 static void
2889 nce_update(nce_t *nce, uint16_t new_state, uchar_t *new_ll_addr)
2890 {
2891 	ill_t	*ill = nce->nce_ill;
2892 	boolean_t need_stop_timer = B_FALSE;
2893 	boolean_t need_fastpath_update = B_FALSE;
2894 
2895 	ASSERT(MUTEX_HELD(&nce->nce_lock));
2896 	ASSERT(nce->nce_ipversion == IPV6_VERSION);
2897 	/*
2898 	 * If this interface does not do NUD, there is no point
2899 	 * in allowing an update to the cache entry.  Although
2900 	 * we will respond to NS.
2901 	 * The only time we accept an update for a resolver when
2902 	 * NUD is turned off is when it has just been created.
2903 	 * Non-Resolvers will always be created as REACHABLE.
2904 	 */
2905 	if (new_state != ND_UNCHANGED) {
2906 		if ((nce->nce_flags & NCE_F_NONUD) &&
2907 		    (nce->nce_state != ND_INCOMPLETE))
2908 			return;
2909 		ASSERT((int16_t)new_state >= ND_STATE_VALID_MIN);
2910 		ASSERT((int16_t)new_state <= ND_STATE_VALID_MAX);
2911 		need_stop_timer = B_TRUE;
2912 		if (new_state == ND_REACHABLE)
2913 			nce->nce_last = TICK_TO_MSEC(lbolt64);
2914 		else {
2915 			/* We force NUD in this case */
2916 			nce->nce_last = 0;
2917 		}
2918 		nce->nce_state = new_state;
2919 		nce->nce_pcnt = ND_MAX_UNICAST_SOLICIT;
2920 	}
2921 	/*
2922 	 * In case of fast path we need to free the the fastpath
2923 	 * M_DATA and do another probe.  Otherwise we can just
2924 	 * overwrite the DL_UNITDATA_REQ data, noting we'll lose
2925 	 * whatever packets that happens to be transmitting at the time.
2926 	 */
2927 	if (new_ll_addr != NULL) {
2928 		ASSERT(nce->nce_res_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill) +
2929 		    ill->ill_nd_lla_len <= nce->nce_res_mp->b_wptr);
2930 		bcopy(new_ll_addr, nce->nce_res_mp->b_rptr +
2931 		    NCE_LL_ADDR_OFFSET(ill), ill->ill_nd_lla_len);
2932 		if (nce->nce_fp_mp != NULL) {
2933 			freemsg(nce->nce_fp_mp);
2934 			nce->nce_fp_mp = NULL;
2935 		}
2936 		need_fastpath_update = B_TRUE;
2937 	}
2938 	mutex_exit(&nce->nce_lock);
2939 	if (need_stop_timer) {
2940 		(void) untimeout(nce->nce_timeout_id);
2941 		nce->nce_timeout_id = 0;
2942 	}
2943 	if (need_fastpath_update)
2944 		nce_fastpath(nce);
2945 	mutex_enter(&nce->nce_lock);
2946 }
2947 
2948 void
2949 nce_queue_mp_common(nce_t *nce, mblk_t *mp, boolean_t head_insert)
2950 {
2951 	uint_t	count = 0;
2952 	mblk_t  **mpp;
2953 
2954 	ASSERT(MUTEX_HELD(&nce->nce_lock));
2955 
2956 	for (mpp = &nce->nce_qd_mp; *mpp != NULL;
2957 	    mpp = &(*mpp)->b_next) {
2958 		if (++count >
2959 		    nce->nce_ill->ill_max_buf) {
2960 			mblk_t *tmp = nce->nce_qd_mp->b_next;
2961 
2962 			nce->nce_qd_mp->b_next = NULL;
2963 			nce->nce_qd_mp->b_prev = NULL;
2964 			freemsg(nce->nce_qd_mp);
2965 			nce->nce_qd_mp = tmp;
2966 		}
2967 	}
2968 	/* put this on the list */
2969 	if (head_insert) {
2970 		mp->b_next = nce->nce_qd_mp;
2971 		nce->nce_qd_mp = mp;
2972 	} else {
2973 		*mpp = mp;
2974 	}
2975 }
2976 
2977 static void
2978 nce_queue_mp(nce_t *nce, mblk_t *mp)
2979 {
2980 	boolean_t head_insert = B_FALSE;
2981 	ip6_t	*ip6h;
2982 	ip6i_t	*ip6i;
2983 	mblk_t *data_mp;
2984 
2985 	ASSERT(MUTEX_HELD(&nce->nce_lock));
2986 
2987 	if (mp->b_datap->db_type == M_CTL)
2988 		data_mp = mp->b_cont;
2989 	else
2990 		data_mp = mp;
2991 	ip6h = (ip6_t *)data_mp->b_rptr;
2992 	if (ip6h->ip6_nxt == IPPROTO_RAW) {
2993 		/*
2994 		 * This message should have been pulled up already in
2995 		 * ip_wput_v6. We can't do pullups here because the message
2996 		 * could be from the nce_qd_mp which could have b_next/b_prev
2997 		 * non-NULL.
2998 		 */
2999 		ip6i = (ip6i_t *)ip6h;
3000 		ASSERT((data_mp->b_wptr - (uchar_t *)ip6i) >=
3001 			    sizeof (ip6i_t) + IPV6_HDR_LEN);
3002 		/*
3003 		 * Multipathing probe packets have IP6I_DROP_IFDELAYED set.
3004 		 * This has 2 aspects mentioned below.
3005 		 * 1. Perform head insertion in the nce_qd_mp for these packets.
3006 		 * This ensures that next retransmit of ND solicitation
3007 		 * will use the interface specified by the probe packet,
3008 		 * for both NS and NA. This corresponds to the src address
3009 		 * in the IPv6 packet. If we insert at tail, we will be
3010 		 * depending on the packet at the head for successful
3011 		 * ND resolution. This is not reliable, because the interface
3012 		 * on which the NA arrives could be different from the interface
3013 		 * on which the NS was sent, and if the receiving interface is
3014 		 * failed, it will appear that the sending interface is also
3015 		 * failed, causing in.mpathd to misdiagnose this as link
3016 		 * failure.
3017 		 * 2. Drop the original packet, if the ND resolution did not
3018 		 * succeed in the first attempt. However we will create the
3019 		 * nce and the ire, as soon as the ND resolution succeeds.
3020 		 * We don't gain anything by queueing multiple probe packets
3021 		 * and sending them back-to-back once resolution succeeds.
3022 		 * It is sufficient to send just 1 packet after ND resolution
3023 		 * succeeds. Since mpathd is sending down probe packets at a
3024 		 * constant rate, we don't need to send the queued packet. We
3025 		 * need to queue it only for NDP resolution. The benefit of
3026 		 * dropping the probe packets that were delayed in ND
3027 		 * resolution, is that in.mpathd will not see inflated
3028 		 * RTT. If the ND resolution does not succeed within
3029 		 * in.mpathd's failure detection time, mpathd may detect
3030 		 * a failure, and it does not matter whether the packet
3031 		 * was queued or dropped.
3032 		 */
3033 		if (ip6i->ip6i_flags & IP6I_DROP_IFDELAYED)
3034 			head_insert = B_TRUE;
3035 	}
3036 
3037 	nce_queue_mp_common(nce, mp, head_insert);
3038 }
3039 
3040 /*
3041  * Called when address resolution failed due to a timeout.
3042  * Send an ICMP unreachable in response to all queued packets.
3043  */
3044 void
3045 nce_resolv_failed(nce_t *nce)
3046 {
3047 	mblk_t	*mp, *nxt_mp, *first_mp;
3048 	char	buf[INET6_ADDRSTRLEN];
3049 	ip6_t *ip6h;
3050 	zoneid_t zoneid = GLOBAL_ZONEID;
3051 
3052 	ip1dbg(("nce_resolv_failed: dst %s\n",
3053 	    inet_ntop(AF_INET6, (char *)&nce->nce_addr, buf, sizeof (buf))));
3054 	mutex_enter(&nce->nce_lock);
3055 	mp = nce->nce_qd_mp;
3056 	nce->nce_qd_mp = NULL;
3057 	mutex_exit(&nce->nce_lock);
3058 	while (mp != NULL) {
3059 		nxt_mp = mp->b_next;
3060 		mp->b_next = NULL;
3061 		mp->b_prev = NULL;
3062 
3063 		first_mp = mp;
3064 		if (mp->b_datap->db_type == M_CTL) {
3065 			ipsec_out_t *io = (ipsec_out_t *)mp->b_rptr;
3066 			ASSERT(io->ipsec_out_type == IPSEC_OUT);
3067 			zoneid = io->ipsec_out_zoneid;
3068 			ASSERT(zoneid != ALL_ZONES);
3069 			mp = mp->b_cont;
3070 		}
3071 
3072 		ip6h = (ip6_t *)mp->b_rptr;
3073 		if (ip6h->ip6_nxt == IPPROTO_RAW) {
3074 			ip6i_t *ip6i;
3075 			/*
3076 			 * This message should have been pulled up already
3077 			 * in ip_wput_v6. ip_hdr_complete_v6 assumes that
3078 			 * the header is pulled up.
3079 			 */
3080 			ip6i = (ip6i_t *)ip6h;
3081 			ASSERT((mp->b_wptr - (uchar_t *)ip6i) >=
3082 			    sizeof (ip6i_t) + IPV6_HDR_LEN);
3083 			mp->b_rptr += sizeof (ip6i_t);
3084 		}
3085 		/*
3086 		 * Ignore failure since icmp_unreachable_v6 will silently
3087 		 * drop packets with an unspecified source address.
3088 		 */
3089 		(void) ip_hdr_complete_v6((ip6_t *)mp->b_rptr, zoneid);
3090 		icmp_unreachable_v6(nce->nce_ill->ill_wq, first_mp,
3091 		    ICMP6_DST_UNREACH_ADDR, B_FALSE, B_FALSE, zoneid);
3092 		mp = nxt_mp;
3093 	}
3094 }
3095 
3096 /*
3097  * Called by SIOCSNDP* ioctl to add/change an nce entry
3098  * and the corresponding attributes.
3099  * Disallow states other than ND_REACHABLE or ND_STALE.
3100  */
3101 int
3102 ndp_sioc_update(ill_t *ill, lif_nd_req_t *lnr)
3103 {
3104 	sin6_t		*sin6;
3105 	in6_addr_t	*addr;
3106 	nce_t		*nce;
3107 	int		err;
3108 	uint16_t	new_flags = 0;
3109 	uint16_t	old_flags = 0;
3110 	int		inflags = lnr->lnr_flags;
3111 
3112 	ASSERT(ill->ill_isv6);
3113 	if ((lnr->lnr_state_create != ND_REACHABLE) &&
3114 	    (lnr->lnr_state_create != ND_STALE))
3115 		return (EINVAL);
3116 
3117 	sin6 = (sin6_t *)&lnr->lnr_addr;
3118 	addr = &sin6->sin6_addr;
3119 
3120 	mutex_enter(&ndp6.ndp_g_lock);
3121 	/* We know it can not be mapping so just look in the hash table */
3122 	nce = *((nce_t **)NCE_HASH_PTR_V6(*addr));
3123 	nce = nce_lookup_addr(ill, addr, nce);
3124 	if (nce != NULL)
3125 		new_flags = nce->nce_flags;
3126 
3127 	switch (inflags & (NDF_ISROUTER_ON|NDF_ISROUTER_OFF)) {
3128 	case NDF_ISROUTER_ON:
3129 		new_flags |= NCE_F_ISROUTER;
3130 		break;
3131 	case NDF_ISROUTER_OFF:
3132 		new_flags &= ~NCE_F_ISROUTER;
3133 		break;
3134 	case (NDF_ISROUTER_OFF|NDF_ISROUTER_ON):
3135 		mutex_exit(&ndp6.ndp_g_lock);
3136 		if (nce != NULL)
3137 			NCE_REFRELE(nce);
3138 		return (EINVAL);
3139 	}
3140 
3141 	switch (inflags & (NDF_ANYCAST_ON|NDF_ANYCAST_OFF)) {
3142 	case NDF_ANYCAST_ON:
3143 		new_flags |= NCE_F_ANYCAST;
3144 		break;
3145 	case NDF_ANYCAST_OFF:
3146 		new_flags &= ~NCE_F_ANYCAST;
3147 		break;
3148 	case (NDF_ANYCAST_OFF|NDF_ANYCAST_ON):
3149 		mutex_exit(&ndp6.ndp_g_lock);
3150 		if (nce != NULL)
3151 			NCE_REFRELE(nce);
3152 		return (EINVAL);
3153 	}
3154 
3155 	switch (inflags & (NDF_PROXY_ON|NDF_PROXY_OFF)) {
3156 	case NDF_PROXY_ON:
3157 		new_flags |= NCE_F_PROXY;
3158 		break;
3159 	case NDF_PROXY_OFF:
3160 		new_flags &= ~NCE_F_PROXY;
3161 		break;
3162 	case (NDF_PROXY_OFF|NDF_PROXY_ON):
3163 		mutex_exit(&ndp6.ndp_g_lock);
3164 		if (nce != NULL)
3165 			NCE_REFRELE(nce);
3166 		return (EINVAL);
3167 	}
3168 
3169 	if (nce == NULL) {
3170 		err = ndp_add(ill,
3171 		    (uchar_t *)lnr->lnr_hdw_addr,
3172 		    addr,
3173 		    &ipv6_all_ones,
3174 		    &ipv6_all_zeros,
3175 		    0,
3176 		    new_flags,
3177 		    lnr->lnr_state_create,
3178 		    &nce,
3179 		    NULL,
3180 		    NULL);
3181 		if (err != 0) {
3182 			mutex_exit(&ndp6.ndp_g_lock);
3183 			ip1dbg(("ndp_sioc_update: Can't create NCE %d\n", err));
3184 			return (err);
3185 		}
3186 	}
3187 	old_flags = nce->nce_flags;
3188 	if (old_flags & NCE_F_ISROUTER && !(new_flags & NCE_F_ISROUTER)) {
3189 		/*
3190 		 * Router turned to host, delete all ires.
3191 		 * XXX Just delete the entry, but we need to add too.
3192 		 */
3193 		nce->nce_flags &= ~NCE_F_ISROUTER;
3194 		mutex_exit(&ndp6.ndp_g_lock);
3195 		ndp_delete(nce);
3196 		NCE_REFRELE(nce);
3197 		return (0);
3198 	}
3199 	mutex_exit(&ndp6.ndp_g_lock);
3200 
3201 	mutex_enter(&nce->nce_lock);
3202 	nce->nce_flags = new_flags;
3203 	mutex_exit(&nce->nce_lock);
3204 	/*
3205 	 * Note that we ignore the state at this point, which
3206 	 * should be either STALE or REACHABLE.  Instead we let
3207 	 * the link layer address passed in to determine the state
3208 	 * much like incoming packets.
3209 	 */
3210 	ndp_process(nce, (uchar_t *)lnr->lnr_hdw_addr, 0, B_FALSE);
3211 	NCE_REFRELE(nce);
3212 	return (0);
3213 }
3214 
3215 /*
3216  * If the device driver supports it, we make nce_fp_mp to have
3217  * an M_DATA prepend.  Otherwise nce_fp_mp will be null.
3218  * The caller insures there is hold on nce for this function.
3219  * Note that since ill_fastpath_probe() copies the mblk there is
3220  * no need for the hold beyond this function.
3221  */
3222 void
3223 nce_fastpath(nce_t *nce)
3224 {
3225 	ill_t	*ill = nce->nce_ill;
3226 	int res;
3227 
3228 	ASSERT(ill != NULL);
3229 	if (nce->nce_fp_mp != NULL) {
3230 		/* Already contains fastpath info */
3231 		return;
3232 	}
3233 	if (nce->nce_res_mp != NULL) {
3234 		nce_fastpath_list_add(nce);
3235 		res = ill_fastpath_probe(ill, nce->nce_res_mp);
3236 		/*
3237 		 * EAGAIN is an indication of a transient error
3238 		 * i.e. allocation failure etc. leave the nce in the list it
3239 		 * will be updated when another probe happens for another ire
3240 		 * if not it will be taken out of the list when the ire is
3241 		 * deleted.
3242 		 */
3243 
3244 		if (res != 0 && res != EAGAIN)
3245 			nce_fastpath_list_delete(nce);
3246 	}
3247 }
3248 
3249 /*
3250  * Drain the list of nce's waiting for fastpath response.
3251  */
3252 void
3253 nce_fastpath_list_dispatch(ill_t *ill, boolean_t (*func)(nce_t *, void  *),
3254     void *arg)
3255 {
3256 
3257 	nce_t *next_nce;
3258 	nce_t *current_nce;
3259 	nce_t *first_nce;
3260 	nce_t *prev_nce = NULL;
3261 
3262 	mutex_enter(&ill->ill_lock);
3263 	first_nce = current_nce = (nce_t *)ill->ill_fastpath_list;
3264 	while (current_nce != (nce_t *)&ill->ill_fastpath_list) {
3265 		next_nce = current_nce->nce_fastpath;
3266 		/*
3267 		 * Take it off the list if we're flushing, or if the callback
3268 		 * routine tells us to do so.  Otherwise, leave the nce in the
3269 		 * fastpath list to handle any pending response from the lower
3270 		 * layer.  We can't drain the list when the callback routine
3271 		 * comparison failed, because the response is asynchronous in
3272 		 * nature, and may not arrive in the same order as the list
3273 		 * insertion.
3274 		 */
3275 		if (func == NULL || func(current_nce, arg)) {
3276 			current_nce->nce_fastpath = NULL;
3277 			if (current_nce == first_nce)
3278 				ill->ill_fastpath_list = first_nce = next_nce;
3279 			else
3280 				prev_nce->nce_fastpath = next_nce;
3281 		} else {
3282 			/* previous element that is still in the list */
3283 			prev_nce = current_nce;
3284 		}
3285 		current_nce = next_nce;
3286 	}
3287 	mutex_exit(&ill->ill_lock);
3288 }
3289 
3290 /*
3291  * Add nce to the nce fastpath list.
3292  */
3293 void
3294 nce_fastpath_list_add(nce_t *nce)
3295 {
3296 	ill_t *ill;
3297 
3298 	ill = nce->nce_ill;
3299 
3300 	mutex_enter(&ill->ill_lock);
3301 	mutex_enter(&nce->nce_lock);
3302 
3303 	/*
3304 	 * if nce has not been deleted and
3305 	 * is not already in the list add it.
3306 	 */
3307 	if (!(nce->nce_flags & NCE_F_CONDEMNED) &&
3308 	    (nce->nce_fastpath == NULL)) {
3309 		nce->nce_fastpath = (nce_t *)ill->ill_fastpath_list;
3310 		ill->ill_fastpath_list = nce;
3311 	}
3312 
3313 	mutex_exit(&nce->nce_lock);
3314 	mutex_exit(&ill->ill_lock);
3315 }
3316 
3317 /*
3318  * remove nce from the nce fastpath list.
3319  */
3320 void
3321 nce_fastpath_list_delete(nce_t *nce)
3322 {
3323 	nce_t *nce_ptr;
3324 
3325 	ill_t *ill;
3326 
3327 	ill = nce->nce_ill;
3328 	ASSERT(ill != NULL);
3329 
3330 	mutex_enter(&ill->ill_lock);
3331 	if (nce->nce_fastpath == NULL)
3332 		goto done;
3333 
3334 	ASSERT(ill->ill_fastpath_list != &ill->ill_fastpath_list);
3335 
3336 	if (ill->ill_fastpath_list == nce) {
3337 		ill->ill_fastpath_list = nce->nce_fastpath;
3338 	} else {
3339 		nce_ptr = ill->ill_fastpath_list;
3340 		while (nce_ptr != (nce_t *)&ill->ill_fastpath_list) {
3341 			if (nce_ptr->nce_fastpath == nce) {
3342 				nce_ptr->nce_fastpath = nce->nce_fastpath;
3343 				break;
3344 			}
3345 			nce_ptr = nce_ptr->nce_fastpath;
3346 		}
3347 	}
3348 
3349 	nce->nce_fastpath = NULL;
3350 done:
3351 	mutex_exit(&ill->ill_lock);
3352 }
3353 
3354 /*
3355  * Update all NCE's that are not in fastpath mode and
3356  * have an nce_fp_mp that matches mp. mp->b_cont contains
3357  * the fastpath header.
3358  *
3359  * Returns TRUE if entry should be dequeued, or FALSE otherwise.
3360  */
3361 boolean_t
3362 ndp_fastpath_update(nce_t *nce, void *arg)
3363 {
3364 	mblk_t 	*mp, *fp_mp;
3365 	uchar_t	*mp_rptr, *ud_mp_rptr;
3366 	mblk_t	*ud_mp = nce->nce_res_mp;
3367 	ptrdiff_t	cmplen;
3368 
3369 	if (nce->nce_flags & NCE_F_MAPPING)
3370 		return (B_TRUE);
3371 	if ((nce->nce_fp_mp != NULL) || (ud_mp == NULL))
3372 		return (B_TRUE);
3373 
3374 	ip2dbg(("ndp_fastpath_update: trying\n"));
3375 	mp = (mblk_t *)arg;
3376 	mp_rptr = mp->b_rptr;
3377 	cmplen = mp->b_wptr - mp_rptr;
3378 	ASSERT(cmplen >= 0);
3379 	ud_mp_rptr = ud_mp->b_rptr;
3380 	/*
3381 	 * The nce is locked here to prevent any other threads
3382 	 * from accessing and changing nce_res_mp when the IPv6 address
3383 	 * becomes resolved to an lla while we're in the middle
3384 	 * of looking at and comparing the hardware address (lla).
3385 	 * It is also locked to prevent multiple threads in nce_fastpath_update
3386 	 * from examining nce_res_mp atthe same time.
3387 	 */
3388 	mutex_enter(&nce->nce_lock);
3389 	if (ud_mp->b_wptr - ud_mp_rptr != cmplen ||
3390 	    bcmp((char *)mp_rptr, (char *)ud_mp_rptr, cmplen) != 0) {
3391 		mutex_exit(&nce->nce_lock);
3392 		/*
3393 		 * Don't take the ire off the fastpath list yet,
3394 		 * since the response may come later.
3395 		 */
3396 		return (B_FALSE);
3397 	}
3398 	/* Matched - install mp as the fastpath mp */
3399 	ip1dbg(("ndp_fastpath_update: match\n"));
3400 	fp_mp = dupb(mp->b_cont);
3401 	if (fp_mp != NULL) {
3402 		nce->nce_fp_mp = fp_mp;
3403 	}
3404 	mutex_exit(&nce->nce_lock);
3405 	return (B_TRUE);
3406 }
3407 
3408 /*
3409  * This function handles the DL_NOTE_FASTPATH_FLUSH notification from
3410  * driver.  Note that it assumes IP is exclusive...
3411  */
3412 /* ARGSUSED */
3413 void
3414 ndp_fastpath_flush(nce_t *nce, char *arg)
3415 {
3416 	if (nce->nce_flags & NCE_F_MAPPING)
3417 		return;
3418 	/* No fastpath info? */
3419 	if (nce->nce_fp_mp == NULL || nce->nce_res_mp == NULL)
3420 		return;
3421 
3422 	if (nce->nce_ipversion == IPV4_VERSION &&
3423 	    nce->nce_flags & NCE_F_BCAST) {
3424 		/*
3425 		 * IPv4 BROADCAST entries:
3426 		 * We can't delete the nce since it is difficult to
3427 		 * recreate these without going through the
3428 		 * ipif down/up dance.
3429 		 *
3430 		 * All access to nce->nce_fp_mp in the case of these
3431 		 * is protected by nce_lock.
3432 		 */
3433 		mutex_enter(&nce->nce_lock);
3434 		if (nce->nce_fp_mp != NULL) {
3435 			freeb(nce->nce_fp_mp);
3436 			nce->nce_fp_mp = NULL;
3437 			mutex_exit(&nce->nce_lock);
3438 			nce_fastpath(nce);
3439 		} else {
3440 			mutex_exit(&nce->nce_lock);
3441 		}
3442 	} else {
3443 		/* Just delete the NCE... */
3444 		ndp_delete(nce);
3445 	}
3446 }
3447 
3448 /*
3449  * Return a pointer to a given option in the packet.
3450  * Assumes that option part of the packet have already been validated.
3451  */
3452 nd_opt_hdr_t *
3453 ndp_get_option(nd_opt_hdr_t *opt, int optlen, int opt_type)
3454 {
3455 	while (optlen > 0) {
3456 		if (opt->nd_opt_type == opt_type)
3457 			return (opt);
3458 		optlen -= 8 * opt->nd_opt_len;
3459 		opt = (struct nd_opt_hdr *)((char *)opt + 8 * opt->nd_opt_len);
3460 	}
3461 	return (NULL);
3462 }
3463 
3464 /*
3465  * Verify all option lengths present are > 0, also check to see
3466  * if the option lengths and packet length are consistent.
3467  */
3468 boolean_t
3469 ndp_verify_optlen(nd_opt_hdr_t *opt, int optlen)
3470 {
3471 	ASSERT(opt != NULL);
3472 	while (optlen > 0) {
3473 		if (opt->nd_opt_len == 0)
3474 			return (B_FALSE);
3475 		optlen -= 8 * opt->nd_opt_len;
3476 		if (optlen < 0)
3477 			return (B_FALSE);
3478 		opt = (struct nd_opt_hdr *)((char *)opt + 8 * opt->nd_opt_len);
3479 	}
3480 	return (B_TRUE);
3481 }
3482 
3483 /*
3484  * ndp_walk function.
3485  * Free a fraction of the NCE cache entries.
3486  * A fraction of zero means to not free any in that category.
3487  */
3488 void
3489 ndp_cache_reclaim(nce_t *nce, char *arg)
3490 {
3491 	nce_cache_reclaim_t *ncr = (nce_cache_reclaim_t *)arg;
3492 	uint_t	rand;
3493 
3494 	if (nce->nce_flags & NCE_F_PERMANENT)
3495 		return;
3496 
3497 	rand = (uint_t)lbolt +
3498 	    NCE_ADDR_HASH_V6(nce->nce_addr, NCE_TABLE_SIZE);
3499 	if (ncr->ncr_host != 0 &&
3500 	    (rand/ncr->ncr_host)*ncr->ncr_host == rand) {
3501 		ndp_delete(nce);
3502 		return;
3503 	}
3504 }
3505 
3506 /*
3507  * ndp_walk function.
3508  * Count the number of NCEs that can be deleted.
3509  * These would be hosts but not routers.
3510  */
3511 void
3512 ndp_cache_count(nce_t *nce, char *arg)
3513 {
3514 	ncc_cache_count_t *ncc = (ncc_cache_count_t *)arg;
3515 
3516 	if (nce->nce_flags & NCE_F_PERMANENT)
3517 		return;
3518 
3519 	ncc->ncc_total++;
3520 	if (!(nce->nce_flags & NCE_F_ISROUTER))
3521 		ncc->ncc_host++;
3522 }
3523 
3524 #ifdef NCE_DEBUG
3525 th_trace_t *
3526 th_trace_nce_lookup(nce_t *nce)
3527 {
3528 	int bucket_id;
3529 	th_trace_t *th_trace;
3530 
3531 	ASSERT(MUTEX_HELD(&nce->nce_lock));
3532 
3533 	bucket_id = IP_TR_HASH(curthread);
3534 	ASSERT(bucket_id < IP_TR_HASH_MAX);
3535 
3536 	for (th_trace = nce->nce_trace[bucket_id]; th_trace != NULL;
3537 	    th_trace = th_trace->th_next) {
3538 		if (th_trace->th_id == curthread)
3539 			return (th_trace);
3540 	}
3541 	return (NULL);
3542 }
3543 
3544 void
3545 nce_trace_ref(nce_t *nce)
3546 {
3547 	int bucket_id;
3548 	th_trace_t *th_trace;
3549 
3550 	/*
3551 	 * Attempt to locate the trace buffer for the curthread.
3552 	 * If it does not exist, then allocate a new trace buffer
3553 	 * and link it in list of trace bufs for this ipif, at the head
3554 	 */
3555 	ASSERT(MUTEX_HELD(&nce->nce_lock));
3556 
3557 	if (nce->nce_trace_disable == B_TRUE)
3558 		return;
3559 
3560 	th_trace = th_trace_nce_lookup(nce);
3561 	if (th_trace == NULL) {
3562 		bucket_id = IP_TR_HASH(curthread);
3563 		th_trace = (th_trace_t *)kmem_zalloc(sizeof (th_trace_t),
3564 		    KM_NOSLEEP);
3565 		if (th_trace == NULL) {
3566 			nce->nce_trace_disable = B_TRUE;
3567 			nce_trace_inactive(nce);
3568 			return;
3569 		}
3570 		th_trace->th_id = curthread;
3571 		th_trace->th_next = nce->nce_trace[bucket_id];
3572 		th_trace->th_prev = &nce->nce_trace[bucket_id];
3573 		if (th_trace->th_next != NULL)
3574 			th_trace->th_next->th_prev = &th_trace->th_next;
3575 		nce->nce_trace[bucket_id] = th_trace;
3576 	}
3577 	ASSERT(th_trace->th_refcnt < TR_BUF_MAX - 1);
3578 	th_trace->th_refcnt++;
3579 	th_trace_rrecord(th_trace);
3580 }
3581 
3582 void
3583 nce_untrace_ref(nce_t *nce)
3584 {
3585 	th_trace_t *th_trace;
3586 
3587 	ASSERT(MUTEX_HELD(&nce->nce_lock));
3588 
3589 	if (nce->nce_trace_disable == B_TRUE)
3590 		return;
3591 
3592 	th_trace = th_trace_nce_lookup(nce);
3593 	ASSERT(th_trace != NULL && th_trace->th_refcnt > 0);
3594 
3595 	th_trace_rrecord(th_trace);
3596 	th_trace->th_refcnt--;
3597 }
3598 
3599 void
3600 nce_trace_inactive(nce_t *nce)
3601 {
3602 	th_trace_t *th_trace;
3603 	int i;
3604 
3605 	ASSERT(MUTEX_HELD(&nce->nce_lock));
3606 
3607 	for (i = 0; i < IP_TR_HASH_MAX; i++) {
3608 		while (nce->nce_trace[i] != NULL) {
3609 			th_trace = nce->nce_trace[i];
3610 
3611 			/* unlink th_trace and free it */
3612 			nce->nce_trace[i] = th_trace->th_next;
3613 			if (th_trace->th_next != NULL)
3614 				th_trace->th_next->th_prev =
3615 				    &nce->nce_trace[i];
3616 
3617 			th_trace->th_next = NULL;
3618 			th_trace->th_prev = NULL;
3619 			kmem_free(th_trace, sizeof (th_trace_t));
3620 		}
3621 	}
3622 
3623 }
3624 
3625 /* ARGSUSED */
3626 int
3627 nce_thread_exit(nce_t *nce, caddr_t arg)
3628 {
3629 	th_trace_t	*th_trace;
3630 
3631 	mutex_enter(&nce->nce_lock);
3632 	th_trace = th_trace_nce_lookup(nce);
3633 
3634 	if (th_trace == NULL) {
3635 		mutex_exit(&nce->nce_lock);
3636 		return (0);
3637 	}
3638 
3639 	ASSERT(th_trace->th_refcnt == 0);
3640 
3641 	/* unlink th_trace and free it */
3642 	*th_trace->th_prev = th_trace->th_next;
3643 	if (th_trace->th_next != NULL)
3644 		th_trace->th_next->th_prev = th_trace->th_prev;
3645 	th_trace->th_next = NULL;
3646 	th_trace->th_prev = NULL;
3647 	kmem_free(th_trace, sizeof (th_trace_t));
3648 	mutex_exit(&nce->nce_lock);
3649 	return (0);
3650 }
3651 #endif
3652 
3653 /*
3654  * Called when address resolution fails due to a timeout.
3655  * Send an ICMP unreachable in response to all queued packets.
3656  */
3657 void
3658 arp_resolv_failed(nce_t *nce)
3659 {
3660 	mblk_t	*mp, *nxt_mp, *first_mp;
3661 	char	buf[INET6_ADDRSTRLEN];
3662 	zoneid_t zoneid = GLOBAL_ZONEID;
3663 	struct in_addr ipv4addr;
3664 
3665 	IN6_V4MAPPED_TO_INADDR(&nce->nce_addr, &ipv4addr);
3666 	ip3dbg(("arp_resolv_failed: dst %s\n",
3667 	    inet_ntop(AF_INET, &ipv4addr, buf, sizeof (buf))));
3668 	mutex_enter(&nce->nce_lock);
3669 	mp = nce->nce_qd_mp;
3670 	nce->nce_qd_mp = NULL;
3671 	mutex_exit(&nce->nce_lock);
3672 
3673 	while (mp != NULL) {
3674 		nxt_mp = mp->b_next;
3675 		mp->b_next = NULL;
3676 		mp->b_prev = NULL;
3677 
3678 		first_mp = mp;
3679 		/*
3680 		 * Send icmp unreachable messages
3681 		 * to the hosts.
3682 		 */
3683 		(void) ip_hdr_complete((ipha_t *)mp->b_rptr, zoneid);
3684 		ip3dbg(("arp_resolv_failed: Calling icmp_unreachable\n"));
3685 		icmp_unreachable(nce->nce_ill->ill_wq, first_mp,
3686 		    ICMP_HOST_UNREACHABLE, zoneid);
3687 		mp = nxt_mp;
3688 	}
3689 }
3690 
3691 static int
3692 ndp_lookup_then_add_v4(ill_t *ill, uchar_t *hw_addr, const in_addr_t *addr,
3693     const in_addr_t *mask, const in_addr_t *extract_mask,
3694     uint32_t hw_extract_start, uint16_t flags, uint16_t state,
3695     nce_t **newnce, mblk_t *fp_mp, mblk_t *res_mp)
3696 {
3697 	int	err = 0;
3698 	nce_t	*nce;
3699 	in6_addr_t addr6;
3700 
3701 	mutex_enter(&ndp4.ndp_g_lock);
3702 	nce = *((nce_t **)NCE_HASH_PTR_V4(*addr));
3703 	IN6_IPADDR_TO_V4MAPPED(*addr, &addr6);
3704 	nce = nce_lookup_addr(ill, &addr6, nce);
3705 	if (nce == NULL) {
3706 		err = ndp_add_v4(ill,
3707 		    hw_addr,
3708 		    addr,
3709 		    mask,
3710 		    extract_mask,
3711 		    hw_extract_start,
3712 		    flags,
3713 		    state,
3714 		    newnce,
3715 		    fp_mp,
3716 		    res_mp);
3717 	} else {
3718 		*newnce = nce;
3719 		err = EEXIST;
3720 	}
3721 	mutex_exit(&ndp4.ndp_g_lock);
3722 	return (err);
3723 }
3724 
3725 /*
3726  * NDP Cache Entry creation routine for IPv4.
3727  * Mapped entries are handled in arp.
3728  * This routine must always be called with ndp4.ndp_g_lock held.
3729  * Prior to return, nce_refcnt is incremented.
3730  */
3731 static int
3732 ndp_add_v4(ill_t *ill, uchar_t *hw_addr, const in_addr_t *addr,
3733     const in_addr_t *mask, const in_addr_t *extract_mask,
3734     uint32_t hw_extract_start, uint16_t flags, uint16_t state,
3735     nce_t **newnce, mblk_t *fp_mp, mblk_t *res_mp)
3736 {
3737 	static	nce_t		nce_nil;
3738 	nce_t		*nce;
3739 	mblk_t		*mp;
3740 	mblk_t		*template;
3741 	nce_t		**ncep;
3742 
3743 	ASSERT(MUTEX_HELD(&ndp4.ndp_g_lock));
3744 	ASSERT(ill != NULL);
3745 	if ((flags & ~NCE_EXTERNAL_FLAGS_MASK)) {
3746 		return (EINVAL);
3747 	}
3748 	ASSERT((flags & NCE_F_MAPPING) == 0);
3749 	ASSERT(extract_mask == NULL);
3750 	/*
3751 	 * Allocate the mblk to hold the nce.
3752 	 */
3753 	mp = allocb(sizeof (nce_t), BPRI_MED);
3754 	if (mp == NULL)
3755 		return (ENOMEM);
3756 
3757 	nce = (nce_t *)mp->b_rptr;
3758 	mp->b_wptr = (uchar_t *)&nce[1];
3759 	*nce = nce_nil;
3760 
3761 	/*
3762 	 * This one holds link layer address; if res_mp has been provided
3763 	 * by the caller, accept it without any further checks. Otherwise,
3764 	 * for V4, we fill it up with ill_resolver_mp here, then in
3765 	 * in ire_arpresolve(), we fill it up with the ARP query
3766 	 * once its formulated.
3767 	 */
3768 	if (res_mp != NULL) {
3769 		template = res_mp;
3770 	} else  {
3771 		if (ill->ill_resolver_mp == NULL) {
3772 			freeb(mp);
3773 			return (EINVAL);
3774 		}
3775 		template = copyb(ill->ill_resolver_mp);
3776 	}
3777 	if (template == NULL) {
3778 		freeb(mp);
3779 		return (ENOMEM);
3780 	}
3781 	nce->nce_ill = ill;
3782 	nce->nce_ipversion = IPV4_VERSION;
3783 	nce->nce_flags = flags;
3784 	nce->nce_state = state;
3785 	nce->nce_pcnt = ND_MAX_UNICAST_SOLICIT;
3786 	nce->nce_rcnt = ill->ill_xmit_count;
3787 	IN6_IPADDR_TO_V4MAPPED(*addr, &nce->nce_addr);
3788 	if (*mask == IP_HOST_MASK) {
3789 		nce->nce_mask = ipv6_all_ones;
3790 	} else  {
3791 		IN6_IPADDR_TO_V4MAPPED(*mask, &nce->nce_mask);
3792 	}
3793 	nce->nce_extract_mask = ipv6_all_zeros;
3794 	nce->nce_ll_extract_start = hw_extract_start;
3795 	nce->nce_fp_mp = (fp_mp? fp_mp : NULL);
3796 	nce->nce_res_mp = template;
3797 	if (state == ND_REACHABLE)
3798 		nce->nce_last = TICK_TO_MSEC(lbolt64);
3799 	else
3800 		nce->nce_last = 0;
3801 	nce->nce_qd_mp = NULL;
3802 	nce->nce_mp = mp;
3803 	if (hw_addr != NULL)
3804 		nce_set_ll(nce, hw_addr);
3805 	/* This one is for nce getting created */
3806 	nce->nce_refcnt = 1;
3807 	mutex_init(&nce->nce_lock, NULL, MUTEX_DEFAULT, NULL);
3808 	ncep = ((nce_t **)NCE_HASH_PTR_V4(*addr));
3809 
3810 #ifdef NCE_DEBUG
3811 	bzero(nce->nce_trace, sizeof (th_trace_t *) * IP_TR_HASH_MAX);
3812 #endif
3813 	/*
3814 	 * Atomically ensure that the ill is not CONDEMNED, before
3815 	 * adding the NCE.
3816 	 */
3817 	mutex_enter(&ill->ill_lock);
3818 	if (ill->ill_state_flags & ILL_CONDEMNED) {
3819 		mutex_exit(&ill->ill_lock);
3820 		freeb(mp);
3821 		if (res_mp == NULL) {
3822 			/*
3823 			 * template was locally allocated. need to free it.
3824 			 */
3825 			freeb(template);
3826 		}
3827 		return (EINVAL);
3828 	}
3829 	if ((nce->nce_next = *ncep) != NULL)
3830 		nce->nce_next->nce_ptpn = &nce->nce_next;
3831 	*ncep = nce;
3832 	nce->nce_ptpn = ncep;
3833 	*newnce = nce;
3834 	/* This one is for nce being used by an active thread */
3835 	NCE_REFHOLD(*newnce);
3836 
3837 	/* Bump up the number of nce's referencing this ill */
3838 	ill->ill_nce_cnt++;
3839 	mutex_exit(&ill->ill_lock);
3840 	return (0);
3841 }
3842 
3843 void
3844 ndp_flush_qd_mp(nce_t *nce)
3845 {
3846 	mblk_t *qd_mp, *qd_next;
3847 
3848 	ASSERT(MUTEX_HELD(&nce->nce_lock));
3849 	qd_mp = nce->nce_qd_mp;
3850 	nce->nce_qd_mp = NULL;
3851 	while (qd_mp != NULL) {
3852 		qd_next = qd_mp->b_next;
3853 		qd_mp->b_next = NULL;
3854 		qd_mp->b_prev = NULL;
3855 		freemsg(qd_mp);
3856 		qd_mp = qd_next;
3857 	}
3858 }
3859 
3860 nce_t *
3861 nce_reinit(nce_t *nce)
3862 {
3863 	nce_t *newnce = NULL;
3864 	in_addr_t nce_addr, nce_mask;
3865 
3866 	IN6_V4MAPPED_TO_IPADDR(&nce->nce_addr, nce_addr);
3867 	IN6_V4MAPPED_TO_IPADDR(&nce->nce_mask, nce_mask);
3868 	/*
3869 	 * delete the old one. this will get rid of any ire's pointing
3870 	 * at this nce.
3871 	 */
3872 	ndp_delete(nce);
3873 	/*
3874 	 * create a new nce with the same addr and mask.
3875 	 */
3876 	mutex_enter(&ndp4.ndp_g_lock);
3877 	(void) ndp_add_v4(nce->nce_ill, NULL, &nce_addr, &nce_mask, NULL, 0, 0,
3878 	    ND_INITIAL, &newnce, NULL, NULL);
3879 	mutex_exit(&ndp4.ndp_g_lock);
3880 	/*
3881 	 * refrele the old nce.
3882 	 */
3883 	NCE_REFRELE(nce);
3884 	return (newnce);
3885 }
3886 
3887 /*
3888  * ndp_walk routine to delete all entries that have a given destination or
3889  * gateway address and cached link layer (MAC) address.  This is used when ARP
3890  * informs us that a network-to-link-layer mapping may have changed.
3891  */
3892 void
3893 nce_delete_hw_changed(nce_t *nce, void *arg)
3894 {
3895 	nce_hw_map_t *hwm = arg;
3896 	mblk_t *mp;
3897 	dl_unitdata_req_t *dlu;
3898 	uchar_t *macaddr;
3899 	ill_t *ill;
3900 	int saplen;
3901 	ipaddr_t nce_addr;
3902 
3903 	if (nce->nce_state != ND_REACHABLE)
3904 		return;
3905 
3906 	IN6_V4MAPPED_TO_IPADDR(&nce->nce_addr, nce_addr);
3907 	if (nce_addr != hwm->hwm_addr)
3908 		return;
3909 
3910 	mutex_enter(&nce->nce_lock);
3911 	if ((mp = nce->nce_res_mp) == NULL) {
3912 		mutex_exit(&nce->nce_lock);
3913 		return;
3914 	}
3915 	dlu = (dl_unitdata_req_t *)mp->b_rptr;
3916 	macaddr = (uchar_t *)(dlu + 1);
3917 	ill = nce->nce_ill;
3918 	if ((saplen = ill->ill_sap_length) > 0)
3919 		macaddr += saplen;
3920 	else
3921 		saplen = -saplen;
3922 
3923 	/*
3924 	 * If the hardware address is unchanged, then leave this one alone.
3925 	 * Note that saplen == abs(saplen) now.
3926 	 */
3927 	if (hwm->hwm_hwlen == dlu->dl_dest_addr_length - saplen &&
3928 	    bcmp(hwm->hwm_hwaddr, macaddr, hwm->hwm_hwlen) == 0) {
3929 		mutex_exit(&nce->nce_lock);
3930 		return;
3931 	}
3932 	mutex_exit(&nce->nce_lock);
3933 
3934 	DTRACE_PROBE1(nce__hw__deleted, nce_t *, nce);
3935 	ndp_delete(nce);
3936 }
3937 
3938 /*
3939  * This function verifies whether a given IPv4 address is potentially known to
3940  * the NCE subsystem.  If so, then ARP must not delete the corresponding ace_t,
3941  * so that it can continue to look for hardware changes on that address.
3942  */
3943 boolean_t
3944 ndp_lookup_ipaddr(in_addr_t addr)
3945 {
3946 	nce_t		*nce;
3947 	struct in_addr	nceaddr;
3948 
3949 	if (addr == INADDR_ANY)
3950 		return (B_FALSE);
3951 
3952 	mutex_enter(&ndp4.ndp_g_lock);
3953 	nce = *(nce_t **)NCE_HASH_PTR_V4(addr);
3954 	for (; nce != NULL; nce = nce->nce_next) {
3955 		/* Note that only v4 mapped entries are in the table. */
3956 		IN6_V4MAPPED_TO_INADDR(&nce->nce_addr, &nceaddr);
3957 		if (addr == nceaddr.s_addr &&
3958 		    IN6_ARE_ADDR_EQUAL(&nce->nce_mask, &ipv6_all_ones)) {
3959 			/* Single flag check; no lock needed */
3960 			if (!(nce->nce_flags & NCE_F_CONDEMNED))
3961 				break;
3962 		}
3963 	}
3964 	mutex_exit(&ndp4.ndp_g_lock);
3965 	return (nce != NULL);
3966 }
3967