xref: /titanic_50/usr/src/uts/common/inet/ip/ip_ndp.c (revision 9acbbeaf2a1ffe5c14b244867d427714fab43c5c)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 #include <sys/types.h>
29 #include <sys/stream.h>
30 #include <sys/stropts.h>
31 #include <sys/strsun.h>
32 #include <sys/sysmacros.h>
33 #include <sys/errno.h>
34 #include <sys/dlpi.h>
35 #include <sys/socket.h>
36 #include <sys/ddi.h>
37 #include <sys/sunddi.h>
38 #include <sys/cmn_err.h>
39 #include <sys/debug.h>
40 #include <sys/vtrace.h>
41 #include <sys/kmem.h>
42 #include <sys/zone.h>
43 #include <sys/ethernet.h>
44 #include <sys/sdt.h>
45 
46 #include <net/if.h>
47 #include <net/if_types.h>
48 #include <net/if_dl.h>
49 #include <net/route.h>
50 #include <netinet/in.h>
51 #include <netinet/ip6.h>
52 #include <netinet/icmp6.h>
53 
54 #include <inet/common.h>
55 #include <inet/mi.h>
56 #include <inet/mib2.h>
57 #include <inet/nd.h>
58 #include <inet/ip.h>
59 #include <inet/ip_if.h>
60 #include <inet/ip_ire.h>
61 #include <inet/ip_rts.h>
62 #include <inet/ip6.h>
63 #include <inet/ip_ndp.h>
64 #include <inet/ipsec_impl.h>
65 #include <inet/ipsec_info.h>
66 #include <inet/sctp_ip.h>
67 
68 /*
69  * Function names with nce_ prefix are static while function
70  * names with ndp_ prefix are used by rest of the IP.
71  *
72  * Lock ordering:
73  *
74  *	ndp_g_lock -> ill_lock -> nce_lock
75  *
76  * The ndp_g_lock protects the NCE hash (nce_hash_tbl, NCE_HASH_PTR) and
77  * nce_next.  Nce_lock protects the contents of the NCE (particularly
78  * nce_refcnt).
79  */
80 
81 static	boolean_t nce_cmp_ll_addr(const nce_t *nce, const uchar_t *new_ll_addr,
82     uint32_t ll_addr_len);
83 static	void	nce_fastpath(nce_t *nce);
84 static	void	nce_ire_delete(nce_t *nce);
85 static	void	nce_ire_delete1(ire_t *ire, char *nce_arg);
86 static	void 	nce_set_ll(nce_t *nce, uchar_t *ll_addr);
87 static	nce_t	*nce_lookup_addr(ill_t *, const in6_addr_t *, nce_t *);
88 static	nce_t	*nce_lookup_mapping(ill_t *ill, const in6_addr_t *addr);
89 static	void	nce_make_mapping(nce_t *nce, uchar_t *addrpos,
90     uchar_t *addr);
91 static	int	nce_set_multicast(ill_t *ill, const in6_addr_t *addr);
92 static	void	nce_queue_mp(nce_t *nce, mblk_t *mp);
93 static	void	nce_report1(nce_t *nce, uchar_t *mp_arg);
94 static	mblk_t	*nce_udreq_alloc(ill_t *ill);
95 static	void	nce_update(nce_t *nce, uint16_t new_state,
96     uchar_t *new_ll_addr);
97 static	uint32_t	nce_solicit(nce_t *nce, mblk_t *mp);
98 static	boolean_t	nce_xmit(ill_t *ill, uint32_t operation,
99     ill_t *hwaddr_ill, boolean_t use_lla_addr, const in6_addr_t *sender,
100     const in6_addr_t *target, int flag);
101 extern void	th_trace_rrecord(th_trace_t *);
102 static	int	ndp_lookup_then_add_v6(ill_t *, uchar_t *,
103     const in6_addr_t *, const in6_addr_t *, const in6_addr_t *,
104     uint32_t, uint16_t, uint16_t, nce_t **, mblk_t *, mblk_t *);
105 static	int	ndp_lookup_then_add_v4(ill_t *, uchar_t *,
106     const in_addr_t *, const in_addr_t *, const in_addr_t *,
107     uint32_t, uint16_t, uint16_t, nce_t **, mblk_t *, mblk_t *);
108 static	int	ndp_add_v6(ill_t *, uchar_t *, const in6_addr_t *,
109     const in6_addr_t *, const in6_addr_t *, uint32_t, uint16_t, uint16_t,
110     nce_t **);
111 static	int	ndp_add_v4(ill_t *, uchar_t *, const in_addr_t *,
112     const in_addr_t *, const in_addr_t *, uint32_t, uint16_t, uint16_t,
113     nce_t **, mblk_t *, mblk_t *);
114 
115 
116 #ifdef NCE_DEBUG
117 void	nce_trace_inactive(nce_t *);
118 #endif
119 
120 ndp_g_t ndp4, ndp6;
121 
122 #define	NCE_HASH_PTR_V4(addr) \
123 	(&(ndp4.nce_hash_tbl[IRE_ADDR_HASH(addr, NCE_TABLE_SIZE)]))
124 
125 #define	NCE_HASH_PTR_V6(addr) \
126 	(&(ndp6.nce_hash_tbl[NCE_ADDR_HASH_V6(addr, NCE_TABLE_SIZE)]))
127 
128 /*
129  * Compute default flags to use for an advertisement of this nce's address.
130  */
131 static int
132 nce_advert_flags(const nce_t *nce)
133 {
134 	int flag = 0;
135 
136 	if (nce->nce_flags & NCE_F_ISROUTER)
137 		flag |= NDP_ISROUTER;
138 	if (!(nce->nce_flags & NCE_F_PROXY))
139 		flag |= NDP_ORIDE;
140 	return (flag);
141 }
142 
143 int
144 ndp_add(ill_t *ill, uchar_t *hw_addr, const void *addr,
145     const void *mask, const void *extract_mask,
146     uint32_t hw_extract_start, uint16_t flags, uint16_t state,
147     nce_t **newnce, mblk_t *fp_mp, mblk_t *res_mp)
148 {
149 	int status;
150 
151 	if (ill->ill_isv6)
152 		status = ndp_add_v6(ill, hw_addr, (in6_addr_t *)addr,
153 		    (in6_addr_t *)mask, (in6_addr_t *)extract_mask,
154 		    hw_extract_start, flags, state, newnce);
155 	else
156 		status = ndp_add_v4(ill, hw_addr, (in_addr_t *)addr,
157 		    (in_addr_t *)mask, (in_addr_t *)extract_mask,
158 		    hw_extract_start, flags, state, newnce, fp_mp, res_mp);
159 	return (status);
160 }
161 
162 /* Non-tunable probe interval, based on link capabilities */
163 #define	ILL_PROBE_INTERVAL(ill)	((ill)->ill_note_link ? 150 : 1500)
164 
165 /*
166  * NDP Cache Entry creation routine.
167  * Mapped entries will never do NUD .
168  * This routine must always be called with ndp6.ndp_g_lock held.
169  * Prior to return, nce_refcnt is incremented.
170  */
171 static int
172 ndp_add_v6(ill_t *ill, uchar_t *hw_addr, const in6_addr_t *addr,
173     const in6_addr_t *mask, const in6_addr_t *extract_mask,
174     uint32_t hw_extract_start, uint16_t flags, uint16_t state,
175     nce_t **newnce)
176 {
177 	static	nce_t		nce_nil;
178 	nce_t		*nce;
179 	mblk_t		*mp;
180 	mblk_t		*template;
181 	nce_t		**ncep;
182 	int		err;
183 	boolean_t	dropped = B_FALSE;
184 
185 	ASSERT(MUTEX_HELD(&ndp6.ndp_g_lock));
186 	ASSERT(ill != NULL && ill->ill_isv6);
187 	if (IN6_IS_ADDR_UNSPECIFIED(addr)) {
188 		ip0dbg(("ndp_add: no addr\n"));
189 		return (EINVAL);
190 	}
191 	if ((flags & ~NCE_EXTERNAL_FLAGS_MASK)) {
192 		ip0dbg(("ndp_add: flags = %x\n", (int)flags));
193 		return (EINVAL);
194 	}
195 	if (IN6_IS_ADDR_UNSPECIFIED(extract_mask) &&
196 	    (flags & NCE_F_MAPPING)) {
197 		ip0dbg(("ndp_add: extract mask zero for mapping"));
198 		return (EINVAL);
199 	}
200 	/*
201 	 * Allocate the mblk to hold the nce.
202 	 *
203 	 * XXX This can come out of a separate cache - nce_cache.
204 	 * We don't need the mp anymore as there are no more
205 	 * "qwriter"s
206 	 */
207 	mp = allocb(sizeof (nce_t), BPRI_MED);
208 	if (mp == NULL)
209 		return (ENOMEM);
210 
211 	nce = (nce_t *)mp->b_rptr;
212 	mp->b_wptr = (uchar_t *)&nce[1];
213 	*nce = nce_nil;
214 
215 	/*
216 	 * This one holds link layer address
217 	 */
218 	if (ill->ill_net_type == IRE_IF_RESOLVER) {
219 		template = nce_udreq_alloc(ill);
220 	} else {
221 		ASSERT((ill->ill_net_type == IRE_IF_NORESOLVER));
222 		ASSERT((ill->ill_resolver_mp != NULL));
223 		template = copyb(ill->ill_resolver_mp);
224 	}
225 	if (template == NULL) {
226 		freeb(mp);
227 		return (ENOMEM);
228 	}
229 	nce->nce_ill = ill;
230 	nce->nce_ipversion = IPV6_VERSION;
231 	nce->nce_flags = flags;
232 	nce->nce_state = state;
233 	nce->nce_pcnt = ND_MAX_UNICAST_SOLICIT;
234 	nce->nce_rcnt = ill->ill_xmit_count;
235 	nce->nce_addr = *addr;
236 	nce->nce_mask = *mask;
237 	nce->nce_extract_mask = *extract_mask;
238 	nce->nce_ll_extract_start = hw_extract_start;
239 	nce->nce_fp_mp = NULL;
240 	nce->nce_res_mp = template;
241 	if (state == ND_REACHABLE)
242 		nce->nce_last = TICK_TO_MSEC(lbolt64);
243 	else
244 		nce->nce_last = 0;
245 	nce->nce_qd_mp = NULL;
246 	nce->nce_mp = mp;
247 	if (hw_addr != NULL)
248 		nce_set_ll(nce, hw_addr);
249 	/* This one is for nce getting created */
250 	nce->nce_refcnt = 1;
251 	mutex_init(&nce->nce_lock, NULL, MUTEX_DEFAULT, NULL);
252 	if (nce->nce_flags & NCE_F_MAPPING) {
253 		ASSERT(IN6_IS_ADDR_MULTICAST(addr));
254 		ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&nce->nce_mask));
255 		ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&nce->nce_extract_mask));
256 		ncep = &ndp6.nce_mask_entries;
257 	} else {
258 		ncep = ((nce_t **)NCE_HASH_PTR_V6(*addr));
259 	}
260 
261 #ifdef NCE_DEBUG
262 	bzero(nce->nce_trace, sizeof (th_trace_t *) * IP_TR_HASH_MAX);
263 #endif
264 	/*
265 	 * Atomically ensure that the ill is not CONDEMNED, before
266 	 * adding the NCE.
267 	 */
268 	mutex_enter(&ill->ill_lock);
269 	if (ill->ill_state_flags & ILL_CONDEMNED) {
270 		mutex_exit(&ill->ill_lock);
271 		freeb(mp);
272 		freeb(template);
273 		return (EINVAL);
274 	}
275 	if ((nce->nce_next = *ncep) != NULL)
276 		nce->nce_next->nce_ptpn = &nce->nce_next;
277 	*ncep = nce;
278 	nce->nce_ptpn = ncep;
279 	*newnce = nce;
280 	/* This one is for nce being used by an active thread */
281 	NCE_REFHOLD(*newnce);
282 
283 	/* Bump up the number of nce's referencing this ill */
284 	ill->ill_nce_cnt++;
285 	mutex_exit(&ill->ill_lock);
286 
287 	err = 0;
288 	if ((flags & NCE_F_PERMANENT) && state == ND_PROBE) {
289 		mutex_enter(&nce->nce_lock);
290 		mutex_exit(&ndp6.ndp_g_lock);
291 		nce->nce_pcnt = ND_MAX_UNICAST_SOLICIT;
292 		mutex_exit(&nce->nce_lock);
293 		dropped = nce_xmit(ill, ND_NEIGHBOR_SOLICIT, NULL, B_FALSE,
294 		    &ipv6_all_zeros, addr, NDP_PROBE);
295 		if (dropped) {
296 			mutex_enter(&nce->nce_lock);
297 			nce->nce_pcnt++;
298 			mutex_exit(&nce->nce_lock);
299 		}
300 		NDP_RESTART_TIMER(nce, ILL_PROBE_INTERVAL(ill));
301 		mutex_enter(&ndp6.ndp_g_lock);
302 		err = EINPROGRESS;
303 	} else if (flags & NCE_F_UNSOL_ADV) {
304 		/*
305 		 * We account for the transmit below by assigning one
306 		 * less than the ndd variable. Subsequent decrements
307 		 * are done in ndp_timer.
308 		 */
309 		mutex_enter(&nce->nce_lock);
310 		mutex_exit(&ndp6.ndp_g_lock);
311 		nce->nce_unsolicit_count = ip_ndp_unsolicit_count - 1;
312 		mutex_exit(&nce->nce_lock);
313 		dropped = nce_xmit(ill,
314 		    ND_NEIGHBOR_ADVERT,
315 		    ill,	/* ill to be used for extracting ill_nd_lla */
316 		    B_TRUE,	/* use ill_nd_lla */
317 		    addr,	/* Source and target of the advertisement pkt */
318 		    &ipv6_all_hosts_mcast, /* Destination of the packet */
319 		    nce_advert_flags(nce));
320 		mutex_enter(&nce->nce_lock);
321 		if (dropped)
322 			nce->nce_unsolicit_count++;
323 		if (nce->nce_unsolicit_count != 0) {
324 			nce->nce_timeout_id = timeout(ndp_timer, nce,
325 			    MSEC_TO_TICK(ip_ndp_unsolicit_interval));
326 		}
327 		mutex_exit(&nce->nce_lock);
328 		mutex_enter(&ndp6.ndp_g_lock);
329 	}
330 	/*
331 	 * If the hw_addr is NULL, typically for ND_INCOMPLETE nces, then
332 	 * we call nce_fastpath as soon as the nce is resolved in ndp_process.
333 	 * We call nce_fastpath from nce_update if the link layer address of
334 	 * the peer changes from nce_update
335 	 */
336 	if (hw_addr != NULL || ill->ill_net_type == IRE_IF_NORESOLVER)
337 		nce_fastpath(nce);
338 	return (err);
339 }
340 
341 int
342 ndp_lookup_then_add(ill_t *ill, uchar_t *hw_addr, const void *addr,
343     const void *mask, const void *extract_mask,
344     uint32_t hw_extract_start, uint16_t flags, uint16_t state,
345     nce_t **newnce, mblk_t *fp_mp, mblk_t *res_mp)
346 {
347 	int status;
348 
349 	if (ill->ill_isv6) {
350 		status = ndp_lookup_then_add_v6(ill, hw_addr,
351 		    (in6_addr_t *)addr, (in6_addr_t *)mask,
352 		    (in6_addr_t *)extract_mask, hw_extract_start, flags,
353 		    state, newnce, fp_mp, res_mp);
354 	} else  {
355 		status = ndp_lookup_then_add_v4(ill, hw_addr,
356 		    (in_addr_t *)addr, (in_addr_t *)mask,
357 		    (in_addr_t *)extract_mask, hw_extract_start, flags,
358 		    state, newnce, fp_mp, res_mp);
359 	}
360 
361 	return (status);
362 }
363 
364 static int
365 ndp_lookup_then_add_v6(ill_t *ill, uchar_t *hw_addr, const in6_addr_t *addr,
366     const in6_addr_t *mask, const in6_addr_t *extract_mask,
367     uint32_t hw_extract_start, uint16_t flags, uint16_t state,
368     nce_t **newnce, mblk_t *fp_mp, mblk_t *res_mp)
369 {
370 	int	err = 0;
371 	nce_t	*nce;
372 
373 	ASSERT(ill != NULL && ill->ill_isv6);
374 	mutex_enter(&ndp6.ndp_g_lock);
375 	nce = *((nce_t **)NCE_HASH_PTR_V6(*addr)); /* head of v6 hash table */
376 	nce = nce_lookup_addr(ill, addr, nce);
377 	if (nce == NULL) {
378 		err = ndp_add(ill,
379 		    hw_addr,
380 		    addr,
381 		    mask,
382 		    extract_mask,
383 		    hw_extract_start,
384 		    flags,
385 		    state,
386 		    newnce,
387 		    fp_mp,
388 		    res_mp);
389 	} else {
390 		*newnce = nce;
391 		err = EEXIST;
392 	}
393 	mutex_exit(&ndp6.ndp_g_lock);
394 	return (err);
395 }
396 
397 /*
398  * Remove all the CONDEMNED nces from the appropriate hash table.
399  * We create a private list of NCEs, these may have ires pointing
400  * to them, so the list will be passed through to clean up dependent
401  * ires and only then we can do NCE_REFRELE which can make NCE inactive.
402  */
403 static void
404 nce_remove(ndp_g_t *ndp, nce_t *nce, nce_t **free_nce_list)
405 {
406 	nce_t *nce1;
407 	nce_t **ptpn;
408 
409 	ASSERT(MUTEX_HELD(&ndp->ndp_g_lock));
410 	ASSERT(ndp->ndp_g_walker == 0);
411 	for (; nce; nce = nce1) {
412 		nce1 = nce->nce_next;
413 		mutex_enter(&nce->nce_lock);
414 		if (nce->nce_flags & NCE_F_CONDEMNED) {
415 			ptpn = nce->nce_ptpn;
416 			nce1 = nce->nce_next;
417 			if (nce1 != NULL)
418 				nce1->nce_ptpn = ptpn;
419 			*ptpn = nce1;
420 			nce->nce_ptpn = NULL;
421 			nce->nce_next = NULL;
422 			nce->nce_next = *free_nce_list;
423 			*free_nce_list = nce;
424 		}
425 		mutex_exit(&nce->nce_lock);
426 	}
427 }
428 
429 /*
430  * 1. Mark the nce CONDEMNED. This ensures that no new nce_lookup()
431  *    will return this NCE. Also no new IREs will be created that
432  *    point to this NCE (See ire_add_v6).  Also no new timeouts will
433  *    be started (See NDP_RESTART_TIMER).
434  * 2. Cancel any currently running timeouts.
435  * 3. If there is an ndp walker, return. The walker will do the cleanup.
436  *    This ensures that walkers see a consistent list of NCEs while walking.
437  * 4. Otherwise remove the NCE from the list of NCEs
438  * 5. Delete all IREs pointing to this NCE.
439  */
440 void
441 ndp_delete(nce_t *nce)
442 {
443 	nce_t	**ptpn;
444 	nce_t	*nce1;
445 	int	ipversion = nce->nce_ipversion;
446 	ndp_g_t *ndp = (ipversion == IPV4_VERSION ? &ndp4 : &ndp6);
447 
448 	/* Serialize deletes */
449 	mutex_enter(&nce->nce_lock);
450 	if (nce->nce_flags & NCE_F_CONDEMNED) {
451 		/* Some other thread is doing the delete */
452 		mutex_exit(&nce->nce_lock);
453 		return;
454 	}
455 	/*
456 	 * Caller has a refhold. Also 1 ref for being in the list. Thus
457 	 * refcnt has to be >= 2
458 	 */
459 	ASSERT(nce->nce_refcnt >= 2);
460 	nce->nce_flags |= NCE_F_CONDEMNED;
461 	mutex_exit(&nce->nce_lock);
462 
463 	nce_fastpath_list_delete(nce);
464 
465 	/*
466 	 * Cancel any running timer. Timeout can't be restarted
467 	 * since CONDEMNED is set. Can't hold nce_lock across untimeout.
468 	 * Passing invalid timeout id is fine.
469 	 */
470 	if (nce->nce_timeout_id != 0) {
471 		(void) untimeout(nce->nce_timeout_id);
472 		nce->nce_timeout_id = 0;
473 	}
474 
475 	mutex_enter(&ndp->ndp_g_lock);
476 	if (nce->nce_ptpn == NULL) {
477 		/*
478 		 * The last ndp walker has already removed this nce from
479 		 * the list after we marked the nce CONDEMNED and before
480 		 * we grabbed the global lock.
481 		 */
482 		mutex_exit(&ndp->ndp_g_lock);
483 		return;
484 	}
485 	if (ndp->ndp_g_walker > 0) {
486 		/*
487 		 * Can't unlink. The walker will clean up
488 		 */
489 		ndp->ndp_g_walker_cleanup = B_TRUE;
490 		mutex_exit(&ndp->ndp_g_lock);
491 		return;
492 	}
493 
494 	/*
495 	 * Now remove the nce from the list. NDP_RESTART_TIMER won't restart
496 	 * the timer since it is marked CONDEMNED.
497 	 */
498 	ptpn = nce->nce_ptpn;
499 	nce1 = nce->nce_next;
500 	if (nce1 != NULL)
501 		nce1->nce_ptpn = ptpn;
502 	*ptpn = nce1;
503 	nce->nce_ptpn = NULL;
504 	nce->nce_next = NULL;
505 	mutex_exit(&ndp->ndp_g_lock);
506 
507 	nce_ire_delete(nce);
508 }
509 
510 void
511 ndp_inactive(nce_t *nce)
512 {
513 	mblk_t		**mpp;
514 	ill_t		*ill;
515 
516 	ASSERT(nce->nce_refcnt == 0);
517 	ASSERT(MUTEX_HELD(&nce->nce_lock));
518 	ASSERT(nce->nce_fastpath == NULL);
519 
520 	/* Free all nce allocated messages */
521 	mpp = &nce->nce_first_mp_to_free;
522 	do {
523 		while (*mpp != NULL) {
524 			mblk_t  *mp;
525 
526 			mp = *mpp;
527 			*mpp = mp->b_next;
528 			mp->b_next = NULL;
529 			mp->b_prev = NULL;
530 			freemsg(mp);
531 		}
532 	} while (mpp++ != &nce->nce_last_mp_to_free);
533 
534 #ifdef NCE_DEBUG
535 	nce_trace_inactive(nce);
536 #endif
537 
538 	ill = nce->nce_ill;
539 	mutex_enter(&ill->ill_lock);
540 	ill->ill_nce_cnt--;
541 	/*
542 	 * If the number of nce's associated with this ill have dropped
543 	 * to zero, check whether we need to restart any operation that
544 	 * is waiting for this to happen.
545 	 */
546 	if (ill->ill_nce_cnt == 0) {
547 		/* ipif_ill_refrele_tail drops the ill_lock */
548 		ipif_ill_refrele_tail(ill);
549 	} else {
550 		mutex_exit(&ill->ill_lock);
551 	}
552 	mutex_destroy(&nce->nce_lock);
553 	freeb(nce->nce_mp);
554 }
555 
556 /*
557  * ndp_walk routine.  Delete the nce if it is associated with the ill
558  * that is going away.  Always called as a writer.
559  */
560 void
561 ndp_delete_per_ill(nce_t *nce, uchar_t *arg)
562 {
563 	if ((nce != NULL) && nce->nce_ill == (ill_t *)arg) {
564 		ndp_delete(nce);
565 	}
566 }
567 
568 /*
569  * Walk a list of to be inactive NCEs and blow away all the ires.
570  */
571 static void
572 nce_ire_delete_list(nce_t *nce)
573 {
574 	nce_t *nce_next;
575 
576 	ASSERT(nce != NULL);
577 	while (nce != NULL) {
578 		nce_next = nce->nce_next;
579 		nce->nce_next = NULL;
580 
581 		/*
582 		 * It is possible for the last ndp walker (this thread)
583 		 * to come here after ndp_delete has marked the nce CONDEMNED
584 		 * and before it has removed the nce from the fastpath list
585 		 * or called untimeout. So we need to do it here. It is safe
586 		 * for both ndp_delete and this thread to do it twice or
587 		 * even simultaneously since each of the threads has a
588 		 * reference on the nce.
589 		 */
590 		nce_fastpath_list_delete(nce);
591 		/*
592 		 * Cancel any running timer. Timeout can't be restarted
593 		 * since CONDEMNED is set. Can't hold nce_lock across untimeout.
594 		 * Passing invalid timeout id is fine.
595 		 */
596 		if (nce->nce_timeout_id != 0) {
597 			(void) untimeout(nce->nce_timeout_id);
598 			nce->nce_timeout_id = 0;
599 		}
600 		/*
601 		 * We might hit this func thus in the v4 case:
602 		 * ipif_down->ipif_ndp_down->ndp_walk
603 		 */
604 
605 		if (nce->nce_ipversion == IPV4_VERSION) {
606 			ire_walk_ill_v4(MATCH_IRE_ILL | MATCH_IRE_TYPE,
607 			    IRE_CACHE, nce_ire_delete1,
608 			    (char *)nce, nce->nce_ill);
609 		} else {
610 			ASSERT(nce->nce_ipversion == IPV6_VERSION);
611 			ire_walk_ill_v6(MATCH_IRE_ILL | MATCH_IRE_TYPE,
612 			    IRE_CACHE, nce_ire_delete1,
613 			    (char *)nce, nce->nce_ill);
614 		}
615 		NCE_REFRELE_NOTR(nce);
616 		nce = nce_next;
617 	}
618 }
619 
620 /*
621  * Delete an ire when the nce goes away.
622  */
623 /* ARGSUSED */
624 static void
625 nce_ire_delete(nce_t *nce)
626 {
627 	if (nce->nce_ipversion == IPV6_VERSION) {
628 		ire_walk_ill_v6(MATCH_IRE_ILL | MATCH_IRE_TYPE, IRE_CACHE,
629 		    nce_ire_delete1, (char *)nce, nce->nce_ill);
630 		NCE_REFRELE_NOTR(nce);
631 	} else {
632 		ire_walk_ill_v4(MATCH_IRE_ILL | MATCH_IRE_TYPE, IRE_CACHE,
633 		    nce_ire_delete1, (char *)nce, nce->nce_ill);
634 		NCE_REFRELE_NOTR(nce);
635 	}
636 }
637 
638 /*
639  * ire_walk routine used to delete every IRE that shares this nce
640  */
641 static void
642 nce_ire_delete1(ire_t *ire, char *nce_arg)
643 {
644 	nce_t	*nce = (nce_t *)nce_arg;
645 
646 	ASSERT(ire->ire_type == IRE_CACHE);
647 
648 	if (ire->ire_nce == nce) {
649 		ASSERT(ire->ire_ipversion == nce->nce_ipversion);
650 		ire_delete(ire);
651 	}
652 }
653 
654 /*
655  * Restart DAD on given NCE.  Returns B_TRUE if DAD has been restarted.
656  */
657 boolean_t
658 ndp_restart_dad(nce_t *nce)
659 {
660 	boolean_t started;
661 	boolean_t dropped;
662 
663 	if (nce == NULL)
664 		return (B_FALSE);
665 	mutex_enter(&nce->nce_lock);
666 	if (nce->nce_state == ND_PROBE) {
667 		mutex_exit(&nce->nce_lock);
668 		started = B_TRUE;
669 	} else if (nce->nce_state == ND_REACHABLE) {
670 		nce->nce_state = ND_PROBE;
671 		nce->nce_pcnt = ND_MAX_UNICAST_SOLICIT - 1;
672 		mutex_exit(&nce->nce_lock);
673 		dropped = nce_xmit(nce->nce_ill, ND_NEIGHBOR_SOLICIT, NULL,
674 		    B_FALSE, &ipv6_all_zeros, &nce->nce_addr, NDP_PROBE);
675 		if (dropped) {
676 			mutex_enter(&nce->nce_lock);
677 			nce->nce_pcnt++;
678 			mutex_exit(&nce->nce_lock);
679 		}
680 		NDP_RESTART_TIMER(nce, ILL_PROBE_INTERVAL(nce->nce_ill));
681 		started = B_TRUE;
682 	} else {
683 		mutex_exit(&nce->nce_lock);
684 		started = B_FALSE;
685 	}
686 	return (started);
687 }
688 
689 /*
690  * IPv6 Cache entry lookup.  Try to find an nce matching the parameters passed.
691  * If one is found, the refcnt on the nce will be incremented.
692  */
693 nce_t *
694 ndp_lookup_v6(ill_t *ill, const in6_addr_t *addr, boolean_t caller_holds_lock)
695 {
696 	nce_t	*nce;
697 
698 	ASSERT(ill != NULL && ill->ill_isv6);
699 	if (!caller_holds_lock) {
700 		mutex_enter(&ndp6.ndp_g_lock);
701 	}
702 	nce = *((nce_t **)NCE_HASH_PTR_V6(*addr)); /* head of v6 hash table */
703 	nce = nce_lookup_addr(ill, addr, nce);
704 	if (nce == NULL)
705 		nce = nce_lookup_mapping(ill, addr);
706 	if (!caller_holds_lock)
707 		mutex_exit(&ndp6.ndp_g_lock);
708 	return (nce);
709 }
710 /*
711  * IPv4 Cache entry lookup.  Try to find an nce matching the parameters passed.
712  * If one is found, the refcnt on the nce will be incremented.
713  * Since multicast mappings are handled in arp, there are no nce_mcast_entries
714  * so we skip the nce_lookup_mapping call.
715  * XXX TODO: if the nce is found to be ND_STALE, ndp_delete it and return NULL
716  */
717 nce_t *
718 ndp_lookup_v4(ill_t *ill, const in_addr_t *addr, boolean_t caller_holds_lock)
719 {
720 	nce_t	*nce;
721 	in6_addr_t addr6;
722 
723 	if (!caller_holds_lock) {
724 		mutex_enter(&ndp4.ndp_g_lock);
725 	}
726 	nce = *((nce_t **)NCE_HASH_PTR_V4(*addr)); /* head of v6 hash table */
727 	IN6_IPADDR_TO_V4MAPPED(*addr, &addr6);
728 	nce = nce_lookup_addr(ill, &addr6, nce);
729 	if (!caller_holds_lock)
730 		mutex_exit(&ndp4.ndp_g_lock);
731 	return (nce);
732 }
733 
734 /*
735  * Cache entry lookup.  Try to find an nce matching the parameters passed.
736  * Look only for exact entries (no mappings).  If an nce is found, increment
737  * the hold count on that nce. The caller passes in the start of the
738  * appropriate hash table, and must be holding the appropriate global
739  * lock (ndp_g_lock).
740  */
741 static nce_t *
742 nce_lookup_addr(ill_t *ill, const in6_addr_t *addr, nce_t *nce)
743 {
744 	ndp_g_t *ndp = (ill->ill_isv6 ? &ndp6 : &ndp4);
745 
746 	ASSERT(ill != NULL);
747 	ASSERT(MUTEX_HELD(&ndp->ndp_g_lock));
748 	if (IN6_IS_ADDR_UNSPECIFIED(addr))
749 		return (NULL);
750 	for (; nce != NULL; nce = nce->nce_next) {
751 		if (nce->nce_ill == ill) {
752 			if (IN6_ARE_ADDR_EQUAL(&nce->nce_addr, addr) &&
753 			    IN6_ARE_ADDR_EQUAL(&nce->nce_mask,
754 			    &ipv6_all_ones)) {
755 				mutex_enter(&nce->nce_lock);
756 				if (!(nce->nce_flags & NCE_F_CONDEMNED)) {
757 					NCE_REFHOLD_LOCKED(nce);
758 					mutex_exit(&nce->nce_lock);
759 					break;
760 				}
761 				mutex_exit(&nce->nce_lock);
762 			}
763 		}
764 	}
765 	return (nce);
766 }
767 
768 /*
769  * Cache entry lookup.  Try to find an nce matching the parameters passed.
770  * Look only for mappings.
771  */
772 static nce_t *
773 nce_lookup_mapping(ill_t *ill, const in6_addr_t *addr)
774 {
775 	nce_t	*nce;
776 
777 	ASSERT(ill != NULL && ill->ill_isv6);
778 	ASSERT(MUTEX_HELD(&ndp6.ndp_g_lock));
779 	if (!IN6_IS_ADDR_MULTICAST(addr))
780 		return (NULL);
781 	nce = ndp6.nce_mask_entries;
782 	for (; nce != NULL; nce = nce->nce_next)
783 		if (nce->nce_ill == ill &&
784 		    (V6_MASK_EQ(*addr, nce->nce_mask, nce->nce_addr))) {
785 			mutex_enter(&nce->nce_lock);
786 			if (!(nce->nce_flags & NCE_F_CONDEMNED)) {
787 				NCE_REFHOLD_LOCKED(nce);
788 				mutex_exit(&nce->nce_lock);
789 				break;
790 			}
791 			mutex_exit(&nce->nce_lock);
792 		}
793 	return (nce);
794 }
795 
796 /*
797  * Process passed in parameters either from an incoming packet or via
798  * user ioctl.
799  */
800 void
801 ndp_process(nce_t *nce, uchar_t *hw_addr, uint32_t flag, boolean_t is_adv)
802 {
803 	ill_t	*ill = nce->nce_ill;
804 	uint32_t hw_addr_len = ill->ill_nd_lla_len;
805 	mblk_t	*mp;
806 	boolean_t ll_updated = B_FALSE;
807 	boolean_t ll_changed;
808 
809 	ASSERT(nce->nce_ipversion == IPV6_VERSION);
810 	/*
811 	 * No updates of link layer address or the neighbor state is
812 	 * allowed, when the cache is in NONUD state.  This still
813 	 * allows for responding to reachability solicitation.
814 	 */
815 	mutex_enter(&nce->nce_lock);
816 	if (nce->nce_state == ND_INCOMPLETE) {
817 		if (hw_addr == NULL) {
818 			mutex_exit(&nce->nce_lock);
819 			return;
820 		}
821 		nce_set_ll(nce, hw_addr);
822 		/*
823 		 * Update nce state and send the queued packets
824 		 * back to ip this time ire will be added.
825 		 */
826 		if (flag & ND_NA_FLAG_SOLICITED) {
827 			nce_update(nce, ND_REACHABLE, NULL);
828 		} else {
829 			nce_update(nce, ND_STALE, NULL);
830 		}
831 		mutex_exit(&nce->nce_lock);
832 		nce_fastpath(nce);
833 		mutex_enter(&nce->nce_lock);
834 		mp = nce->nce_qd_mp;
835 		nce->nce_qd_mp = NULL;
836 		mutex_exit(&nce->nce_lock);
837 		while (mp != NULL) {
838 			mblk_t *nxt_mp;
839 
840 			nxt_mp = mp->b_next;
841 			mp->b_next = NULL;
842 			if (mp->b_prev != NULL) {
843 				ill_t   *inbound_ill;
844 				queue_t *fwdq = NULL;
845 				uint_t ifindex;
846 
847 				ifindex = (uint_t)(uintptr_t)mp->b_prev;
848 				inbound_ill = ill_lookup_on_ifindex(ifindex,
849 				    B_TRUE, NULL, NULL, NULL, NULL);
850 				if (inbound_ill == NULL) {
851 					mp->b_prev = NULL;
852 					freemsg(mp);
853 					return;
854 				} else {
855 					fwdq = inbound_ill->ill_rq;
856 				}
857 				mp->b_prev = NULL;
858 				/*
859 				 * Send a forwarded packet back into ip_rput_v6
860 				 * just as in ire_send_v6().
861 				 * Extract the queue from b_prev (set in
862 				 * ip_rput_data_v6).
863 				 */
864 				if (fwdq != NULL) {
865 					/*
866 					 * Forwarded packets hop count will
867 					 * get decremented in ip_rput_data_v6
868 					 */
869 					put(fwdq, mp);
870 				} else {
871 					/*
872 					 * Send locally originated packets back
873 					 * into * ip_wput_v6.
874 					 */
875 					put(ill->ill_wq, mp);
876 				}
877 				ill_refrele(inbound_ill);
878 			} else {
879 				put(ill->ill_wq, mp);
880 			}
881 			mp = nxt_mp;
882 		}
883 		return;
884 	}
885 	ll_changed = nce_cmp_ll_addr(nce, hw_addr, hw_addr_len);
886 	if (!is_adv) {
887 		/* If this is a SOLICITATION request only */
888 		if (ll_changed)
889 			nce_update(nce, ND_STALE, hw_addr);
890 		mutex_exit(&nce->nce_lock);
891 		return;
892 	}
893 	if (!(flag & ND_NA_FLAG_OVERRIDE) && ll_changed) {
894 		/* If in any other state than REACHABLE, ignore */
895 		if (nce->nce_state == ND_REACHABLE) {
896 			nce_update(nce, ND_STALE, NULL);
897 		}
898 		mutex_exit(&nce->nce_lock);
899 		return;
900 	} else {
901 		if (ll_changed) {
902 			nce_update(nce, ND_UNCHANGED, hw_addr);
903 			ll_updated = B_TRUE;
904 		}
905 		if (flag & ND_NA_FLAG_SOLICITED) {
906 			nce_update(nce, ND_REACHABLE, NULL);
907 		} else {
908 			if (ll_updated) {
909 				nce_update(nce, ND_STALE, NULL);
910 			}
911 		}
912 		mutex_exit(&nce->nce_lock);
913 		if (!(flag & ND_NA_FLAG_ROUTER) && (nce->nce_flags &
914 		    NCE_F_ISROUTER)) {
915 			ire_t *ire;
916 
917 			/*
918 			 * Router turned to host.  We need to remove the
919 			 * entry as well as any default route that may be
920 			 * using this as a next hop.  This is required by
921 			 * section 7.2.5 of RFC 2461.
922 			 */
923 			ire = ire_ftable_lookup_v6(&ipv6_all_zeros,
924 			    &ipv6_all_zeros, &nce->nce_addr, IRE_DEFAULT,
925 			    nce->nce_ill->ill_ipif, NULL, ALL_ZONES, 0, NULL,
926 			    MATCH_IRE_ILL | MATCH_IRE_TYPE | MATCH_IRE_GW |
927 			    MATCH_IRE_DEFAULT);
928 			if (ire != NULL) {
929 				ip_rts_rtmsg(RTM_DELETE, ire, 0);
930 				ire_delete(ire);
931 				ire_refrele(ire);
932 			}
933 			ndp_delete(nce);
934 		}
935 	}
936 }
937 
938 /*
939  * Pass arg1 to the pfi supplied, along with each nce in existence.
940  * ndp_walk() places a REFHOLD on the nce and drops the lock when
941  * walking the hash list.
942  */
943 void
944 ndp_walk_common(ndp_g_t *ndp, ill_t *ill, pfi_t pfi, void *arg1,
945     boolean_t trace)
946 {
947 
948 	nce_t	*nce;
949 	nce_t	*nce1;
950 	nce_t	**ncep;
951 	nce_t	*free_nce_list = NULL;
952 
953 	mutex_enter(&ndp->ndp_g_lock);
954 	/* Prevent ndp_delete from unlink and free of NCE */
955 	ndp->ndp_g_walker++;
956 	mutex_exit(&ndp->ndp_g_lock);
957 	for (ncep = ndp->nce_hash_tbl;
958 	    ncep < A_END(ndp->nce_hash_tbl); ncep++) {
959 		for (nce = *ncep; nce != NULL; nce = nce1) {
960 			nce1 = nce->nce_next;
961 			if (ill == NULL || nce->nce_ill == ill) {
962 				if (trace) {
963 					NCE_REFHOLD(nce);
964 					(*pfi)(nce, arg1);
965 					NCE_REFRELE(nce);
966 				} else {
967 					NCE_REFHOLD_NOTR(nce);
968 					(*pfi)(nce, arg1);
969 					NCE_REFRELE_NOTR(nce);
970 				}
971 			}
972 		}
973 	}
974 	for (nce = ndp->nce_mask_entries; nce != NULL; nce = nce1) {
975 		nce1 = nce->nce_next;
976 		if (ill == NULL || nce->nce_ill == ill) {
977 			if (trace) {
978 				NCE_REFHOLD(nce);
979 				(*pfi)(nce, arg1);
980 				NCE_REFRELE(nce);
981 			} else {
982 				NCE_REFHOLD_NOTR(nce);
983 				(*pfi)(nce, arg1);
984 				NCE_REFRELE_NOTR(nce);
985 			}
986 		}
987 	}
988 	mutex_enter(&ndp->ndp_g_lock);
989 	ndp->ndp_g_walker--;
990 	/*
991 	 * While NCE's are removed from global list they are placed
992 	 * in a private list, to be passed to nce_ire_delete_list().
993 	 * The reason is, there may be ires pointing to this nce
994 	 * which needs to cleaned up.
995 	 */
996 	if (ndp->ndp_g_walker_cleanup && ndp->ndp_g_walker == 0) {
997 		/* Time to delete condemned entries */
998 		for (ncep = ndp->nce_hash_tbl;
999 		    ncep < A_END(ndp->nce_hash_tbl); ncep++) {
1000 			nce = *ncep;
1001 			if (nce != NULL) {
1002 				nce_remove(ndp, nce, &free_nce_list);
1003 			}
1004 		}
1005 		nce = ndp->nce_mask_entries;
1006 		if (nce != NULL) {
1007 			nce_remove(ndp, nce, &free_nce_list);
1008 		}
1009 		ndp->ndp_g_walker_cleanup = B_FALSE;
1010 	}
1011 	mutex_exit(&ndp->ndp_g_lock);
1012 
1013 	if (free_nce_list != NULL) {
1014 		nce_ire_delete_list(free_nce_list);
1015 	}
1016 }
1017 
1018 void
1019 ndp_walk(ill_t *ill, pfi_t pfi, void *arg1)
1020 {
1021 	ndp_walk_common(&ndp4, ill, pfi, arg1, B_TRUE);
1022 	ndp_walk_common(&ndp6, ill, pfi, arg1, B_TRUE);
1023 }
1024 
1025 /*
1026  * Prepend the zoneid using an ipsec_out_t for later use by functions like
1027  * ip_rput_v6() after neighbor discovery has taken place.  If the message
1028  * block already has a M_CTL at the front of it, then simply set the zoneid
1029  * appropriately.
1030  */
1031 static mblk_t *
1032 ndp_prepend_zone(mblk_t *mp, zoneid_t zoneid)
1033 {
1034 	mblk_t		*first_mp;
1035 	ipsec_out_t	*io;
1036 
1037 	ASSERT(zoneid != ALL_ZONES);
1038 	if (mp->b_datap->db_type == M_CTL) {
1039 		io = (ipsec_out_t *)mp->b_rptr;
1040 		ASSERT(io->ipsec_out_type == IPSEC_OUT);
1041 		io->ipsec_out_zoneid = zoneid;
1042 		return (mp);
1043 	}
1044 
1045 	first_mp = ipsec_alloc_ipsec_out();
1046 	if (first_mp == NULL)
1047 		return (NULL);
1048 	io = (ipsec_out_t *)first_mp->b_rptr;
1049 	/* This is not a secure packet */
1050 	io->ipsec_out_secure = B_FALSE;
1051 	io->ipsec_out_zoneid = zoneid;
1052 	first_mp->b_cont = mp;
1053 	return (first_mp);
1054 }
1055 
1056 /*
1057  * Process resolve requests.  Handles both mapped entries
1058  * as well as cases that needs to be send out on the wire.
1059  * Lookup a NCE for a given IRE.  Regardless of whether one exists
1060  * or one is created, we defer making ire point to nce until the
1061  * ire is actually added at which point the nce_refcnt on the nce is
1062  * incremented.  This is done primarily to have symmetry between ire_add()
1063  * and ire_delete() which decrements the nce_refcnt, when an ire is deleted.
1064  */
1065 int
1066 ndp_resolver(ill_t *ill, const in6_addr_t *dst, mblk_t *mp, zoneid_t zoneid)
1067 {
1068 	nce_t		*nce;
1069 	int		err = 0;
1070 	uint32_t	ms;
1071 	mblk_t		*mp_nce = NULL;
1072 
1073 	ASSERT(ill != NULL);
1074 	ASSERT(ill->ill_isv6);
1075 	if (IN6_IS_ADDR_MULTICAST(dst)) {
1076 		err = nce_set_multicast(ill, dst);
1077 		return (err);
1078 	}
1079 	err = ndp_lookup_then_add(ill,
1080 	    NULL,	/* No hardware address */
1081 	    dst,
1082 	    &ipv6_all_ones,
1083 	    &ipv6_all_zeros,
1084 	    0,
1085 	    (ill->ill_flags & ILLF_NONUD) ? NCE_F_NONUD : 0,
1086 	    ND_INCOMPLETE,
1087 	    &nce,
1088 	    NULL, /* let ndp_add figure out fastpath mp and dlureq_mp for v6 */
1089 	    NULL);
1090 
1091 	switch (err) {
1092 	case 0:
1093 		/*
1094 		 * New cache entry was created. Make sure that the state
1095 		 * is not ND_INCOMPLETE. It can be in some other state
1096 		 * even before we send out the solicitation as we could
1097 		 * get un-solicited advertisements.
1098 		 *
1099 		 * If this is an XRESOLV interface, simply return 0,
1100 		 * since we don't want to solicit just yet.
1101 		 */
1102 		if (ill->ill_flags & ILLF_XRESOLV) {
1103 			NCE_REFRELE(nce);
1104 			return (0);
1105 		}
1106 		rw_enter(&ill_g_lock, RW_READER);
1107 		mutex_enter(&nce->nce_lock);
1108 		if (nce->nce_state != ND_INCOMPLETE) {
1109 			mutex_exit(&nce->nce_lock);
1110 			rw_exit(&ill_g_lock);
1111 			NCE_REFRELE(nce);
1112 			return (0);
1113 		}
1114 		mp_nce = ndp_prepend_zone(mp, zoneid);
1115 		if (mp_nce == NULL) {
1116 			/* The caller will free mp */
1117 			mutex_exit(&nce->nce_lock);
1118 			rw_exit(&ill_g_lock);
1119 			ndp_delete(nce);
1120 			NCE_REFRELE(nce);
1121 			return (ENOMEM);
1122 		}
1123 		ms = nce_solicit(nce, mp_nce);
1124 		rw_exit(&ill_g_lock);
1125 		if (ms == 0) {
1126 			/* The caller will free mp */
1127 			if (mp_nce != mp)
1128 				freeb(mp_nce);
1129 			mutex_exit(&nce->nce_lock);
1130 			ndp_delete(nce);
1131 			NCE_REFRELE(nce);
1132 			return (EBUSY);
1133 		}
1134 		mutex_exit(&nce->nce_lock);
1135 		NDP_RESTART_TIMER(nce, (clock_t)ms);
1136 		NCE_REFRELE(nce);
1137 		return (EINPROGRESS);
1138 	case EEXIST:
1139 		/* Resolution in progress just queue the packet */
1140 		mutex_enter(&nce->nce_lock);
1141 		if (nce->nce_state == ND_INCOMPLETE) {
1142 			mp_nce = ndp_prepend_zone(mp, zoneid);
1143 			if (mp_nce == NULL) {
1144 				err = ENOMEM;
1145 			} else {
1146 				nce_queue_mp(nce, mp_nce);
1147 				err = EINPROGRESS;
1148 			}
1149 		} else {
1150 			/*
1151 			 * Any other state implies we have
1152 			 * a nce but IRE needs to be added ...
1153 			 * ire_add_v6() will take care of the
1154 			 * the case when the nce becomes CONDEMNED
1155 			 * before the ire is added to the table.
1156 			 */
1157 			err = 0;
1158 		}
1159 		mutex_exit(&nce->nce_lock);
1160 		NCE_REFRELE(nce);
1161 		break;
1162 	default:
1163 		ip1dbg(("ndp_resolver: Can't create NCE %d\n", err));
1164 		break;
1165 	}
1166 	return (err);
1167 }
1168 
1169 /*
1170  * When there is no resolver, the link layer template is passed in
1171  * the IRE.
1172  * Lookup a NCE for a given IRE.  Regardless of whether one exists
1173  * or one is created, we defer making ire point to nce until the
1174  * ire is actually added at which point the nce_refcnt on the nce is
1175  * incremented.  This is done primarily to have symmetry between ire_add()
1176  * and ire_delete() which decrements the nce_refcnt, when an ire is deleted.
1177  */
1178 int
1179 ndp_noresolver(ill_t *ill, const in6_addr_t *dst)
1180 {
1181 	nce_t		*nce;
1182 	int		err = 0;
1183 
1184 	ASSERT(ill != NULL);
1185 	ASSERT(ill->ill_isv6);
1186 	if (IN6_IS_ADDR_MULTICAST(dst)) {
1187 		err = nce_set_multicast(ill, dst);
1188 		return (err);
1189 	}
1190 
1191 	err = ndp_lookup_then_add(ill,
1192 	    NULL,	/* hardware address */
1193 	    dst,
1194 	    &ipv6_all_ones,
1195 	    &ipv6_all_zeros,
1196 	    0,
1197 	    (ill->ill_flags & ILLF_NONUD) ? NCE_F_NONUD : 0,
1198 	    ND_REACHABLE,
1199 	    &nce,
1200 	    NULL, /* let ndp_add figure out fp_mp/dlureq_mp for v6 */
1201 	    NULL);
1202 
1203 	switch (err) {
1204 	case 0:
1205 		/*
1206 		 * Cache entry with a proper resolver cookie was
1207 		 * created.
1208 		 */
1209 		NCE_REFRELE(nce);
1210 		break;
1211 	case EEXIST:
1212 		err = 0;
1213 		NCE_REFRELE(nce);
1214 		break;
1215 	default:
1216 		ip1dbg(("ndp_noresolver: Can't create NCE %d\n", err));
1217 		break;
1218 	}
1219 	return (err);
1220 }
1221 
1222 /*
1223  * For each interface an entry is added for the unspecified multicast group.
1224  * Here that mapping is used to form the multicast cache entry for a particular
1225  * multicast destination.
1226  */
1227 static int
1228 nce_set_multicast(ill_t *ill, const in6_addr_t *dst)
1229 {
1230 	nce_t		*mnce;	/* Multicast mapping entry */
1231 	nce_t		*nce;
1232 	uchar_t		*hw_addr = NULL;
1233 	int		err = 0;
1234 
1235 	ASSERT(ill != NULL);
1236 	ASSERT(ill->ill_isv6);
1237 	ASSERT(!(IN6_IS_ADDR_UNSPECIFIED(dst)));
1238 
1239 	mutex_enter(&ndp6.ndp_g_lock);
1240 	nce = *((nce_t **)NCE_HASH_PTR_V6(*dst));
1241 	nce = nce_lookup_addr(ill, dst, nce);
1242 	if (nce != NULL) {
1243 		mutex_exit(&ndp6.ndp_g_lock);
1244 		NCE_REFRELE(nce);
1245 		return (0);
1246 	}
1247 	/* No entry, now lookup for a mapping this should never fail */
1248 	mnce = nce_lookup_mapping(ill, dst);
1249 	if (mnce == NULL) {
1250 		/* Something broken for the interface. */
1251 		mutex_exit(&ndp6.ndp_g_lock);
1252 		return (ESRCH);
1253 	}
1254 	ASSERT(mnce->nce_flags & NCE_F_MAPPING);
1255 	if (ill->ill_net_type == IRE_IF_RESOLVER) {
1256 		/*
1257 		 * For IRE_IF_RESOLVER a hardware mapping can be
1258 		 * generated, for IRE_IF_NORESOLVER, resolution cookie
1259 		 * in the ill is copied in ndp_add().
1260 		 */
1261 		hw_addr = kmem_alloc(ill->ill_nd_lla_len, KM_NOSLEEP);
1262 		if (hw_addr == NULL) {
1263 			mutex_exit(&ndp6.ndp_g_lock);
1264 			NCE_REFRELE(mnce);
1265 			return (ENOMEM);
1266 		}
1267 		nce_make_mapping(mnce, hw_addr, (uchar_t *)dst);
1268 	}
1269 	NCE_REFRELE(mnce);
1270 	/*
1271 	 * IRE_IF_NORESOLVER type simply copies the resolution
1272 	 * cookie passed in.  So no hw_addr is needed.
1273 	 */
1274 	err = ndp_add(ill,
1275 	    hw_addr,
1276 	    dst,
1277 	    &ipv6_all_ones,
1278 	    &ipv6_all_zeros,
1279 	    0,
1280 	    NCE_F_NONUD,
1281 	    ND_REACHABLE,
1282 	    &nce,
1283 	    NULL,
1284 	    NULL);
1285 	mutex_exit(&ndp6.ndp_g_lock);
1286 	if (hw_addr != NULL)
1287 		kmem_free(hw_addr, ill->ill_nd_lla_len);
1288 	if (err != 0) {
1289 		ip1dbg(("nce_set_multicast: create failed" "%d\n", err));
1290 		return (err);
1291 	}
1292 	NCE_REFRELE(nce);
1293 	return (0);
1294 }
1295 
1296 /*
1297  * Return the link layer address, and any flags of a nce.
1298  */
1299 int
1300 ndp_query(ill_t *ill, struct lif_nd_req *lnr)
1301 {
1302 	nce_t		*nce;
1303 	in6_addr_t	*addr;
1304 	sin6_t		*sin6;
1305 	dl_unitdata_req_t	*dl;
1306 
1307 	ASSERT(ill != NULL && ill->ill_isv6);
1308 	sin6 = (sin6_t *)&lnr->lnr_addr;
1309 	addr =  &sin6->sin6_addr;
1310 
1311 	nce = ndp_lookup_v6(ill, addr, B_FALSE);
1312 	if (nce == NULL)
1313 		return (ESRCH);
1314 	/* If in INCOMPLETE state, no link layer address is available yet */
1315 	if (nce->nce_state == ND_INCOMPLETE)
1316 		goto done;
1317 	dl = (dl_unitdata_req_t *)nce->nce_res_mp->b_rptr;
1318 	if (ill->ill_flags & ILLF_XRESOLV)
1319 		lnr->lnr_hdw_len = dl->dl_dest_addr_length;
1320 	else
1321 		lnr->lnr_hdw_len = ill->ill_nd_lla_len;
1322 	ASSERT(NCE_LL_ADDR_OFFSET(ill) + lnr->lnr_hdw_len <=
1323 	    sizeof (lnr->lnr_hdw_addr));
1324 	bcopy(nce->nce_res_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill),
1325 	    (uchar_t *)&lnr->lnr_hdw_addr, lnr->lnr_hdw_len);
1326 	if (nce->nce_flags & NCE_F_ISROUTER)
1327 		lnr->lnr_flags = NDF_ISROUTER_ON;
1328 	if (nce->nce_flags & NCE_F_PROXY)
1329 		lnr->lnr_flags |= NDF_PROXY_ON;
1330 	if (nce->nce_flags & NCE_F_ANYCAST)
1331 		lnr->lnr_flags |= NDF_ANYCAST_ON;
1332 done:
1333 	NCE_REFRELE(nce);
1334 	return (0);
1335 }
1336 
1337 /*
1338  * Send Enable/Disable multicast reqs to driver.
1339  */
1340 int
1341 ndp_mcastreq(ill_t *ill, const in6_addr_t *addr, uint32_t hw_addr_len,
1342     uint32_t hw_addr_offset, mblk_t *mp)
1343 {
1344 	nce_t		*nce;
1345 	uchar_t		*hw_addr;
1346 
1347 	ASSERT(ill != NULL && ill->ill_isv6);
1348 	ASSERT(ill->ill_net_type == IRE_IF_RESOLVER);
1349 	hw_addr = mi_offset_paramc(mp, hw_addr_offset, hw_addr_len);
1350 	if (hw_addr == NULL || !IN6_IS_ADDR_MULTICAST(addr)) {
1351 		freemsg(mp);
1352 		return (EINVAL);
1353 	}
1354 	mutex_enter(&ndp6.ndp_g_lock);
1355 	nce = nce_lookup_mapping(ill, addr);
1356 	if (nce == NULL) {
1357 		mutex_exit(&ndp6.ndp_g_lock);
1358 		freemsg(mp);
1359 		return (ESRCH);
1360 	}
1361 	mutex_exit(&ndp6.ndp_g_lock);
1362 	/*
1363 	 * Update dl_addr_length and dl_addr_offset for primitives that
1364 	 * have physical addresses as opposed to full saps
1365 	 */
1366 	switch (((union DL_primitives *)mp->b_rptr)->dl_primitive) {
1367 	case DL_ENABMULTI_REQ:
1368 		/* Track the state if this is the first enabmulti */
1369 		if (ill->ill_dlpi_multicast_state == IDMS_UNKNOWN)
1370 			ill->ill_dlpi_multicast_state = IDMS_INPROGRESS;
1371 		ip1dbg(("ndp_mcastreq: ENABMULTI\n"));
1372 		break;
1373 	case DL_DISABMULTI_REQ:
1374 		ip1dbg(("ndp_mcastreq: DISABMULTI\n"));
1375 		break;
1376 	default:
1377 		NCE_REFRELE(nce);
1378 		ip1dbg(("ndp_mcastreq: default\n"));
1379 		return (EINVAL);
1380 	}
1381 	nce_make_mapping(nce, hw_addr, (uchar_t *)addr);
1382 	NCE_REFRELE(nce);
1383 	putnext(ill->ill_wq, mp);
1384 	return (0);
1385 }
1386 
1387 /*
1388  * Send a neighbor solicitation.
1389  * Returns number of milliseconds after which we should either rexmit or abort.
1390  * Return of zero means we should abort.
1391  * The caller holds the nce_lock to protect nce_qd_mp and nce_rcnt.
1392  *
1393  * NOTE: This routine drops nce_lock (and later reacquires it) when sending
1394  * the packet.
1395  * NOTE: This routine does not consume mp.
1396  */
1397 uint32_t
1398 nce_solicit(nce_t *nce, mblk_t *mp)
1399 {
1400 	ill_t		*ill;
1401 	ill_t		*src_ill;
1402 	ip6_t		*ip6h;
1403 	in6_addr_t	src;
1404 	in6_addr_t	dst;
1405 	ipif_t		*ipif;
1406 	ip6i_t		*ip6i;
1407 	boolean_t	dropped = B_FALSE;
1408 
1409 	ASSERT(RW_READ_HELD(&ill_g_lock));
1410 	ASSERT(MUTEX_HELD(&nce->nce_lock));
1411 	ill = nce->nce_ill;
1412 	ASSERT(ill != NULL);
1413 
1414 	if (nce->nce_rcnt == 0) {
1415 		return (0);
1416 	}
1417 
1418 	if (mp == NULL) {
1419 		ASSERT(nce->nce_qd_mp != NULL);
1420 		mp = nce->nce_qd_mp;
1421 	} else {
1422 		nce_queue_mp(nce, mp);
1423 	}
1424 
1425 	/* Handle ip_newroute_v6 giving us IPSEC packets */
1426 	if (mp->b_datap->db_type == M_CTL)
1427 		mp = mp->b_cont;
1428 
1429 	ip6h = (ip6_t *)mp->b_rptr;
1430 	if (ip6h->ip6_nxt == IPPROTO_RAW) {
1431 		/*
1432 		 * This message should have been pulled up already in
1433 		 * ip_wput_v6. We can't do pullups here because the message
1434 		 * could be from the nce_qd_mp which could have b_next/b_prev
1435 		 * non-NULL.
1436 		 */
1437 		ip6i = (ip6i_t *)ip6h;
1438 		ASSERT((mp->b_wptr - (uchar_t *)ip6i) >=
1439 			    sizeof (ip6i_t) + IPV6_HDR_LEN);
1440 		ip6h = (ip6_t *)(mp->b_rptr + sizeof (ip6i_t));
1441 	}
1442 	src = ip6h->ip6_src;
1443 	/*
1444 	 * If the src of outgoing packet is one of the assigned interface
1445 	 * addresses use it, otherwise we will pick the source address below.
1446 	 */
1447 	src_ill = ill;
1448 	if (!IN6_IS_ADDR_UNSPECIFIED(&src)) {
1449 		if (ill->ill_group != NULL)
1450 			src_ill = ill->ill_group->illgrp_ill;
1451 		for (; src_ill != NULL; src_ill = src_ill->ill_group_next) {
1452 			for (ipif = src_ill->ill_ipif; ipif != NULL;
1453 			    ipif = ipif->ipif_next) {
1454 				if (IN6_ARE_ADDR_EQUAL(&src,
1455 				    &ipif->ipif_v6lcl_addr)) {
1456 					break;
1457 				}
1458 			}
1459 			if (ipif != NULL)
1460 				break;
1461 		}
1462 		/*
1463 		 * If no relevant ipif can be found, then it's not one of our
1464 		 * addresses.  Reset to :: and let nce_xmit.  If an ipif can be
1465 		 * found, but it's not yet done with DAD verification, then
1466 		 * just postpone this transmission until later.
1467 		 */
1468 		if (src_ill == NULL)
1469 			src = ipv6_all_zeros;
1470 		else if (!ipif->ipif_addr_ready)
1471 			return (ill->ill_reachable_retrans_time);
1472 	}
1473 	dst = nce->nce_addr;
1474 	/*
1475 	 * If source address is unspecified, nce_xmit will choose
1476 	 * one for us and initialize the hardware address also
1477 	 * appropriately.
1478 	 */
1479 	if (IN6_IS_ADDR_UNSPECIFIED(&src))
1480 		src_ill = NULL;
1481 	nce->nce_rcnt--;
1482 	mutex_exit(&nce->nce_lock);
1483 	rw_exit(&ill_g_lock);
1484 	dropped = nce_xmit(ill, ND_NEIGHBOR_SOLICIT, src_ill, B_TRUE, &src,
1485 	    &dst, 0);
1486 	rw_enter(&ill_g_lock, RW_READER);
1487 	mutex_enter(&nce->nce_lock);
1488 	if (dropped)
1489 		nce->nce_rcnt++;
1490 	return (ill->ill_reachable_retrans_time);
1491 }
1492 
1493 /*
1494  * Attempt to recover an address on an interface that's been marked as a
1495  * duplicate.  Because NCEs are destroyed when the interface goes down, there's
1496  * no easy way to just probe the address and have the right thing happen if
1497  * it's no longer in use.  Instead, we just bring it up normally and allow the
1498  * regular interface start-up logic to probe for a remaining duplicate and take
1499  * us back down if necessary.
1500  * Neither DHCP nor temporary addresses arrive here; they're excluded by
1501  * ip_ndp_excl.
1502  */
1503 /* ARGSUSED */
1504 static void
1505 ip_ndp_recover(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg)
1506 {
1507 	ill_t	*ill = rq->q_ptr;
1508 	ipif_t	*ipif;
1509 	in6_addr_t *addr = (in6_addr_t *)mp->b_rptr;
1510 
1511 	for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
1512 		/*
1513 		 * We do not support recovery of proxy ARP'd interfaces,
1514 		 * because the system lacks a complete proxy ARP mechanism.
1515 		 */
1516 		if ((ipif->ipif_flags & IPIF_POINTOPOINT) ||
1517 		    !IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6lcl_addr, addr)) {
1518 			continue;
1519 		}
1520 
1521 		/*
1522 		 * If we have already recovered, then ignore.
1523 		 */
1524 		mutex_enter(&ill->ill_lock);
1525 		if (!(ipif->ipif_flags & IPIF_DUPLICATE)) {
1526 			mutex_exit(&ill->ill_lock);
1527 			continue;
1528 		}
1529 
1530 		ipif->ipif_flags &= ~IPIF_DUPLICATE;
1531 		ill->ill_ipif_dup_count--;
1532 		mutex_exit(&ill->ill_lock);
1533 		ipif->ipif_was_dup = B_TRUE;
1534 
1535 		if (ipif_ndp_up(ipif, addr, B_FALSE) != EINPROGRESS)
1536 			(void) ipif_up_done_v6(ipif);
1537 	}
1538 	freeb(mp);
1539 }
1540 
1541 /*
1542  * Attempt to recover an IPv6 interface that's been shut down as a duplicate.
1543  * As long as someone else holds the address, the interface will stay down.
1544  * When that conflict goes away, the interface is brought back up.  This is
1545  * done so that accidental shutdowns of addresses aren't made permanent.  Your
1546  * server will recover from a failure.
1547  *
1548  * For DHCP and temporary addresses, recovery is not done in the kernel.
1549  * Instead, it's handled by user space processes (dhcpagent and in.ndpd).
1550  *
1551  * This function is entered on a timer expiry; the ID is in ipif_recovery_id.
1552  */
1553 static void
1554 ipif6_dup_recovery(void *arg)
1555 {
1556 	ipif_t *ipif = arg;
1557 
1558 	ipif->ipif_recovery_id = 0;
1559 	if (!(ipif->ipif_flags & IPIF_DUPLICATE))
1560 		return;
1561 
1562 	/* If the link is down, we'll retry this later */
1563 	if (!(ipif->ipif_ill->ill_phyint->phyint_flags & PHYI_RUNNING))
1564 		return;
1565 
1566 	ndp_do_recovery(ipif);
1567 }
1568 
1569 /*
1570  * Perform interface recovery by forcing the duplicate interfaces up and
1571  * allowing the system to determine which ones should stay up.
1572  *
1573  * Called both by recovery timer expiry and link-up notification.
1574  */
1575 void
1576 ndp_do_recovery(ipif_t *ipif)
1577 {
1578 	ill_t *ill = ipif->ipif_ill;
1579 	mblk_t *mp;
1580 
1581 	mp = allocb(sizeof (ipif->ipif_v6lcl_addr), BPRI_MED);
1582 	if (mp == NULL) {
1583 		ipif->ipif_recovery_id = timeout(ipif6_dup_recovery,
1584 		    ipif, MSEC_TO_TICK(ip_dup_recovery));
1585 	} else {
1586 		bcopy(&ipif->ipif_v6lcl_addr, mp->b_rptr,
1587 		    sizeof (ipif->ipif_v6lcl_addr));
1588 		ill_refhold(ill);
1589 		(void) qwriter_ip(NULL, ill, ill->ill_rq, mp, ip_ndp_recover,
1590 		    CUR_OP, B_FALSE);
1591 	}
1592 }
1593 
1594 /*
1595  * Find the solicitation in the given message, and extract printable details
1596  * (MAC and IP addresses) from it.
1597  */
1598 static nd_neighbor_solicit_t *
1599 ip_ndp_find_solicitation(mblk_t *mp, mblk_t *dl_mp, ill_t *ill, char *hbuf,
1600     size_t hlen, char *sbuf, size_t slen, uchar_t **haddr)
1601 {
1602 	nd_neighbor_solicit_t *ns;
1603 	ip6_t *ip6h;
1604 	uchar_t *addr;
1605 	int alen;
1606 
1607 	alen = 0;
1608 	ip6h = (ip6_t *)mp->b_rptr;
1609 	if (dl_mp == NULL) {
1610 		nd_opt_hdr_t *opt;
1611 		int nslen;
1612 
1613 		/*
1614 		 * If it's from the fast-path, then it can't be a probe
1615 		 * message, and thus must include the source linkaddr option.
1616 		 * Extract that here.
1617 		 */
1618 		ns = (nd_neighbor_solicit_t *)((char *)ip6h + IPV6_HDR_LEN);
1619 		nslen = mp->b_wptr - (uchar_t *)ns;
1620 		if ((nslen -= sizeof (*ns)) > 0) {
1621 			opt = ndp_get_option((nd_opt_hdr_t *)(ns + 1), nslen,
1622 			    ND_OPT_SOURCE_LINKADDR);
1623 			if (opt != NULL &&
1624 			    opt->nd_opt_len * 8 - sizeof (*opt) >=
1625 			    ill->ill_nd_lla_len) {
1626 				addr = (uchar_t *)(opt + 1);
1627 				alen = ill->ill_nd_lla_len;
1628 			}
1629 		}
1630 		/*
1631 		 * We cheat a bit here for the sake of printing usable log
1632 		 * messages in the rare case where the reply we got was unicast
1633 		 * without a source linkaddr option, and the interface is in
1634 		 * fastpath mode.  (Sigh.)
1635 		 */
1636 		if (alen == 0 && ill->ill_type == IFT_ETHER &&
1637 		    MBLKHEAD(mp) >= sizeof (struct ether_header)) {
1638 			struct ether_header *pether;
1639 
1640 			pether = (struct ether_header *)((char *)ip6h -
1641 			    sizeof (*pether));
1642 			addr = pether->ether_shost.ether_addr_octet;
1643 			alen = ETHERADDRL;
1644 		}
1645 	} else {
1646 		dl_unitdata_ind_t *dlu;
1647 
1648 		dlu = (dl_unitdata_ind_t *)dl_mp->b_rptr;
1649 		alen = dlu->dl_src_addr_length;
1650 		if (alen > 0 && dlu->dl_src_addr_offset >= sizeof (*dlu) &&
1651 		    dlu->dl_src_addr_offset + alen <= MBLKL(dl_mp)) {
1652 			addr = dl_mp->b_rptr + dlu->dl_src_addr_offset;
1653 			if (ill->ill_sap_length < 0) {
1654 				alen += ill->ill_sap_length;
1655 			} else {
1656 				addr += ill->ill_sap_length;
1657 				alen -= ill->ill_sap_length;
1658 			}
1659 		}
1660 	}
1661 	if (alen > 0) {
1662 		*haddr = addr;
1663 		(void) mac_colon_addr(addr, alen, hbuf, hlen);
1664 	} else {
1665 		*haddr = NULL;
1666 		(void) strcpy(hbuf, "?");
1667 	}
1668 	ns = (nd_neighbor_solicit_t *)((char *)ip6h + IPV6_HDR_LEN);
1669 	(void) inet_ntop(AF_INET6, &ns->nd_ns_target, sbuf, slen);
1670 	return (ns);
1671 }
1672 
1673 /*
1674  * This is for exclusive changes due to NDP duplicate address detection
1675  * failure.
1676  */
1677 /* ARGSUSED */
1678 static void
1679 ip_ndp_excl(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg)
1680 {
1681 	ill_t	*ill = rq->q_ptr;
1682 	ipif_t	*ipif;
1683 	char ibuf[LIFNAMSIZ + 10];	/* 10 digits for logical i/f number */
1684 	char hbuf[MAC_STR_LEN];
1685 	char sbuf[INET6_ADDRSTRLEN];
1686 	nd_neighbor_solicit_t *ns;
1687 	mblk_t *dl_mp = NULL;
1688 	uchar_t *haddr;
1689 
1690 	if (DB_TYPE(mp) != M_DATA) {
1691 		dl_mp = mp;
1692 		mp = mp->b_cont;
1693 	}
1694 	ns = ip_ndp_find_solicitation(mp, dl_mp, ill, hbuf, sizeof (hbuf), sbuf,
1695 	    sizeof (sbuf), &haddr);
1696 	if (haddr != NULL &&
1697 	    bcmp(haddr, ill->ill_phys_addr, ill->ill_phys_addr_length) == 0) {
1698 		/*
1699 		 * Ignore conflicts generated by misbehaving switches that just
1700 		 * reflect our own messages back to us.
1701 		 */
1702 		goto ignore_conflict;
1703 	}
1704 	(void) strlcpy(ibuf, ill->ill_name, sizeof (ibuf));
1705 	for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
1706 
1707 		if ((ipif->ipif_flags & IPIF_POINTOPOINT) ||
1708 		    !IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6lcl_addr,
1709 		    &ns->nd_ns_target)) {
1710 			continue;
1711 		}
1712 
1713 		/* If it's already marked, then don't do anything. */
1714 		if (ipif->ipif_flags & IPIF_DUPLICATE)
1715 			continue;
1716 
1717 		/*
1718 		 * If this is a failure during duplicate recovery, then don't
1719 		 * complain.  It may take a long time to recover.
1720 		 */
1721 		if (!ipif->ipif_was_dup) {
1722 			if (ipif->ipif_id != 0) {
1723 				(void) snprintf(ibuf + ill->ill_name_length - 1,
1724 				    sizeof (ibuf) - ill->ill_name_length + 1,
1725 				    ":%d", ipif->ipif_id);
1726 			}
1727 			cmn_err(CE_WARN, "%s has duplicate address %s (in "
1728 			    "use by %s); disabled", ibuf, sbuf, hbuf);
1729 		}
1730 		mutex_enter(&ill->ill_lock);
1731 		ASSERT(!(ipif->ipif_flags & IPIF_DUPLICATE));
1732 		ipif->ipif_flags |= IPIF_DUPLICATE;
1733 		ill->ill_ipif_dup_count++;
1734 		mutex_exit(&ill->ill_lock);
1735 		(void) ipif_down(ipif, NULL, NULL);
1736 		ipif_down_tail(ipif);
1737 		if (!(ipif->ipif_flags & (IPIF_DHCPRUNNING|IPIF_TEMPORARY)) &&
1738 		    ill->ill_net_type == IRE_IF_RESOLVER &&
1739 		    ip_dup_recovery > 0)
1740 			ipif->ipif_recovery_id = timeout(ipif6_dup_recovery,
1741 			    ipif, MSEC_TO_TICK(ip_dup_recovery));
1742 	}
1743 ignore_conflict:
1744 	if (dl_mp != NULL)
1745 		freeb(dl_mp);
1746 	freemsg(mp);
1747 }
1748 
1749 /*
1750  * Handle failure by tearing down the ipifs with the specified address.  Note
1751  * that tearing down the ipif also means deleting the nce through ipif_down, so
1752  * it's not possible to do recovery by just restarting the nce timer.  Instead,
1753  * we start a timer on the ipif.
1754  */
1755 static void
1756 ip_ndp_failure(ill_t *ill, mblk_t *mp, mblk_t *dl_mp, nce_t *nce)
1757 {
1758 	if ((mp = copymsg(mp)) != NULL) {
1759 		if (dl_mp == NULL)
1760 			dl_mp = mp;
1761 		else if ((dl_mp = copyb(dl_mp)) != NULL)
1762 			dl_mp->b_cont = mp;
1763 		if (dl_mp == NULL) {
1764 			freemsg(mp);
1765 		} else {
1766 			ill_refhold(ill);
1767 			(void) qwriter_ip(NULL, ill, ill->ill_rq, dl_mp,
1768 			    ip_ndp_excl, CUR_OP, B_FALSE);
1769 		}
1770 	}
1771 	ndp_delete(nce);
1772 }
1773 
1774 /*
1775  * Handle a discovered conflict: some other system is advertising that it owns
1776  * one of our IP addresses.  We need to defend ourselves, or just shut down the
1777  * interface.
1778  */
1779 static void
1780 ip_ndp_conflict(ill_t *ill, mblk_t *mp, mblk_t *dl_mp, nce_t *nce)
1781 {
1782 	ipif_t *ipif;
1783 	uint32_t now;
1784 	uint_t maxdefense;
1785 	uint_t defs;
1786 
1787 	ipif = ipif_lookup_addr_v6(&nce->nce_addr, ill, ALL_ZONES, NULL, NULL,
1788 	    NULL, NULL);
1789 	if (ipif == NULL)
1790 		return;
1791 	/*
1792 	 * First, figure out if this address is disposable.
1793 	 */
1794 	if (ipif->ipif_flags & (IPIF_DHCPRUNNING | IPIF_TEMPORARY))
1795 		maxdefense = ip_max_temp_defend;
1796 	else
1797 		maxdefense = ip_max_defend;
1798 
1799 	/*
1800 	 * Now figure out how many times we've defended ourselves.  Ignore
1801 	 * defenses that happened long in the past.
1802 	 */
1803 	now = gethrestime_sec();
1804 	mutex_enter(&nce->nce_lock);
1805 	if ((defs = nce->nce_defense_count) > 0 &&
1806 	    now - nce->nce_defense_time > ip_defend_interval) {
1807 		nce->nce_defense_count = defs = 0;
1808 	}
1809 	nce->nce_defense_count++;
1810 	nce->nce_defense_time = now;
1811 	mutex_exit(&nce->nce_lock);
1812 	ipif_refrele(ipif);
1813 
1814 	/*
1815 	 * If we've defended ourselves too many times already, then give up and
1816 	 * tear down the interface(s) using this address.  Otherwise, defend by
1817 	 * sending out an unsolicited Neighbor Advertisement.
1818 	 */
1819 	if (defs >= maxdefense) {
1820 		ip_ndp_failure(ill, mp, dl_mp, nce);
1821 	} else {
1822 		char hbuf[MAC_STR_LEN];
1823 		char sbuf[INET6_ADDRSTRLEN];
1824 		uchar_t *haddr;
1825 
1826 		(void) ip_ndp_find_solicitation(mp, dl_mp, ill, hbuf,
1827 		    sizeof (hbuf), sbuf, sizeof (sbuf), &haddr);
1828 		cmn_err(CE_WARN, "node %s is using our IP address %s on %s",
1829 		    hbuf, sbuf, ill->ill_name);
1830 		(void) nce_xmit(ill, ND_NEIGHBOR_ADVERT, ill, B_FALSE,
1831 		    &nce->nce_addr, &ipv6_all_hosts_mcast,
1832 		    nce_advert_flags(nce));
1833 	}
1834 }
1835 
1836 static void
1837 ndp_input_solicit(ill_t *ill, mblk_t *mp, mblk_t *dl_mp)
1838 {
1839 	nd_neighbor_solicit_t *ns;
1840 	uint32_t	hlen = ill->ill_nd_lla_len;
1841 	uchar_t		*haddr = NULL;
1842 	icmp6_t		*icmp_nd;
1843 	ip6_t		*ip6h;
1844 	nce_t		*our_nce = NULL;
1845 	in6_addr_t	target;
1846 	in6_addr_t	src;
1847 	int		len;
1848 	int		flag = 0;
1849 	nd_opt_hdr_t	*opt = NULL;
1850 	boolean_t	bad_solicit = B_FALSE;
1851 	mib2_ipv6IfIcmpEntry_t	*mib = ill->ill_icmp6_mib;
1852 
1853 	ip6h = (ip6_t *)mp->b_rptr;
1854 	icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
1855 	len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN;
1856 	src = ip6h->ip6_src;
1857 	ns = (nd_neighbor_solicit_t *)icmp_nd;
1858 	target = ns->nd_ns_target;
1859 	if (IN6_IS_ADDR_MULTICAST(&target)) {
1860 		if (ip_debug > 2) {
1861 			/* ip1dbg */
1862 			pr_addr_dbg("ndp_input_solicit: Target is"
1863 			    " multicast! %s\n", AF_INET6, &target);
1864 		}
1865 		bad_solicit = B_TRUE;
1866 		goto done;
1867 	}
1868 	if (len > sizeof (nd_neighbor_solicit_t)) {
1869 		/* Options present */
1870 		opt = (nd_opt_hdr_t *)&ns[1];
1871 		len -= sizeof (nd_neighbor_solicit_t);
1872 		if (!ndp_verify_optlen(opt, len)) {
1873 			ip1dbg(("ndp_input_solicit: Bad opt len\n"));
1874 			bad_solicit = B_TRUE;
1875 			goto done;
1876 		}
1877 	}
1878 	if (IN6_IS_ADDR_UNSPECIFIED(&src)) {
1879 		/* Check to see if this is a valid DAD solicitation */
1880 		if (!IN6_IS_ADDR_MC_SOLICITEDNODE(&ip6h->ip6_dst)) {
1881 			if (ip_debug > 2) {
1882 				/* ip1dbg */
1883 				pr_addr_dbg("ndp_input_solicit: IPv6 "
1884 				    "Destination is not solicited node "
1885 				    "multicast %s\n", AF_INET6,
1886 				    &ip6h->ip6_dst);
1887 			}
1888 			bad_solicit = B_TRUE;
1889 			goto done;
1890 		}
1891 	}
1892 
1893 	our_nce = ndp_lookup_v6(ill, &target, B_FALSE);
1894 	/*
1895 	 * If this is a valid Solicitation, a permanent
1896 	 * entry should exist in the cache
1897 	 */
1898 	if (our_nce == NULL ||
1899 	    !(our_nce->nce_flags & NCE_F_PERMANENT)) {
1900 		ip1dbg(("ndp_input_solicit: Wrong target in NS?!"
1901 		    "ifname=%s ", ill->ill_name));
1902 		if (ip_debug > 2) {
1903 			/* ip1dbg */
1904 			pr_addr_dbg(" dst %s\n", AF_INET6, &target);
1905 		}
1906 		bad_solicit = B_TRUE;
1907 		goto done;
1908 	}
1909 
1910 	/* At this point we should have a verified NS per spec */
1911 	if (opt != NULL) {
1912 		opt = ndp_get_option(opt, len, ND_OPT_SOURCE_LINKADDR);
1913 		if (opt != NULL) {
1914 			haddr = (uchar_t *)&opt[1];
1915 			if (hlen > opt->nd_opt_len * 8 - sizeof (*opt) ||
1916 			    hlen == 0) {
1917 				ip1dbg(("ndp_input_advert: bad SLLA\n"));
1918 				bad_solicit = B_TRUE;
1919 				goto done;
1920 			}
1921 		}
1922 	}
1923 
1924 	/* If sending directly to peer, set the unicast flag */
1925 	if (!IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst))
1926 		flag |= NDP_UNICAST;
1927 
1928 	/*
1929 	 * Create/update the entry for the soliciting node.
1930 	 * or respond to outstanding queries, don't if
1931 	 * the source is unspecified address.
1932 	 */
1933 	if (!IN6_IS_ADDR_UNSPECIFIED(&src)) {
1934 		int	err;
1935 		nce_t	*nnce;
1936 
1937 		ASSERT(ill->ill_isv6);
1938 		/*
1939 		 * Regular solicitations *must* include the Source Link-Layer
1940 		 * Address option.  Ignore messages that do not.
1941 		 */
1942 		if (haddr == NULL && IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst)) {
1943 			ip1dbg(("ndp_input_solicit: source link-layer address "
1944 			    "option missing with a specified source.\n"));
1945 			bad_solicit = B_TRUE;
1946 			goto done;
1947 		}
1948 
1949 		/*
1950 		 * This is a regular solicitation.  If we're still in the
1951 		 * process of verifying the address, then don't respond at all
1952 		 * and don't keep track of the sender.
1953 		 */
1954 		if (our_nce->nce_state == ND_PROBE)
1955 			goto done;
1956 
1957 		/*
1958 		 * If the solicitation doesn't have sender hardware address
1959 		 * (legal for unicast solicitation), then process without
1960 		 * installing the return NCE.  Either we already know it, or
1961 		 * we'll be forced to look it up when (and if) we reply to the
1962 		 * packet.
1963 		 */
1964 		if (haddr == NULL)
1965 			goto no_source;
1966 
1967 		err = ndp_lookup_then_add(ill,
1968 		    haddr,
1969 		    &src,	/* Soliciting nodes address */
1970 		    &ipv6_all_ones,
1971 		    &ipv6_all_zeros,
1972 		    0,
1973 		    0,
1974 		    ND_STALE,
1975 		    &nnce,
1976 		    NULL,
1977 		    NULL);
1978 		switch (err) {
1979 		case 0:
1980 			/* done with this entry */
1981 			NCE_REFRELE(nnce);
1982 			break;
1983 		case EEXIST:
1984 			/*
1985 			 * B_FALSE indicates this is not an
1986 			 * an advertisement.
1987 			 */
1988 			ndp_process(nnce, haddr, 0, B_FALSE);
1989 			NCE_REFRELE(nnce);
1990 			break;
1991 		default:
1992 			ip1dbg(("ndp_input_solicit: Can't create NCE %d\n",
1993 			    err));
1994 			goto done;
1995 		}
1996 no_source:
1997 		flag |= NDP_SOLICITED;
1998 	} else {
1999 		/*
2000 		 * No source link layer address option should be present in a
2001 		 * valid DAD request.
2002 		 */
2003 		if (haddr != NULL) {
2004 			ip1dbg(("ndp_input_solicit: source link-layer address "
2005 			    "option present with an unspecified source.\n"));
2006 			bad_solicit = B_TRUE;
2007 			goto done;
2008 		}
2009 		if (our_nce->nce_state == ND_PROBE) {
2010 			/*
2011 			 * Internally looped-back probes won't have DLPI
2012 			 * attached to them.  External ones (which are sent by
2013 			 * multicast) always will.  Just ignore our own
2014 			 * transmissions.
2015 			 */
2016 			if (dl_mp != NULL) {
2017 				/*
2018 				 * If someone else is probing our address, then
2019 				 * we've crossed wires.  Declare failure.
2020 				 */
2021 				ip_ndp_failure(ill, mp, dl_mp, our_nce);
2022 			}
2023 			goto done;
2024 		}
2025 		/*
2026 		 * This is a DAD probe.  Multicast the advertisement to the
2027 		 * all-nodes address.
2028 		 */
2029 		src = ipv6_all_hosts_mcast;
2030 	}
2031 	flag |= nce_advert_flags(our_nce);
2032 	/* Response to a solicitation */
2033 	(void) nce_xmit(ill,
2034 	    ND_NEIGHBOR_ADVERT,
2035 	    ill,	/* ill to be used for extracting ill_nd_lla */
2036 	    B_TRUE,	/* use ill_nd_lla */
2037 	    &target,	/* Source and target of the advertisement pkt */
2038 	    &src,	/* IP Destination (source of original pkt) */
2039 	    flag);
2040 done:
2041 	if (bad_solicit)
2042 		BUMP_MIB(mib, ipv6IfIcmpInBadNeighborSolicitations);
2043 	if (our_nce != NULL)
2044 		NCE_REFRELE(our_nce);
2045 }
2046 
2047 void
2048 ndp_input_advert(ill_t *ill, mblk_t *mp, mblk_t *dl_mp)
2049 {
2050 	nd_neighbor_advert_t *na;
2051 	uint32_t	hlen = ill->ill_nd_lla_len;
2052 	uchar_t		*haddr = NULL;
2053 	icmp6_t		*icmp_nd;
2054 	ip6_t		*ip6h;
2055 	nce_t		*dst_nce = NULL;
2056 	in6_addr_t	target;
2057 	nd_opt_hdr_t	*opt = NULL;
2058 	int		len;
2059 	mib2_ipv6IfIcmpEntry_t	*mib = ill->ill_icmp6_mib;
2060 
2061 	ip6h = (ip6_t *)mp->b_rptr;
2062 	icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
2063 	len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN;
2064 	na = (nd_neighbor_advert_t *)icmp_nd;
2065 	if (IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst) &&
2066 	    (na->nd_na_flags_reserved & ND_NA_FLAG_SOLICITED)) {
2067 		ip1dbg(("ndp_input_advert: Target is multicast but the "
2068 		    "solicited flag is not zero\n"));
2069 		BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements);
2070 		return;
2071 	}
2072 	target = na->nd_na_target;
2073 	if (IN6_IS_ADDR_MULTICAST(&target)) {
2074 		ip1dbg(("ndp_input_advert: Target is multicast!\n"));
2075 		BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements);
2076 		return;
2077 	}
2078 	if (len > sizeof (nd_neighbor_advert_t)) {
2079 		opt = (nd_opt_hdr_t *)&na[1];
2080 		if (!ndp_verify_optlen(opt,
2081 		    len - sizeof (nd_neighbor_advert_t))) {
2082 			ip1dbg(("ndp_input_advert: cannot verify SLLA\n"));
2083 			BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements);
2084 			return;
2085 		}
2086 		/* At this point we have a verified NA per spec */
2087 		len -= sizeof (nd_neighbor_advert_t);
2088 		opt = ndp_get_option(opt, len, ND_OPT_TARGET_LINKADDR);
2089 		if (opt != NULL) {
2090 			haddr = (uchar_t *)&opt[1];
2091 			if (hlen > opt->nd_opt_len * 8 - sizeof (*opt) ||
2092 			    hlen == 0) {
2093 				ip1dbg(("ndp_input_advert: bad SLLA\n"));
2094 				BUMP_MIB(mib,
2095 				    ipv6IfIcmpInBadNeighborAdvertisements);
2096 				return;
2097 			}
2098 		}
2099 	}
2100 
2101 	/*
2102 	 * If this interface is part of the group look at all the
2103 	 * ills in the group.
2104 	 */
2105 	rw_enter(&ill_g_lock, RW_READER);
2106 	if (ill->ill_group != NULL)
2107 		ill = ill->ill_group->illgrp_ill;
2108 
2109 	for (; ill != NULL; ill = ill->ill_group_next) {
2110 		mutex_enter(&ill->ill_lock);
2111 		if (!ILL_CAN_LOOKUP(ill)) {
2112 			mutex_exit(&ill->ill_lock);
2113 			continue;
2114 		}
2115 		ill_refhold_locked(ill);
2116 		mutex_exit(&ill->ill_lock);
2117 		dst_nce = ndp_lookup_v6(ill, &target, B_FALSE);
2118 		/* We have to drop the lock since ndp_process calls put* */
2119 		rw_exit(&ill_g_lock);
2120 		if (dst_nce != NULL) {
2121 			if ((dst_nce->nce_flags & NCE_F_PERMANENT) &&
2122 			    dst_nce->nce_state == ND_PROBE) {
2123 				/*
2124 				 * Someone else sent an advertisement for an
2125 				 * address that we're trying to configure.
2126 				 * Tear it down.  Note that dl_mp might be NULL
2127 				 * if we're getting a unicast reply.  This
2128 				 * isn't typically done (multicast is the norm
2129 				 * in response to a probe), but ip_ndp_failure
2130 				 * will handle the dl_mp == NULL case as well.
2131 				 */
2132 				ip_ndp_failure(ill, mp, dl_mp, dst_nce);
2133 			} else if (dst_nce->nce_flags & NCE_F_PERMANENT) {
2134 				/*
2135 				 * Someone just announced one of our local
2136 				 * addresses.  If it wasn't us, then this is a
2137 				 * conflict.  Defend the address or shut it
2138 				 * down.
2139 				 */
2140 				if (dl_mp != NULL &&
2141 				    (haddr == NULL ||
2142 				    nce_cmp_ll_addr(dst_nce, haddr,
2143 				    ill->ill_nd_lla_len))) {
2144 					ip_ndp_conflict(ill, mp, dl_mp,
2145 					    dst_nce);
2146 				}
2147 			} else {
2148 				if (na->nd_na_flags_reserved &
2149 				    ND_NA_FLAG_ROUTER) {
2150 					dst_nce->nce_flags |= NCE_F_ISROUTER;
2151 				}
2152 				/* B_TRUE indicates this an advertisement */
2153 				ndp_process(dst_nce, haddr,
2154 				    na->nd_na_flags_reserved, B_TRUE);
2155 			}
2156 			NCE_REFRELE(dst_nce);
2157 		}
2158 		rw_enter(&ill_g_lock, RW_READER);
2159 		ill_refrele(ill);
2160 	}
2161 	rw_exit(&ill_g_lock);
2162 }
2163 
2164 /*
2165  * Process NDP neighbor solicitation/advertisement messages.
2166  * The checksum has already checked o.k before reaching here.
2167  */
2168 void
2169 ndp_input(ill_t *ill, mblk_t *mp, mblk_t *dl_mp)
2170 {
2171 	icmp6_t		*icmp_nd;
2172 	ip6_t		*ip6h;
2173 	int		len;
2174 	mib2_ipv6IfIcmpEntry_t	*mib = ill->ill_icmp6_mib;
2175 
2176 
2177 	if (!pullupmsg(mp, -1)) {
2178 		ip1dbg(("ndp_input: pullupmsg failed\n"));
2179 		BUMP_MIB(ill->ill_ip6_mib, ipv6InDiscards);
2180 		goto done;
2181 	}
2182 	ip6h = (ip6_t *)mp->b_rptr;
2183 	if (ip6h->ip6_hops != IPV6_MAX_HOPS) {
2184 		ip1dbg(("ndp_input: hoplimit != IPV6_MAX_HOPS\n"));
2185 		BUMP_MIB(mib, ipv6IfIcmpBadHoplimit);
2186 		goto done;
2187 	}
2188 	/*
2189 	 * NDP does not accept any extension headers between the
2190 	 * IP header and the ICMP header since e.g. a routing
2191 	 * header could be dangerous.
2192 	 * This assumes that any AH or ESP headers are removed
2193 	 * by ip prior to passing the packet to ndp_input.
2194 	 */
2195 	if (ip6h->ip6_nxt != IPPROTO_ICMPV6) {
2196 		ip1dbg(("ndp_input: Wrong next header 0x%x\n",
2197 		    ip6h->ip6_nxt));
2198 		BUMP_MIB(mib, ipv6IfIcmpInErrors);
2199 		goto done;
2200 	}
2201 	icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
2202 	ASSERT(icmp_nd->icmp6_type == ND_NEIGHBOR_SOLICIT ||
2203 	    icmp_nd->icmp6_type == ND_NEIGHBOR_ADVERT);
2204 	if (icmp_nd->icmp6_code != 0) {
2205 		ip1dbg(("ndp_input: icmp6 code != 0 \n"));
2206 		BUMP_MIB(mib, ipv6IfIcmpInErrors);
2207 		goto done;
2208 	}
2209 	len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN;
2210 	/*
2211 	 * Make sure packet length is large enough for either
2212 	 * a NS or a NA icmp packet.
2213 	 */
2214 	if (len <  sizeof (struct icmp6_hdr) + sizeof (struct in6_addr)) {
2215 		ip1dbg(("ndp_input: packet too short\n"));
2216 		BUMP_MIB(mib, ipv6IfIcmpInErrors);
2217 		goto done;
2218 	}
2219 	if (icmp_nd->icmp6_type == ND_NEIGHBOR_SOLICIT) {
2220 		ndp_input_solicit(ill, mp, dl_mp);
2221 	} else {
2222 		ndp_input_advert(ill, mp, dl_mp);
2223 	}
2224 done:
2225 	freemsg(mp);
2226 }
2227 
2228 /*
2229  * nce_xmit is called to form and transmit a ND solicitation or
2230  * advertisement ICMP packet.
2231  *
2232  * If the source address is unspecified and this isn't a probe (used for
2233  * duplicate address detection), an appropriate source address and link layer
2234  * address will be chosen here.  The link layer address option is included if
2235  * the source is specified (i.e., all non-probe packets), and omitted (per the
2236  * specification) otherwise.
2237  *
2238  * It returns B_FALSE only if it does a successful put() to the
2239  * corresponding ill's ill_wq otherwise returns B_TRUE.
2240  */
2241 static boolean_t
2242 nce_xmit(ill_t *ill, uint32_t operation, ill_t *hwaddr_ill,
2243     boolean_t use_nd_lla, const in6_addr_t *sender, const in6_addr_t *target,
2244     int flag)
2245 {
2246 	uint32_t	len;
2247 	icmp6_t 	*icmp6;
2248 	mblk_t		*mp;
2249 	ip6_t		*ip6h;
2250 	nd_opt_hdr_t	*opt;
2251 	uint_t		plen;
2252 	ip6i_t		*ip6i;
2253 	ipif_t		*src_ipif = NULL;
2254 	uint8_t		*hw_addr;
2255 
2256 	/*
2257 	 * If we have a unspecified source(sender) address, select a
2258 	 * proper source address for the solicitation here itself so
2259 	 * that we can initialize the h/w address correctly. This is
2260 	 * needed for interface groups as source address can come from
2261 	 * the whole group and the h/w address initialized from ill will
2262 	 * be wrong if the source address comes from a different ill.
2263 	 *
2264 	 * Note that the NA never comes here with the unspecified source
2265 	 * address. The following asserts that whenever the source
2266 	 * address is specified, the haddr also should be specified.
2267 	 */
2268 	ASSERT(IN6_IS_ADDR_UNSPECIFIED(sender) || (hwaddr_ill != NULL));
2269 
2270 	if (IN6_IS_ADDR_UNSPECIFIED(sender) && !(flag & NDP_PROBE)) {
2271 		ASSERT(operation != ND_NEIGHBOR_ADVERT);
2272 		/*
2273 		 * Pick a source address for this solicitation, but
2274 		 * restrict the selection to addresses assigned to the
2275 		 * output interface (or interface group).  We do this
2276 		 * because the destination will create a neighbor cache
2277 		 * entry for the source address of this packet, so the
2278 		 * source address had better be a valid neighbor.
2279 		 */
2280 		src_ipif = ipif_select_source_v6(ill, target, RESTRICT_TO_ILL,
2281 		    IPV6_PREFER_SRC_DEFAULT, GLOBAL_ZONEID);
2282 		if (src_ipif == NULL) {
2283 			char buf[INET6_ADDRSTRLEN];
2284 
2285 			ip1dbg(("nce_xmit: No source ipif for dst %s\n",
2286 			    inet_ntop(AF_INET6, (char *)target, buf,
2287 			    sizeof (buf))));
2288 			return (B_TRUE);
2289 		}
2290 		sender = &src_ipif->ipif_v6src_addr;
2291 		hwaddr_ill = src_ipif->ipif_ill;
2292 	}
2293 
2294 	/*
2295 	 * Always make sure that the NS/NA packets don't get load
2296 	 * spread. This is needed so that the probe packets sent
2297 	 * by the in.mpathd daemon can really go out on the desired
2298 	 * interface. Probe packets are made to go out on a desired
2299 	 * interface by including a ip6i with ATTACH_IF flag. As these
2300 	 * packets indirectly end up sending/receiving NS/NA packets
2301 	 * (neighbor doing NUD), we have to make sure that NA
2302 	 * also go out on the same interface.
2303 	 */
2304 	plen = (sizeof (nd_opt_hdr_t) + ill->ill_nd_lla_len + 7) / 8;
2305 	len = IPV6_HDR_LEN + sizeof (ip6i_t) + sizeof (nd_neighbor_advert_t) +
2306 	    plen * 8;
2307 	mp = allocb(len,  BPRI_LO);
2308 	if (mp == NULL) {
2309 		if (src_ipif != NULL)
2310 			ipif_refrele(src_ipif);
2311 		return (B_TRUE);
2312 	}
2313 	bzero((char *)mp->b_rptr, len);
2314 	mp->b_wptr = mp->b_rptr + len;
2315 
2316 	ip6i = (ip6i_t *)mp->b_rptr;
2317 	ip6i->ip6i_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
2318 	ip6i->ip6i_nxt = IPPROTO_RAW;
2319 	ip6i->ip6i_flags = IP6I_ATTACH_IF | IP6I_HOPLIMIT;
2320 	if (flag & NDP_PROBE)
2321 		ip6i->ip6i_flags |= IP6I_UNSPEC_SRC;
2322 	ip6i->ip6i_ifindex = ill->ill_phyint->phyint_ifindex;
2323 
2324 	ip6h = (ip6_t *)(mp->b_rptr + sizeof (ip6i_t));
2325 	ip6h->ip6_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
2326 	ip6h->ip6_plen = htons(len - IPV6_HDR_LEN - sizeof (ip6i_t));
2327 	ip6h->ip6_nxt = IPPROTO_ICMPV6;
2328 	ip6h->ip6_hops = IPV6_MAX_HOPS;
2329 	ip6h->ip6_dst = *target;
2330 	icmp6 = (icmp6_t *)&ip6h[1];
2331 
2332 	opt = (nd_opt_hdr_t *)((uint8_t *)ip6h + IPV6_HDR_LEN +
2333 	    sizeof (nd_neighbor_advert_t));
2334 
2335 	if (operation == ND_NEIGHBOR_SOLICIT) {
2336 		nd_neighbor_solicit_t *ns = (nd_neighbor_solicit_t *)icmp6;
2337 
2338 		if (!(flag & NDP_PROBE))
2339 			opt->nd_opt_type = ND_OPT_SOURCE_LINKADDR;
2340 		ip6h->ip6_src = *sender;
2341 		ns->nd_ns_target = *target;
2342 		if (!(flag & NDP_UNICAST)) {
2343 			/* Form multicast address of the target */
2344 			ip6h->ip6_dst = ipv6_solicited_node_mcast;
2345 			ip6h->ip6_dst.s6_addr32[3] |=
2346 			    ns->nd_ns_target.s6_addr32[3];
2347 		}
2348 	} else {
2349 		nd_neighbor_advert_t *na = (nd_neighbor_advert_t *)icmp6;
2350 
2351 		ASSERT(!(flag & NDP_PROBE));
2352 		opt->nd_opt_type = ND_OPT_TARGET_LINKADDR;
2353 		ip6h->ip6_src = *sender;
2354 		na->nd_na_target = *sender;
2355 		if (flag & NDP_ISROUTER)
2356 			na->nd_na_flags_reserved |= ND_NA_FLAG_ROUTER;
2357 		if (flag & NDP_SOLICITED)
2358 			na->nd_na_flags_reserved |= ND_NA_FLAG_SOLICITED;
2359 		if (flag & NDP_ORIDE)
2360 			na->nd_na_flags_reserved |= ND_NA_FLAG_OVERRIDE;
2361 	}
2362 
2363 	hw_addr = NULL;
2364 	if (!(flag & NDP_PROBE)) {
2365 		mutex_enter(&hwaddr_ill->ill_lock);
2366 		hw_addr = use_nd_lla ? hwaddr_ill->ill_nd_lla :
2367 		    hwaddr_ill->ill_phys_addr;
2368 		if (hw_addr != NULL) {
2369 			/* Fill in link layer address and option len */
2370 			opt->nd_opt_len = (uint8_t)plen;
2371 			bcopy(hw_addr, &opt[1], hwaddr_ill->ill_nd_lla_len);
2372 		}
2373 		mutex_exit(&hwaddr_ill->ill_lock);
2374 	}
2375 	if (hw_addr == NULL) {
2376 		/* If there's no link layer address option, then strip it. */
2377 		len -= plen * 8;
2378 		mp->b_wptr = mp->b_rptr + len;
2379 		ip6h->ip6_plen = htons(len - IPV6_HDR_LEN - sizeof (ip6i_t));
2380 	}
2381 
2382 	icmp6->icmp6_type = (uint8_t)operation;
2383 	icmp6->icmp6_code = 0;
2384 	/*
2385 	 * Prepare for checksum by putting icmp length in the icmp
2386 	 * checksum field. The checksum is calculated in ip_wput_v6.
2387 	 */
2388 	icmp6->icmp6_cksum = ip6h->ip6_plen;
2389 
2390 	if (src_ipif != NULL)
2391 		ipif_refrele(src_ipif);
2392 	if (canput(ill->ill_wq)) {
2393 		put(ill->ill_wq, mp);
2394 		return (B_FALSE);
2395 	}
2396 	freemsg(mp);
2397 	return (B_TRUE);
2398 }
2399 
2400 /*
2401  * Make a link layer address (does not include the SAP) from an nce.
2402  * To form the link layer address, use the last four bytes of ipv6
2403  * address passed in and the fixed offset stored in nce.
2404  */
2405 static void
2406 nce_make_mapping(nce_t *nce, uchar_t *addrpos, uchar_t *addr)
2407 {
2408 	uchar_t *mask, *to;
2409 	ill_t	*ill = nce->nce_ill;
2410 	int 	len;
2411 
2412 	if (ill->ill_net_type == IRE_IF_NORESOLVER)
2413 		return;
2414 	ASSERT(nce->nce_res_mp != NULL);
2415 	ASSERT(ill->ill_net_type == IRE_IF_RESOLVER);
2416 	ASSERT(nce->nce_flags & NCE_F_MAPPING);
2417 	ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&nce->nce_extract_mask));
2418 	ASSERT(addr != NULL);
2419 	bcopy(nce->nce_res_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill),
2420 	    addrpos, ill->ill_nd_lla_len);
2421 	len = MIN((int)ill->ill_nd_lla_len - nce->nce_ll_extract_start,
2422 	    IPV6_ADDR_LEN);
2423 	mask = (uchar_t *)&nce->nce_extract_mask;
2424 	mask += (IPV6_ADDR_LEN - len);
2425 	addr += (IPV6_ADDR_LEN - len);
2426 	to = addrpos + nce->nce_ll_extract_start;
2427 	while (len-- > 0)
2428 		*to++ |= *mask++ & *addr++;
2429 }
2430 
2431 /*
2432  * Pass a cache report back out via NDD.
2433  */
2434 /* ARGSUSED */
2435 int
2436 ndp_report(queue_t *q, mblk_t *mp, caddr_t arg, cred_t *ioc_cr)
2437 {
2438 	(void) mi_mpprintf(mp, "ifname      hardware addr    flags"
2439 			"     proto addr/mask");
2440 	ndp_walk(NULL, (pfi_t)nce_report1, (uchar_t *)mp);
2441 	return (0);
2442 }
2443 
2444 /*
2445  * Add a single line to the NDP Cache Entry Report.
2446  */
2447 static void
2448 nce_report1(nce_t *nce, uchar_t *mp_arg)
2449 {
2450 	ill_t		*ill = nce->nce_ill;
2451 	char		local_buf[INET6_ADDRSTRLEN];
2452 	uchar_t		flags_buf[10];
2453 	uint32_t	flags = nce->nce_flags;
2454 	mblk_t		*mp = (mblk_t *)mp_arg;
2455 	uchar_t		*h;
2456 	uchar_t		*m = flags_buf;
2457 	in6_addr_t	v6addr;
2458 
2459 	/*
2460 	 * Lock the nce to protect nce_res_mp from being changed
2461 	 * if an external resolver address resolution completes
2462 	 * while nce_res_mp is being accessed here.
2463 	 *
2464 	 * Deal with all address formats, not just Ethernet-specific
2465 	 * In addition, make sure that the mblk has enough space
2466 	 * before writing to it. If is doesn't, allocate a new one.
2467 	 */
2468 	if (nce->nce_ipversion == IPV4_VERSION)
2469 		/* Don't include v4 nce_ts in NDP cache entry report */
2470 		return;
2471 
2472 	ASSERT(ill != NULL);
2473 	v6addr = nce->nce_mask;
2474 	if (flags & NCE_F_PERMANENT)
2475 		*m++ = 'P';
2476 	if (flags & NCE_F_ISROUTER)
2477 		*m++ = 'R';
2478 	if (flags & NCE_F_MAPPING)
2479 		*m++ = 'M';
2480 	*m = '\0';
2481 
2482 	if (ill->ill_net_type == IRE_IF_RESOLVER) {
2483 		size_t		addrlen;
2484 		char		*addr_buf;
2485 		dl_unitdata_req_t	*dl;
2486 
2487 		mutex_enter(&nce->nce_lock);
2488 		h = nce->nce_res_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill);
2489 		dl = (dl_unitdata_req_t *)nce->nce_res_mp->b_rptr;
2490 		if (ill->ill_flags & ILLF_XRESOLV)
2491 			addrlen = (3 * (dl->dl_dest_addr_length));
2492 		else
2493 			addrlen = (3 * (ill->ill_nd_lla_len));
2494 		if (addrlen <= 0) {
2495 			mutex_exit(&nce->nce_lock);
2496 			(void) mi_mpprintf(mp,
2497 			    "%8s %9s %5s %s/%d",
2498 			    ill->ill_name,
2499 			    "None",
2500 			    (uchar_t *)&flags_buf,
2501 			    inet_ntop(AF_INET6, (char *)&nce->nce_addr,
2502 				(char *)local_buf, sizeof (local_buf)),
2503 				ip_mask_to_plen_v6(&v6addr));
2504 		} else {
2505 			/*
2506 			 * Convert the hardware/lla address to ascii
2507 			 */
2508 			addr_buf = kmem_zalloc(addrlen, KM_NOSLEEP);
2509 			if (addr_buf == NULL) {
2510 				mutex_exit(&nce->nce_lock);
2511 				return;
2512 			}
2513 			(void) mac_colon_addr((uint8_t *)h,
2514 			    (ill->ill_flags & ILLF_XRESOLV) ?
2515 			    dl->dl_dest_addr_length : ill->ill_nd_lla_len,
2516 			    addr_buf, addrlen);
2517 			mutex_exit(&nce->nce_lock);
2518 			(void) mi_mpprintf(mp, "%8s %17s %5s %s/%d",
2519 			    ill->ill_name, addr_buf, (uchar_t *)&flags_buf,
2520 			    inet_ntop(AF_INET6, (char *)&nce->nce_addr,
2521 				(char *)local_buf, sizeof (local_buf)),
2522 				ip_mask_to_plen_v6(&v6addr));
2523 			kmem_free(addr_buf, addrlen);
2524 		}
2525 	} else {
2526 		(void) mi_mpprintf(mp,
2527 		    "%8s %9s %5s %s/%d",
2528 		    ill->ill_name,
2529 		    "None",
2530 		    (uchar_t *)&flags_buf,
2531 		    inet_ntop(AF_INET6, (char *)&nce->nce_addr,
2532 			(char *)local_buf, sizeof (local_buf)),
2533 			ip_mask_to_plen_v6(&v6addr));
2534 	}
2535 }
2536 
2537 mblk_t *
2538 nce_udreq_alloc(ill_t *ill)
2539 {
2540 	mblk_t	*template_mp = NULL;
2541 	dl_unitdata_req_t *dlur;
2542 	int	sap_length;
2543 
2544 	ASSERT(ill->ill_isv6);
2545 
2546 	sap_length = ill->ill_sap_length;
2547 	template_mp = ip_dlpi_alloc(sizeof (dl_unitdata_req_t) +
2548 	    ill->ill_nd_lla_len + ABS(sap_length), DL_UNITDATA_REQ);
2549 	if (template_mp == NULL)
2550 		return (NULL);
2551 
2552 	dlur = (dl_unitdata_req_t *)template_mp->b_rptr;
2553 	dlur->dl_priority.dl_min = 0;
2554 	dlur->dl_priority.dl_max = 0;
2555 	dlur->dl_dest_addr_length = ABS(sap_length) + ill->ill_nd_lla_len;
2556 	dlur->dl_dest_addr_offset = sizeof (dl_unitdata_req_t);
2557 
2558 	/* Copy in the SAP value. */
2559 	NCE_LL_SAP_COPY(ill, template_mp);
2560 
2561 	return (template_mp);
2562 }
2563 
2564 /*
2565  * NDP retransmit timer.
2566  * This timer goes off when:
2567  * a. It is time to retransmit NS for resolver.
2568  * b. It is time to send reachability probes.
2569  */
2570 void
2571 ndp_timer(void *arg)
2572 {
2573 	nce_t		*nce = arg;
2574 	ill_t		*ill = nce->nce_ill;
2575 	uint32_t	ms;
2576 	char		addrbuf[INET6_ADDRSTRLEN];
2577 	mblk_t		*mp;
2578 	boolean_t	dropped = B_FALSE;
2579 
2580 	/*
2581 	 * The timer has to be cancelled by ndp_delete before doing the final
2582 	 * refrele. So the NCE is guaranteed to exist when the timer runs
2583 	 * until it clears the timeout_id. Before clearing the timeout_id
2584 	 * bump up the refcnt so that we can continue to use the nce
2585 	 */
2586 	ASSERT(nce != NULL);
2587 
2588 	/*
2589 	 * Grab the ill_g_lock now itself to avoid lock order problems.
2590 	 * nce_solicit needs ill_g_lock to be able to traverse ills
2591 	 */
2592 	rw_enter(&ill_g_lock, RW_READER);
2593 	mutex_enter(&nce->nce_lock);
2594 	NCE_REFHOLD_LOCKED(nce);
2595 	nce->nce_timeout_id = 0;
2596 
2597 	/*
2598 	 * Check the reachability state first.
2599 	 */
2600 	switch (nce->nce_state) {
2601 	case ND_DELAY:
2602 		rw_exit(&ill_g_lock);
2603 		nce->nce_state = ND_PROBE;
2604 		mutex_exit(&nce->nce_lock);
2605 		(void) nce_xmit(ill, ND_NEIGHBOR_SOLICIT, NULL, B_FALSE,
2606 		    &ipv6_all_zeros, &nce->nce_addr, NDP_UNICAST);
2607 		if (ip_debug > 3) {
2608 			/* ip2dbg */
2609 			pr_addr_dbg("ndp_timer: state for %s changed "
2610 			    "to PROBE\n", AF_INET6, &nce->nce_addr);
2611 		}
2612 		NDP_RESTART_TIMER(nce, ill->ill_reachable_retrans_time);
2613 		NCE_REFRELE(nce);
2614 		return;
2615 	case ND_PROBE:
2616 		/* must be retransmit timer */
2617 		rw_exit(&ill_g_lock);
2618 		nce->nce_pcnt--;
2619 		ASSERT(nce->nce_pcnt < ND_MAX_UNICAST_SOLICIT &&
2620 		    nce->nce_pcnt >= -1);
2621 		if (nce->nce_pcnt > 0) {
2622 			/*
2623 			 * As per RFC2461, the nce gets deleted after
2624 			 * MAX_UNICAST_SOLICIT unsuccessful re-transmissions.
2625 			 * Note that the first unicast solicitation is sent
2626 			 * during the DELAY state.
2627 			 */
2628 			ip2dbg(("ndp_timer: pcount=%x dst %s\n",
2629 			    nce->nce_pcnt, inet_ntop(AF_INET6, &nce->nce_addr,
2630 			    addrbuf, sizeof (addrbuf))));
2631 			mutex_exit(&nce->nce_lock);
2632 			dropped = nce_xmit(ill, ND_NEIGHBOR_SOLICIT, NULL,
2633 			    B_FALSE, &ipv6_all_zeros, &nce->nce_addr,
2634 			    (nce->nce_flags & NCE_F_PERMANENT) ? NDP_PROBE :
2635 			    NDP_UNICAST);
2636 			if (dropped) {
2637 				mutex_enter(&nce->nce_lock);
2638 				nce->nce_pcnt++;
2639 				mutex_exit(&nce->nce_lock);
2640 			}
2641 			NDP_RESTART_TIMER(nce, ILL_PROBE_INTERVAL(ill));
2642 		} else if (nce->nce_pcnt < 0) {
2643 			/* No hope, delete the nce */
2644 			nce->nce_state = ND_UNREACHABLE;
2645 			mutex_exit(&nce->nce_lock);
2646 			if (ip_debug > 2) {
2647 				/* ip1dbg */
2648 				pr_addr_dbg("ndp_timer: Delete IRE for"
2649 				    " dst %s\n", AF_INET6, &nce->nce_addr);
2650 			}
2651 			ndp_delete(nce);
2652 		} else if (!(nce->nce_flags & NCE_F_PERMANENT)) {
2653 			/* Wait RetransTimer, before deleting the entry */
2654 			ip2dbg(("ndp_timer: pcount=%x dst %s\n",
2655 			    nce->nce_pcnt, inet_ntop(AF_INET6,
2656 			    &nce->nce_addr, addrbuf, sizeof (addrbuf))));
2657 			mutex_exit(&nce->nce_lock);
2658 			/* Wait one interval before killing */
2659 			NDP_RESTART_TIMER(nce, ill->ill_reachable_retrans_time);
2660 		} else if (ill->ill_phyint->phyint_flags & PHYI_RUNNING) {
2661 			ipif_t *ipif;
2662 
2663 			/*
2664 			 * We're done probing, and we can now declare this
2665 			 * address to be usable.  Let IP know that it's ok to
2666 			 * use.
2667 			 */
2668 			nce->nce_state = ND_REACHABLE;
2669 			mutex_exit(&nce->nce_lock);
2670 			ipif = ipif_lookup_addr_v6(&nce->nce_addr, ill,
2671 			    ALL_ZONES, NULL, NULL, NULL, NULL);
2672 			if (ipif != NULL) {
2673 				if (ipif->ipif_was_dup) {
2674 					char ibuf[LIFNAMSIZ + 10];
2675 					char sbuf[INET6_ADDRSTRLEN];
2676 
2677 					ipif->ipif_was_dup = B_FALSE;
2678 					(void) strlcpy(ibuf, ill->ill_name,
2679 					    sizeof (ibuf));
2680 					(void) inet_ntop(AF_INET6,
2681 					    &ipif->ipif_v6lcl_addr,
2682 					    sbuf, sizeof (sbuf));
2683 					if (ipif->ipif_id != 0) {
2684 						(void) snprintf(ibuf +
2685 						    ill->ill_name_length - 1,
2686 						    sizeof (ibuf) -
2687 						    ill->ill_name_length + 1,
2688 						    ":%d", ipif->ipif_id);
2689 					}
2690 					cmn_err(CE_NOTE, "recovered address "
2691 					    "%s on %s", sbuf, ibuf);
2692 				}
2693 				if ((ipif->ipif_flags & IPIF_UP) &&
2694 				    !ipif->ipif_addr_ready) {
2695 					ip_rts_ifmsg(ipif);
2696 					ip_rts_newaddrmsg(RTM_ADD, 0, ipif);
2697 					sctp_update_ipif(ipif, SCTP_IPIF_UP);
2698 				}
2699 				ipif->ipif_addr_ready = 1;
2700 				ipif_refrele(ipif);
2701 			}
2702 			/* Begin defending our new address */
2703 			nce->nce_unsolicit_count = 0;
2704 			dropped = nce_xmit(ill, ND_NEIGHBOR_ADVERT, ill,
2705 			    B_FALSE, &nce->nce_addr, &ipv6_all_hosts_mcast,
2706 			    nce_advert_flags(nce));
2707 			if (dropped) {
2708 				nce->nce_unsolicit_count = 1;
2709 				NDP_RESTART_TIMER(nce,
2710 				    ip_ndp_unsolicit_interval);
2711 			} else if (ip_ndp_defense_interval != 0) {
2712 				NDP_RESTART_TIMER(nce, ip_ndp_defense_interval);
2713 			}
2714 		} else {
2715 			/*
2716 			 * This is an address we're probing to be our own, but
2717 			 * the ill is down.  Wait until it comes back before
2718 			 * doing anything, but switch to reachable state so
2719 			 * that the restart will work.
2720 			 */
2721 			nce->nce_state = ND_REACHABLE;
2722 			mutex_exit(&nce->nce_lock);
2723 		}
2724 		NCE_REFRELE(nce);
2725 		return;
2726 	case ND_INCOMPLETE:
2727 		/*
2728 		 * Must be resolvers retransmit timer.
2729 		 */
2730 		for (mp = nce->nce_qd_mp; mp != NULL; mp = mp->b_next) {
2731 			ip6i_t	*ip6i;
2732 			ip6_t	*ip6h;
2733 			mblk_t *data_mp;
2734 
2735 			/*
2736 			 * Walk the list of packets queued, and see if there
2737 			 * are any multipathing probe packets. Such packets
2738 			 * are always queued at the head. Since this is a
2739 			 * retransmit timer firing, mark such packets as
2740 			 * delayed in ND resolution. This info will be used
2741 			 * in ip_wput_v6(). Multipathing probe packets will
2742 			 * always have an ip6i_t. Once we hit a packet without
2743 			 * it, we can break out of this loop.
2744 			 */
2745 			if (mp->b_datap->db_type == M_CTL)
2746 				data_mp = mp->b_cont;
2747 			else
2748 				data_mp = mp;
2749 
2750 			ip6h = (ip6_t *)data_mp->b_rptr;
2751 			if (ip6h->ip6_nxt != IPPROTO_RAW)
2752 				break;
2753 
2754 			/*
2755 			 * This message should have been pulled up already in
2756 			 * ip_wput_v6. We can't do pullups here because the
2757 			 * b_next/b_prev is non-NULL.
2758 			 */
2759 			ip6i = (ip6i_t *)ip6h;
2760 			ASSERT((data_mp->b_wptr - (uchar_t *)ip6i) >=
2761 			    sizeof (ip6i_t) + IPV6_HDR_LEN);
2762 
2763 			/* Mark this packet as delayed due to ND resolution */
2764 			if (ip6i->ip6i_flags & IP6I_DROP_IFDELAYED)
2765 				ip6i->ip6i_flags |= IP6I_ND_DELAYED;
2766 		}
2767 		if (nce->nce_qd_mp != NULL) {
2768 			ms = nce_solicit(nce, NULL);
2769 			rw_exit(&ill_g_lock);
2770 			if (ms == 0) {
2771 				if (nce->nce_state != ND_REACHABLE) {
2772 					mutex_exit(&nce->nce_lock);
2773 					nce_resolv_failed(nce);
2774 					ndp_delete(nce);
2775 				} else {
2776 					mutex_exit(&nce->nce_lock);
2777 				}
2778 			} else {
2779 				mutex_exit(&nce->nce_lock);
2780 				NDP_RESTART_TIMER(nce, (clock_t)ms);
2781 			}
2782 			NCE_REFRELE(nce);
2783 			return;
2784 		}
2785 		mutex_exit(&nce->nce_lock);
2786 		rw_exit(&ill_g_lock);
2787 		NCE_REFRELE(nce);
2788 		break;
2789 	case ND_REACHABLE :
2790 		rw_exit(&ill_g_lock);
2791 		if (((nce->nce_flags & NCE_F_UNSOL_ADV) &&
2792 		    nce->nce_unsolicit_count != 0) ||
2793 		    ((nce->nce_flags & NCE_F_PERMANENT) &&
2794 		    ip_ndp_defense_interval != 0)) {
2795 			if (nce->nce_unsolicit_count > 0)
2796 				nce->nce_unsolicit_count--;
2797 			mutex_exit(&nce->nce_lock);
2798 			dropped = nce_xmit(ill,
2799 			    ND_NEIGHBOR_ADVERT,
2800 			    ill,	/* ill to be used for hw addr */
2801 			    B_FALSE,	/* use ill_phys_addr */
2802 			    &nce->nce_addr,
2803 			    &ipv6_all_hosts_mcast,
2804 			    nce_advert_flags(nce));
2805 			if (dropped) {
2806 				mutex_enter(&nce->nce_lock);
2807 				nce->nce_unsolicit_count++;
2808 				mutex_exit(&nce->nce_lock);
2809 			}
2810 			if (nce->nce_unsolicit_count != 0) {
2811 				NDP_RESTART_TIMER(nce,
2812 				    ip_ndp_unsolicit_interval);
2813 			} else {
2814 				NDP_RESTART_TIMER(nce,
2815 				    ip_ndp_defense_interval);
2816 			}
2817 		} else {
2818 			mutex_exit(&nce->nce_lock);
2819 		}
2820 		NCE_REFRELE(nce);
2821 		break;
2822 	default:
2823 		rw_exit(&ill_g_lock);
2824 		mutex_exit(&nce->nce_lock);
2825 		NCE_REFRELE(nce);
2826 		break;
2827 	}
2828 }
2829 
2830 /*
2831  * Set a link layer address from the ll_addr passed in.
2832  * Copy SAP from ill.
2833  */
2834 static void
2835 nce_set_ll(nce_t *nce, uchar_t *ll_addr)
2836 {
2837 	ill_t	*ill = nce->nce_ill;
2838 	uchar_t	*woffset;
2839 
2840 	ASSERT(ll_addr != NULL);
2841 	/* Always called before fast_path_probe */
2842 	ASSERT(nce->nce_fp_mp == NULL);
2843 	if (ill->ill_sap_length != 0) {
2844 		/*
2845 		 * Copy the SAP type specified in the
2846 		 * request into the xmit template.
2847 		 */
2848 		NCE_LL_SAP_COPY(ill, nce->nce_res_mp);
2849 	}
2850 	if (ill->ill_phys_addr_length > 0) {
2851 		/*
2852 		 * The bcopy() below used to be called for the physical address
2853 		 * length rather than the link layer address length. For
2854 		 * ethernet and many other media, the phys_addr and lla are
2855 		 * identical.
2856 		 * However, with xresolv interfaces being introduced, the
2857 		 * phys_addr and lla are no longer the same, and the physical
2858 		 * address may not have any useful meaning, so we use the lla
2859 		 * for IPv6 address resolution and destination addressing.
2860 		 *
2861 		 * For PPP or other interfaces with a zero length
2862 		 * physical address, don't do anything here.
2863 		 * The bcopy() with a zero phys_addr length was previously
2864 		 * a no-op for interfaces with a zero-length physical address.
2865 		 * Using the lla for them would change the way they operate.
2866 		 * Doing nothing in such cases preserves expected behavior.
2867 		 */
2868 		woffset = nce->nce_res_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill);
2869 		bcopy(ll_addr, woffset, ill->ill_nd_lla_len);
2870 	}
2871 }
2872 
2873 static boolean_t
2874 nce_cmp_ll_addr(const nce_t *nce, const uchar_t *ll_addr, uint32_t ll_addr_len)
2875 {
2876 	ill_t	*ill = nce->nce_ill;
2877 	uchar_t	*ll_offset;
2878 
2879 	ASSERT(nce->nce_res_mp != NULL);
2880 	if (ll_addr == NULL)
2881 		return (B_FALSE);
2882 	ll_offset = nce->nce_res_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill);
2883 	if (bcmp(ll_addr, ll_offset, ll_addr_len) != 0)
2884 		return (B_TRUE);
2885 	return (B_FALSE);
2886 }
2887 
2888 /*
2889  * Updates the link layer address or the reachability state of
2890  * a cache entry.  Reset probe counter if needed.
2891  */
2892 static void
2893 nce_update(nce_t *nce, uint16_t new_state, uchar_t *new_ll_addr)
2894 {
2895 	ill_t	*ill = nce->nce_ill;
2896 	boolean_t need_stop_timer = B_FALSE;
2897 	boolean_t need_fastpath_update = B_FALSE;
2898 
2899 	ASSERT(MUTEX_HELD(&nce->nce_lock));
2900 	ASSERT(nce->nce_ipversion == IPV6_VERSION);
2901 	/*
2902 	 * If this interface does not do NUD, there is no point
2903 	 * in allowing an update to the cache entry.  Although
2904 	 * we will respond to NS.
2905 	 * The only time we accept an update for a resolver when
2906 	 * NUD is turned off is when it has just been created.
2907 	 * Non-Resolvers will always be created as REACHABLE.
2908 	 */
2909 	if (new_state != ND_UNCHANGED) {
2910 		if ((nce->nce_flags & NCE_F_NONUD) &&
2911 		    (nce->nce_state != ND_INCOMPLETE))
2912 			return;
2913 		ASSERT((int16_t)new_state >= ND_STATE_VALID_MIN);
2914 		ASSERT((int16_t)new_state <= ND_STATE_VALID_MAX);
2915 		need_stop_timer = B_TRUE;
2916 		if (new_state == ND_REACHABLE)
2917 			nce->nce_last = TICK_TO_MSEC(lbolt64);
2918 		else {
2919 			/* We force NUD in this case */
2920 			nce->nce_last = 0;
2921 		}
2922 		nce->nce_state = new_state;
2923 		nce->nce_pcnt = ND_MAX_UNICAST_SOLICIT;
2924 	}
2925 	/*
2926 	 * In case of fast path we need to free the the fastpath
2927 	 * M_DATA and do another probe.  Otherwise we can just
2928 	 * overwrite the DL_UNITDATA_REQ data, noting we'll lose
2929 	 * whatever packets that happens to be transmitting at the time.
2930 	 */
2931 	if (new_ll_addr != NULL) {
2932 		ASSERT(nce->nce_res_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill) +
2933 		    ill->ill_nd_lla_len <= nce->nce_res_mp->b_wptr);
2934 		bcopy(new_ll_addr, nce->nce_res_mp->b_rptr +
2935 		    NCE_LL_ADDR_OFFSET(ill), ill->ill_nd_lla_len);
2936 		if (nce->nce_fp_mp != NULL) {
2937 			freemsg(nce->nce_fp_mp);
2938 			nce->nce_fp_mp = NULL;
2939 		}
2940 		need_fastpath_update = B_TRUE;
2941 	}
2942 	mutex_exit(&nce->nce_lock);
2943 	if (need_stop_timer) {
2944 		(void) untimeout(nce->nce_timeout_id);
2945 		nce->nce_timeout_id = 0;
2946 	}
2947 	if (need_fastpath_update)
2948 		nce_fastpath(nce);
2949 	mutex_enter(&nce->nce_lock);
2950 }
2951 
2952 void
2953 nce_queue_mp_common(nce_t *nce, mblk_t *mp, boolean_t head_insert)
2954 {
2955 	uint_t	count = 0;
2956 	mblk_t  **mpp;
2957 
2958 	ASSERT(MUTEX_HELD(&nce->nce_lock));
2959 
2960 	for (mpp = &nce->nce_qd_mp; *mpp != NULL;
2961 	    mpp = &(*mpp)->b_next) {
2962 		if (++count >
2963 		    nce->nce_ill->ill_max_buf) {
2964 			mblk_t *tmp = nce->nce_qd_mp->b_next;
2965 
2966 			nce->nce_qd_mp->b_next = NULL;
2967 			nce->nce_qd_mp->b_prev = NULL;
2968 			freemsg(nce->nce_qd_mp);
2969 			nce->nce_qd_mp = tmp;
2970 		}
2971 	}
2972 	/* put this on the list */
2973 	if (head_insert) {
2974 		mp->b_next = nce->nce_qd_mp;
2975 		nce->nce_qd_mp = mp;
2976 	} else {
2977 		*mpp = mp;
2978 	}
2979 }
2980 
2981 static void
2982 nce_queue_mp(nce_t *nce, mblk_t *mp)
2983 {
2984 	boolean_t head_insert = B_FALSE;
2985 	ip6_t	*ip6h;
2986 	ip6i_t	*ip6i;
2987 	mblk_t *data_mp;
2988 
2989 	ASSERT(MUTEX_HELD(&nce->nce_lock));
2990 
2991 	if (mp->b_datap->db_type == M_CTL)
2992 		data_mp = mp->b_cont;
2993 	else
2994 		data_mp = mp;
2995 	ip6h = (ip6_t *)data_mp->b_rptr;
2996 	if (ip6h->ip6_nxt == IPPROTO_RAW) {
2997 		/*
2998 		 * This message should have been pulled up already in
2999 		 * ip_wput_v6. We can't do pullups here because the message
3000 		 * could be from the nce_qd_mp which could have b_next/b_prev
3001 		 * non-NULL.
3002 		 */
3003 		ip6i = (ip6i_t *)ip6h;
3004 		ASSERT((data_mp->b_wptr - (uchar_t *)ip6i) >=
3005 			    sizeof (ip6i_t) + IPV6_HDR_LEN);
3006 		/*
3007 		 * Multipathing probe packets have IP6I_DROP_IFDELAYED set.
3008 		 * This has 2 aspects mentioned below.
3009 		 * 1. Perform head insertion in the nce_qd_mp for these packets.
3010 		 * This ensures that next retransmit of ND solicitation
3011 		 * will use the interface specified by the probe packet,
3012 		 * for both NS and NA. This corresponds to the src address
3013 		 * in the IPv6 packet. If we insert at tail, we will be
3014 		 * depending on the packet at the head for successful
3015 		 * ND resolution. This is not reliable, because the interface
3016 		 * on which the NA arrives could be different from the interface
3017 		 * on which the NS was sent, and if the receiving interface is
3018 		 * failed, it will appear that the sending interface is also
3019 		 * failed, causing in.mpathd to misdiagnose this as link
3020 		 * failure.
3021 		 * 2. Drop the original packet, if the ND resolution did not
3022 		 * succeed in the first attempt. However we will create the
3023 		 * nce and the ire, as soon as the ND resolution succeeds.
3024 		 * We don't gain anything by queueing multiple probe packets
3025 		 * and sending them back-to-back once resolution succeeds.
3026 		 * It is sufficient to send just 1 packet after ND resolution
3027 		 * succeeds. Since mpathd is sending down probe packets at a
3028 		 * constant rate, we don't need to send the queued packet. We
3029 		 * need to queue it only for NDP resolution. The benefit of
3030 		 * dropping the probe packets that were delayed in ND
3031 		 * resolution, is that in.mpathd will not see inflated
3032 		 * RTT. If the ND resolution does not succeed within
3033 		 * in.mpathd's failure detection time, mpathd may detect
3034 		 * a failure, and it does not matter whether the packet
3035 		 * was queued or dropped.
3036 		 */
3037 		if (ip6i->ip6i_flags & IP6I_DROP_IFDELAYED)
3038 			head_insert = B_TRUE;
3039 	}
3040 
3041 	nce_queue_mp_common(nce, mp, head_insert);
3042 }
3043 
3044 /*
3045  * Called when address resolution failed due to a timeout.
3046  * Send an ICMP unreachable in response to all queued packets.
3047  */
3048 void
3049 nce_resolv_failed(nce_t *nce)
3050 {
3051 	mblk_t	*mp, *nxt_mp, *first_mp;
3052 	char	buf[INET6_ADDRSTRLEN];
3053 	ip6_t *ip6h;
3054 	zoneid_t zoneid = GLOBAL_ZONEID;
3055 
3056 	ip1dbg(("nce_resolv_failed: dst %s\n",
3057 	    inet_ntop(AF_INET6, (char *)&nce->nce_addr, buf, sizeof (buf))));
3058 	mutex_enter(&nce->nce_lock);
3059 	mp = nce->nce_qd_mp;
3060 	nce->nce_qd_mp = NULL;
3061 	mutex_exit(&nce->nce_lock);
3062 	while (mp != NULL) {
3063 		nxt_mp = mp->b_next;
3064 		mp->b_next = NULL;
3065 		mp->b_prev = NULL;
3066 
3067 		first_mp = mp;
3068 		if (mp->b_datap->db_type == M_CTL) {
3069 			ipsec_out_t *io = (ipsec_out_t *)mp->b_rptr;
3070 			ASSERT(io->ipsec_out_type == IPSEC_OUT);
3071 			zoneid = io->ipsec_out_zoneid;
3072 			ASSERT(zoneid != ALL_ZONES);
3073 			mp = mp->b_cont;
3074 		}
3075 
3076 		ip6h = (ip6_t *)mp->b_rptr;
3077 		if (ip6h->ip6_nxt == IPPROTO_RAW) {
3078 			ip6i_t *ip6i;
3079 			/*
3080 			 * This message should have been pulled up already
3081 			 * in ip_wput_v6. ip_hdr_complete_v6 assumes that
3082 			 * the header is pulled up.
3083 			 */
3084 			ip6i = (ip6i_t *)ip6h;
3085 			ASSERT((mp->b_wptr - (uchar_t *)ip6i) >=
3086 			    sizeof (ip6i_t) + IPV6_HDR_LEN);
3087 			mp->b_rptr += sizeof (ip6i_t);
3088 		}
3089 		/*
3090 		 * Ignore failure since icmp_unreachable_v6 will silently
3091 		 * drop packets with an unspecified source address.
3092 		 */
3093 		(void) ip_hdr_complete_v6((ip6_t *)mp->b_rptr, zoneid);
3094 		icmp_unreachable_v6(nce->nce_ill->ill_wq, first_mp,
3095 		    ICMP6_DST_UNREACH_ADDR, B_FALSE, B_FALSE);
3096 		mp = nxt_mp;
3097 	}
3098 }
3099 
3100 /*
3101  * Called by SIOCSNDP* ioctl to add/change an nce entry
3102  * and the corresponding attributes.
3103  * Disallow states other than ND_REACHABLE or ND_STALE.
3104  */
3105 int
3106 ndp_sioc_update(ill_t *ill, lif_nd_req_t *lnr)
3107 {
3108 	sin6_t		*sin6;
3109 	in6_addr_t	*addr;
3110 	nce_t		*nce;
3111 	int		err;
3112 	uint16_t	new_flags = 0;
3113 	uint16_t	old_flags = 0;
3114 	int		inflags = lnr->lnr_flags;
3115 
3116 	ASSERT(ill->ill_isv6);
3117 	if ((lnr->lnr_state_create != ND_REACHABLE) &&
3118 	    (lnr->lnr_state_create != ND_STALE))
3119 		return (EINVAL);
3120 
3121 	sin6 = (sin6_t *)&lnr->lnr_addr;
3122 	addr = &sin6->sin6_addr;
3123 
3124 	mutex_enter(&ndp6.ndp_g_lock);
3125 	/* We know it can not be mapping so just look in the hash table */
3126 	nce = *((nce_t **)NCE_HASH_PTR_V6(*addr));
3127 	nce = nce_lookup_addr(ill, addr, nce);
3128 	if (nce != NULL)
3129 		new_flags = nce->nce_flags;
3130 
3131 	switch (inflags & (NDF_ISROUTER_ON|NDF_ISROUTER_OFF)) {
3132 	case NDF_ISROUTER_ON:
3133 		new_flags |= NCE_F_ISROUTER;
3134 		break;
3135 	case NDF_ISROUTER_OFF:
3136 		new_flags &= ~NCE_F_ISROUTER;
3137 		break;
3138 	case (NDF_ISROUTER_OFF|NDF_ISROUTER_ON):
3139 		mutex_exit(&ndp6.ndp_g_lock);
3140 		if (nce != NULL)
3141 			NCE_REFRELE(nce);
3142 		return (EINVAL);
3143 	}
3144 
3145 	switch (inflags & (NDF_ANYCAST_ON|NDF_ANYCAST_OFF)) {
3146 	case NDF_ANYCAST_ON:
3147 		new_flags |= NCE_F_ANYCAST;
3148 		break;
3149 	case NDF_ANYCAST_OFF:
3150 		new_flags &= ~NCE_F_ANYCAST;
3151 		break;
3152 	case (NDF_ANYCAST_OFF|NDF_ANYCAST_ON):
3153 		mutex_exit(&ndp6.ndp_g_lock);
3154 		if (nce != NULL)
3155 			NCE_REFRELE(nce);
3156 		return (EINVAL);
3157 	}
3158 
3159 	switch (inflags & (NDF_PROXY_ON|NDF_PROXY_OFF)) {
3160 	case NDF_PROXY_ON:
3161 		new_flags |= NCE_F_PROXY;
3162 		break;
3163 	case NDF_PROXY_OFF:
3164 		new_flags &= ~NCE_F_PROXY;
3165 		break;
3166 	case (NDF_PROXY_OFF|NDF_PROXY_ON):
3167 		mutex_exit(&ndp6.ndp_g_lock);
3168 		if (nce != NULL)
3169 			NCE_REFRELE(nce);
3170 		return (EINVAL);
3171 	}
3172 
3173 	if (nce == NULL) {
3174 		err = ndp_add(ill,
3175 		    (uchar_t *)lnr->lnr_hdw_addr,
3176 		    addr,
3177 		    &ipv6_all_ones,
3178 		    &ipv6_all_zeros,
3179 		    0,
3180 		    new_flags,
3181 		    lnr->lnr_state_create,
3182 		    &nce,
3183 		    NULL,
3184 		    NULL);
3185 		if (err != 0) {
3186 			mutex_exit(&ndp6.ndp_g_lock);
3187 			ip1dbg(("ndp_sioc_update: Can't create NCE %d\n", err));
3188 			return (err);
3189 		}
3190 	}
3191 	old_flags = nce->nce_flags;
3192 	if (old_flags & NCE_F_ISROUTER && !(new_flags & NCE_F_ISROUTER)) {
3193 		/*
3194 		 * Router turned to host, delete all ires.
3195 		 * XXX Just delete the entry, but we need to add too.
3196 		 */
3197 		nce->nce_flags &= ~NCE_F_ISROUTER;
3198 		mutex_exit(&ndp6.ndp_g_lock);
3199 		ndp_delete(nce);
3200 		NCE_REFRELE(nce);
3201 		return (0);
3202 	}
3203 	mutex_exit(&ndp6.ndp_g_lock);
3204 
3205 	mutex_enter(&nce->nce_lock);
3206 	nce->nce_flags = new_flags;
3207 	mutex_exit(&nce->nce_lock);
3208 	/*
3209 	 * Note that we ignore the state at this point, which
3210 	 * should be either STALE or REACHABLE.  Instead we let
3211 	 * the link layer address passed in to determine the state
3212 	 * much like incoming packets.
3213 	 */
3214 	ndp_process(nce, (uchar_t *)lnr->lnr_hdw_addr, 0, B_FALSE);
3215 	NCE_REFRELE(nce);
3216 	return (0);
3217 }
3218 
3219 /*
3220  * If the device driver supports it, we make nce_fp_mp to have
3221  * an M_DATA prepend.  Otherwise nce_fp_mp will be null.
3222  * The caller insures there is hold on nce for this function.
3223  * Note that since ill_fastpath_probe() copies the mblk there is
3224  * no need for the hold beyond this function.
3225  */
3226 static void
3227 nce_fastpath(nce_t *nce)
3228 {
3229 	ill_t	*ill = nce->nce_ill;
3230 	int res;
3231 
3232 	ASSERT(ill != NULL);
3233 	if (nce->nce_fp_mp != NULL) {
3234 		/* Already contains fastpath info */
3235 		return;
3236 	}
3237 	if (nce->nce_res_mp != NULL) {
3238 		nce_fastpath_list_add(nce);
3239 		res = ill_fastpath_probe(ill, nce->nce_res_mp);
3240 		/*
3241 		 * EAGAIN is an indication of a transient error
3242 		 * i.e. allocation failure etc. leave the nce in the list it
3243 		 * will be updated when another probe happens for another ire
3244 		 * if not it will be taken out of the list when the ire is
3245 		 * deleted.
3246 		 */
3247 
3248 		if (res != 0 && res != EAGAIN)
3249 			nce_fastpath_list_delete(nce);
3250 	}
3251 }
3252 
3253 /*
3254  * Drain the list of nce's waiting for fastpath response.
3255  */
3256 void
3257 nce_fastpath_list_dispatch(ill_t *ill, boolean_t (*func)(nce_t *, void  *),
3258     void *arg)
3259 {
3260 
3261 	nce_t *next_nce;
3262 	nce_t *current_nce;
3263 	nce_t *first_nce;
3264 	nce_t *prev_nce = NULL;
3265 
3266 	ASSERT(ill != NULL && ill->ill_isv6);
3267 
3268 	mutex_enter(&ill->ill_lock);
3269 	first_nce = current_nce = (nce_t *)ill->ill_fastpath_list;
3270 	while (current_nce != (nce_t *)&ill->ill_fastpath_list) {
3271 		next_nce = current_nce->nce_fastpath;
3272 		/*
3273 		 * Take it off the list if we're flushing, or if the callback
3274 		 * routine tells us to do so.  Otherwise, leave the nce in the
3275 		 * fastpath list to handle any pending response from the lower
3276 		 * layer.  We can't drain the list when the callback routine
3277 		 * comparison failed, because the response is asynchronous in
3278 		 * nature, and may not arrive in the same order as the list
3279 		 * insertion.
3280 		 */
3281 		if (func == NULL || func(current_nce, arg)) {
3282 			current_nce->nce_fastpath = NULL;
3283 			if (current_nce == first_nce)
3284 				ill->ill_fastpath_list = first_nce = next_nce;
3285 			else
3286 				prev_nce->nce_fastpath = next_nce;
3287 		} else {
3288 			/* previous element that is still in the list */
3289 			prev_nce = current_nce;
3290 		}
3291 		current_nce = next_nce;
3292 	}
3293 	mutex_exit(&ill->ill_lock);
3294 }
3295 
3296 /*
3297  * Add nce to the nce fastpath list.
3298  */
3299 void
3300 nce_fastpath_list_add(nce_t *nce)
3301 {
3302 	ill_t *ill;
3303 
3304 	ill = nce->nce_ill;
3305 	ASSERT(ill != NULL && ill->ill_isv6);
3306 
3307 	mutex_enter(&ill->ill_lock);
3308 	mutex_enter(&nce->nce_lock);
3309 
3310 	/*
3311 	 * if nce has not been deleted and
3312 	 * is not already in the list add it.
3313 	 */
3314 	if (!(nce->nce_flags & NCE_F_CONDEMNED) &&
3315 	    (nce->nce_fastpath == NULL)) {
3316 		nce->nce_fastpath = (nce_t *)ill->ill_fastpath_list;
3317 		ill->ill_fastpath_list = nce;
3318 	}
3319 
3320 	mutex_exit(&nce->nce_lock);
3321 	mutex_exit(&ill->ill_lock);
3322 }
3323 
3324 /*
3325  * remove nce from the nce fastpath list.
3326  */
3327 void
3328 nce_fastpath_list_delete(nce_t *nce)
3329 {
3330 	nce_t *nce_ptr;
3331 
3332 	ill_t *ill;
3333 
3334 	ill = nce->nce_ill;
3335 	ASSERT(ill != NULL);
3336 	if (!ill->ill_isv6)  {
3337 		/*
3338 		 * v4 nce_t's do not have nce_fastpath set.
3339 		 */
3340 		return;
3341 	}
3342 
3343 	mutex_enter(&ill->ill_lock);
3344 	if (nce->nce_fastpath == NULL)
3345 		goto done;
3346 
3347 	ASSERT(ill->ill_fastpath_list != &ill->ill_fastpath_list);
3348 
3349 	if (ill->ill_fastpath_list == nce) {
3350 		ill->ill_fastpath_list = nce->nce_fastpath;
3351 	} else {
3352 		nce_ptr = ill->ill_fastpath_list;
3353 		while (nce_ptr != (nce_t *)&ill->ill_fastpath_list) {
3354 			if (nce_ptr->nce_fastpath == nce) {
3355 				nce_ptr->nce_fastpath = nce->nce_fastpath;
3356 				break;
3357 			}
3358 			nce_ptr = nce_ptr->nce_fastpath;
3359 		}
3360 	}
3361 
3362 	nce->nce_fastpath = NULL;
3363 done:
3364 	mutex_exit(&ill->ill_lock);
3365 }
3366 
3367 /*
3368  * Update all NCE's that are not in fastpath mode and
3369  * have an nce_fp_mp that matches mp. mp->b_cont contains
3370  * the fastpath header.
3371  *
3372  * Returns TRUE if entry should be dequeued, or FALSE otherwise.
3373  */
3374 boolean_t
3375 ndp_fastpath_update(nce_t *nce, void *arg)
3376 {
3377 	mblk_t 	*mp, *fp_mp;
3378 	uchar_t	*mp_rptr, *ud_mp_rptr;
3379 	mblk_t	*ud_mp = nce->nce_res_mp;
3380 	ptrdiff_t	cmplen;
3381 
3382 	if (nce->nce_flags & NCE_F_MAPPING)
3383 		return (B_TRUE);
3384 	if ((nce->nce_fp_mp != NULL) || (ud_mp == NULL))
3385 		return (B_TRUE);
3386 
3387 	ip2dbg(("ndp_fastpath_update: trying\n"));
3388 	mp = (mblk_t *)arg;
3389 	mp_rptr = mp->b_rptr;
3390 	cmplen = mp->b_wptr - mp_rptr;
3391 	ASSERT(cmplen >= 0);
3392 	ud_mp_rptr = ud_mp->b_rptr;
3393 	/*
3394 	 * The nce is locked here to prevent any other threads
3395 	 * from accessing and changing nce_res_mp when the IPv6 address
3396 	 * becomes resolved to an lla while we're in the middle
3397 	 * of looking at and comparing the hardware address (lla).
3398 	 * It is also locked to prevent multiple threads in nce_fastpath_update
3399 	 * from examining nce_res_mp atthe same time.
3400 	 */
3401 	mutex_enter(&nce->nce_lock);
3402 	if (ud_mp->b_wptr - ud_mp_rptr != cmplen ||
3403 	    bcmp((char *)mp_rptr, (char *)ud_mp_rptr, cmplen) != 0) {
3404 		mutex_exit(&nce->nce_lock);
3405 		/*
3406 		 * Don't take the ire off the fastpath list yet,
3407 		 * since the response may come later.
3408 		 */
3409 		return (B_FALSE);
3410 	}
3411 	/* Matched - install mp as the fastpath mp */
3412 	ip1dbg(("ndp_fastpath_update: match\n"));
3413 	fp_mp = dupb(mp->b_cont);
3414 	if (fp_mp != NULL) {
3415 		nce->nce_fp_mp = fp_mp;
3416 	}
3417 	mutex_exit(&nce->nce_lock);
3418 	return (B_TRUE);
3419 }
3420 
3421 /*
3422  * This function handles the DL_NOTE_FASTPATH_FLUSH notification from
3423  * driver.  Note that it assumes IP is exclusive...
3424  */
3425 /* ARGSUSED */
3426 void
3427 ndp_fastpath_flush(nce_t *nce, char *arg)
3428 {
3429 	if (nce->nce_flags & NCE_F_MAPPING)
3430 		return;
3431 	/* No fastpath info? */
3432 	if (nce->nce_fp_mp == NULL || nce->nce_res_mp == NULL)
3433 		return;
3434 
3435 	/* Just delete the NCE... */
3436 	ndp_delete(nce);
3437 }
3438 
3439 /*
3440  * Return a pointer to a given option in the packet.
3441  * Assumes that option part of the packet have already been validated.
3442  */
3443 nd_opt_hdr_t *
3444 ndp_get_option(nd_opt_hdr_t *opt, int optlen, int opt_type)
3445 {
3446 	while (optlen > 0) {
3447 		if (opt->nd_opt_type == opt_type)
3448 			return (opt);
3449 		optlen -= 8 * opt->nd_opt_len;
3450 		opt = (struct nd_opt_hdr *)((char *)opt + 8 * opt->nd_opt_len);
3451 	}
3452 	return (NULL);
3453 }
3454 
3455 /*
3456  * Verify all option lengths present are > 0, also check to see
3457  * if the option lengths and packet length are consistent.
3458  */
3459 boolean_t
3460 ndp_verify_optlen(nd_opt_hdr_t *opt, int optlen)
3461 {
3462 	ASSERT(opt != NULL);
3463 	while (optlen > 0) {
3464 		if (opt->nd_opt_len == 0)
3465 			return (B_FALSE);
3466 		optlen -= 8 * opt->nd_opt_len;
3467 		if (optlen < 0)
3468 			return (B_FALSE);
3469 		opt = (struct nd_opt_hdr *)((char *)opt + 8 * opt->nd_opt_len);
3470 	}
3471 	return (B_TRUE);
3472 }
3473 
3474 /*
3475  * ndp_walk function.
3476  * Free a fraction of the NCE cache entries.
3477  * A fraction of zero means to not free any in that category.
3478  */
3479 void
3480 ndp_cache_reclaim(nce_t *nce, char *arg)
3481 {
3482 	nce_cache_reclaim_t *ncr = (nce_cache_reclaim_t *)arg;
3483 	uint_t	rand;
3484 
3485 	if (nce->nce_flags & NCE_F_PERMANENT)
3486 		return;
3487 
3488 	rand = (uint_t)lbolt +
3489 	    NCE_ADDR_HASH_V6(nce->nce_addr, NCE_TABLE_SIZE);
3490 	if (ncr->ncr_host != 0 &&
3491 	    (rand/ncr->ncr_host)*ncr->ncr_host == rand) {
3492 		ndp_delete(nce);
3493 		return;
3494 	}
3495 }
3496 
3497 /*
3498  * ndp_walk function.
3499  * Count the number of NCEs that can be deleted.
3500  * These would be hosts but not routers.
3501  */
3502 void
3503 ndp_cache_count(nce_t *nce, char *arg)
3504 {
3505 	ncc_cache_count_t *ncc = (ncc_cache_count_t *)arg;
3506 
3507 	if (nce->nce_flags & NCE_F_PERMANENT)
3508 		return;
3509 
3510 	ncc->ncc_total++;
3511 	if (!(nce->nce_flags & NCE_F_ISROUTER))
3512 		ncc->ncc_host++;
3513 }
3514 
3515 #ifdef NCE_DEBUG
3516 th_trace_t *
3517 th_trace_nce_lookup(nce_t *nce)
3518 {
3519 	int bucket_id;
3520 	th_trace_t *th_trace;
3521 
3522 	ASSERT(MUTEX_HELD(&nce->nce_lock));
3523 
3524 	bucket_id = IP_TR_HASH(curthread);
3525 	ASSERT(bucket_id < IP_TR_HASH_MAX);
3526 
3527 	for (th_trace = nce->nce_trace[bucket_id]; th_trace != NULL;
3528 	    th_trace = th_trace->th_next) {
3529 		if (th_trace->th_id == curthread)
3530 			return (th_trace);
3531 	}
3532 	return (NULL);
3533 }
3534 
3535 void
3536 nce_trace_ref(nce_t *nce)
3537 {
3538 	int bucket_id;
3539 	th_trace_t *th_trace;
3540 
3541 	/*
3542 	 * Attempt to locate the trace buffer for the curthread.
3543 	 * If it does not exist, then allocate a new trace buffer
3544 	 * and link it in list of trace bufs for this ipif, at the head
3545 	 */
3546 	ASSERT(MUTEX_HELD(&nce->nce_lock));
3547 
3548 	if (nce->nce_trace_disable == B_TRUE)
3549 		return;
3550 
3551 	th_trace = th_trace_nce_lookup(nce);
3552 	if (th_trace == NULL) {
3553 		bucket_id = IP_TR_HASH(curthread);
3554 		th_trace = (th_trace_t *)kmem_zalloc(sizeof (th_trace_t),
3555 		    KM_NOSLEEP);
3556 		if (th_trace == NULL) {
3557 			nce->nce_trace_disable = B_TRUE;
3558 			nce_trace_inactive(nce);
3559 			return;
3560 		}
3561 		th_trace->th_id = curthread;
3562 		th_trace->th_next = nce->nce_trace[bucket_id];
3563 		th_trace->th_prev = &nce->nce_trace[bucket_id];
3564 		if (th_trace->th_next != NULL)
3565 			th_trace->th_next->th_prev = &th_trace->th_next;
3566 		nce->nce_trace[bucket_id] = th_trace;
3567 	}
3568 	ASSERT(th_trace->th_refcnt < TR_BUF_MAX - 1);
3569 	th_trace->th_refcnt++;
3570 	th_trace_rrecord(th_trace);
3571 }
3572 
3573 void
3574 nce_untrace_ref(nce_t *nce)
3575 {
3576 	th_trace_t *th_trace;
3577 
3578 	ASSERT(MUTEX_HELD(&nce->nce_lock));
3579 
3580 	if (nce->nce_trace_disable == B_TRUE)
3581 		return;
3582 
3583 	th_trace = th_trace_nce_lookup(nce);
3584 	ASSERT(th_trace != NULL && th_trace->th_refcnt > 0);
3585 
3586 	th_trace_rrecord(th_trace);
3587 	th_trace->th_refcnt--;
3588 }
3589 
3590 void
3591 nce_trace_inactive(nce_t *nce)
3592 {
3593 	th_trace_t *th_trace;
3594 	int i;
3595 
3596 	ASSERT(MUTEX_HELD(&nce->nce_lock));
3597 
3598 	for (i = 0; i < IP_TR_HASH_MAX; i++) {
3599 		while (nce->nce_trace[i] != NULL) {
3600 			th_trace = nce->nce_trace[i];
3601 
3602 			/* unlink th_trace and free it */
3603 			nce->nce_trace[i] = th_trace->th_next;
3604 			if (th_trace->th_next != NULL)
3605 				th_trace->th_next->th_prev =
3606 				    &nce->nce_trace[i];
3607 
3608 			th_trace->th_next = NULL;
3609 			th_trace->th_prev = NULL;
3610 			kmem_free(th_trace, sizeof (th_trace_t));
3611 		}
3612 	}
3613 
3614 }
3615 
3616 /* ARGSUSED */
3617 int
3618 nce_thread_exit(nce_t *nce, caddr_t arg)
3619 {
3620 	th_trace_t	*th_trace;
3621 
3622 	mutex_enter(&nce->nce_lock);
3623 	th_trace = th_trace_nce_lookup(nce);
3624 
3625 	if (th_trace == NULL) {
3626 		mutex_exit(&nce->nce_lock);
3627 		return (0);
3628 	}
3629 
3630 	ASSERT(th_trace->th_refcnt == 0);
3631 
3632 	/* unlink th_trace and free it */
3633 	*th_trace->th_prev = th_trace->th_next;
3634 	if (th_trace->th_next != NULL)
3635 		th_trace->th_next->th_prev = th_trace->th_prev;
3636 	th_trace->th_next = NULL;
3637 	th_trace->th_prev = NULL;
3638 	kmem_free(th_trace, sizeof (th_trace_t));
3639 	mutex_exit(&nce->nce_lock);
3640 	return (0);
3641 }
3642 #endif
3643 
3644 /*
3645  * Called when address resolution fails due to a timeout.
3646  * Send an ICMP unreachable in response to all queued packets.
3647  */
3648 void
3649 arp_resolv_failed(nce_t *nce)
3650 {
3651 	mblk_t	*mp, *nxt_mp, *first_mp;
3652 	char	buf[INET6_ADDRSTRLEN];
3653 	zoneid_t zoneid = GLOBAL_ZONEID;
3654 	struct in_addr ipv4addr;
3655 
3656 	IN6_V4MAPPED_TO_INADDR(&nce->nce_addr, &ipv4addr);
3657 	ip3dbg(("arp_resolv_failed: dst %s\n",
3658 	    inet_ntop(AF_INET, &ipv4addr, buf, sizeof (buf))));
3659 	mutex_enter(&nce->nce_lock);
3660 	mp = nce->nce_qd_mp;
3661 	nce->nce_qd_mp = NULL;
3662 	mutex_exit(&nce->nce_lock);
3663 
3664 	while (mp != NULL) {
3665 		nxt_mp = mp->b_next;
3666 		mp->b_next = NULL;
3667 		mp->b_prev = NULL;
3668 
3669 		first_mp = mp;
3670 		/*
3671 		 * Send icmp unreachable messages
3672 		 * to the hosts.
3673 		 */
3674 		(void) ip_hdr_complete((ipha_t *)mp->b_rptr, zoneid);
3675 		ip3dbg(("arp_resolv_failed: Calling icmp_unreachable\n"));
3676 		icmp_unreachable(nce->nce_ill->ill_wq, first_mp,
3677 		    ICMP_HOST_UNREACHABLE);
3678 		mp = nxt_mp;
3679 	}
3680 }
3681 
3682 static int
3683 ndp_lookup_then_add_v4(ill_t *ill, uchar_t *hw_addr, const in_addr_t *addr,
3684     const in_addr_t *mask, const in_addr_t *extract_mask,
3685     uint32_t hw_extract_start, uint16_t flags, uint16_t state,
3686     nce_t **newnce, mblk_t *fp_mp, mblk_t *res_mp)
3687 {
3688 	int	err = 0;
3689 	nce_t	*nce;
3690 	in6_addr_t addr6;
3691 
3692 	mutex_enter(&ndp4.ndp_g_lock);
3693 	nce = *((nce_t **)NCE_HASH_PTR_V4(*addr));
3694 	IN6_IPADDR_TO_V4MAPPED(*addr, &addr6);
3695 	nce = nce_lookup_addr(ill, &addr6, nce);
3696 	if (nce == NULL) {
3697 		err = ndp_add_v4(ill,
3698 		    hw_addr,
3699 		    addr,
3700 		    mask,
3701 		    extract_mask,
3702 		    hw_extract_start,
3703 		    flags,
3704 		    state,
3705 		    newnce,
3706 		    fp_mp,
3707 		    res_mp);
3708 	} else {
3709 		*newnce = nce;
3710 		err = EEXIST;
3711 	}
3712 	mutex_exit(&ndp4.ndp_g_lock);
3713 	return (err);
3714 }
3715 
3716 /*
3717  * NDP Cache Entry creation routine for IPv4.
3718  * Mapped entries are handled in arp.
3719  * This routine must always be called with ndp4.ndp_g_lock held.
3720  * Prior to return, nce_refcnt is incremented.
3721  */
3722 static int
3723 ndp_add_v4(ill_t *ill, uchar_t *hw_addr, const in_addr_t *addr,
3724     const in_addr_t *mask, const in_addr_t *extract_mask,
3725     uint32_t hw_extract_start, uint16_t flags, uint16_t state,
3726     nce_t **newnce, mblk_t *fp_mp, mblk_t *res_mp)
3727 {
3728 	static	nce_t		nce_nil;
3729 	nce_t		*nce;
3730 	mblk_t		*mp;
3731 	mblk_t		*template;
3732 	nce_t		**ncep;
3733 
3734 	ASSERT(MUTEX_HELD(&ndp4.ndp_g_lock));
3735 	ASSERT(ill != NULL);
3736 	if ((flags & ~NCE_EXTERNAL_FLAGS_MASK)) {
3737 		return (EINVAL);
3738 	}
3739 	ASSERT((flags & NCE_F_MAPPING) == 0);
3740 	ASSERT(extract_mask == NULL);
3741 	/*
3742 	 * Allocate the mblk to hold the nce.
3743 	 */
3744 	mp = allocb(sizeof (nce_t), BPRI_MED);
3745 	if (mp == NULL)
3746 		return (ENOMEM);
3747 
3748 	nce = (nce_t *)mp->b_rptr;
3749 	mp->b_wptr = (uchar_t *)&nce[1];
3750 	*nce = nce_nil;
3751 
3752 	/*
3753 	 * This one holds link layer address; if res_mp has been provided
3754 	 * by the caller, accept it without any further checks. Otherwise,
3755 	 * for V4, we fill it up with ill_resolver_mp here, then in
3756 	 * in ire_arpresolve(), we fill it up with the ARP query
3757 	 * once its formulated.
3758 	 */
3759 	if (res_mp != NULL) {
3760 		template = res_mp;
3761 	} else  {
3762 		template = copyb(ill->ill_resolver_mp);
3763 	}
3764 	if (template == NULL) {
3765 		freeb(mp);
3766 		return (ENOMEM);
3767 	}
3768 	nce->nce_ill = ill;
3769 	nce->nce_ipversion = IPV4_VERSION;
3770 	nce->nce_flags = flags;
3771 	nce->nce_state = state;
3772 	nce->nce_pcnt = ND_MAX_UNICAST_SOLICIT;
3773 	nce->nce_rcnt = ill->ill_xmit_count;
3774 	IN6_IPADDR_TO_V4MAPPED(*addr, &nce->nce_addr);
3775 	if (*mask == IP_HOST_MASK) {
3776 		nce->nce_mask = ipv6_all_ones;
3777 	} else  {
3778 		IN6_IPADDR_TO_V4MAPPED(*mask, &nce->nce_mask);
3779 	}
3780 	nce->nce_extract_mask = ipv6_all_zeros;
3781 	nce->nce_ll_extract_start = hw_extract_start;
3782 	nce->nce_fp_mp = (fp_mp? fp_mp : NULL);
3783 	nce->nce_res_mp = template;
3784 	if (state == ND_REACHABLE)
3785 		nce->nce_last = TICK_TO_MSEC(lbolt64);
3786 	else
3787 		nce->nce_last = 0;
3788 	nce->nce_qd_mp = NULL;
3789 	nce->nce_mp = mp;
3790 	if (hw_addr != NULL)
3791 		nce_set_ll(nce, hw_addr);
3792 	/* This one is for nce getting created */
3793 	nce->nce_refcnt = 1;
3794 	mutex_init(&nce->nce_lock, NULL, MUTEX_DEFAULT, NULL);
3795 	ncep = ((nce_t **)NCE_HASH_PTR_V4(*addr));
3796 
3797 #ifdef NCE_DEBUG
3798 	bzero(nce->nce_trace, sizeof (th_trace_t *) * IP_TR_HASH_MAX);
3799 #endif
3800 	/*
3801 	 * Atomically ensure that the ill is not CONDEMNED, before
3802 	 * adding the NCE.
3803 	 */
3804 	mutex_enter(&ill->ill_lock);
3805 	if (ill->ill_state_flags & ILL_CONDEMNED) {
3806 		mutex_exit(&ill->ill_lock);
3807 		freeb(mp);
3808 		if (res_mp == NULL) {
3809 			/*
3810 			 * template was locally allocated. need to free it.
3811 			 */
3812 			freeb(template);
3813 		}
3814 		return (EINVAL);
3815 	}
3816 	if ((nce->nce_next = *ncep) != NULL)
3817 		nce->nce_next->nce_ptpn = &nce->nce_next;
3818 	*ncep = nce;
3819 	nce->nce_ptpn = ncep;
3820 	*newnce = nce;
3821 	/* This one is for nce being used by an active thread */
3822 	NCE_REFHOLD(*newnce);
3823 
3824 	/* Bump up the number of nce's referencing this ill */
3825 	ill->ill_nce_cnt++;
3826 	mutex_exit(&ill->ill_lock);
3827 	return (0);
3828 }
3829 
3830 void
3831 ndp_flush_qd_mp(nce_t *nce)
3832 {
3833 	mblk_t *qd_mp, *qd_next;
3834 
3835 	ASSERT(MUTEX_HELD(&nce->nce_lock));
3836 	qd_mp = nce->nce_qd_mp;
3837 	nce->nce_qd_mp = NULL;
3838 	while (qd_mp != NULL) {
3839 		qd_next = qd_mp->b_next;
3840 		qd_mp->b_next = NULL;
3841 		qd_mp->b_prev = NULL;
3842 		freemsg(qd_mp);
3843 		qd_mp = qd_next;
3844 	}
3845 }
3846 
3847 nce_t *
3848 nce_reinit(nce_t *nce)
3849 {
3850 	nce_t *newnce = NULL;
3851 	in_addr_t nce_addr, nce_mask;
3852 
3853 	IN6_V4MAPPED_TO_IPADDR(&nce->nce_addr, nce_addr);
3854 	IN6_V4MAPPED_TO_IPADDR(&nce->nce_mask, nce_mask);
3855 	/*
3856 	 * delete the old one. this will get rid of any ire's pointing
3857 	 * at this nce.
3858 	 */
3859 	ndp_delete(nce);
3860 	/*
3861 	 * create a new nce with the same addr and mask.
3862 	 */
3863 	mutex_enter(&ndp4.ndp_g_lock);
3864 	(void) ndp_add_v4(nce->nce_ill, NULL, &nce_addr, &nce_mask, NULL, 0, 0,
3865 	    ND_INITIAL, &newnce, NULL, NULL);
3866 	mutex_exit(&ndp4.ndp_g_lock);
3867 	/*
3868 	 * refrele the old nce.
3869 	 */
3870 	NCE_REFRELE(nce);
3871 	return (newnce);
3872 }
3873 
3874 /*
3875  * ndp_walk routine to delete all entries that have a given destination or
3876  * gateway address and cached link layer (MAC) address.  This is used when ARP
3877  * informs us that a network-to-link-layer mapping may have changed.
3878  */
3879 void
3880 nce_delete_hw_changed(nce_t *nce, void *arg)
3881 {
3882 	nce_hw_map_t *hwm = arg;
3883 	mblk_t *mp;
3884 	dl_unitdata_req_t *dlu;
3885 	uchar_t *macaddr;
3886 	ill_t *ill;
3887 	int saplen;
3888 	ipaddr_t nce_addr;
3889 
3890 	if (nce->nce_state != ND_REACHABLE)
3891 		return;
3892 
3893 	IN6_V4MAPPED_TO_IPADDR(&nce->nce_addr, nce_addr);
3894 	if (nce_addr != hwm->hwm_addr)
3895 		return;
3896 
3897 	mutex_enter(&nce->nce_lock);
3898 	if ((mp = nce->nce_res_mp) == NULL) {
3899 		mutex_exit(&nce->nce_lock);
3900 		return;
3901 	}
3902 	dlu = (dl_unitdata_req_t *)mp->b_rptr;
3903 	macaddr = (uchar_t *)(dlu + 1);
3904 	ill = nce->nce_ill;
3905 	if ((saplen = ill->ill_sap_length) > 0)
3906 		macaddr += saplen;
3907 	else
3908 		saplen = -saplen;
3909 
3910 	/*
3911 	 * If the hardware address is unchanged, then leave this one alone.
3912 	 * Note that saplen == abs(saplen) now.
3913 	 */
3914 	if (hwm->hwm_hwlen == dlu->dl_dest_addr_length - saplen &&
3915 	    bcmp(hwm->hwm_hwaddr, macaddr, hwm->hwm_hwlen) == 0) {
3916 		mutex_exit(&nce->nce_lock);
3917 		return;
3918 	}
3919 	mutex_exit(&nce->nce_lock);
3920 
3921 	DTRACE_PROBE1(nce__hw__deleted, nce_t *, nce);
3922 	ndp_delete(nce);
3923 }
3924 
3925 /*
3926  * This function verifies whether a given IPv4 address is potentially known to
3927  * the NCE subsystem.  If so, then ARP must not delete the corresponding ace_t,
3928  * so that it can continue to look for hardware changes on that address.
3929  */
3930 boolean_t
3931 ndp_lookup_ipaddr(in_addr_t addr)
3932 {
3933 	nce_t		*nce;
3934 	struct in_addr	nceaddr;
3935 
3936 	if (addr == INADDR_ANY)
3937 		return (B_FALSE);
3938 
3939 	mutex_enter(&ndp4.ndp_g_lock);
3940 	nce = *(nce_t **)NCE_HASH_PTR_V4(addr);
3941 	for (; nce != NULL; nce = nce->nce_next) {
3942 		/* Note that only v4 mapped entries are in the table. */
3943 		IN6_V4MAPPED_TO_INADDR(&nce->nce_addr, &nceaddr);
3944 		if (addr == nceaddr.s_addr &&
3945 		    IN6_ARE_ADDR_EQUAL(&nce->nce_mask, &ipv6_all_ones)) {
3946 			/* Single flag check; no lock needed */
3947 			if (!(nce->nce_flags & NCE_F_CONDEMNED))
3948 				break;
3949 		}
3950 	}
3951 	mutex_exit(&ndp4.ndp_g_lock);
3952 	return (nce != NULL);
3953 }
3954