xref: /titanic_44/usr/src/uts/common/inet/ip/ip_ndp.c (revision 5151fb1220e0ceafdc172203863c73da4285c170)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 #include <sys/types.h>
29 #include <sys/stream.h>
30 #include <sys/stropts.h>
31 #include <sys/strsun.h>
32 #include <sys/sysmacros.h>
33 #include <sys/errno.h>
34 #include <sys/dlpi.h>
35 #include <sys/socket.h>
36 #include <sys/ddi.h>
37 #include <sys/sunddi.h>
38 #include <sys/cmn_err.h>
39 #include <sys/debug.h>
40 #include <sys/vtrace.h>
41 #include <sys/kmem.h>
42 #include <sys/zone.h>
43 #include <sys/ethernet.h>
44 #include <sys/sdt.h>
45 
46 #include <net/if.h>
47 #include <net/if_types.h>
48 #include <net/if_dl.h>
49 #include <net/route.h>
50 #include <netinet/in.h>
51 #include <netinet/ip6.h>
52 #include <netinet/icmp6.h>
53 
54 #include <inet/common.h>
55 #include <inet/mi.h>
56 #include <inet/mib2.h>
57 #include <inet/nd.h>
58 #include <inet/ip.h>
59 #include <inet/ip_impl.h>
60 #include <inet/ipclassifier.h>
61 #include <inet/ip_if.h>
62 #include <inet/ip_ire.h>
63 #include <inet/ip_rts.h>
64 #include <inet/ip6.h>
65 #include <inet/ip_ndp.h>
66 #include <inet/ipsec_impl.h>
67 #include <inet/ipsec_info.h>
68 #include <inet/sctp_ip.h>
69 
70 /*
71  * Function names with nce_ prefix are static while function
72  * names with ndp_ prefix are used by rest of the IP.
73  *
74  * Lock ordering:
75  *
76  *	ndp_g_lock -> ill_lock -> nce_lock
77  *
78  * The ndp_g_lock protects the NCE hash (nce_hash_tbl, NCE_HASH_PTR) and
79  * nce_next.  Nce_lock protects the contents of the NCE (particularly
80  * nce_refcnt).
81  */
82 
83 static	boolean_t nce_cmp_ll_addr(const nce_t *nce, const uchar_t *new_ll_addr,
84     uint32_t ll_addr_len);
85 static	void	nce_ire_delete(nce_t *nce);
86 static	void	nce_ire_delete1(ire_t *ire, char *nce_arg);
87 static	void 	nce_set_ll(nce_t *nce, uchar_t *ll_addr);
88 static	nce_t	*nce_lookup_addr(ill_t *, const in6_addr_t *, nce_t *);
89 static	nce_t	*nce_lookup_mapping(ill_t *ill, const in6_addr_t *addr);
90 static	void	nce_make_mapping(nce_t *nce, uchar_t *addrpos,
91     uchar_t *addr);
92 static	int	nce_set_multicast(ill_t *ill, const in6_addr_t *addr);
93 static	void	nce_queue_mp(nce_t *nce, mblk_t *mp);
94 static	void	nce_report1(nce_t *nce, uchar_t *mp_arg);
95 static	mblk_t	*nce_udreq_alloc(ill_t *ill);
96 static	void	nce_update(nce_t *nce, uint16_t new_state,
97     uchar_t *new_ll_addr);
98 static	uint32_t	nce_solicit(nce_t *nce, mblk_t *mp);
99 static	boolean_t	nce_xmit(ill_t *ill, uint32_t operation,
100     ill_t *hwaddr_ill, boolean_t use_lla_addr, const in6_addr_t *sender,
101     const in6_addr_t *target, int flag);
102 extern void	th_trace_rrecord(th_trace_t *);
103 static	int	ndp_lookup_then_add_v6(ill_t *, uchar_t *,
104     const in6_addr_t *, const in6_addr_t *, const in6_addr_t *,
105     uint32_t, uint16_t, uint16_t, nce_t **, mblk_t *, mblk_t *);
106 static	int	ndp_lookup_then_add_v4(ill_t *, uchar_t *,
107     const in_addr_t *, const in_addr_t *, const in_addr_t *,
108     uint32_t, uint16_t, uint16_t, nce_t **, mblk_t *, mblk_t *);
109 static	int	ndp_add_v6(ill_t *, uchar_t *, const in6_addr_t *,
110     const in6_addr_t *, const in6_addr_t *, uint32_t, uint16_t, uint16_t,
111     nce_t **);
112 static	int	ndp_add_v4(ill_t *, uchar_t *, const in_addr_t *,
113     const in_addr_t *, const in_addr_t *, uint32_t, uint16_t, uint16_t,
114     nce_t **, mblk_t *, mblk_t *);
115 
116 
117 #ifdef NCE_DEBUG
118 void	nce_trace_inactive(nce_t *);
119 #endif
120 
121 #define	NCE_HASH_PTR_V4(ipst, addr)					\
122 	(&((ipst)->ips_ndp4->nce_hash_tbl[IRE_ADDR_HASH(addr, NCE_TABLE_SIZE)]))
123 
124 #define	NCE_HASH_PTR_V6(ipst, addr)				 \
125 	(&((ipst)->ips_ndp6->nce_hash_tbl[NCE_ADDR_HASH_V6(addr, \
126 		NCE_TABLE_SIZE)]))
127 
128 /*
129  * Compute default flags to use for an advertisement of this nce's address.
130  */
131 static int
132 nce_advert_flags(const nce_t *nce)
133 {
134 	int flag = 0;
135 
136 	if (nce->nce_flags & NCE_F_ISROUTER)
137 		flag |= NDP_ISROUTER;
138 	if (!(nce->nce_flags & NCE_F_PROXY))
139 		flag |= NDP_ORIDE;
140 	return (flag);
141 }
142 
143 int
144 ndp_add(ill_t *ill, uchar_t *hw_addr, const void *addr,
145     const void *mask, const void *extract_mask,
146     uint32_t hw_extract_start, uint16_t flags, uint16_t state,
147     nce_t **newnce, mblk_t *fp_mp, mblk_t *res_mp)
148 {
149 	int status;
150 
151 	if (ill->ill_isv6)
152 		status = ndp_add_v6(ill, hw_addr, (in6_addr_t *)addr,
153 		    (in6_addr_t *)mask, (in6_addr_t *)extract_mask,
154 		    hw_extract_start, flags, state, newnce);
155 	else
156 		status = ndp_add_v4(ill, hw_addr, (in_addr_t *)addr,
157 		    (in_addr_t *)mask, (in_addr_t *)extract_mask,
158 		    hw_extract_start, flags, state, newnce, fp_mp, res_mp);
159 	return (status);
160 }
161 
162 /* Non-tunable probe interval, based on link capabilities */
163 #define	ILL_PROBE_INTERVAL(ill)	((ill)->ill_note_link ? 150 : 1500)
164 
165 /*
166  * NDP Cache Entry creation routine.
167  * Mapped entries will never do NUD .
168  * This routine must always be called with ndp6->ndp_g_lock held.
169  * Prior to return, nce_refcnt is incremented.
170  */
171 static int
172 ndp_add_v6(ill_t *ill, uchar_t *hw_addr, const in6_addr_t *addr,
173     const in6_addr_t *mask, const in6_addr_t *extract_mask,
174     uint32_t hw_extract_start, uint16_t flags, uint16_t state,
175     nce_t **newnce)
176 {
177 	static	nce_t		nce_nil;
178 	nce_t		*nce;
179 	mblk_t		*mp;
180 	mblk_t		*template;
181 	nce_t		**ncep;
182 	int		err;
183 	boolean_t	dropped = B_FALSE;
184 	ip_stack_t	*ipst = ill->ill_ipst;
185 
186 	ASSERT(MUTEX_HELD(&ipst->ips_ndp6->ndp_g_lock));
187 	ASSERT(ill != NULL && ill->ill_isv6);
188 	if (IN6_IS_ADDR_UNSPECIFIED(addr)) {
189 		ip0dbg(("ndp_add: no addr\n"));
190 		return (EINVAL);
191 	}
192 	if ((flags & ~NCE_EXTERNAL_FLAGS_MASK)) {
193 		ip0dbg(("ndp_add: flags = %x\n", (int)flags));
194 		return (EINVAL);
195 	}
196 	if (IN6_IS_ADDR_UNSPECIFIED(extract_mask) &&
197 	    (flags & NCE_F_MAPPING)) {
198 		ip0dbg(("ndp_add: extract mask zero for mapping"));
199 		return (EINVAL);
200 	}
201 	/*
202 	 * Allocate the mblk to hold the nce.
203 	 *
204 	 * XXX This can come out of a separate cache - nce_cache.
205 	 * We don't need the mp anymore as there are no more
206 	 * "qwriter"s
207 	 */
208 	mp = allocb(sizeof (nce_t), BPRI_MED);
209 	if (mp == NULL)
210 		return (ENOMEM);
211 
212 	nce = (nce_t *)mp->b_rptr;
213 	mp->b_wptr = (uchar_t *)&nce[1];
214 	*nce = nce_nil;
215 
216 	/*
217 	 * This one holds link layer address
218 	 */
219 	if (ill->ill_net_type == IRE_IF_RESOLVER) {
220 		template = nce_udreq_alloc(ill);
221 	} else {
222 		if (ill->ill_resolver_mp == NULL) {
223 			freeb(mp);
224 			return (EINVAL);
225 		}
226 		ASSERT((ill->ill_net_type == IRE_IF_NORESOLVER));
227 		template = copyb(ill->ill_resolver_mp);
228 	}
229 	if (template == NULL) {
230 		freeb(mp);
231 		return (ENOMEM);
232 	}
233 	nce->nce_ill = ill;
234 	nce->nce_ipversion = IPV6_VERSION;
235 	nce->nce_flags = flags;
236 	nce->nce_state = state;
237 	nce->nce_pcnt = ND_MAX_UNICAST_SOLICIT;
238 	nce->nce_rcnt = ill->ill_xmit_count;
239 	nce->nce_addr = *addr;
240 	nce->nce_mask = *mask;
241 	nce->nce_extract_mask = *extract_mask;
242 	nce->nce_ll_extract_start = hw_extract_start;
243 	nce->nce_fp_mp = NULL;
244 	nce->nce_res_mp = template;
245 	if (state == ND_REACHABLE)
246 		nce->nce_last = TICK_TO_MSEC(lbolt64);
247 	else
248 		nce->nce_last = 0;
249 	nce->nce_qd_mp = NULL;
250 	nce->nce_mp = mp;
251 	if (hw_addr != NULL)
252 		nce_set_ll(nce, hw_addr);
253 	/* This one is for nce getting created */
254 	nce->nce_refcnt = 1;
255 	mutex_init(&nce->nce_lock, NULL, MUTEX_DEFAULT, NULL);
256 	if (nce->nce_flags & NCE_F_MAPPING) {
257 		ASSERT(IN6_IS_ADDR_MULTICAST(addr));
258 		ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&nce->nce_mask));
259 		ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&nce->nce_extract_mask));
260 		ncep = &ipst->ips_ndp6->nce_mask_entries;
261 	} else {
262 		ncep = ((nce_t **)NCE_HASH_PTR_V6(ipst, *addr));
263 	}
264 
265 #ifdef NCE_DEBUG
266 	bzero(nce->nce_trace, sizeof (th_trace_t *) * IP_TR_HASH_MAX);
267 #endif
268 	/*
269 	 * Atomically ensure that the ill is not CONDEMNED, before
270 	 * adding the NCE.
271 	 */
272 	mutex_enter(&ill->ill_lock);
273 	if (ill->ill_state_flags & ILL_CONDEMNED) {
274 		mutex_exit(&ill->ill_lock);
275 		freeb(mp);
276 		freeb(template);
277 		return (EINVAL);
278 	}
279 	if ((nce->nce_next = *ncep) != NULL)
280 		nce->nce_next->nce_ptpn = &nce->nce_next;
281 	*ncep = nce;
282 	nce->nce_ptpn = ncep;
283 	*newnce = nce;
284 	/* This one is for nce being used by an active thread */
285 	NCE_REFHOLD(*newnce);
286 
287 	/* Bump up the number of nce's referencing this ill */
288 	ill->ill_nce_cnt++;
289 	mutex_exit(&ill->ill_lock);
290 
291 	err = 0;
292 	if ((flags & NCE_F_PERMANENT) && state == ND_PROBE) {
293 		mutex_enter(&nce->nce_lock);
294 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
295 		nce->nce_pcnt = ND_MAX_UNICAST_SOLICIT;
296 		mutex_exit(&nce->nce_lock);
297 		dropped = nce_xmit(ill, ND_NEIGHBOR_SOLICIT, NULL, B_FALSE,
298 		    &ipv6_all_zeros, addr, NDP_PROBE);
299 		if (dropped) {
300 			mutex_enter(&nce->nce_lock);
301 			nce->nce_pcnt++;
302 			mutex_exit(&nce->nce_lock);
303 		}
304 		NDP_RESTART_TIMER(nce, ILL_PROBE_INTERVAL(ill));
305 		mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
306 		err = EINPROGRESS;
307 	} else if (flags & NCE_F_UNSOL_ADV) {
308 		/*
309 		 * We account for the transmit below by assigning one
310 		 * less than the ndd variable. Subsequent decrements
311 		 * are done in ndp_timer.
312 		 */
313 		mutex_enter(&nce->nce_lock);
314 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
315 		nce->nce_unsolicit_count = ipst->ips_ip_ndp_unsolicit_count - 1;
316 		mutex_exit(&nce->nce_lock);
317 		dropped = nce_xmit(ill,
318 		    ND_NEIGHBOR_ADVERT,
319 		    ill,	/* ill to be used for extracting ill_nd_lla */
320 		    B_TRUE,	/* use ill_nd_lla */
321 		    addr,	/* Source and target of the advertisement pkt */
322 		    &ipv6_all_hosts_mcast, /* Destination of the packet */
323 		    nce_advert_flags(nce));
324 		mutex_enter(&nce->nce_lock);
325 		if (dropped)
326 			nce->nce_unsolicit_count++;
327 		if (nce->nce_unsolicit_count != 0) {
328 			nce->nce_timeout_id = timeout(ndp_timer, nce,
329 			    MSEC_TO_TICK(ipst->ips_ip_ndp_unsolicit_interval));
330 		}
331 		mutex_exit(&nce->nce_lock);
332 		mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
333 	}
334 	/*
335 	 * If the hw_addr is NULL, typically for ND_INCOMPLETE nces, then
336 	 * we call nce_fastpath as soon as the nce is resolved in ndp_process.
337 	 * We call nce_fastpath from nce_update if the link layer address of
338 	 * the peer changes from nce_update
339 	 */
340 	if (hw_addr != NULL || ill->ill_net_type == IRE_IF_NORESOLVER)
341 		nce_fastpath(nce);
342 	return (err);
343 }
344 
345 int
346 ndp_lookup_then_add(ill_t *ill, uchar_t *hw_addr, const void *addr,
347     const void *mask, const void *extract_mask,
348     uint32_t hw_extract_start, uint16_t flags, uint16_t state,
349     nce_t **newnce, mblk_t *fp_mp, mblk_t *res_mp)
350 {
351 	int status;
352 
353 	if (ill->ill_isv6) {
354 		status = ndp_lookup_then_add_v6(ill, hw_addr,
355 		    (in6_addr_t *)addr, (in6_addr_t *)mask,
356 		    (in6_addr_t *)extract_mask, hw_extract_start, flags,
357 		    state, newnce, fp_mp, res_mp);
358 	} else  {
359 		status = ndp_lookup_then_add_v4(ill, hw_addr,
360 		    (in_addr_t *)addr, (in_addr_t *)mask,
361 		    (in_addr_t *)extract_mask, hw_extract_start, flags,
362 		    state, newnce, fp_mp, res_mp);
363 	}
364 
365 	return (status);
366 }
367 
368 static int
369 ndp_lookup_then_add_v6(ill_t *ill, uchar_t *hw_addr, const in6_addr_t *addr,
370     const in6_addr_t *mask, const in6_addr_t *extract_mask,
371     uint32_t hw_extract_start, uint16_t flags, uint16_t state,
372     nce_t **newnce, mblk_t *fp_mp, mblk_t *res_mp)
373 {
374 	int	err = 0;
375 	nce_t	*nce;
376 	ip_stack_t	*ipst = ill->ill_ipst;
377 
378 	ASSERT(ill != NULL && ill->ill_isv6);
379 	mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
380 
381 	/* Get head of v6 hash table */
382 	nce = *((nce_t **)NCE_HASH_PTR_V6(ipst, *addr));
383 	nce = nce_lookup_addr(ill, addr, nce);
384 	if (nce == NULL) {
385 		err = ndp_add(ill,
386 		    hw_addr,
387 		    addr,
388 		    mask,
389 		    extract_mask,
390 		    hw_extract_start,
391 		    flags,
392 		    state,
393 		    newnce,
394 		    fp_mp,
395 		    res_mp);
396 	} else {
397 		*newnce = nce;
398 		err = EEXIST;
399 	}
400 	mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
401 	return (err);
402 }
403 
404 /*
405  * Remove all the CONDEMNED nces from the appropriate hash table.
406  * We create a private list of NCEs, these may have ires pointing
407  * to them, so the list will be passed through to clean up dependent
408  * ires and only then we can do NCE_REFRELE which can make NCE inactive.
409  */
410 static void
411 nce_remove(ndp_g_t *ndp, nce_t *nce, nce_t **free_nce_list)
412 {
413 	nce_t *nce1;
414 	nce_t **ptpn;
415 
416 	ASSERT(MUTEX_HELD(&ndp->ndp_g_lock));
417 	ASSERT(ndp->ndp_g_walker == 0);
418 	for (; nce; nce = nce1) {
419 		nce1 = nce->nce_next;
420 		mutex_enter(&nce->nce_lock);
421 		if (nce->nce_flags & NCE_F_CONDEMNED) {
422 			ptpn = nce->nce_ptpn;
423 			nce1 = nce->nce_next;
424 			if (nce1 != NULL)
425 				nce1->nce_ptpn = ptpn;
426 			*ptpn = nce1;
427 			nce->nce_ptpn = NULL;
428 			nce->nce_next = NULL;
429 			nce->nce_next = *free_nce_list;
430 			*free_nce_list = nce;
431 		}
432 		mutex_exit(&nce->nce_lock);
433 	}
434 }
435 
436 /*
437  * 1. Mark the nce CONDEMNED. This ensures that no new nce_lookup()
438  *    will return this NCE. Also no new IREs will be created that
439  *    point to this NCE (See ire_add_v6).  Also no new timeouts will
440  *    be started (See NDP_RESTART_TIMER).
441  * 2. Cancel any currently running timeouts.
442  * 3. If there is an ndp walker, return. The walker will do the cleanup.
443  *    This ensures that walkers see a consistent list of NCEs while walking.
444  * 4. Otherwise remove the NCE from the list of NCEs
445  * 5. Delete all IREs pointing to this NCE.
446  */
447 void
448 ndp_delete(nce_t *nce)
449 {
450 	nce_t	**ptpn;
451 	nce_t	*nce1;
452 	int	ipversion = nce->nce_ipversion;
453 	ndp_g_t *ndp;
454 	ip_stack_t	*ipst = nce->nce_ill->ill_ipst;
455 
456 	if (ipversion == IPV4_VERSION)
457 		ndp = ipst->ips_ndp4;
458 	else
459 		ndp = ipst->ips_ndp6;
460 
461 	/* Serialize deletes */
462 	mutex_enter(&nce->nce_lock);
463 	if (nce->nce_flags & NCE_F_CONDEMNED) {
464 		/* Some other thread is doing the delete */
465 		mutex_exit(&nce->nce_lock);
466 		return;
467 	}
468 	/*
469 	 * Caller has a refhold. Also 1 ref for being in the list. Thus
470 	 * refcnt has to be >= 2
471 	 */
472 	ASSERT(nce->nce_refcnt >= 2);
473 	nce->nce_flags |= NCE_F_CONDEMNED;
474 	mutex_exit(&nce->nce_lock);
475 
476 	nce_fastpath_list_delete(nce);
477 
478 	/*
479 	 * Cancel any running timer. Timeout can't be restarted
480 	 * since CONDEMNED is set. Can't hold nce_lock across untimeout.
481 	 * Passing invalid timeout id is fine.
482 	 */
483 	if (nce->nce_timeout_id != 0) {
484 		(void) untimeout(nce->nce_timeout_id);
485 		nce->nce_timeout_id = 0;
486 	}
487 
488 	mutex_enter(&ndp->ndp_g_lock);
489 	if (nce->nce_ptpn == NULL) {
490 		/*
491 		 * The last ndp walker has already removed this nce from
492 		 * the list after we marked the nce CONDEMNED and before
493 		 * we grabbed the global lock.
494 		 */
495 		mutex_exit(&ndp->ndp_g_lock);
496 		return;
497 	}
498 	if (ndp->ndp_g_walker > 0) {
499 		/*
500 		 * Can't unlink. The walker will clean up
501 		 */
502 		ndp->ndp_g_walker_cleanup = B_TRUE;
503 		mutex_exit(&ndp->ndp_g_lock);
504 		return;
505 	}
506 
507 	/*
508 	 * Now remove the nce from the list. NDP_RESTART_TIMER won't restart
509 	 * the timer since it is marked CONDEMNED.
510 	 */
511 	ptpn = nce->nce_ptpn;
512 	nce1 = nce->nce_next;
513 	if (nce1 != NULL)
514 		nce1->nce_ptpn = ptpn;
515 	*ptpn = nce1;
516 	nce->nce_ptpn = NULL;
517 	nce->nce_next = NULL;
518 	mutex_exit(&ndp->ndp_g_lock);
519 
520 	nce_ire_delete(nce);
521 }
522 
523 void
524 ndp_inactive(nce_t *nce)
525 {
526 	mblk_t		**mpp;
527 	ill_t		*ill;
528 
529 	ASSERT(nce->nce_refcnt == 0);
530 	ASSERT(MUTEX_HELD(&nce->nce_lock));
531 	ASSERT(nce->nce_fastpath == NULL);
532 
533 	/* Free all nce allocated messages */
534 	mpp = &nce->nce_first_mp_to_free;
535 	do {
536 		while (*mpp != NULL) {
537 			mblk_t  *mp;
538 
539 			mp = *mpp;
540 			*mpp = mp->b_next;
541 
542 			inet_freemsg(mp);
543 		}
544 	} while (mpp++ != &nce->nce_last_mp_to_free);
545 
546 #ifdef NCE_DEBUG
547 	nce_trace_inactive(nce);
548 #endif
549 
550 	ill = nce->nce_ill;
551 	mutex_enter(&ill->ill_lock);
552 	ill->ill_nce_cnt--;
553 	/*
554 	 * If the number of nce's associated with this ill have dropped
555 	 * to zero, check whether we need to restart any operation that
556 	 * is waiting for this to happen.
557 	 */
558 	if (ill->ill_nce_cnt == 0) {
559 		/* ipif_ill_refrele_tail drops the ill_lock */
560 		ipif_ill_refrele_tail(ill);
561 	} else {
562 		mutex_exit(&ill->ill_lock);
563 	}
564 	mutex_destroy(&nce->nce_lock);
565 	if (nce->nce_mp != NULL)
566 		inet_freemsg(nce->nce_mp);
567 }
568 
569 /*
570  * ndp_walk routine.  Delete the nce if it is associated with the ill
571  * that is going away.  Always called as a writer.
572  */
573 void
574 ndp_delete_per_ill(nce_t *nce, uchar_t *arg)
575 {
576 	if ((nce != NULL) && nce->nce_ill == (ill_t *)arg) {
577 		ndp_delete(nce);
578 	}
579 }
580 
581 /*
582  * Walk a list of to be inactive NCEs and blow away all the ires.
583  */
584 static void
585 nce_ire_delete_list(nce_t *nce)
586 {
587 	nce_t *nce_next;
588 
589 	ASSERT(nce != NULL);
590 	while (nce != NULL) {
591 		nce_next = nce->nce_next;
592 		nce->nce_next = NULL;
593 
594 		/*
595 		 * It is possible for the last ndp walker (this thread)
596 		 * to come here after ndp_delete has marked the nce CONDEMNED
597 		 * and before it has removed the nce from the fastpath list
598 		 * or called untimeout. So we need to do it here. It is safe
599 		 * for both ndp_delete and this thread to do it twice or
600 		 * even simultaneously since each of the threads has a
601 		 * reference on the nce.
602 		 */
603 		nce_fastpath_list_delete(nce);
604 		/*
605 		 * Cancel any running timer. Timeout can't be restarted
606 		 * since CONDEMNED is set. Can't hold nce_lock across untimeout.
607 		 * Passing invalid timeout id is fine.
608 		 */
609 		if (nce->nce_timeout_id != 0) {
610 			(void) untimeout(nce->nce_timeout_id);
611 			nce->nce_timeout_id = 0;
612 		}
613 		/*
614 		 * We might hit this func thus in the v4 case:
615 		 * ipif_down->ipif_ndp_down->ndp_walk
616 		 */
617 
618 		if (nce->nce_ipversion == IPV4_VERSION) {
619 			ire_walk_ill_v4(MATCH_IRE_ILL | MATCH_IRE_TYPE,
620 			    IRE_CACHE, nce_ire_delete1,
621 			    (char *)nce, nce->nce_ill);
622 		} else {
623 			ASSERT(nce->nce_ipversion == IPV6_VERSION);
624 			ire_walk_ill_v6(MATCH_IRE_ILL | MATCH_IRE_TYPE,
625 			    IRE_CACHE, nce_ire_delete1,
626 			    (char *)nce, nce->nce_ill);
627 		}
628 		NCE_REFRELE_NOTR(nce);
629 		nce = nce_next;
630 	}
631 }
632 
633 /*
634  * Delete an ire when the nce goes away.
635  */
636 /* ARGSUSED */
637 static void
638 nce_ire_delete(nce_t *nce)
639 {
640 	if (nce->nce_ipversion == IPV6_VERSION) {
641 		ire_walk_ill_v6(MATCH_IRE_ILL | MATCH_IRE_TYPE, IRE_CACHE,
642 		    nce_ire_delete1, (char *)nce, nce->nce_ill);
643 		NCE_REFRELE_NOTR(nce);
644 	} else {
645 		ire_walk_ill_v4(MATCH_IRE_ILL | MATCH_IRE_TYPE, IRE_CACHE,
646 		    nce_ire_delete1, (char *)nce, nce->nce_ill);
647 		NCE_REFRELE_NOTR(nce);
648 	}
649 }
650 
651 /*
652  * ire_walk routine used to delete every IRE that shares this nce
653  */
654 static void
655 nce_ire_delete1(ire_t *ire, char *nce_arg)
656 {
657 	nce_t	*nce = (nce_t *)nce_arg;
658 
659 	ASSERT(ire->ire_type == IRE_CACHE);
660 
661 	if (ire->ire_nce == nce) {
662 		ASSERT(ire->ire_ipversion == nce->nce_ipversion);
663 		ire_delete(ire);
664 	}
665 }
666 
667 /*
668  * Restart DAD on given NCE.  Returns B_TRUE if DAD has been restarted.
669  */
670 boolean_t
671 ndp_restart_dad(nce_t *nce)
672 {
673 	boolean_t started;
674 	boolean_t dropped;
675 
676 	if (nce == NULL)
677 		return (B_FALSE);
678 	mutex_enter(&nce->nce_lock);
679 	if (nce->nce_state == ND_PROBE) {
680 		mutex_exit(&nce->nce_lock);
681 		started = B_TRUE;
682 	} else if (nce->nce_state == ND_REACHABLE) {
683 		nce->nce_state = ND_PROBE;
684 		nce->nce_pcnt = ND_MAX_UNICAST_SOLICIT - 1;
685 		mutex_exit(&nce->nce_lock);
686 		dropped = nce_xmit(nce->nce_ill, ND_NEIGHBOR_SOLICIT, NULL,
687 		    B_FALSE, &ipv6_all_zeros, &nce->nce_addr, NDP_PROBE);
688 		if (dropped) {
689 			mutex_enter(&nce->nce_lock);
690 			nce->nce_pcnt++;
691 			mutex_exit(&nce->nce_lock);
692 		}
693 		NDP_RESTART_TIMER(nce, ILL_PROBE_INTERVAL(nce->nce_ill));
694 		started = B_TRUE;
695 	} else {
696 		mutex_exit(&nce->nce_lock);
697 		started = B_FALSE;
698 	}
699 	return (started);
700 }
701 
702 /*
703  * IPv6 Cache entry lookup.  Try to find an nce matching the parameters passed.
704  * If one is found, the refcnt on the nce will be incremented.
705  */
706 nce_t *
707 ndp_lookup_v6(ill_t *ill, const in6_addr_t *addr, boolean_t caller_holds_lock)
708 {
709 	nce_t	*nce;
710 	ip_stack_t	*ipst;
711 
712 	ASSERT(ill != NULL);
713 	ipst = ill->ill_ipst;
714 
715 	ASSERT(ill != NULL && ill->ill_isv6);
716 	if (!caller_holds_lock) {
717 		mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
718 	}
719 
720 	/* Get head of v6 hash table */
721 	nce = *((nce_t **)NCE_HASH_PTR_V6(ipst, *addr));
722 	nce = nce_lookup_addr(ill, addr, nce);
723 	if (nce == NULL)
724 		nce = nce_lookup_mapping(ill, addr);
725 	if (!caller_holds_lock)
726 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
727 	return (nce);
728 }
729 /*
730  * IPv4 Cache entry lookup.  Try to find an nce matching the parameters passed.
731  * If one is found, the refcnt on the nce will be incremented.
732  * Since multicast mappings are handled in arp, there are no nce_mcast_entries
733  * so we skip the nce_lookup_mapping call.
734  * XXX TODO: if the nce is found to be ND_STALE, ndp_delete it and return NULL
735  */
736 nce_t *
737 ndp_lookup_v4(ill_t *ill, const in_addr_t *addr, boolean_t caller_holds_lock)
738 {
739 	nce_t	*nce;
740 	in6_addr_t addr6;
741 	ip_stack_t *ipst = ill->ill_ipst;
742 
743 	if (!caller_holds_lock) {
744 		mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
745 	}
746 
747 	/* Get head of v4 hash table */
748 	nce = *((nce_t **)NCE_HASH_PTR_V4(ipst, *addr));
749 	IN6_IPADDR_TO_V4MAPPED(*addr, &addr6);
750 	nce = nce_lookup_addr(ill, &addr6, nce);
751 	if (!caller_holds_lock)
752 		mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
753 	return (nce);
754 }
755 
756 /*
757  * Cache entry lookup.  Try to find an nce matching the parameters passed.
758  * Look only for exact entries (no mappings).  If an nce is found, increment
759  * the hold count on that nce. The caller passes in the start of the
760  * appropriate hash table, and must be holding the appropriate global
761  * lock (ndp_g_lock).
762  */
763 static nce_t *
764 nce_lookup_addr(ill_t *ill, const in6_addr_t *addr, nce_t *nce)
765 {
766 	ndp_g_t		*ndp;
767 	ip_stack_t	*ipst = ill->ill_ipst;
768 
769 	if (ill->ill_isv6)
770 		ndp = ipst->ips_ndp6;
771 	else
772 		ndp = ipst->ips_ndp4;
773 
774 	ASSERT(ill != NULL);
775 	ASSERT(MUTEX_HELD(&ndp->ndp_g_lock));
776 	if (IN6_IS_ADDR_UNSPECIFIED(addr))
777 		return (NULL);
778 	for (; nce != NULL; nce = nce->nce_next) {
779 		if (nce->nce_ill == ill) {
780 			if (IN6_ARE_ADDR_EQUAL(&nce->nce_addr, addr) &&
781 			    IN6_ARE_ADDR_EQUAL(&nce->nce_mask,
782 			    &ipv6_all_ones)) {
783 				mutex_enter(&nce->nce_lock);
784 				if (!(nce->nce_flags & NCE_F_CONDEMNED)) {
785 					NCE_REFHOLD_LOCKED(nce);
786 					mutex_exit(&nce->nce_lock);
787 					break;
788 				}
789 				mutex_exit(&nce->nce_lock);
790 			}
791 		}
792 	}
793 	return (nce);
794 }
795 
796 /*
797  * Cache entry lookup.  Try to find an nce matching the parameters passed.
798  * Look only for mappings.
799  */
800 static nce_t *
801 nce_lookup_mapping(ill_t *ill, const in6_addr_t *addr)
802 {
803 	nce_t	*nce;
804 	ip_stack_t	*ipst = ill->ill_ipst;
805 
806 	ASSERT(ill != NULL && ill->ill_isv6);
807 	ASSERT(MUTEX_HELD(&ipst->ips_ndp6->ndp_g_lock));
808 	if (!IN6_IS_ADDR_MULTICAST(addr))
809 		return (NULL);
810 	nce = ipst->ips_ndp6->nce_mask_entries;
811 	for (; nce != NULL; nce = nce->nce_next)
812 		if (nce->nce_ill == ill &&
813 		    (V6_MASK_EQ(*addr, nce->nce_mask, nce->nce_addr))) {
814 			mutex_enter(&nce->nce_lock);
815 			if (!(nce->nce_flags & NCE_F_CONDEMNED)) {
816 				NCE_REFHOLD_LOCKED(nce);
817 				mutex_exit(&nce->nce_lock);
818 				break;
819 			}
820 			mutex_exit(&nce->nce_lock);
821 		}
822 	return (nce);
823 }
824 
825 /*
826  * Process passed in parameters either from an incoming packet or via
827  * user ioctl.
828  */
829 void
830 ndp_process(nce_t *nce, uchar_t *hw_addr, uint32_t flag, boolean_t is_adv)
831 {
832 	ill_t	*ill = nce->nce_ill;
833 	uint32_t hw_addr_len = ill->ill_nd_lla_len;
834 	mblk_t	*mp;
835 	boolean_t ll_updated = B_FALSE;
836 	boolean_t ll_changed;
837 	ip_stack_t	*ipst = ill->ill_ipst;
838 
839 	ASSERT(nce->nce_ipversion == IPV6_VERSION);
840 	/*
841 	 * No updates of link layer address or the neighbor state is
842 	 * allowed, when the cache is in NONUD state.  This still
843 	 * allows for responding to reachability solicitation.
844 	 */
845 	mutex_enter(&nce->nce_lock);
846 	if (nce->nce_state == ND_INCOMPLETE) {
847 		if (hw_addr == NULL) {
848 			mutex_exit(&nce->nce_lock);
849 			return;
850 		}
851 		nce_set_ll(nce, hw_addr);
852 		/*
853 		 * Update nce state and send the queued packets
854 		 * back to ip this time ire will be added.
855 		 */
856 		if (flag & ND_NA_FLAG_SOLICITED) {
857 			nce_update(nce, ND_REACHABLE, NULL);
858 		} else {
859 			nce_update(nce, ND_STALE, NULL);
860 		}
861 		mutex_exit(&nce->nce_lock);
862 		nce_fastpath(nce);
863 		mutex_enter(&nce->nce_lock);
864 		mp = nce->nce_qd_mp;
865 		nce->nce_qd_mp = NULL;
866 		mutex_exit(&nce->nce_lock);
867 		while (mp != NULL) {
868 			mblk_t *nxt_mp, *data_mp;
869 
870 			nxt_mp = mp->b_next;
871 			mp->b_next = NULL;
872 
873 			if (mp->b_datap->db_type == M_CTL)
874 				data_mp = mp->b_cont;
875 			else
876 				data_mp = mp;
877 			if (data_mp->b_prev != NULL) {
878 				ill_t   *inbound_ill;
879 				queue_t *fwdq = NULL;
880 				uint_t ifindex;
881 
882 				ifindex = (uint_t)(uintptr_t)data_mp->b_prev;
883 				inbound_ill = ill_lookup_on_ifindex(ifindex,
884 				    B_TRUE, NULL, NULL, NULL, NULL, ipst);
885 				if (inbound_ill == NULL) {
886 					data_mp->b_prev = NULL;
887 					freemsg(mp);
888 					return;
889 				} else {
890 					fwdq = inbound_ill->ill_rq;
891 				}
892 				data_mp->b_prev = NULL;
893 				/*
894 				 * Send a forwarded packet back into ip_rput_v6
895 				 * just as in ire_send_v6().
896 				 * Extract the queue from b_prev (set in
897 				 * ip_rput_data_v6).
898 				 */
899 				if (fwdq != NULL) {
900 					/*
901 					 * Forwarded packets hop count will
902 					 * get decremented in ip_rput_data_v6
903 					 */
904 					if (data_mp != mp)
905 						freeb(mp);
906 					put(fwdq, data_mp);
907 				} else {
908 					/*
909 					 * Send locally originated packets back
910 					 * into * ip_wput_v6.
911 					 */
912 					put(ill->ill_wq, mp);
913 				}
914 				ill_refrele(inbound_ill);
915 			} else {
916 				put(ill->ill_wq, mp);
917 			}
918 			mp = nxt_mp;
919 		}
920 		return;
921 	}
922 	ll_changed = nce_cmp_ll_addr(nce, hw_addr, hw_addr_len);
923 	if (!is_adv) {
924 		/* If this is a SOLICITATION request only */
925 		if (ll_changed)
926 			nce_update(nce, ND_STALE, hw_addr);
927 		mutex_exit(&nce->nce_lock);
928 		return;
929 	}
930 	if (!(flag & ND_NA_FLAG_OVERRIDE) && ll_changed) {
931 		/* If in any other state than REACHABLE, ignore */
932 		if (nce->nce_state == ND_REACHABLE) {
933 			nce_update(nce, ND_STALE, NULL);
934 		}
935 		mutex_exit(&nce->nce_lock);
936 		return;
937 	} else {
938 		if (ll_changed) {
939 			nce_update(nce, ND_UNCHANGED, hw_addr);
940 			ll_updated = B_TRUE;
941 		}
942 		if (flag & ND_NA_FLAG_SOLICITED) {
943 			nce_update(nce, ND_REACHABLE, NULL);
944 		} else {
945 			if (ll_updated) {
946 				nce_update(nce, ND_STALE, NULL);
947 			}
948 		}
949 		mutex_exit(&nce->nce_lock);
950 		if (!(flag & ND_NA_FLAG_ROUTER) && (nce->nce_flags &
951 		    NCE_F_ISROUTER)) {
952 			ire_t *ire;
953 
954 			/*
955 			 * Router turned to host.  We need to remove the
956 			 * entry as well as any default route that may be
957 			 * using this as a next hop.  This is required by
958 			 * section 7.2.5 of RFC 2461.
959 			 */
960 			ire = ire_ftable_lookup_v6(&ipv6_all_zeros,
961 			    &ipv6_all_zeros, &nce->nce_addr, IRE_DEFAULT,
962 			    nce->nce_ill->ill_ipif, NULL, ALL_ZONES, 0, NULL,
963 			    MATCH_IRE_ILL | MATCH_IRE_TYPE | MATCH_IRE_GW |
964 			    MATCH_IRE_DEFAULT, ipst);
965 			if (ire != NULL) {
966 				ip_rts_rtmsg(RTM_DELETE, ire, 0, ipst);
967 				ire_delete(ire);
968 				ire_refrele(ire);
969 			}
970 			ndp_delete(nce);
971 		}
972 	}
973 }
974 
975 /*
976  * Pass arg1 to the pfi supplied, along with each nce in existence.
977  * ndp_walk() places a REFHOLD on the nce and drops the lock when
978  * walking the hash list.
979  */
980 void
981 ndp_walk_common(ndp_g_t *ndp, ill_t *ill, pfi_t pfi, void *arg1,
982     boolean_t trace)
983 {
984 
985 	nce_t	*nce;
986 	nce_t	*nce1;
987 	nce_t	**ncep;
988 	nce_t	*free_nce_list = NULL;
989 
990 	mutex_enter(&ndp->ndp_g_lock);
991 	/* Prevent ndp_delete from unlink and free of NCE */
992 	ndp->ndp_g_walker++;
993 	mutex_exit(&ndp->ndp_g_lock);
994 	for (ncep = ndp->nce_hash_tbl;
995 	    ncep < A_END(ndp->nce_hash_tbl); ncep++) {
996 		for (nce = *ncep; nce != NULL; nce = nce1) {
997 			nce1 = nce->nce_next;
998 			if (ill == NULL || nce->nce_ill == ill) {
999 				if (trace) {
1000 					NCE_REFHOLD(nce);
1001 					(*pfi)(nce, arg1);
1002 					NCE_REFRELE(nce);
1003 				} else {
1004 					NCE_REFHOLD_NOTR(nce);
1005 					(*pfi)(nce, arg1);
1006 					NCE_REFRELE_NOTR(nce);
1007 				}
1008 			}
1009 		}
1010 	}
1011 	for (nce = ndp->nce_mask_entries; nce != NULL; nce = nce1) {
1012 		nce1 = nce->nce_next;
1013 		if (ill == NULL || nce->nce_ill == ill) {
1014 			if (trace) {
1015 				NCE_REFHOLD(nce);
1016 				(*pfi)(nce, arg1);
1017 				NCE_REFRELE(nce);
1018 			} else {
1019 				NCE_REFHOLD_NOTR(nce);
1020 				(*pfi)(nce, arg1);
1021 				NCE_REFRELE_NOTR(nce);
1022 			}
1023 		}
1024 	}
1025 	mutex_enter(&ndp->ndp_g_lock);
1026 	ndp->ndp_g_walker--;
1027 	/*
1028 	 * While NCE's are removed from global list they are placed
1029 	 * in a private list, to be passed to nce_ire_delete_list().
1030 	 * The reason is, there may be ires pointing to this nce
1031 	 * which needs to cleaned up.
1032 	 */
1033 	if (ndp->ndp_g_walker_cleanup && ndp->ndp_g_walker == 0) {
1034 		/* Time to delete condemned entries */
1035 		for (ncep = ndp->nce_hash_tbl;
1036 		    ncep < A_END(ndp->nce_hash_tbl); ncep++) {
1037 			nce = *ncep;
1038 			if (nce != NULL) {
1039 				nce_remove(ndp, nce, &free_nce_list);
1040 			}
1041 		}
1042 		nce = ndp->nce_mask_entries;
1043 		if (nce != NULL) {
1044 			nce_remove(ndp, nce, &free_nce_list);
1045 		}
1046 		ndp->ndp_g_walker_cleanup = B_FALSE;
1047 	}
1048 	mutex_exit(&ndp->ndp_g_lock);
1049 
1050 	if (free_nce_list != NULL) {
1051 		nce_ire_delete_list(free_nce_list);
1052 	}
1053 }
1054 
1055 /*
1056  * Walk everything.
1057  * Note that ill can be NULL hence can't derive the ipst from it.
1058  */
1059 void
1060 ndp_walk(ill_t *ill, pfi_t pfi, void *arg1, ip_stack_t *ipst)
1061 {
1062 	ndp_walk_common(ipst->ips_ndp4, ill, pfi, arg1, B_TRUE);
1063 	ndp_walk_common(ipst->ips_ndp6, ill, pfi, arg1, B_TRUE);
1064 }
1065 
1066 /*
1067  * Process resolve requests.  Handles both mapped entries
1068  * as well as cases that needs to be send out on the wire.
1069  * Lookup a NCE for a given IRE.  Regardless of whether one exists
1070  * or one is created, we defer making ire point to nce until the
1071  * ire is actually added at which point the nce_refcnt on the nce is
1072  * incremented.  This is done primarily to have symmetry between ire_add()
1073  * and ire_delete() which decrements the nce_refcnt, when an ire is deleted.
1074  */
1075 int
1076 ndp_resolver(ill_t *ill, const in6_addr_t *dst, mblk_t *mp, zoneid_t zoneid)
1077 {
1078 	nce_t		*nce;
1079 	int		err = 0;
1080 	uint32_t	ms;
1081 	mblk_t		*mp_nce = NULL;
1082 	ip_stack_t	*ipst = ill->ill_ipst;
1083 
1084 	ASSERT(ill != NULL);
1085 	ASSERT(ill->ill_isv6);
1086 	if (IN6_IS_ADDR_MULTICAST(dst)) {
1087 		err = nce_set_multicast(ill, dst);
1088 		return (err);
1089 	}
1090 	err = ndp_lookup_then_add(ill,
1091 	    NULL,	/* No hardware address */
1092 	    dst,
1093 	    &ipv6_all_ones,
1094 	    &ipv6_all_zeros,
1095 	    0,
1096 	    (ill->ill_flags & ILLF_NONUD) ? NCE_F_NONUD : 0,
1097 	    ND_INCOMPLETE,
1098 	    &nce,
1099 	    NULL, /* let ndp_add figure out fastpath mp and dlureq_mp for v6 */
1100 	    NULL);
1101 
1102 	switch (err) {
1103 	case 0:
1104 		/*
1105 		 * New cache entry was created. Make sure that the state
1106 		 * is not ND_INCOMPLETE. It can be in some other state
1107 		 * even before we send out the solicitation as we could
1108 		 * get un-solicited advertisements.
1109 		 *
1110 		 * If this is an XRESOLV interface, simply return 0,
1111 		 * since we don't want to solicit just yet.
1112 		 */
1113 		if (ill->ill_flags & ILLF_XRESOLV) {
1114 			NCE_REFRELE(nce);
1115 			return (0);
1116 		}
1117 		rw_enter(&ipst->ips_ill_g_lock, RW_READER);
1118 		mutex_enter(&nce->nce_lock);
1119 		if (nce->nce_state != ND_INCOMPLETE) {
1120 			mutex_exit(&nce->nce_lock);
1121 			rw_exit(&ipst->ips_ill_g_lock);
1122 			NCE_REFRELE(nce);
1123 			return (0);
1124 		}
1125 		mp_nce = ip_prepend_zoneid(mp, zoneid, ipst);
1126 		if (mp_nce == NULL) {
1127 			/* The caller will free mp */
1128 			mutex_exit(&nce->nce_lock);
1129 			rw_exit(&ipst->ips_ill_g_lock);
1130 			ndp_delete(nce);
1131 			NCE_REFRELE(nce);
1132 			return (ENOMEM);
1133 		}
1134 		ms = nce_solicit(nce, mp_nce);
1135 		rw_exit(&ipst->ips_ill_g_lock);
1136 		if (ms == 0) {
1137 			/* The caller will free mp */
1138 			if (mp_nce != mp)
1139 				freeb(mp_nce);
1140 			mutex_exit(&nce->nce_lock);
1141 			ndp_delete(nce);
1142 			NCE_REFRELE(nce);
1143 			return (EBUSY);
1144 		}
1145 		mutex_exit(&nce->nce_lock);
1146 		NDP_RESTART_TIMER(nce, (clock_t)ms);
1147 		NCE_REFRELE(nce);
1148 		return (EINPROGRESS);
1149 	case EEXIST:
1150 		/* Resolution in progress just queue the packet */
1151 		mutex_enter(&nce->nce_lock);
1152 		if (nce->nce_state == ND_INCOMPLETE) {
1153 			mp_nce = ip_prepend_zoneid(mp, zoneid, ipst);
1154 			if (mp_nce == NULL) {
1155 				err = ENOMEM;
1156 			} else {
1157 				nce_queue_mp(nce, mp_nce);
1158 				err = EINPROGRESS;
1159 			}
1160 		} else {
1161 			/*
1162 			 * Any other state implies we have
1163 			 * a nce but IRE needs to be added ...
1164 			 * ire_add_v6() will take care of the
1165 			 * the case when the nce becomes CONDEMNED
1166 			 * before the ire is added to the table.
1167 			 */
1168 			err = 0;
1169 		}
1170 		mutex_exit(&nce->nce_lock);
1171 		NCE_REFRELE(nce);
1172 		break;
1173 	default:
1174 		ip1dbg(("ndp_resolver: Can't create NCE %d\n", err));
1175 		break;
1176 	}
1177 	return (err);
1178 }
1179 
1180 /*
1181  * When there is no resolver, the link layer template is passed in
1182  * the IRE.
1183  * Lookup a NCE for a given IRE.  Regardless of whether one exists
1184  * or one is created, we defer making ire point to nce until the
1185  * ire is actually added at which point the nce_refcnt on the nce is
1186  * incremented.  This is done primarily to have symmetry between ire_add()
1187  * and ire_delete() which decrements the nce_refcnt, when an ire is deleted.
1188  */
1189 int
1190 ndp_noresolver(ill_t *ill, const in6_addr_t *dst)
1191 {
1192 	nce_t		*nce;
1193 	int		err = 0;
1194 
1195 	ASSERT(ill != NULL);
1196 	ASSERT(ill->ill_isv6);
1197 	if (IN6_IS_ADDR_MULTICAST(dst)) {
1198 		err = nce_set_multicast(ill, dst);
1199 		return (err);
1200 	}
1201 
1202 	err = ndp_lookup_then_add(ill,
1203 	    NULL,	/* hardware address */
1204 	    dst,
1205 	    &ipv6_all_ones,
1206 	    &ipv6_all_zeros,
1207 	    0,
1208 	    (ill->ill_flags & ILLF_NONUD) ? NCE_F_NONUD : 0,
1209 	    ND_REACHABLE,
1210 	    &nce,
1211 	    NULL, /* let ndp_add figure out fp_mp/dlureq_mp for v6 */
1212 	    NULL);
1213 
1214 	switch (err) {
1215 	case 0:
1216 		/*
1217 		 * Cache entry with a proper resolver cookie was
1218 		 * created.
1219 		 */
1220 		NCE_REFRELE(nce);
1221 		break;
1222 	case EEXIST:
1223 		err = 0;
1224 		NCE_REFRELE(nce);
1225 		break;
1226 	default:
1227 		ip1dbg(("ndp_noresolver: Can't create NCE %d\n", err));
1228 		break;
1229 	}
1230 	return (err);
1231 }
1232 
1233 /*
1234  * For each interface an entry is added for the unspecified multicast group.
1235  * Here that mapping is used to form the multicast cache entry for a particular
1236  * multicast destination.
1237  */
1238 static int
1239 nce_set_multicast(ill_t *ill, const in6_addr_t *dst)
1240 {
1241 	nce_t		*mnce;	/* Multicast mapping entry */
1242 	nce_t		*nce;
1243 	uchar_t		*hw_addr = NULL;
1244 	int		err = 0;
1245 	ip_stack_t	*ipst = ill->ill_ipst;
1246 
1247 	ASSERT(ill != NULL);
1248 	ASSERT(ill->ill_isv6);
1249 	ASSERT(!(IN6_IS_ADDR_UNSPECIFIED(dst)));
1250 
1251 	mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
1252 	nce = *((nce_t **)NCE_HASH_PTR_V6(ipst, *dst));
1253 	nce = nce_lookup_addr(ill, dst, nce);
1254 	if (nce != NULL) {
1255 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
1256 		NCE_REFRELE(nce);
1257 		return (0);
1258 	}
1259 	/* No entry, now lookup for a mapping this should never fail */
1260 	mnce = nce_lookup_mapping(ill, dst);
1261 	if (mnce == NULL) {
1262 		/* Something broken for the interface. */
1263 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
1264 		return (ESRCH);
1265 	}
1266 	ASSERT(mnce->nce_flags & NCE_F_MAPPING);
1267 	if (ill->ill_net_type == IRE_IF_RESOLVER) {
1268 		/*
1269 		 * For IRE_IF_RESOLVER a hardware mapping can be
1270 		 * generated, for IRE_IF_NORESOLVER, resolution cookie
1271 		 * in the ill is copied in ndp_add().
1272 		 */
1273 		hw_addr = kmem_alloc(ill->ill_nd_lla_len, KM_NOSLEEP);
1274 		if (hw_addr == NULL) {
1275 			mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
1276 			NCE_REFRELE(mnce);
1277 			return (ENOMEM);
1278 		}
1279 		nce_make_mapping(mnce, hw_addr, (uchar_t *)dst);
1280 	}
1281 	NCE_REFRELE(mnce);
1282 	/*
1283 	 * IRE_IF_NORESOLVER type simply copies the resolution
1284 	 * cookie passed in.  So no hw_addr is needed.
1285 	 */
1286 	err = ndp_add(ill,
1287 	    hw_addr,
1288 	    dst,
1289 	    &ipv6_all_ones,
1290 	    &ipv6_all_zeros,
1291 	    0,
1292 	    NCE_F_NONUD,
1293 	    ND_REACHABLE,
1294 	    &nce,
1295 	    NULL,
1296 	    NULL);
1297 	mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
1298 	if (hw_addr != NULL)
1299 		kmem_free(hw_addr, ill->ill_nd_lla_len);
1300 	if (err != 0) {
1301 		ip1dbg(("nce_set_multicast: create failed" "%d\n", err));
1302 		return (err);
1303 	}
1304 	NCE_REFRELE(nce);
1305 	return (0);
1306 }
1307 
1308 /*
1309  * Return the link layer address, and any flags of a nce.
1310  */
1311 int
1312 ndp_query(ill_t *ill, struct lif_nd_req *lnr)
1313 {
1314 	nce_t		*nce;
1315 	in6_addr_t	*addr;
1316 	sin6_t		*sin6;
1317 	dl_unitdata_req_t	*dl;
1318 
1319 	ASSERT(ill != NULL && ill->ill_isv6);
1320 	sin6 = (sin6_t *)&lnr->lnr_addr;
1321 	addr =  &sin6->sin6_addr;
1322 
1323 	nce = ndp_lookup_v6(ill, addr, B_FALSE);
1324 	if (nce == NULL)
1325 		return (ESRCH);
1326 	/* If in INCOMPLETE state, no link layer address is available yet */
1327 	if (nce->nce_state == ND_INCOMPLETE)
1328 		goto done;
1329 	dl = (dl_unitdata_req_t *)nce->nce_res_mp->b_rptr;
1330 	if (ill->ill_flags & ILLF_XRESOLV)
1331 		lnr->lnr_hdw_len = dl->dl_dest_addr_length;
1332 	else
1333 		lnr->lnr_hdw_len = ill->ill_nd_lla_len;
1334 	ASSERT(NCE_LL_ADDR_OFFSET(ill) + lnr->lnr_hdw_len <=
1335 	    sizeof (lnr->lnr_hdw_addr));
1336 	bcopy(nce->nce_res_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill),
1337 	    (uchar_t *)&lnr->lnr_hdw_addr, lnr->lnr_hdw_len);
1338 	if (nce->nce_flags & NCE_F_ISROUTER)
1339 		lnr->lnr_flags = NDF_ISROUTER_ON;
1340 	if (nce->nce_flags & NCE_F_PROXY)
1341 		lnr->lnr_flags |= NDF_PROXY_ON;
1342 	if (nce->nce_flags & NCE_F_ANYCAST)
1343 		lnr->lnr_flags |= NDF_ANYCAST_ON;
1344 done:
1345 	NCE_REFRELE(nce);
1346 	return (0);
1347 }
1348 
1349 /*
1350  * Send Enable/Disable multicast reqs to driver.
1351  */
1352 int
1353 ndp_mcastreq(ill_t *ill, const in6_addr_t *addr, uint32_t hw_addr_len,
1354     uint32_t hw_addr_offset, mblk_t *mp)
1355 {
1356 	nce_t		*nce;
1357 	uchar_t		*hw_addr;
1358 	ip_stack_t	*ipst = ill->ill_ipst;
1359 
1360 	ASSERT(ill != NULL && ill->ill_isv6);
1361 	ASSERT(ill->ill_net_type == IRE_IF_RESOLVER);
1362 	hw_addr = mi_offset_paramc(mp, hw_addr_offset, hw_addr_len);
1363 	if (hw_addr == NULL || !IN6_IS_ADDR_MULTICAST(addr)) {
1364 		freemsg(mp);
1365 		return (EINVAL);
1366 	}
1367 	mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
1368 	nce = nce_lookup_mapping(ill, addr);
1369 	if (nce == NULL) {
1370 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
1371 		freemsg(mp);
1372 		return (ESRCH);
1373 	}
1374 	mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
1375 	/*
1376 	 * Update dl_addr_length and dl_addr_offset for primitives that
1377 	 * have physical addresses as opposed to full saps
1378 	 */
1379 	switch (((union DL_primitives *)mp->b_rptr)->dl_primitive) {
1380 	case DL_ENABMULTI_REQ:
1381 		/* Track the state if this is the first enabmulti */
1382 		if (ill->ill_dlpi_multicast_state == IDS_UNKNOWN)
1383 			ill->ill_dlpi_multicast_state = IDS_INPROGRESS;
1384 		ip1dbg(("ndp_mcastreq: ENABMULTI\n"));
1385 		break;
1386 	case DL_DISABMULTI_REQ:
1387 		ip1dbg(("ndp_mcastreq: DISABMULTI\n"));
1388 		break;
1389 	default:
1390 		NCE_REFRELE(nce);
1391 		ip1dbg(("ndp_mcastreq: default\n"));
1392 		return (EINVAL);
1393 	}
1394 	nce_make_mapping(nce, hw_addr, (uchar_t *)addr);
1395 	NCE_REFRELE(nce);
1396 	putnext(ill->ill_wq, mp);
1397 	return (0);
1398 }
1399 
1400 /*
1401  * Send a neighbor solicitation.
1402  * Returns number of milliseconds after which we should either rexmit or abort.
1403  * Return of zero means we should abort.
1404  * The caller holds the nce_lock to protect nce_qd_mp and nce_rcnt.
1405  *
1406  * NOTE: This routine drops nce_lock (and later reacquires it) when sending
1407  * the packet.
1408  * NOTE: This routine does not consume mp.
1409  */
1410 uint32_t
1411 nce_solicit(nce_t *nce, mblk_t *mp)
1412 {
1413 	ill_t		*ill;
1414 	ill_t		*src_ill;
1415 	ip6_t		*ip6h;
1416 	in6_addr_t	src;
1417 	in6_addr_t	dst;
1418 	ipif_t		*ipif;
1419 	ip6i_t		*ip6i;
1420 	boolean_t	dropped = B_FALSE;
1421 	ip_stack_t	*ipst = nce->nce_ill->ill_ipst;
1422 
1423 	ASSERT(RW_READ_HELD(&ipst->ips_ill_g_lock));
1424 	ASSERT(MUTEX_HELD(&nce->nce_lock));
1425 	ill = nce->nce_ill;
1426 	ASSERT(ill != NULL);
1427 
1428 	if (nce->nce_rcnt == 0) {
1429 		return (0);
1430 	}
1431 
1432 	if (mp == NULL) {
1433 		ASSERT(nce->nce_qd_mp != NULL);
1434 		mp = nce->nce_qd_mp;
1435 	} else {
1436 		nce_queue_mp(nce, mp);
1437 	}
1438 
1439 	/* Handle ip_newroute_v6 giving us IPSEC packets */
1440 	if (mp->b_datap->db_type == M_CTL)
1441 		mp = mp->b_cont;
1442 
1443 	ip6h = (ip6_t *)mp->b_rptr;
1444 	if (ip6h->ip6_nxt == IPPROTO_RAW) {
1445 		/*
1446 		 * This message should have been pulled up already in
1447 		 * ip_wput_v6. We can't do pullups here because the message
1448 		 * could be from the nce_qd_mp which could have b_next/b_prev
1449 		 * non-NULL.
1450 		 */
1451 		ip6i = (ip6i_t *)ip6h;
1452 		ASSERT((mp->b_wptr - (uchar_t *)ip6i) >=
1453 			    sizeof (ip6i_t) + IPV6_HDR_LEN);
1454 		ip6h = (ip6_t *)(mp->b_rptr + sizeof (ip6i_t));
1455 	}
1456 	src = ip6h->ip6_src;
1457 	/*
1458 	 * If the src of outgoing packet is one of the assigned interface
1459 	 * addresses use it, otherwise we will pick the source address below.
1460 	 */
1461 	src_ill = ill;
1462 	if (!IN6_IS_ADDR_UNSPECIFIED(&src)) {
1463 		if (ill->ill_group != NULL)
1464 			src_ill = ill->ill_group->illgrp_ill;
1465 		for (; src_ill != NULL; src_ill = src_ill->ill_group_next) {
1466 			for (ipif = src_ill->ill_ipif; ipif != NULL;
1467 			    ipif = ipif->ipif_next) {
1468 				if (IN6_ARE_ADDR_EQUAL(&src,
1469 				    &ipif->ipif_v6lcl_addr)) {
1470 					break;
1471 				}
1472 			}
1473 			if (ipif != NULL)
1474 				break;
1475 		}
1476 		/*
1477 		 * If no relevant ipif can be found, then it's not one of our
1478 		 * addresses.  Reset to :: and let nce_xmit.  If an ipif can be
1479 		 * found, but it's not yet done with DAD verification, then
1480 		 * just postpone this transmission until later.
1481 		 */
1482 		if (src_ill == NULL)
1483 			src = ipv6_all_zeros;
1484 		else if (!ipif->ipif_addr_ready)
1485 			return (ill->ill_reachable_retrans_time);
1486 	}
1487 	dst = nce->nce_addr;
1488 	/*
1489 	 * If source address is unspecified, nce_xmit will choose
1490 	 * one for us and initialize the hardware address also
1491 	 * appropriately.
1492 	 */
1493 	if (IN6_IS_ADDR_UNSPECIFIED(&src))
1494 		src_ill = NULL;
1495 	nce->nce_rcnt--;
1496 	mutex_exit(&nce->nce_lock);
1497 	rw_exit(&ipst->ips_ill_g_lock);
1498 	dropped = nce_xmit(ill, ND_NEIGHBOR_SOLICIT, src_ill, B_TRUE, &src,
1499 	    &dst, 0);
1500 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
1501 	mutex_enter(&nce->nce_lock);
1502 	if (dropped)
1503 		nce->nce_rcnt++;
1504 	return (ill->ill_reachable_retrans_time);
1505 }
1506 
1507 /*
1508  * Attempt to recover an address on an interface that's been marked as a
1509  * duplicate.  Because NCEs are destroyed when the interface goes down, there's
1510  * no easy way to just probe the address and have the right thing happen if
1511  * it's no longer in use.  Instead, we just bring it up normally and allow the
1512  * regular interface start-up logic to probe for a remaining duplicate and take
1513  * us back down if necessary.
1514  * Neither DHCP nor temporary addresses arrive here; they're excluded by
1515  * ip_ndp_excl.
1516  */
1517 /* ARGSUSED */
1518 static void
1519 ip_ndp_recover(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg)
1520 {
1521 	ill_t	*ill = rq->q_ptr;
1522 	ipif_t	*ipif;
1523 	in6_addr_t *addr = (in6_addr_t *)mp->b_rptr;
1524 
1525 	for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
1526 		/*
1527 		 * We do not support recovery of proxy ARP'd interfaces,
1528 		 * because the system lacks a complete proxy ARP mechanism.
1529 		 */
1530 		if ((ipif->ipif_flags & IPIF_POINTOPOINT) ||
1531 		    !IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6lcl_addr, addr)) {
1532 			continue;
1533 		}
1534 
1535 		/*
1536 		 * If we have already recovered or if the interface is going
1537 		 * away, then ignore.
1538 		 */
1539 		mutex_enter(&ill->ill_lock);
1540 		if (!(ipif->ipif_flags & IPIF_DUPLICATE) ||
1541 		    (ipif->ipif_flags & (IPIF_MOVING | IPIF_CONDEMNED))) {
1542 			mutex_exit(&ill->ill_lock);
1543 			continue;
1544 		}
1545 
1546 		ipif->ipif_flags &= ~IPIF_DUPLICATE;
1547 		ill->ill_ipif_dup_count--;
1548 		mutex_exit(&ill->ill_lock);
1549 		ipif->ipif_was_dup = B_TRUE;
1550 
1551 		if (ipif_ndp_up(ipif, addr) != EINPROGRESS)
1552 			(void) ipif_up_done_v6(ipif);
1553 	}
1554 	freeb(mp);
1555 }
1556 
1557 /*
1558  * Attempt to recover an IPv6 interface that's been shut down as a duplicate.
1559  * As long as someone else holds the address, the interface will stay down.
1560  * When that conflict goes away, the interface is brought back up.  This is
1561  * done so that accidental shutdowns of addresses aren't made permanent.  Your
1562  * server will recover from a failure.
1563  *
1564  * For DHCP and temporary addresses, recovery is not done in the kernel.
1565  * Instead, it's handled by user space processes (dhcpagent and in.ndpd).
1566  *
1567  * This function is entered on a timer expiry; the ID is in ipif_recovery_id.
1568  */
1569 static void
1570 ipif6_dup_recovery(void *arg)
1571 {
1572 	ipif_t *ipif = arg;
1573 
1574 	ipif->ipif_recovery_id = 0;
1575 	if (!(ipif->ipif_flags & IPIF_DUPLICATE))
1576 		return;
1577 
1578 	/*
1579 	 * No lock, because this is just an optimization.
1580 	 */
1581 	if (ipif->ipif_state_flags & (IPIF_MOVING | IPIF_CONDEMNED))
1582 		return;
1583 
1584 	/* If the link is down, we'll retry this later */
1585 	if (!(ipif->ipif_ill->ill_phyint->phyint_flags & PHYI_RUNNING))
1586 		return;
1587 
1588 	ndp_do_recovery(ipif);
1589 }
1590 
1591 /*
1592  * Perform interface recovery by forcing the duplicate interfaces up and
1593  * allowing the system to determine which ones should stay up.
1594  *
1595  * Called both by recovery timer expiry and link-up notification.
1596  */
1597 void
1598 ndp_do_recovery(ipif_t *ipif)
1599 {
1600 	ill_t *ill = ipif->ipif_ill;
1601 	mblk_t *mp;
1602 	ip_stack_t *ipst = ill->ill_ipst;
1603 
1604 	mp = allocb(sizeof (ipif->ipif_v6lcl_addr), BPRI_MED);
1605 	if (mp == NULL) {
1606 		mutex_enter(&ill->ill_lock);
1607 		if (ipif->ipif_recovery_id == 0 &&
1608 		    !(ipif->ipif_state_flags & (IPIF_MOVING |
1609 		    IPIF_CONDEMNED))) {
1610 			ipif->ipif_recovery_id = timeout(ipif6_dup_recovery,
1611 			    ipif, MSEC_TO_TICK(ipst->ips_ip_dup_recovery));
1612 		}
1613 		mutex_exit(&ill->ill_lock);
1614 	} else {
1615 		bcopy(&ipif->ipif_v6lcl_addr, mp->b_rptr,
1616 		    sizeof (ipif->ipif_v6lcl_addr));
1617 		ill_refhold(ill);
1618 		(void) qwriter_ip(NULL, ill, ill->ill_rq, mp, ip_ndp_recover,
1619 		    CUR_OP, B_FALSE);
1620 	}
1621 }
1622 
1623 /*
1624  * Find the solicitation in the given message, and extract printable details
1625  * (MAC and IP addresses) from it.
1626  */
1627 static nd_neighbor_solicit_t *
1628 ip_ndp_find_solicitation(mblk_t *mp, mblk_t *dl_mp, ill_t *ill, char *hbuf,
1629     size_t hlen, char *sbuf, size_t slen, uchar_t **haddr)
1630 {
1631 	nd_neighbor_solicit_t *ns;
1632 	ip6_t *ip6h;
1633 	uchar_t *addr;
1634 	int alen;
1635 
1636 	alen = 0;
1637 	ip6h = (ip6_t *)mp->b_rptr;
1638 	if (dl_mp == NULL) {
1639 		nd_opt_hdr_t *opt;
1640 		int nslen;
1641 
1642 		/*
1643 		 * If it's from the fast-path, then it can't be a probe
1644 		 * message, and thus must include the source linkaddr option.
1645 		 * Extract that here.
1646 		 */
1647 		ns = (nd_neighbor_solicit_t *)((char *)ip6h + IPV6_HDR_LEN);
1648 		nslen = mp->b_wptr - (uchar_t *)ns;
1649 		if ((nslen -= sizeof (*ns)) > 0) {
1650 			opt = ndp_get_option((nd_opt_hdr_t *)(ns + 1), nslen,
1651 			    ND_OPT_SOURCE_LINKADDR);
1652 			if (opt != NULL &&
1653 			    opt->nd_opt_len * 8 - sizeof (*opt) >=
1654 			    ill->ill_nd_lla_len) {
1655 				addr = (uchar_t *)(opt + 1);
1656 				alen = ill->ill_nd_lla_len;
1657 			}
1658 		}
1659 		/*
1660 		 * We cheat a bit here for the sake of printing usable log
1661 		 * messages in the rare case where the reply we got was unicast
1662 		 * without a source linkaddr option, and the interface is in
1663 		 * fastpath mode.  (Sigh.)
1664 		 */
1665 		if (alen == 0 && ill->ill_type == IFT_ETHER &&
1666 		    MBLKHEAD(mp) >= sizeof (struct ether_header)) {
1667 			struct ether_header *pether;
1668 
1669 			pether = (struct ether_header *)((char *)ip6h -
1670 			    sizeof (*pether));
1671 			addr = pether->ether_shost.ether_addr_octet;
1672 			alen = ETHERADDRL;
1673 		}
1674 	} else {
1675 		dl_unitdata_ind_t *dlu;
1676 
1677 		dlu = (dl_unitdata_ind_t *)dl_mp->b_rptr;
1678 		alen = dlu->dl_src_addr_length;
1679 		if (alen > 0 && dlu->dl_src_addr_offset >= sizeof (*dlu) &&
1680 		    dlu->dl_src_addr_offset + alen <= MBLKL(dl_mp)) {
1681 			addr = dl_mp->b_rptr + dlu->dl_src_addr_offset;
1682 			if (ill->ill_sap_length < 0) {
1683 				alen += ill->ill_sap_length;
1684 			} else {
1685 				addr += ill->ill_sap_length;
1686 				alen -= ill->ill_sap_length;
1687 			}
1688 		}
1689 	}
1690 	if (alen > 0) {
1691 		*haddr = addr;
1692 		(void) mac_colon_addr(addr, alen, hbuf, hlen);
1693 	} else {
1694 		*haddr = NULL;
1695 		(void) strcpy(hbuf, "?");
1696 	}
1697 	ns = (nd_neighbor_solicit_t *)((char *)ip6h + IPV6_HDR_LEN);
1698 	(void) inet_ntop(AF_INET6, &ns->nd_ns_target, sbuf, slen);
1699 	return (ns);
1700 }
1701 
1702 /*
1703  * This is for exclusive changes due to NDP duplicate address detection
1704  * failure.
1705  */
1706 /* ARGSUSED */
1707 static void
1708 ip_ndp_excl(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg)
1709 {
1710 	ill_t	*ill = rq->q_ptr;
1711 	ipif_t	*ipif;
1712 	char ibuf[LIFNAMSIZ + 10];	/* 10 digits for logical i/f number */
1713 	char hbuf[MAC_STR_LEN];
1714 	char sbuf[INET6_ADDRSTRLEN];
1715 	nd_neighbor_solicit_t *ns;
1716 	mblk_t *dl_mp = NULL;
1717 	uchar_t *haddr;
1718 	ip_stack_t *ipst = ill->ill_ipst;
1719 
1720 	if (DB_TYPE(mp) != M_DATA) {
1721 		dl_mp = mp;
1722 		mp = mp->b_cont;
1723 	}
1724 	ns = ip_ndp_find_solicitation(mp, dl_mp, ill, hbuf, sizeof (hbuf), sbuf,
1725 	    sizeof (sbuf), &haddr);
1726 	if (haddr != NULL &&
1727 	    bcmp(haddr, ill->ill_phys_addr, ill->ill_phys_addr_length) == 0) {
1728 		/*
1729 		 * Ignore conflicts generated by misbehaving switches that just
1730 		 * reflect our own messages back to us.
1731 		 */
1732 		goto ignore_conflict;
1733 	}
1734 	(void) strlcpy(ibuf, ill->ill_name, sizeof (ibuf));
1735 	for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
1736 
1737 		if ((ipif->ipif_flags & IPIF_POINTOPOINT) ||
1738 		    !IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6lcl_addr,
1739 		    &ns->nd_ns_target)) {
1740 			continue;
1741 		}
1742 
1743 		/* If it's already marked, then don't do anything. */
1744 		if (ipif->ipif_flags & IPIF_DUPLICATE)
1745 			continue;
1746 
1747 		/*
1748 		 * If this is a failure during duplicate recovery, then don't
1749 		 * complain.  It may take a long time to recover.
1750 		 */
1751 		if (!ipif->ipif_was_dup) {
1752 			if (ipif->ipif_id != 0) {
1753 				(void) snprintf(ibuf + ill->ill_name_length - 1,
1754 				    sizeof (ibuf) - ill->ill_name_length + 1,
1755 				    ":%d", ipif->ipif_id);
1756 			}
1757 			cmn_err(CE_WARN, "%s has duplicate address %s (in "
1758 			    "use by %s); disabled", ibuf, sbuf, hbuf);
1759 		}
1760 		mutex_enter(&ill->ill_lock);
1761 		ASSERT(!(ipif->ipif_flags & IPIF_DUPLICATE));
1762 		ipif->ipif_flags |= IPIF_DUPLICATE;
1763 		ill->ill_ipif_dup_count++;
1764 		mutex_exit(&ill->ill_lock);
1765 		(void) ipif_down(ipif, NULL, NULL);
1766 		ipif_down_tail(ipif);
1767 		mutex_enter(&ill->ill_lock);
1768 		if (!(ipif->ipif_flags & (IPIF_DHCPRUNNING|IPIF_TEMPORARY)) &&
1769 		    ill->ill_net_type == IRE_IF_RESOLVER &&
1770 		    !(ipif->ipif_state_flags & (IPIF_MOVING |
1771 		    IPIF_CONDEMNED)) &&
1772 		    ipst->ips_ip_dup_recovery > 0) {
1773 			ipif->ipif_recovery_id = timeout(ipif6_dup_recovery,
1774 			    ipif, MSEC_TO_TICK(ipst->ips_ip_dup_recovery));
1775 		}
1776 		mutex_exit(&ill->ill_lock);
1777 	}
1778 ignore_conflict:
1779 	if (dl_mp != NULL)
1780 		freeb(dl_mp);
1781 	freemsg(mp);
1782 }
1783 
1784 /*
1785  * Handle failure by tearing down the ipifs with the specified address.  Note
1786  * that tearing down the ipif also means deleting the nce through ipif_down, so
1787  * it's not possible to do recovery by just restarting the nce timer.  Instead,
1788  * we start a timer on the ipif.
1789  */
1790 static void
1791 ip_ndp_failure(ill_t *ill, mblk_t *mp, mblk_t *dl_mp, nce_t *nce)
1792 {
1793 	if ((mp = copymsg(mp)) != NULL) {
1794 		if (dl_mp == NULL)
1795 			dl_mp = mp;
1796 		else if ((dl_mp = copyb(dl_mp)) != NULL)
1797 			dl_mp->b_cont = mp;
1798 		if (dl_mp == NULL) {
1799 			freemsg(mp);
1800 		} else {
1801 			ill_refhold(ill);
1802 			(void) qwriter_ip(NULL, ill, ill->ill_rq, dl_mp,
1803 			    ip_ndp_excl, CUR_OP, B_FALSE);
1804 		}
1805 	}
1806 	ndp_delete(nce);
1807 }
1808 
1809 /*
1810  * Handle a discovered conflict: some other system is advertising that it owns
1811  * one of our IP addresses.  We need to defend ourselves, or just shut down the
1812  * interface.
1813  */
1814 static void
1815 ip_ndp_conflict(ill_t *ill, mblk_t *mp, mblk_t *dl_mp, nce_t *nce)
1816 {
1817 	ipif_t *ipif;
1818 	uint32_t now;
1819 	uint_t maxdefense;
1820 	uint_t defs;
1821 	ip_stack_t *ipst = ill->ill_ipst;
1822 
1823 	ipif = ipif_lookup_addr_v6(&nce->nce_addr, ill, ALL_ZONES, NULL, NULL,
1824 	    NULL, NULL, ipst);
1825 	if (ipif == NULL)
1826 		return;
1827 	/*
1828 	 * First, figure out if this address is disposable.
1829 	 */
1830 	if (ipif->ipif_flags & (IPIF_DHCPRUNNING | IPIF_TEMPORARY))
1831 		maxdefense = ipst->ips_ip_max_temp_defend;
1832 	else
1833 		maxdefense = ipst->ips_ip_max_defend;
1834 
1835 	/*
1836 	 * Now figure out how many times we've defended ourselves.  Ignore
1837 	 * defenses that happened long in the past.
1838 	 */
1839 	now = gethrestime_sec();
1840 	mutex_enter(&nce->nce_lock);
1841 	if ((defs = nce->nce_defense_count) > 0 &&
1842 	    now - nce->nce_defense_time > ipst->ips_ip_defend_interval) {
1843 		nce->nce_defense_count = defs = 0;
1844 	}
1845 	nce->nce_defense_count++;
1846 	nce->nce_defense_time = now;
1847 	mutex_exit(&nce->nce_lock);
1848 	ipif_refrele(ipif);
1849 
1850 	/*
1851 	 * If we've defended ourselves too many times already, then give up and
1852 	 * tear down the interface(s) using this address.  Otherwise, defend by
1853 	 * sending out an unsolicited Neighbor Advertisement.
1854 	 */
1855 	if (defs >= maxdefense) {
1856 		ip_ndp_failure(ill, mp, dl_mp, nce);
1857 	} else {
1858 		char hbuf[MAC_STR_LEN];
1859 		char sbuf[INET6_ADDRSTRLEN];
1860 		uchar_t *haddr;
1861 
1862 		(void) ip_ndp_find_solicitation(mp, dl_mp, ill, hbuf,
1863 		    sizeof (hbuf), sbuf, sizeof (sbuf), &haddr);
1864 		cmn_err(CE_WARN, "node %s is using our IP address %s on %s",
1865 		    hbuf, sbuf, ill->ill_name);
1866 		(void) nce_xmit(ill, ND_NEIGHBOR_ADVERT, ill, B_FALSE,
1867 		    &nce->nce_addr, &ipv6_all_hosts_mcast,
1868 		    nce_advert_flags(nce));
1869 	}
1870 }
1871 
1872 static void
1873 ndp_input_solicit(ill_t *ill, mblk_t *mp, mblk_t *dl_mp)
1874 {
1875 	nd_neighbor_solicit_t *ns;
1876 	uint32_t	hlen = ill->ill_nd_lla_len;
1877 	uchar_t		*haddr = NULL;
1878 	icmp6_t		*icmp_nd;
1879 	ip6_t		*ip6h;
1880 	nce_t		*our_nce = NULL;
1881 	in6_addr_t	target;
1882 	in6_addr_t	src;
1883 	int		len;
1884 	int		flag = 0;
1885 	nd_opt_hdr_t	*opt = NULL;
1886 	boolean_t	bad_solicit = B_FALSE;
1887 	mib2_ipv6IfIcmpEntry_t	*mib = ill->ill_icmp6_mib;
1888 
1889 	ip6h = (ip6_t *)mp->b_rptr;
1890 	icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
1891 	len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN;
1892 	src = ip6h->ip6_src;
1893 	ns = (nd_neighbor_solicit_t *)icmp_nd;
1894 	target = ns->nd_ns_target;
1895 	if (IN6_IS_ADDR_MULTICAST(&target)) {
1896 		if (ip_debug > 2) {
1897 			/* ip1dbg */
1898 			pr_addr_dbg("ndp_input_solicit: Target is"
1899 			    " multicast! %s\n", AF_INET6, &target);
1900 		}
1901 		bad_solicit = B_TRUE;
1902 		goto done;
1903 	}
1904 	if (len > sizeof (nd_neighbor_solicit_t)) {
1905 		/* Options present */
1906 		opt = (nd_opt_hdr_t *)&ns[1];
1907 		len -= sizeof (nd_neighbor_solicit_t);
1908 		if (!ndp_verify_optlen(opt, len)) {
1909 			ip1dbg(("ndp_input_solicit: Bad opt len\n"));
1910 			bad_solicit = B_TRUE;
1911 			goto done;
1912 		}
1913 	}
1914 	if (IN6_IS_ADDR_UNSPECIFIED(&src)) {
1915 		/* Check to see if this is a valid DAD solicitation */
1916 		if (!IN6_IS_ADDR_MC_SOLICITEDNODE(&ip6h->ip6_dst)) {
1917 			if (ip_debug > 2) {
1918 				/* ip1dbg */
1919 				pr_addr_dbg("ndp_input_solicit: IPv6 "
1920 				    "Destination is not solicited node "
1921 				    "multicast %s\n", AF_INET6,
1922 				    &ip6h->ip6_dst);
1923 			}
1924 			bad_solicit = B_TRUE;
1925 			goto done;
1926 		}
1927 	}
1928 
1929 	our_nce = ndp_lookup_v6(ill, &target, B_FALSE);
1930 	/*
1931 	 * If this is a valid Solicitation, a permanent
1932 	 * entry should exist in the cache
1933 	 */
1934 	if (our_nce == NULL ||
1935 	    !(our_nce->nce_flags & NCE_F_PERMANENT)) {
1936 		ip1dbg(("ndp_input_solicit: Wrong target in NS?!"
1937 		    "ifname=%s ", ill->ill_name));
1938 		if (ip_debug > 2) {
1939 			/* ip1dbg */
1940 			pr_addr_dbg(" dst %s\n", AF_INET6, &target);
1941 		}
1942 		bad_solicit = B_TRUE;
1943 		goto done;
1944 	}
1945 
1946 	/* At this point we should have a verified NS per spec */
1947 	if (opt != NULL) {
1948 		opt = ndp_get_option(opt, len, ND_OPT_SOURCE_LINKADDR);
1949 		if (opt != NULL) {
1950 			haddr = (uchar_t *)&opt[1];
1951 			if (hlen > opt->nd_opt_len * 8 - sizeof (*opt) ||
1952 			    hlen == 0) {
1953 				ip1dbg(("ndp_input_advert: bad SLLA\n"));
1954 				bad_solicit = B_TRUE;
1955 				goto done;
1956 			}
1957 		}
1958 	}
1959 
1960 	/* If sending directly to peer, set the unicast flag */
1961 	if (!IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst))
1962 		flag |= NDP_UNICAST;
1963 
1964 	/*
1965 	 * Create/update the entry for the soliciting node.
1966 	 * or respond to outstanding queries, don't if
1967 	 * the source is unspecified address.
1968 	 */
1969 	if (!IN6_IS_ADDR_UNSPECIFIED(&src)) {
1970 		int	err;
1971 		nce_t	*nnce;
1972 
1973 		ASSERT(ill->ill_isv6);
1974 		/*
1975 		 * Regular solicitations *must* include the Source Link-Layer
1976 		 * Address option.  Ignore messages that do not.
1977 		 */
1978 		if (haddr == NULL && IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst)) {
1979 			ip1dbg(("ndp_input_solicit: source link-layer address "
1980 			    "option missing with a specified source.\n"));
1981 			bad_solicit = B_TRUE;
1982 			goto done;
1983 		}
1984 
1985 		/*
1986 		 * This is a regular solicitation.  If we're still in the
1987 		 * process of verifying the address, then don't respond at all
1988 		 * and don't keep track of the sender.
1989 		 */
1990 		if (our_nce->nce_state == ND_PROBE)
1991 			goto done;
1992 
1993 		/*
1994 		 * If the solicitation doesn't have sender hardware address
1995 		 * (legal for unicast solicitation), then process without
1996 		 * installing the return NCE.  Either we already know it, or
1997 		 * we'll be forced to look it up when (and if) we reply to the
1998 		 * packet.
1999 		 */
2000 		if (haddr == NULL)
2001 			goto no_source;
2002 
2003 		err = ndp_lookup_then_add(ill,
2004 		    haddr,
2005 		    &src,	/* Soliciting nodes address */
2006 		    &ipv6_all_ones,
2007 		    &ipv6_all_zeros,
2008 		    0,
2009 		    0,
2010 		    ND_STALE,
2011 		    &nnce,
2012 		    NULL,
2013 		    NULL);
2014 		switch (err) {
2015 		case 0:
2016 			/* done with this entry */
2017 			NCE_REFRELE(nnce);
2018 			break;
2019 		case EEXIST:
2020 			/*
2021 			 * B_FALSE indicates this is not an
2022 			 * an advertisement.
2023 			 */
2024 			ndp_process(nnce, haddr, 0, B_FALSE);
2025 			NCE_REFRELE(nnce);
2026 			break;
2027 		default:
2028 			ip1dbg(("ndp_input_solicit: Can't create NCE %d\n",
2029 			    err));
2030 			goto done;
2031 		}
2032 no_source:
2033 		flag |= NDP_SOLICITED;
2034 	} else {
2035 		/*
2036 		 * No source link layer address option should be present in a
2037 		 * valid DAD request.
2038 		 */
2039 		if (haddr != NULL) {
2040 			ip1dbg(("ndp_input_solicit: source link-layer address "
2041 			    "option present with an unspecified source.\n"));
2042 			bad_solicit = B_TRUE;
2043 			goto done;
2044 		}
2045 		if (our_nce->nce_state == ND_PROBE) {
2046 			/*
2047 			 * Internally looped-back probes won't have DLPI
2048 			 * attached to them.  External ones (which are sent by
2049 			 * multicast) always will.  Just ignore our own
2050 			 * transmissions.
2051 			 */
2052 			if (dl_mp != NULL) {
2053 				/*
2054 				 * If someone else is probing our address, then
2055 				 * we've crossed wires.  Declare failure.
2056 				 */
2057 				ip_ndp_failure(ill, mp, dl_mp, our_nce);
2058 			}
2059 			goto done;
2060 		}
2061 		/*
2062 		 * This is a DAD probe.  Multicast the advertisement to the
2063 		 * all-nodes address.
2064 		 */
2065 		src = ipv6_all_hosts_mcast;
2066 	}
2067 	flag |= nce_advert_flags(our_nce);
2068 	/* Response to a solicitation */
2069 	(void) nce_xmit(ill,
2070 	    ND_NEIGHBOR_ADVERT,
2071 	    ill,	/* ill to be used for extracting ill_nd_lla */
2072 	    B_TRUE,	/* use ill_nd_lla */
2073 	    &target,	/* Source and target of the advertisement pkt */
2074 	    &src,	/* IP Destination (source of original pkt) */
2075 	    flag);
2076 done:
2077 	if (bad_solicit)
2078 		BUMP_MIB(mib, ipv6IfIcmpInBadNeighborSolicitations);
2079 	if (our_nce != NULL)
2080 		NCE_REFRELE(our_nce);
2081 }
2082 
2083 void
2084 ndp_input_advert(ill_t *ill, mblk_t *mp, mblk_t *dl_mp)
2085 {
2086 	nd_neighbor_advert_t *na;
2087 	uint32_t	hlen = ill->ill_nd_lla_len;
2088 	uchar_t		*haddr = NULL;
2089 	icmp6_t		*icmp_nd;
2090 	ip6_t		*ip6h;
2091 	nce_t		*dst_nce = NULL;
2092 	in6_addr_t	target;
2093 	nd_opt_hdr_t	*opt = NULL;
2094 	int		len;
2095 	mib2_ipv6IfIcmpEntry_t	*mib = ill->ill_icmp6_mib;
2096 	ip_stack_t	*ipst = ill->ill_ipst;
2097 
2098 	ip6h = (ip6_t *)mp->b_rptr;
2099 	icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
2100 	len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN;
2101 	na = (nd_neighbor_advert_t *)icmp_nd;
2102 	if (IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst) &&
2103 	    (na->nd_na_flags_reserved & ND_NA_FLAG_SOLICITED)) {
2104 		ip1dbg(("ndp_input_advert: Target is multicast but the "
2105 		    "solicited flag is not zero\n"));
2106 		BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements);
2107 		return;
2108 	}
2109 	target = na->nd_na_target;
2110 	if (IN6_IS_ADDR_MULTICAST(&target)) {
2111 		ip1dbg(("ndp_input_advert: Target is multicast!\n"));
2112 		BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements);
2113 		return;
2114 	}
2115 	if (len > sizeof (nd_neighbor_advert_t)) {
2116 		opt = (nd_opt_hdr_t *)&na[1];
2117 		if (!ndp_verify_optlen(opt,
2118 		    len - sizeof (nd_neighbor_advert_t))) {
2119 			ip1dbg(("ndp_input_advert: cannot verify SLLA\n"));
2120 			BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements);
2121 			return;
2122 		}
2123 		/* At this point we have a verified NA per spec */
2124 		len -= sizeof (nd_neighbor_advert_t);
2125 		opt = ndp_get_option(opt, len, ND_OPT_TARGET_LINKADDR);
2126 		if (opt != NULL) {
2127 			haddr = (uchar_t *)&opt[1];
2128 			if (hlen > opt->nd_opt_len * 8 - sizeof (*opt) ||
2129 			    hlen == 0) {
2130 				ip1dbg(("ndp_input_advert: bad SLLA\n"));
2131 				BUMP_MIB(mib,
2132 				    ipv6IfIcmpInBadNeighborAdvertisements);
2133 				return;
2134 			}
2135 		}
2136 	}
2137 
2138 	/*
2139 	 * If this interface is part of the group look at all the
2140 	 * ills in the group.
2141 	 */
2142 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
2143 	if (ill->ill_group != NULL)
2144 		ill = ill->ill_group->illgrp_ill;
2145 
2146 	for (; ill != NULL; ill = ill->ill_group_next) {
2147 		mutex_enter(&ill->ill_lock);
2148 		if (!ILL_CAN_LOOKUP(ill)) {
2149 			mutex_exit(&ill->ill_lock);
2150 			continue;
2151 		}
2152 		ill_refhold_locked(ill);
2153 		mutex_exit(&ill->ill_lock);
2154 		dst_nce = ndp_lookup_v6(ill, &target, B_FALSE);
2155 		/* We have to drop the lock since ndp_process calls put* */
2156 		rw_exit(&ipst->ips_ill_g_lock);
2157 		if (dst_nce != NULL) {
2158 			if ((dst_nce->nce_flags & NCE_F_PERMANENT) &&
2159 			    dst_nce->nce_state == ND_PROBE) {
2160 				/*
2161 				 * Someone else sent an advertisement for an
2162 				 * address that we're trying to configure.
2163 				 * Tear it down.  Note that dl_mp might be NULL
2164 				 * if we're getting a unicast reply.  This
2165 				 * isn't typically done (multicast is the norm
2166 				 * in response to a probe), but ip_ndp_failure
2167 				 * will handle the dl_mp == NULL case as well.
2168 				 */
2169 				ip_ndp_failure(ill, mp, dl_mp, dst_nce);
2170 			} else if (dst_nce->nce_flags & NCE_F_PERMANENT) {
2171 				/*
2172 				 * Someone just announced one of our local
2173 				 * addresses.  If it wasn't us, then this is a
2174 				 * conflict.  Defend the address or shut it
2175 				 * down.
2176 				 */
2177 				if (dl_mp != NULL &&
2178 				    (haddr == NULL ||
2179 				    nce_cmp_ll_addr(dst_nce, haddr,
2180 				    ill->ill_nd_lla_len))) {
2181 					ip_ndp_conflict(ill, mp, dl_mp,
2182 					    dst_nce);
2183 				}
2184 			} else {
2185 				if (na->nd_na_flags_reserved &
2186 				    ND_NA_FLAG_ROUTER) {
2187 					dst_nce->nce_flags |= NCE_F_ISROUTER;
2188 				}
2189 				/* B_TRUE indicates this an advertisement */
2190 				ndp_process(dst_nce, haddr,
2191 				    na->nd_na_flags_reserved, B_TRUE);
2192 			}
2193 			NCE_REFRELE(dst_nce);
2194 		}
2195 		rw_enter(&ipst->ips_ill_g_lock, RW_READER);
2196 		ill_refrele(ill);
2197 	}
2198 	rw_exit(&ipst->ips_ill_g_lock);
2199 }
2200 
2201 /*
2202  * Process NDP neighbor solicitation/advertisement messages.
2203  * The checksum has already checked o.k before reaching here.
2204  */
2205 void
2206 ndp_input(ill_t *ill, mblk_t *mp, mblk_t *dl_mp)
2207 {
2208 	icmp6_t		*icmp_nd;
2209 	ip6_t		*ip6h;
2210 	int		len;
2211 	mib2_ipv6IfIcmpEntry_t	*mib = ill->ill_icmp6_mib;
2212 
2213 
2214 	if (!pullupmsg(mp, -1)) {
2215 		ip1dbg(("ndp_input: pullupmsg failed\n"));
2216 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2217 		goto done;
2218 	}
2219 	ip6h = (ip6_t *)mp->b_rptr;
2220 	if (ip6h->ip6_hops != IPV6_MAX_HOPS) {
2221 		ip1dbg(("ndp_input: hoplimit != IPV6_MAX_HOPS\n"));
2222 		BUMP_MIB(mib, ipv6IfIcmpBadHoplimit);
2223 		goto done;
2224 	}
2225 	/*
2226 	 * NDP does not accept any extension headers between the
2227 	 * IP header and the ICMP header since e.g. a routing
2228 	 * header could be dangerous.
2229 	 * This assumes that any AH or ESP headers are removed
2230 	 * by ip prior to passing the packet to ndp_input.
2231 	 */
2232 	if (ip6h->ip6_nxt != IPPROTO_ICMPV6) {
2233 		ip1dbg(("ndp_input: Wrong next header 0x%x\n",
2234 		    ip6h->ip6_nxt));
2235 		BUMP_MIB(mib, ipv6IfIcmpInErrors);
2236 		goto done;
2237 	}
2238 	icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
2239 	ASSERT(icmp_nd->icmp6_type == ND_NEIGHBOR_SOLICIT ||
2240 	    icmp_nd->icmp6_type == ND_NEIGHBOR_ADVERT);
2241 	if (icmp_nd->icmp6_code != 0) {
2242 		ip1dbg(("ndp_input: icmp6 code != 0 \n"));
2243 		BUMP_MIB(mib, ipv6IfIcmpInErrors);
2244 		goto done;
2245 	}
2246 	len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN;
2247 	/*
2248 	 * Make sure packet length is large enough for either
2249 	 * a NS or a NA icmp packet.
2250 	 */
2251 	if (len <  sizeof (struct icmp6_hdr) + sizeof (struct in6_addr)) {
2252 		ip1dbg(("ndp_input: packet too short\n"));
2253 		BUMP_MIB(mib, ipv6IfIcmpInErrors);
2254 		goto done;
2255 	}
2256 	if (icmp_nd->icmp6_type == ND_NEIGHBOR_SOLICIT) {
2257 		ndp_input_solicit(ill, mp, dl_mp);
2258 	} else {
2259 		ndp_input_advert(ill, mp, dl_mp);
2260 	}
2261 done:
2262 	freemsg(mp);
2263 }
2264 
2265 /*
2266  * nce_xmit is called to form and transmit a ND solicitation or
2267  * advertisement ICMP packet.
2268  *
2269  * If the source address is unspecified and this isn't a probe (used for
2270  * duplicate address detection), an appropriate source address and link layer
2271  * address will be chosen here.  The link layer address option is included if
2272  * the source is specified (i.e., all non-probe packets), and omitted (per the
2273  * specification) otherwise.
2274  *
2275  * It returns B_FALSE only if it does a successful put() to the
2276  * corresponding ill's ill_wq otherwise returns B_TRUE.
2277  */
2278 static boolean_t
2279 nce_xmit(ill_t *ill, uint32_t operation, ill_t *hwaddr_ill,
2280     boolean_t use_nd_lla, const in6_addr_t *sender, const in6_addr_t *target,
2281     int flag)
2282 {
2283 	uint32_t	len;
2284 	icmp6_t 	*icmp6;
2285 	mblk_t		*mp;
2286 	ip6_t		*ip6h;
2287 	nd_opt_hdr_t	*opt;
2288 	uint_t		plen;
2289 	ip6i_t		*ip6i;
2290 	ipif_t		*src_ipif = NULL;
2291 	uint8_t		*hw_addr;
2292 	zoneid_t	zoneid = GLOBAL_ZONEID;
2293 
2294 	/*
2295 	 * If we have a unspecified source(sender) address, select a
2296 	 * proper source address for the solicitation here itself so
2297 	 * that we can initialize the h/w address correctly. This is
2298 	 * needed for interface groups as source address can come from
2299 	 * the whole group and the h/w address initialized from ill will
2300 	 * be wrong if the source address comes from a different ill.
2301 	 *
2302 	 * If the sender is specified then we use this address in order
2303 	 * to lookup the zoneid before calling ip_output_v6(). This is to
2304 	 * enable unicast ND_NEIGHBOR_ADVERT packets to be routed correctly
2305 	 * by IP (we cannot guarantee that the global zone has an interface
2306 	 * route to the destination).
2307 	 *
2308 	 * Note that the NA never comes here with the unspecified source
2309 	 * address. The following asserts that whenever the source
2310 	 * address is specified, the haddr also should be specified.
2311 	 */
2312 	ASSERT(IN6_IS_ADDR_UNSPECIFIED(sender) || (hwaddr_ill != NULL));
2313 
2314 	if (IN6_IS_ADDR_UNSPECIFIED(sender) && !(flag & NDP_PROBE)) {
2315 		ASSERT(operation != ND_NEIGHBOR_ADVERT);
2316 		/*
2317 		 * Pick a source address for this solicitation, but
2318 		 * restrict the selection to addresses assigned to the
2319 		 * output interface (or interface group).  We do this
2320 		 * because the destination will create a neighbor cache
2321 		 * entry for the source address of this packet, so the
2322 		 * source address had better be a valid neighbor.
2323 		 */
2324 		src_ipif = ipif_select_source_v6(ill, target, RESTRICT_TO_ILL,
2325 		    IPV6_PREFER_SRC_DEFAULT, ALL_ZONES);
2326 		if (src_ipif == NULL) {
2327 			char buf[INET6_ADDRSTRLEN];
2328 
2329 			ip1dbg(("nce_xmit: No source ipif for dst %s\n",
2330 			    inet_ntop(AF_INET6, (char *)target, buf,
2331 			    sizeof (buf))));
2332 			return (B_TRUE);
2333 		}
2334 		sender = &src_ipif->ipif_v6src_addr;
2335 		hwaddr_ill = src_ipif->ipif_ill;
2336 	} else if (!(IN6_IS_ADDR_UNSPECIFIED(sender))) {
2337 		zoneid = ipif_lookup_addr_zoneid_v6(sender, ill, ill->ill_ipst);
2338 		/*
2339 		 * It's possible for ipif_lookup_addr_zoneid_v6() to return
2340 		 * ALL_ZONES if it cannot find a matching ipif for the address
2341 		 * we are trying to use. In this case we err on the side of
2342 		 * trying to send the packet by defaulting to the GLOBAL_ZONEID.
2343 		 */
2344 		if (zoneid == ALL_ZONES)
2345 			zoneid = GLOBAL_ZONEID;
2346 	}
2347 
2348 	/*
2349 	 * Always make sure that the NS/NA packets don't get load
2350 	 * spread. This is needed so that the probe packets sent
2351 	 * by the in.mpathd daemon can really go out on the desired
2352 	 * interface. Probe packets are made to go out on a desired
2353 	 * interface by including a ip6i with ATTACH_IF flag. As these
2354 	 * packets indirectly end up sending/receiving NS/NA packets
2355 	 * (neighbor doing NUD), we have to make sure that NA
2356 	 * also go out on the same interface.
2357 	 */
2358 	plen = (sizeof (nd_opt_hdr_t) + ill->ill_nd_lla_len + 7) / 8;
2359 	len = IPV6_HDR_LEN + sizeof (ip6i_t) + sizeof (nd_neighbor_advert_t) +
2360 	    plen * 8;
2361 	mp = allocb(len,  BPRI_LO);
2362 	if (mp == NULL) {
2363 		if (src_ipif != NULL)
2364 			ipif_refrele(src_ipif);
2365 		return (B_TRUE);
2366 	}
2367 	bzero((char *)mp->b_rptr, len);
2368 	mp->b_wptr = mp->b_rptr + len;
2369 
2370 	ip6i = (ip6i_t *)mp->b_rptr;
2371 	ip6i->ip6i_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
2372 	ip6i->ip6i_nxt = IPPROTO_RAW;
2373 	ip6i->ip6i_flags = IP6I_ATTACH_IF | IP6I_HOPLIMIT;
2374 	if (flag & NDP_PROBE)
2375 		ip6i->ip6i_flags |= IP6I_UNSPEC_SRC;
2376 	ip6i->ip6i_ifindex = ill->ill_phyint->phyint_ifindex;
2377 
2378 	ip6h = (ip6_t *)(mp->b_rptr + sizeof (ip6i_t));
2379 	ip6h->ip6_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
2380 	ip6h->ip6_plen = htons(len - IPV6_HDR_LEN - sizeof (ip6i_t));
2381 	ip6h->ip6_nxt = IPPROTO_ICMPV6;
2382 	ip6h->ip6_hops = IPV6_MAX_HOPS;
2383 	ip6h->ip6_dst = *target;
2384 	icmp6 = (icmp6_t *)&ip6h[1];
2385 
2386 	opt = (nd_opt_hdr_t *)((uint8_t *)ip6h + IPV6_HDR_LEN +
2387 	    sizeof (nd_neighbor_advert_t));
2388 
2389 	if (operation == ND_NEIGHBOR_SOLICIT) {
2390 		nd_neighbor_solicit_t *ns = (nd_neighbor_solicit_t *)icmp6;
2391 
2392 		if (!(flag & NDP_PROBE))
2393 			opt->nd_opt_type = ND_OPT_SOURCE_LINKADDR;
2394 		ip6h->ip6_src = *sender;
2395 		ns->nd_ns_target = *target;
2396 		if (!(flag & NDP_UNICAST)) {
2397 			/* Form multicast address of the target */
2398 			ip6h->ip6_dst = ipv6_solicited_node_mcast;
2399 			ip6h->ip6_dst.s6_addr32[3] |=
2400 			    ns->nd_ns_target.s6_addr32[3];
2401 		}
2402 	} else {
2403 		nd_neighbor_advert_t *na = (nd_neighbor_advert_t *)icmp6;
2404 
2405 		ASSERT(!(flag & NDP_PROBE));
2406 		opt->nd_opt_type = ND_OPT_TARGET_LINKADDR;
2407 		ip6h->ip6_src = *sender;
2408 		na->nd_na_target = *sender;
2409 		if (flag & NDP_ISROUTER)
2410 			na->nd_na_flags_reserved |= ND_NA_FLAG_ROUTER;
2411 		if (flag & NDP_SOLICITED)
2412 			na->nd_na_flags_reserved |= ND_NA_FLAG_SOLICITED;
2413 		if (flag & NDP_ORIDE)
2414 			na->nd_na_flags_reserved |= ND_NA_FLAG_OVERRIDE;
2415 	}
2416 
2417 	hw_addr = NULL;
2418 	if (!(flag & NDP_PROBE)) {
2419 		hw_addr = use_nd_lla ? hwaddr_ill->ill_nd_lla :
2420 		    hwaddr_ill->ill_phys_addr;
2421 		if (hw_addr != NULL) {
2422 			/* Fill in link layer address and option len */
2423 			opt->nd_opt_len = (uint8_t)plen;
2424 			bcopy(hw_addr, &opt[1], hwaddr_ill->ill_nd_lla_len);
2425 		}
2426 	}
2427 	if (hw_addr == NULL) {
2428 		/* If there's no link layer address option, then strip it. */
2429 		len -= plen * 8;
2430 		mp->b_wptr = mp->b_rptr + len;
2431 		ip6h->ip6_plen = htons(len - IPV6_HDR_LEN - sizeof (ip6i_t));
2432 	}
2433 
2434 	icmp6->icmp6_type = (uint8_t)operation;
2435 	icmp6->icmp6_code = 0;
2436 	/*
2437 	 * Prepare for checksum by putting icmp length in the icmp
2438 	 * checksum field. The checksum is calculated in ip_wput_v6.
2439 	 */
2440 	icmp6->icmp6_cksum = ip6h->ip6_plen;
2441 
2442 	if (src_ipif != NULL)
2443 		ipif_refrele(src_ipif);
2444 
2445 	ip_output_v6((void *)(uintptr_t)zoneid, mp, ill->ill_wq, IP_WPUT);
2446 	return (B_FALSE);
2447 }
2448 
2449 /*
2450  * Make a link layer address (does not include the SAP) from an nce.
2451  * To form the link layer address, use the last four bytes of ipv6
2452  * address passed in and the fixed offset stored in nce.
2453  */
2454 static void
2455 nce_make_mapping(nce_t *nce, uchar_t *addrpos, uchar_t *addr)
2456 {
2457 	uchar_t *mask, *to;
2458 	ill_t	*ill = nce->nce_ill;
2459 	int 	len;
2460 
2461 	if (ill->ill_net_type == IRE_IF_NORESOLVER)
2462 		return;
2463 	ASSERT(nce->nce_res_mp != NULL);
2464 	ASSERT(ill->ill_net_type == IRE_IF_RESOLVER);
2465 	ASSERT(nce->nce_flags & NCE_F_MAPPING);
2466 	ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&nce->nce_extract_mask));
2467 	ASSERT(addr != NULL);
2468 	bcopy(nce->nce_res_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill),
2469 	    addrpos, ill->ill_nd_lla_len);
2470 	len = MIN((int)ill->ill_nd_lla_len - nce->nce_ll_extract_start,
2471 	    IPV6_ADDR_LEN);
2472 	mask = (uchar_t *)&nce->nce_extract_mask;
2473 	mask += (IPV6_ADDR_LEN - len);
2474 	addr += (IPV6_ADDR_LEN - len);
2475 	to = addrpos + nce->nce_ll_extract_start;
2476 	while (len-- > 0)
2477 		*to++ |= *mask++ & *addr++;
2478 }
2479 
2480 /*
2481  * Pass a cache report back out via NDD.
2482  */
2483 /* ARGSUSED */
2484 int
2485 ndp_report(queue_t *q, mblk_t *mp, caddr_t arg, cred_t *ioc_cr)
2486 {
2487 	ip_stack_t	*ipst;
2488 
2489 	if (CONN_Q(q))
2490 		ipst = CONNQ_TO_IPST(q);
2491 	else
2492 		ipst = ILLQ_TO_IPST(q);
2493 
2494 	(void) mi_mpprintf(mp, "ifname      hardware addr    flags"
2495 			"     proto addr/mask");
2496 	ndp_walk(NULL, (pfi_t)nce_report1, (uchar_t *)mp, ipst);
2497 	return (0);
2498 }
2499 
2500 /*
2501  * Add a single line to the NDP Cache Entry Report.
2502  */
2503 static void
2504 nce_report1(nce_t *nce, uchar_t *mp_arg)
2505 {
2506 	ill_t		*ill = nce->nce_ill;
2507 	char		local_buf[INET6_ADDRSTRLEN];
2508 	uchar_t		flags_buf[10];
2509 	uint32_t	flags = nce->nce_flags;
2510 	mblk_t		*mp = (mblk_t *)mp_arg;
2511 	uchar_t		*h;
2512 	uchar_t		*m = flags_buf;
2513 	in6_addr_t	v6addr;
2514 
2515 	/*
2516 	 * Lock the nce to protect nce_res_mp from being changed
2517 	 * if an external resolver address resolution completes
2518 	 * while nce_res_mp is being accessed here.
2519 	 *
2520 	 * Deal with all address formats, not just Ethernet-specific
2521 	 * In addition, make sure that the mblk has enough space
2522 	 * before writing to it. If is doesn't, allocate a new one.
2523 	 */
2524 	if (nce->nce_ipversion == IPV4_VERSION)
2525 		/* Don't include v4 nce_ts in NDP cache entry report */
2526 		return;
2527 
2528 	ASSERT(ill != NULL);
2529 	v6addr = nce->nce_mask;
2530 	if (flags & NCE_F_PERMANENT)
2531 		*m++ = 'P';
2532 	if (flags & NCE_F_ISROUTER)
2533 		*m++ = 'R';
2534 	if (flags & NCE_F_MAPPING)
2535 		*m++ = 'M';
2536 	*m = '\0';
2537 
2538 	if (ill->ill_net_type == IRE_IF_RESOLVER) {
2539 		size_t		addrlen;
2540 		char		*addr_buf;
2541 		dl_unitdata_req_t	*dl;
2542 
2543 		mutex_enter(&nce->nce_lock);
2544 		h = nce->nce_res_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill);
2545 		dl = (dl_unitdata_req_t *)nce->nce_res_mp->b_rptr;
2546 		if (ill->ill_flags & ILLF_XRESOLV)
2547 			addrlen = (3 * (dl->dl_dest_addr_length));
2548 		else
2549 			addrlen = (3 * (ill->ill_nd_lla_len));
2550 		if (addrlen <= 0) {
2551 			mutex_exit(&nce->nce_lock);
2552 			(void) mi_mpprintf(mp,
2553 			    "%8s %9s %5s %s/%d",
2554 			    ill->ill_name,
2555 			    "None",
2556 			    (uchar_t *)&flags_buf,
2557 			    inet_ntop(AF_INET6, (char *)&nce->nce_addr,
2558 				(char *)local_buf, sizeof (local_buf)),
2559 				ip_mask_to_plen_v6(&v6addr));
2560 		} else {
2561 			/*
2562 			 * Convert the hardware/lla address to ascii
2563 			 */
2564 			addr_buf = kmem_zalloc(addrlen, KM_NOSLEEP);
2565 			if (addr_buf == NULL) {
2566 				mutex_exit(&nce->nce_lock);
2567 				return;
2568 			}
2569 			(void) mac_colon_addr((uint8_t *)h,
2570 			    (ill->ill_flags & ILLF_XRESOLV) ?
2571 			    dl->dl_dest_addr_length : ill->ill_nd_lla_len,
2572 			    addr_buf, addrlen);
2573 			mutex_exit(&nce->nce_lock);
2574 			(void) mi_mpprintf(mp, "%8s %17s %5s %s/%d",
2575 			    ill->ill_name, addr_buf, (uchar_t *)&flags_buf,
2576 			    inet_ntop(AF_INET6, (char *)&nce->nce_addr,
2577 				(char *)local_buf, sizeof (local_buf)),
2578 				ip_mask_to_plen_v6(&v6addr));
2579 			kmem_free(addr_buf, addrlen);
2580 		}
2581 	} else {
2582 		(void) mi_mpprintf(mp,
2583 		    "%8s %9s %5s %s/%d",
2584 		    ill->ill_name,
2585 		    "None",
2586 		    (uchar_t *)&flags_buf,
2587 		    inet_ntop(AF_INET6, (char *)&nce->nce_addr,
2588 			(char *)local_buf, sizeof (local_buf)),
2589 			ip_mask_to_plen_v6(&v6addr));
2590 	}
2591 }
2592 
2593 mblk_t *
2594 nce_udreq_alloc(ill_t *ill)
2595 {
2596 	mblk_t	*template_mp = NULL;
2597 	dl_unitdata_req_t *dlur;
2598 	int	sap_length;
2599 
2600 	ASSERT(ill->ill_isv6);
2601 
2602 	sap_length = ill->ill_sap_length;
2603 	template_mp = ip_dlpi_alloc(sizeof (dl_unitdata_req_t) +
2604 	    ill->ill_nd_lla_len + ABS(sap_length), DL_UNITDATA_REQ);
2605 	if (template_mp == NULL)
2606 		return (NULL);
2607 
2608 	dlur = (dl_unitdata_req_t *)template_mp->b_rptr;
2609 	dlur->dl_priority.dl_min = 0;
2610 	dlur->dl_priority.dl_max = 0;
2611 	dlur->dl_dest_addr_length = ABS(sap_length) + ill->ill_nd_lla_len;
2612 	dlur->dl_dest_addr_offset = sizeof (dl_unitdata_req_t);
2613 
2614 	/* Copy in the SAP value. */
2615 	NCE_LL_SAP_COPY(ill, template_mp);
2616 
2617 	return (template_mp);
2618 }
2619 
2620 /*
2621  * NDP retransmit timer.
2622  * This timer goes off when:
2623  * a. It is time to retransmit NS for resolver.
2624  * b. It is time to send reachability probes.
2625  */
2626 void
2627 ndp_timer(void *arg)
2628 {
2629 	nce_t		*nce = arg;
2630 	ill_t		*ill = nce->nce_ill;
2631 	uint32_t	ms;
2632 	char		addrbuf[INET6_ADDRSTRLEN];
2633 	mblk_t		*mp;
2634 	boolean_t	dropped = B_FALSE;
2635 	ip_stack_t	*ipst = ill->ill_ipst;
2636 
2637 	/*
2638 	 * The timer has to be cancelled by ndp_delete before doing the final
2639 	 * refrele. So the NCE is guaranteed to exist when the timer runs
2640 	 * until it clears the timeout_id. Before clearing the timeout_id
2641 	 * bump up the refcnt so that we can continue to use the nce
2642 	 */
2643 	ASSERT(nce != NULL);
2644 
2645 	/*
2646 	 * Grab the ill_g_lock now itself to avoid lock order problems.
2647 	 * nce_solicit needs ill_g_lock to be able to traverse ills
2648 	 */
2649 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
2650 	mutex_enter(&nce->nce_lock);
2651 	NCE_REFHOLD_LOCKED(nce);
2652 	nce->nce_timeout_id = 0;
2653 
2654 	/*
2655 	 * Check the reachability state first.
2656 	 */
2657 	switch (nce->nce_state) {
2658 	case ND_DELAY:
2659 		rw_exit(&ipst->ips_ill_g_lock);
2660 		nce->nce_state = ND_PROBE;
2661 		mutex_exit(&nce->nce_lock);
2662 		(void) nce_xmit(ill, ND_NEIGHBOR_SOLICIT, NULL, B_FALSE,
2663 		    &ipv6_all_zeros, &nce->nce_addr, NDP_UNICAST);
2664 		if (ip_debug > 3) {
2665 			/* ip2dbg */
2666 			pr_addr_dbg("ndp_timer: state for %s changed "
2667 			    "to PROBE\n", AF_INET6, &nce->nce_addr);
2668 		}
2669 		NDP_RESTART_TIMER(nce, ill->ill_reachable_retrans_time);
2670 		NCE_REFRELE(nce);
2671 		return;
2672 	case ND_PROBE:
2673 		/* must be retransmit timer */
2674 		rw_exit(&ipst->ips_ill_g_lock);
2675 		nce->nce_pcnt--;
2676 		ASSERT(nce->nce_pcnt < ND_MAX_UNICAST_SOLICIT &&
2677 		    nce->nce_pcnt >= -1);
2678 		if (nce->nce_pcnt > 0) {
2679 			/*
2680 			 * As per RFC2461, the nce gets deleted after
2681 			 * MAX_UNICAST_SOLICIT unsuccessful re-transmissions.
2682 			 * Note that the first unicast solicitation is sent
2683 			 * during the DELAY state.
2684 			 */
2685 			ip2dbg(("ndp_timer: pcount=%x dst %s\n",
2686 			    nce->nce_pcnt, inet_ntop(AF_INET6, &nce->nce_addr,
2687 			    addrbuf, sizeof (addrbuf))));
2688 			mutex_exit(&nce->nce_lock);
2689 			dropped = nce_xmit(ill, ND_NEIGHBOR_SOLICIT, NULL,
2690 			    B_FALSE, &ipv6_all_zeros, &nce->nce_addr,
2691 			    (nce->nce_flags & NCE_F_PERMANENT) ? NDP_PROBE :
2692 			    NDP_UNICAST);
2693 			if (dropped) {
2694 				mutex_enter(&nce->nce_lock);
2695 				nce->nce_pcnt++;
2696 				mutex_exit(&nce->nce_lock);
2697 			}
2698 			NDP_RESTART_TIMER(nce, ILL_PROBE_INTERVAL(ill));
2699 		} else if (nce->nce_pcnt < 0) {
2700 			/* No hope, delete the nce */
2701 			nce->nce_state = ND_UNREACHABLE;
2702 			mutex_exit(&nce->nce_lock);
2703 			if (ip_debug > 2) {
2704 				/* ip1dbg */
2705 				pr_addr_dbg("ndp_timer: Delete IRE for"
2706 				    " dst %s\n", AF_INET6, &nce->nce_addr);
2707 			}
2708 			ndp_delete(nce);
2709 		} else if (!(nce->nce_flags & NCE_F_PERMANENT)) {
2710 			/* Wait RetransTimer, before deleting the entry */
2711 			ip2dbg(("ndp_timer: pcount=%x dst %s\n",
2712 			    nce->nce_pcnt, inet_ntop(AF_INET6,
2713 			    &nce->nce_addr, addrbuf, sizeof (addrbuf))));
2714 			mutex_exit(&nce->nce_lock);
2715 			/* Wait one interval before killing */
2716 			NDP_RESTART_TIMER(nce, ill->ill_reachable_retrans_time);
2717 		} else if (ill->ill_phyint->phyint_flags & PHYI_RUNNING) {
2718 			ipif_t *ipif;
2719 
2720 			/*
2721 			 * We're done probing, and we can now declare this
2722 			 * address to be usable.  Let IP know that it's ok to
2723 			 * use.
2724 			 */
2725 			nce->nce_state = ND_REACHABLE;
2726 			mutex_exit(&nce->nce_lock);
2727 			ipif = ipif_lookup_addr_v6(&nce->nce_addr, ill,
2728 			    ALL_ZONES, NULL, NULL, NULL, NULL, ipst);
2729 			if (ipif != NULL) {
2730 				if (ipif->ipif_was_dup) {
2731 					char ibuf[LIFNAMSIZ + 10];
2732 					char sbuf[INET6_ADDRSTRLEN];
2733 
2734 					ipif->ipif_was_dup = B_FALSE;
2735 					(void) strlcpy(ibuf, ill->ill_name,
2736 					    sizeof (ibuf));
2737 					(void) inet_ntop(AF_INET6,
2738 					    &ipif->ipif_v6lcl_addr,
2739 					    sbuf, sizeof (sbuf));
2740 					if (ipif->ipif_id != 0) {
2741 						(void) snprintf(ibuf +
2742 						    ill->ill_name_length - 1,
2743 						    sizeof (ibuf) -
2744 						    ill->ill_name_length + 1,
2745 						    ":%d", ipif->ipif_id);
2746 					}
2747 					cmn_err(CE_NOTE, "recovered address "
2748 					    "%s on %s", sbuf, ibuf);
2749 				}
2750 				if ((ipif->ipif_flags & IPIF_UP) &&
2751 				    !ipif->ipif_addr_ready) {
2752 					ip_rts_ifmsg(ipif);
2753 					ip_rts_newaddrmsg(RTM_ADD, 0, ipif);
2754 					sctp_update_ipif(ipif, SCTP_IPIF_UP);
2755 				}
2756 				ipif->ipif_addr_ready = 1;
2757 				ipif_refrele(ipif);
2758 			}
2759 			/* Begin defending our new address */
2760 			nce->nce_unsolicit_count = 0;
2761 			dropped = nce_xmit(ill, ND_NEIGHBOR_ADVERT, ill,
2762 			    B_FALSE, &nce->nce_addr, &ipv6_all_hosts_mcast,
2763 			    nce_advert_flags(nce));
2764 			if (dropped) {
2765 				nce->nce_unsolicit_count = 1;
2766 				NDP_RESTART_TIMER(nce,
2767 				    ipst->ips_ip_ndp_unsolicit_interval);
2768 			} else if (ipst->ips_ip_ndp_defense_interval != 0) {
2769 				NDP_RESTART_TIMER(nce,
2770 				    ipst->ips_ip_ndp_defense_interval);
2771 			}
2772 		} else {
2773 			/*
2774 			 * This is an address we're probing to be our own, but
2775 			 * the ill is down.  Wait until it comes back before
2776 			 * doing anything, but switch to reachable state so
2777 			 * that the restart will work.
2778 			 */
2779 			nce->nce_state = ND_REACHABLE;
2780 			mutex_exit(&nce->nce_lock);
2781 		}
2782 		NCE_REFRELE(nce);
2783 		return;
2784 	case ND_INCOMPLETE:
2785 		/*
2786 		 * Must be resolvers retransmit timer.
2787 		 */
2788 		for (mp = nce->nce_qd_mp; mp != NULL; mp = mp->b_next) {
2789 			ip6i_t	*ip6i;
2790 			ip6_t	*ip6h;
2791 			mblk_t *data_mp;
2792 
2793 			/*
2794 			 * Walk the list of packets queued, and see if there
2795 			 * are any multipathing probe packets. Such packets
2796 			 * are always queued at the head. Since this is a
2797 			 * retransmit timer firing, mark such packets as
2798 			 * delayed in ND resolution. This info will be used
2799 			 * in ip_wput_v6(). Multipathing probe packets will
2800 			 * always have an ip6i_t. Once we hit a packet without
2801 			 * it, we can break out of this loop.
2802 			 */
2803 			if (mp->b_datap->db_type == M_CTL)
2804 				data_mp = mp->b_cont;
2805 			else
2806 				data_mp = mp;
2807 
2808 			ip6h = (ip6_t *)data_mp->b_rptr;
2809 			if (ip6h->ip6_nxt != IPPROTO_RAW)
2810 				break;
2811 
2812 			/*
2813 			 * This message should have been pulled up already in
2814 			 * ip_wput_v6. We can't do pullups here because the
2815 			 * b_next/b_prev is non-NULL.
2816 			 */
2817 			ip6i = (ip6i_t *)ip6h;
2818 			ASSERT((data_mp->b_wptr - (uchar_t *)ip6i) >=
2819 			    sizeof (ip6i_t) + IPV6_HDR_LEN);
2820 
2821 			/* Mark this packet as delayed due to ND resolution */
2822 			if (ip6i->ip6i_flags & IP6I_DROP_IFDELAYED)
2823 				ip6i->ip6i_flags |= IP6I_ND_DELAYED;
2824 		}
2825 		if (nce->nce_qd_mp != NULL) {
2826 			ms = nce_solicit(nce, NULL);
2827 			rw_exit(&ipst->ips_ill_g_lock);
2828 			if (ms == 0) {
2829 				if (nce->nce_state != ND_REACHABLE) {
2830 					mutex_exit(&nce->nce_lock);
2831 					nce_resolv_failed(nce);
2832 					ndp_delete(nce);
2833 				} else {
2834 					mutex_exit(&nce->nce_lock);
2835 				}
2836 			} else {
2837 				mutex_exit(&nce->nce_lock);
2838 				NDP_RESTART_TIMER(nce, (clock_t)ms);
2839 			}
2840 			NCE_REFRELE(nce);
2841 			return;
2842 		}
2843 		mutex_exit(&nce->nce_lock);
2844 		rw_exit(&ipst->ips_ill_g_lock);
2845 		NCE_REFRELE(nce);
2846 		break;
2847 	case ND_REACHABLE :
2848 		rw_exit(&ipst->ips_ill_g_lock);
2849 		if (((nce->nce_flags & NCE_F_UNSOL_ADV) &&
2850 		    nce->nce_unsolicit_count != 0) ||
2851 		    ((nce->nce_flags & NCE_F_PERMANENT) &&
2852 		    ipst->ips_ip_ndp_defense_interval != 0)) {
2853 			if (nce->nce_unsolicit_count > 0)
2854 				nce->nce_unsolicit_count--;
2855 			mutex_exit(&nce->nce_lock);
2856 			dropped = nce_xmit(ill,
2857 			    ND_NEIGHBOR_ADVERT,
2858 			    ill,	/* ill to be used for hw addr */
2859 			    B_FALSE,	/* use ill_phys_addr */
2860 			    &nce->nce_addr,
2861 			    &ipv6_all_hosts_mcast,
2862 			    nce_advert_flags(nce));
2863 			if (dropped) {
2864 				mutex_enter(&nce->nce_lock);
2865 				nce->nce_unsolicit_count++;
2866 				mutex_exit(&nce->nce_lock);
2867 			}
2868 			if (nce->nce_unsolicit_count != 0) {
2869 				NDP_RESTART_TIMER(nce,
2870 				    ipst->ips_ip_ndp_unsolicit_interval);
2871 			} else {
2872 				NDP_RESTART_TIMER(nce,
2873 				    ipst->ips_ip_ndp_defense_interval);
2874 			}
2875 		} else {
2876 			mutex_exit(&nce->nce_lock);
2877 		}
2878 		NCE_REFRELE(nce);
2879 		break;
2880 	default:
2881 		rw_exit(&ipst->ips_ill_g_lock);
2882 		mutex_exit(&nce->nce_lock);
2883 		NCE_REFRELE(nce);
2884 		break;
2885 	}
2886 }
2887 
2888 /*
2889  * Set a link layer address from the ll_addr passed in.
2890  * Copy SAP from ill.
2891  */
2892 static void
2893 nce_set_ll(nce_t *nce, uchar_t *ll_addr)
2894 {
2895 	ill_t	*ill = nce->nce_ill;
2896 	uchar_t	*woffset;
2897 
2898 	ASSERT(ll_addr != NULL);
2899 	/* Always called before fast_path_probe */
2900 	ASSERT(nce->nce_fp_mp == NULL);
2901 	if (ill->ill_sap_length != 0) {
2902 		/*
2903 		 * Copy the SAP type specified in the
2904 		 * request into the xmit template.
2905 		 */
2906 		NCE_LL_SAP_COPY(ill, nce->nce_res_mp);
2907 	}
2908 	if (ill->ill_phys_addr_length > 0) {
2909 		/*
2910 		 * The bcopy() below used to be called for the physical address
2911 		 * length rather than the link layer address length. For
2912 		 * ethernet and many other media, the phys_addr and lla are
2913 		 * identical.
2914 		 * However, with xresolv interfaces being introduced, the
2915 		 * phys_addr and lla are no longer the same, and the physical
2916 		 * address may not have any useful meaning, so we use the lla
2917 		 * for IPv6 address resolution and destination addressing.
2918 		 *
2919 		 * For PPP or other interfaces with a zero length
2920 		 * physical address, don't do anything here.
2921 		 * The bcopy() with a zero phys_addr length was previously
2922 		 * a no-op for interfaces with a zero-length physical address.
2923 		 * Using the lla for them would change the way they operate.
2924 		 * Doing nothing in such cases preserves expected behavior.
2925 		 */
2926 		woffset = nce->nce_res_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill);
2927 		bcopy(ll_addr, woffset, ill->ill_nd_lla_len);
2928 	}
2929 }
2930 
2931 static boolean_t
2932 nce_cmp_ll_addr(const nce_t *nce, const uchar_t *ll_addr, uint32_t ll_addr_len)
2933 {
2934 	ill_t	*ill = nce->nce_ill;
2935 	uchar_t	*ll_offset;
2936 
2937 	ASSERT(nce->nce_res_mp != NULL);
2938 	if (ll_addr == NULL)
2939 		return (B_FALSE);
2940 	ll_offset = nce->nce_res_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill);
2941 	if (bcmp(ll_addr, ll_offset, ll_addr_len) != 0)
2942 		return (B_TRUE);
2943 	return (B_FALSE);
2944 }
2945 
2946 /*
2947  * Updates the link layer address or the reachability state of
2948  * a cache entry.  Reset probe counter if needed.
2949  */
2950 static void
2951 nce_update(nce_t *nce, uint16_t new_state, uchar_t *new_ll_addr)
2952 {
2953 	ill_t	*ill = nce->nce_ill;
2954 	boolean_t need_stop_timer = B_FALSE;
2955 	boolean_t need_fastpath_update = B_FALSE;
2956 
2957 	ASSERT(MUTEX_HELD(&nce->nce_lock));
2958 	ASSERT(nce->nce_ipversion == IPV6_VERSION);
2959 	/*
2960 	 * If this interface does not do NUD, there is no point
2961 	 * in allowing an update to the cache entry.  Although
2962 	 * we will respond to NS.
2963 	 * The only time we accept an update for a resolver when
2964 	 * NUD is turned off is when it has just been created.
2965 	 * Non-Resolvers will always be created as REACHABLE.
2966 	 */
2967 	if (new_state != ND_UNCHANGED) {
2968 		if ((nce->nce_flags & NCE_F_NONUD) &&
2969 		    (nce->nce_state != ND_INCOMPLETE))
2970 			return;
2971 		ASSERT((int16_t)new_state >= ND_STATE_VALID_MIN);
2972 		ASSERT((int16_t)new_state <= ND_STATE_VALID_MAX);
2973 		need_stop_timer = B_TRUE;
2974 		if (new_state == ND_REACHABLE)
2975 			nce->nce_last = TICK_TO_MSEC(lbolt64);
2976 		else {
2977 			/* We force NUD in this case */
2978 			nce->nce_last = 0;
2979 		}
2980 		nce->nce_state = new_state;
2981 		nce->nce_pcnt = ND_MAX_UNICAST_SOLICIT;
2982 	}
2983 	/*
2984 	 * In case of fast path we need to free the the fastpath
2985 	 * M_DATA and do another probe.  Otherwise we can just
2986 	 * overwrite the DL_UNITDATA_REQ data, noting we'll lose
2987 	 * whatever packets that happens to be transmitting at the time.
2988 	 */
2989 	if (new_ll_addr != NULL) {
2990 		ASSERT(nce->nce_res_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill) +
2991 		    ill->ill_nd_lla_len <= nce->nce_res_mp->b_wptr);
2992 		bcopy(new_ll_addr, nce->nce_res_mp->b_rptr +
2993 		    NCE_LL_ADDR_OFFSET(ill), ill->ill_nd_lla_len);
2994 		if (nce->nce_fp_mp != NULL) {
2995 			freemsg(nce->nce_fp_mp);
2996 			nce->nce_fp_mp = NULL;
2997 		}
2998 		need_fastpath_update = B_TRUE;
2999 	}
3000 	mutex_exit(&nce->nce_lock);
3001 	if (need_stop_timer) {
3002 		(void) untimeout(nce->nce_timeout_id);
3003 		nce->nce_timeout_id = 0;
3004 	}
3005 	if (need_fastpath_update)
3006 		nce_fastpath(nce);
3007 	mutex_enter(&nce->nce_lock);
3008 }
3009 
3010 void
3011 nce_queue_mp_common(nce_t *nce, mblk_t *mp, boolean_t head_insert)
3012 {
3013 	uint_t	count = 0;
3014 	mblk_t  **mpp;
3015 
3016 	ASSERT(MUTEX_HELD(&nce->nce_lock));
3017 
3018 	for (mpp = &nce->nce_qd_mp; *mpp != NULL;
3019 	    mpp = &(*mpp)->b_next) {
3020 		if (++count >
3021 		    nce->nce_ill->ill_max_buf) {
3022 			mblk_t *tmp = nce->nce_qd_mp->b_next;
3023 
3024 			nce->nce_qd_mp->b_next = NULL;
3025 			nce->nce_qd_mp->b_prev = NULL;
3026 			freemsg(nce->nce_qd_mp);
3027 			nce->nce_qd_mp = tmp;
3028 		}
3029 	}
3030 	/* put this on the list */
3031 	if (head_insert) {
3032 		mp->b_next = nce->nce_qd_mp;
3033 		nce->nce_qd_mp = mp;
3034 	} else {
3035 		*mpp = mp;
3036 	}
3037 }
3038 
3039 static void
3040 nce_queue_mp(nce_t *nce, mblk_t *mp)
3041 {
3042 	boolean_t head_insert = B_FALSE;
3043 	ip6_t	*ip6h;
3044 	ip6i_t	*ip6i;
3045 	mblk_t *data_mp;
3046 
3047 	ASSERT(MUTEX_HELD(&nce->nce_lock));
3048 
3049 	if (mp->b_datap->db_type == M_CTL)
3050 		data_mp = mp->b_cont;
3051 	else
3052 		data_mp = mp;
3053 	ip6h = (ip6_t *)data_mp->b_rptr;
3054 	if (ip6h->ip6_nxt == IPPROTO_RAW) {
3055 		/*
3056 		 * This message should have been pulled up already in
3057 		 * ip_wput_v6. We can't do pullups here because the message
3058 		 * could be from the nce_qd_mp which could have b_next/b_prev
3059 		 * non-NULL.
3060 		 */
3061 		ip6i = (ip6i_t *)ip6h;
3062 		ASSERT((data_mp->b_wptr - (uchar_t *)ip6i) >=
3063 			    sizeof (ip6i_t) + IPV6_HDR_LEN);
3064 		/*
3065 		 * Multipathing probe packets have IP6I_DROP_IFDELAYED set.
3066 		 * This has 2 aspects mentioned below.
3067 		 * 1. Perform head insertion in the nce_qd_mp for these packets.
3068 		 * This ensures that next retransmit of ND solicitation
3069 		 * will use the interface specified by the probe packet,
3070 		 * for both NS and NA. This corresponds to the src address
3071 		 * in the IPv6 packet. If we insert at tail, we will be
3072 		 * depending on the packet at the head for successful
3073 		 * ND resolution. This is not reliable, because the interface
3074 		 * on which the NA arrives could be different from the interface
3075 		 * on which the NS was sent, and if the receiving interface is
3076 		 * failed, it will appear that the sending interface is also
3077 		 * failed, causing in.mpathd to misdiagnose this as link
3078 		 * failure.
3079 		 * 2. Drop the original packet, if the ND resolution did not
3080 		 * succeed in the first attempt. However we will create the
3081 		 * nce and the ire, as soon as the ND resolution succeeds.
3082 		 * We don't gain anything by queueing multiple probe packets
3083 		 * and sending them back-to-back once resolution succeeds.
3084 		 * It is sufficient to send just 1 packet after ND resolution
3085 		 * succeeds. Since mpathd is sending down probe packets at a
3086 		 * constant rate, we don't need to send the queued packet. We
3087 		 * need to queue it only for NDP resolution. The benefit of
3088 		 * dropping the probe packets that were delayed in ND
3089 		 * resolution, is that in.mpathd will not see inflated
3090 		 * RTT. If the ND resolution does not succeed within
3091 		 * in.mpathd's failure detection time, mpathd may detect
3092 		 * a failure, and it does not matter whether the packet
3093 		 * was queued or dropped.
3094 		 */
3095 		if (ip6i->ip6i_flags & IP6I_DROP_IFDELAYED)
3096 			head_insert = B_TRUE;
3097 	}
3098 
3099 	nce_queue_mp_common(nce, mp, head_insert);
3100 }
3101 
3102 /*
3103  * Called when address resolution failed due to a timeout.
3104  * Send an ICMP unreachable in response to all queued packets.
3105  */
3106 void
3107 nce_resolv_failed(nce_t *nce)
3108 {
3109 	mblk_t	*mp, *nxt_mp, *first_mp;
3110 	char	buf[INET6_ADDRSTRLEN];
3111 	ip6_t *ip6h;
3112 	zoneid_t zoneid = GLOBAL_ZONEID;
3113 	ip_stack_t	*ipst = nce->nce_ill->ill_ipst;
3114 
3115 	ip1dbg(("nce_resolv_failed: dst %s\n",
3116 	    inet_ntop(AF_INET6, (char *)&nce->nce_addr, buf, sizeof (buf))));
3117 	mutex_enter(&nce->nce_lock);
3118 	mp = nce->nce_qd_mp;
3119 	nce->nce_qd_mp = NULL;
3120 	mutex_exit(&nce->nce_lock);
3121 	while (mp != NULL) {
3122 		nxt_mp = mp->b_next;
3123 		mp->b_next = NULL;
3124 		mp->b_prev = NULL;
3125 
3126 		first_mp = mp;
3127 		if (mp->b_datap->db_type == M_CTL) {
3128 			ipsec_out_t *io = (ipsec_out_t *)mp->b_rptr;
3129 			ASSERT(io->ipsec_out_type == IPSEC_OUT);
3130 			zoneid = io->ipsec_out_zoneid;
3131 			ASSERT(zoneid != ALL_ZONES);
3132 			mp = mp->b_cont;
3133 		}
3134 
3135 		ip6h = (ip6_t *)mp->b_rptr;
3136 		if (ip6h->ip6_nxt == IPPROTO_RAW) {
3137 			ip6i_t *ip6i;
3138 			/*
3139 			 * This message should have been pulled up already
3140 			 * in ip_wput_v6. ip_hdr_complete_v6 assumes that
3141 			 * the header is pulled up.
3142 			 */
3143 			ip6i = (ip6i_t *)ip6h;
3144 			ASSERT((mp->b_wptr - (uchar_t *)ip6i) >=
3145 			    sizeof (ip6i_t) + IPV6_HDR_LEN);
3146 			mp->b_rptr += sizeof (ip6i_t);
3147 		}
3148 		/*
3149 		 * Ignore failure since icmp_unreachable_v6 will silently
3150 		 * drop packets with an unspecified source address.
3151 		 */
3152 		(void) ip_hdr_complete_v6((ip6_t *)mp->b_rptr, zoneid, ipst);
3153 		icmp_unreachable_v6(nce->nce_ill->ill_wq, first_mp,
3154 		    ICMP6_DST_UNREACH_ADDR, B_FALSE, B_FALSE, zoneid, ipst);
3155 		mp = nxt_mp;
3156 	}
3157 }
3158 
3159 /*
3160  * Called by SIOCSNDP* ioctl to add/change an nce entry
3161  * and the corresponding attributes.
3162  * Disallow states other than ND_REACHABLE or ND_STALE.
3163  */
3164 int
3165 ndp_sioc_update(ill_t *ill, lif_nd_req_t *lnr)
3166 {
3167 	sin6_t		*sin6;
3168 	in6_addr_t	*addr;
3169 	nce_t		*nce;
3170 	int		err;
3171 	uint16_t	new_flags = 0;
3172 	uint16_t	old_flags = 0;
3173 	int		inflags = lnr->lnr_flags;
3174 	ip_stack_t	*ipst = ill->ill_ipst;
3175 
3176 	ASSERT(ill->ill_isv6);
3177 	if ((lnr->lnr_state_create != ND_REACHABLE) &&
3178 	    (lnr->lnr_state_create != ND_STALE))
3179 		return (EINVAL);
3180 
3181 	sin6 = (sin6_t *)&lnr->lnr_addr;
3182 	addr = &sin6->sin6_addr;
3183 
3184 	mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
3185 	/* We know it can not be mapping so just look in the hash table */
3186 	nce = *((nce_t **)NCE_HASH_PTR_V6(ipst, *addr));
3187 	nce = nce_lookup_addr(ill, addr, nce);
3188 	if (nce != NULL)
3189 		new_flags = nce->nce_flags;
3190 
3191 	switch (inflags & (NDF_ISROUTER_ON|NDF_ISROUTER_OFF)) {
3192 	case NDF_ISROUTER_ON:
3193 		new_flags |= NCE_F_ISROUTER;
3194 		break;
3195 	case NDF_ISROUTER_OFF:
3196 		new_flags &= ~NCE_F_ISROUTER;
3197 		break;
3198 	case (NDF_ISROUTER_OFF|NDF_ISROUTER_ON):
3199 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
3200 		if (nce != NULL)
3201 			NCE_REFRELE(nce);
3202 		return (EINVAL);
3203 	}
3204 
3205 	switch (inflags & (NDF_ANYCAST_ON|NDF_ANYCAST_OFF)) {
3206 	case NDF_ANYCAST_ON:
3207 		new_flags |= NCE_F_ANYCAST;
3208 		break;
3209 	case NDF_ANYCAST_OFF:
3210 		new_flags &= ~NCE_F_ANYCAST;
3211 		break;
3212 	case (NDF_ANYCAST_OFF|NDF_ANYCAST_ON):
3213 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
3214 		if (nce != NULL)
3215 			NCE_REFRELE(nce);
3216 		return (EINVAL);
3217 	}
3218 
3219 	switch (inflags & (NDF_PROXY_ON|NDF_PROXY_OFF)) {
3220 	case NDF_PROXY_ON:
3221 		new_flags |= NCE_F_PROXY;
3222 		break;
3223 	case NDF_PROXY_OFF:
3224 		new_flags &= ~NCE_F_PROXY;
3225 		break;
3226 	case (NDF_PROXY_OFF|NDF_PROXY_ON):
3227 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
3228 		if (nce != NULL)
3229 			NCE_REFRELE(nce);
3230 		return (EINVAL);
3231 	}
3232 
3233 	if (nce == NULL) {
3234 		err = ndp_add(ill,
3235 		    (uchar_t *)lnr->lnr_hdw_addr,
3236 		    addr,
3237 		    &ipv6_all_ones,
3238 		    &ipv6_all_zeros,
3239 		    0,
3240 		    new_flags,
3241 		    lnr->lnr_state_create,
3242 		    &nce,
3243 		    NULL,
3244 		    NULL);
3245 		if (err != 0) {
3246 			mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
3247 			ip1dbg(("ndp_sioc_update: Can't create NCE %d\n", err));
3248 			return (err);
3249 		}
3250 	}
3251 	old_flags = nce->nce_flags;
3252 	if (old_flags & NCE_F_ISROUTER && !(new_flags & NCE_F_ISROUTER)) {
3253 		/*
3254 		 * Router turned to host, delete all ires.
3255 		 * XXX Just delete the entry, but we need to add too.
3256 		 */
3257 		nce->nce_flags &= ~NCE_F_ISROUTER;
3258 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
3259 		ndp_delete(nce);
3260 		NCE_REFRELE(nce);
3261 		return (0);
3262 	}
3263 	mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
3264 
3265 	mutex_enter(&nce->nce_lock);
3266 	nce->nce_flags = new_flags;
3267 	mutex_exit(&nce->nce_lock);
3268 	/*
3269 	 * Note that we ignore the state at this point, which
3270 	 * should be either STALE or REACHABLE.  Instead we let
3271 	 * the link layer address passed in to determine the state
3272 	 * much like incoming packets.
3273 	 */
3274 	ndp_process(nce, (uchar_t *)lnr->lnr_hdw_addr, 0, B_FALSE);
3275 	NCE_REFRELE(nce);
3276 	return (0);
3277 }
3278 
3279 /*
3280  * If the device driver supports it, we make nce_fp_mp to have
3281  * an M_DATA prepend.  Otherwise nce_fp_mp will be null.
3282  * The caller insures there is hold on nce for this function.
3283  * Note that since ill_fastpath_probe() copies the mblk there is
3284  * no need for the hold beyond this function.
3285  */
3286 void
3287 nce_fastpath(nce_t *nce)
3288 {
3289 	ill_t	*ill = nce->nce_ill;
3290 	int res;
3291 
3292 	ASSERT(ill != NULL);
3293 	if ((nce->nce_fp_mp != NULL) ||
3294 	    !(ire_nce_valid_dlureq_mp(nce->nce_res_mp))) {
3295 		/*
3296 		 * Already contains fastpath info or nce is not
3297 		 * resolved, so cant process fastpath yet.
3298 		 */
3299 		return;
3300 	}
3301 	if (nce->nce_res_mp != NULL) {
3302 		nce_fastpath_list_add(nce);
3303 		res = ill_fastpath_probe(ill, nce->nce_res_mp);
3304 		/*
3305 		 * EAGAIN is an indication of a transient error
3306 		 * i.e. allocation failure etc. leave the nce in the list it
3307 		 * will be updated when another probe happens for another ire
3308 		 * if not it will be taken out of the list when the ire is
3309 		 * deleted.
3310 		 */
3311 
3312 		if (res != 0 && res != EAGAIN)
3313 			nce_fastpath_list_delete(nce);
3314 	}
3315 }
3316 
3317 /*
3318  * Drain the list of nce's waiting for fastpath response.
3319  */
3320 void
3321 nce_fastpath_list_dispatch(ill_t *ill, boolean_t (*func)(nce_t *, void  *),
3322     void *arg)
3323 {
3324 
3325 	nce_t *next_nce;
3326 	nce_t *current_nce;
3327 	nce_t *first_nce;
3328 	nce_t *prev_nce = NULL;
3329 
3330 	mutex_enter(&ill->ill_lock);
3331 	first_nce = current_nce = (nce_t *)ill->ill_fastpath_list;
3332 	while (current_nce != (nce_t *)&ill->ill_fastpath_list) {
3333 		next_nce = current_nce->nce_fastpath;
3334 		/*
3335 		 * Take it off the list if we're flushing, or if the callback
3336 		 * routine tells us to do so.  Otherwise, leave the nce in the
3337 		 * fastpath list to handle any pending response from the lower
3338 		 * layer.  We can't drain the list when the callback routine
3339 		 * comparison failed, because the response is asynchronous in
3340 		 * nature, and may not arrive in the same order as the list
3341 		 * insertion.
3342 		 */
3343 		if (func == NULL || func(current_nce, arg)) {
3344 			current_nce->nce_fastpath = NULL;
3345 			if (current_nce == first_nce)
3346 				ill->ill_fastpath_list = first_nce = next_nce;
3347 			else
3348 				prev_nce->nce_fastpath = next_nce;
3349 		} else {
3350 			/* previous element that is still in the list */
3351 			prev_nce = current_nce;
3352 		}
3353 		current_nce = next_nce;
3354 	}
3355 	mutex_exit(&ill->ill_lock);
3356 }
3357 
3358 /*
3359  * Add nce to the nce fastpath list.
3360  */
3361 void
3362 nce_fastpath_list_add(nce_t *nce)
3363 {
3364 	ill_t *ill;
3365 
3366 	ill = nce->nce_ill;
3367 
3368 	mutex_enter(&ill->ill_lock);
3369 	mutex_enter(&nce->nce_lock);
3370 
3371 	/*
3372 	 * if nce has not been deleted and
3373 	 * is not already in the list add it.
3374 	 */
3375 	if (!(nce->nce_flags & NCE_F_CONDEMNED) &&
3376 	    (nce->nce_fastpath == NULL)) {
3377 		nce->nce_fastpath = (nce_t *)ill->ill_fastpath_list;
3378 		ill->ill_fastpath_list = nce;
3379 	}
3380 
3381 	mutex_exit(&nce->nce_lock);
3382 	mutex_exit(&ill->ill_lock);
3383 }
3384 
3385 /*
3386  * remove nce from the nce fastpath list.
3387  */
3388 void
3389 nce_fastpath_list_delete(nce_t *nce)
3390 {
3391 	nce_t *nce_ptr;
3392 
3393 	ill_t *ill;
3394 
3395 	ill = nce->nce_ill;
3396 	ASSERT(ill != NULL);
3397 
3398 	mutex_enter(&ill->ill_lock);
3399 	if (nce->nce_fastpath == NULL)
3400 		goto done;
3401 
3402 	ASSERT(ill->ill_fastpath_list != &ill->ill_fastpath_list);
3403 
3404 	if (ill->ill_fastpath_list == nce) {
3405 		ill->ill_fastpath_list = nce->nce_fastpath;
3406 	} else {
3407 		nce_ptr = ill->ill_fastpath_list;
3408 		while (nce_ptr != (nce_t *)&ill->ill_fastpath_list) {
3409 			if (nce_ptr->nce_fastpath == nce) {
3410 				nce_ptr->nce_fastpath = nce->nce_fastpath;
3411 				break;
3412 			}
3413 			nce_ptr = nce_ptr->nce_fastpath;
3414 		}
3415 	}
3416 
3417 	nce->nce_fastpath = NULL;
3418 done:
3419 	mutex_exit(&ill->ill_lock);
3420 }
3421 
3422 /*
3423  * Update all NCE's that are not in fastpath mode and
3424  * have an nce_fp_mp that matches mp. mp->b_cont contains
3425  * the fastpath header.
3426  *
3427  * Returns TRUE if entry should be dequeued, or FALSE otherwise.
3428  */
3429 boolean_t
3430 ndp_fastpath_update(nce_t *nce, void *arg)
3431 {
3432 	mblk_t 	*mp, *fp_mp;
3433 	uchar_t	*mp_rptr, *ud_mp_rptr;
3434 	mblk_t	*ud_mp = nce->nce_res_mp;
3435 	ptrdiff_t	cmplen;
3436 
3437 	if (nce->nce_flags & NCE_F_MAPPING)
3438 		return (B_TRUE);
3439 	if ((nce->nce_fp_mp != NULL) || (ud_mp == NULL))
3440 		return (B_TRUE);
3441 
3442 	ip2dbg(("ndp_fastpath_update: trying\n"));
3443 	mp = (mblk_t *)arg;
3444 	mp_rptr = mp->b_rptr;
3445 	cmplen = mp->b_wptr - mp_rptr;
3446 	ASSERT(cmplen >= 0);
3447 	ud_mp_rptr = ud_mp->b_rptr;
3448 	/*
3449 	 * The nce is locked here to prevent any other threads
3450 	 * from accessing and changing nce_res_mp when the IPv6 address
3451 	 * becomes resolved to an lla while we're in the middle
3452 	 * of looking at and comparing the hardware address (lla).
3453 	 * It is also locked to prevent multiple threads in nce_fastpath_update
3454 	 * from examining nce_res_mp atthe same time.
3455 	 */
3456 	mutex_enter(&nce->nce_lock);
3457 	if (ud_mp->b_wptr - ud_mp_rptr != cmplen ||
3458 	    bcmp((char *)mp_rptr, (char *)ud_mp_rptr, cmplen) != 0) {
3459 		mutex_exit(&nce->nce_lock);
3460 		/*
3461 		 * Don't take the ire off the fastpath list yet,
3462 		 * since the response may come later.
3463 		 */
3464 		return (B_FALSE);
3465 	}
3466 	/* Matched - install mp as the fastpath mp */
3467 	ip1dbg(("ndp_fastpath_update: match\n"));
3468 	fp_mp = dupb(mp->b_cont);
3469 	if (fp_mp != NULL) {
3470 		nce->nce_fp_mp = fp_mp;
3471 	}
3472 	mutex_exit(&nce->nce_lock);
3473 	return (B_TRUE);
3474 }
3475 
3476 /*
3477  * This function handles the DL_NOTE_FASTPATH_FLUSH notification from
3478  * driver.  Note that it assumes IP is exclusive...
3479  */
3480 /* ARGSUSED */
3481 void
3482 ndp_fastpath_flush(nce_t *nce, char *arg)
3483 {
3484 	if (nce->nce_flags & NCE_F_MAPPING)
3485 		return;
3486 	/* No fastpath info? */
3487 	if (nce->nce_fp_mp == NULL || nce->nce_res_mp == NULL)
3488 		return;
3489 
3490 	if (nce->nce_ipversion == IPV4_VERSION &&
3491 	    nce->nce_flags & NCE_F_BCAST) {
3492 		/*
3493 		 * IPv4 BROADCAST entries:
3494 		 * We can't delete the nce since it is difficult to
3495 		 * recreate these without going through the
3496 		 * ipif down/up dance.
3497 		 *
3498 		 * All access to nce->nce_fp_mp in the case of these
3499 		 * is protected by nce_lock.
3500 		 */
3501 		mutex_enter(&nce->nce_lock);
3502 		if (nce->nce_fp_mp != NULL) {
3503 			freeb(nce->nce_fp_mp);
3504 			nce->nce_fp_mp = NULL;
3505 			mutex_exit(&nce->nce_lock);
3506 			nce_fastpath(nce);
3507 		} else {
3508 			mutex_exit(&nce->nce_lock);
3509 		}
3510 	} else {
3511 		/* Just delete the NCE... */
3512 		ndp_delete(nce);
3513 	}
3514 }
3515 
3516 /*
3517  * Return a pointer to a given option in the packet.
3518  * Assumes that option part of the packet have already been validated.
3519  */
3520 nd_opt_hdr_t *
3521 ndp_get_option(nd_opt_hdr_t *opt, int optlen, int opt_type)
3522 {
3523 	while (optlen > 0) {
3524 		if (opt->nd_opt_type == opt_type)
3525 			return (opt);
3526 		optlen -= 8 * opt->nd_opt_len;
3527 		opt = (struct nd_opt_hdr *)((char *)opt + 8 * opt->nd_opt_len);
3528 	}
3529 	return (NULL);
3530 }
3531 
3532 /*
3533  * Verify all option lengths present are > 0, also check to see
3534  * if the option lengths and packet length are consistent.
3535  */
3536 boolean_t
3537 ndp_verify_optlen(nd_opt_hdr_t *opt, int optlen)
3538 {
3539 	ASSERT(opt != NULL);
3540 	while (optlen > 0) {
3541 		if (opt->nd_opt_len == 0)
3542 			return (B_FALSE);
3543 		optlen -= 8 * opt->nd_opt_len;
3544 		if (optlen < 0)
3545 			return (B_FALSE);
3546 		opt = (struct nd_opt_hdr *)((char *)opt + 8 * opt->nd_opt_len);
3547 	}
3548 	return (B_TRUE);
3549 }
3550 
3551 /*
3552  * ndp_walk function.
3553  * Free a fraction of the NCE cache entries.
3554  * A fraction of zero means to not free any in that category.
3555  */
3556 void
3557 ndp_cache_reclaim(nce_t *nce, char *arg)
3558 {
3559 	nce_cache_reclaim_t *ncr = (nce_cache_reclaim_t *)arg;
3560 	uint_t	rand;
3561 
3562 	if (nce->nce_flags & NCE_F_PERMANENT)
3563 		return;
3564 
3565 	rand = (uint_t)lbolt +
3566 	    NCE_ADDR_HASH_V6(nce->nce_addr, NCE_TABLE_SIZE);
3567 	if (ncr->ncr_host != 0 &&
3568 	    (rand/ncr->ncr_host)*ncr->ncr_host == rand) {
3569 		ndp_delete(nce);
3570 		return;
3571 	}
3572 }
3573 
3574 /*
3575  * ndp_walk function.
3576  * Count the number of NCEs that can be deleted.
3577  * These would be hosts but not routers.
3578  */
3579 void
3580 ndp_cache_count(nce_t *nce, char *arg)
3581 {
3582 	ncc_cache_count_t *ncc = (ncc_cache_count_t *)arg;
3583 
3584 	if (nce->nce_flags & NCE_F_PERMANENT)
3585 		return;
3586 
3587 	ncc->ncc_total++;
3588 	if (!(nce->nce_flags & NCE_F_ISROUTER))
3589 		ncc->ncc_host++;
3590 }
3591 
3592 #ifdef NCE_DEBUG
3593 th_trace_t *
3594 th_trace_nce_lookup(nce_t *nce)
3595 {
3596 	int bucket_id;
3597 	th_trace_t *th_trace;
3598 
3599 	ASSERT(MUTEX_HELD(&nce->nce_lock));
3600 
3601 	bucket_id = IP_TR_HASH(curthread);
3602 	ASSERT(bucket_id < IP_TR_HASH_MAX);
3603 
3604 	for (th_trace = nce->nce_trace[bucket_id]; th_trace != NULL;
3605 	    th_trace = th_trace->th_next) {
3606 		if (th_trace->th_id == curthread)
3607 			return (th_trace);
3608 	}
3609 	return (NULL);
3610 }
3611 
3612 void
3613 nce_trace_ref(nce_t *nce)
3614 {
3615 	int bucket_id;
3616 	th_trace_t *th_trace;
3617 
3618 	/*
3619 	 * Attempt to locate the trace buffer for the curthread.
3620 	 * If it does not exist, then allocate a new trace buffer
3621 	 * and link it in list of trace bufs for this ipif, at the head
3622 	 */
3623 	ASSERT(MUTEX_HELD(&nce->nce_lock));
3624 
3625 	if (nce->nce_trace_disable == B_TRUE)
3626 		return;
3627 
3628 	th_trace = th_trace_nce_lookup(nce);
3629 	if (th_trace == NULL) {
3630 		bucket_id = IP_TR_HASH(curthread);
3631 		th_trace = (th_trace_t *)kmem_zalloc(sizeof (th_trace_t),
3632 		    KM_NOSLEEP);
3633 		if (th_trace == NULL) {
3634 			nce->nce_trace_disable = B_TRUE;
3635 			nce_trace_inactive(nce);
3636 			return;
3637 		}
3638 		th_trace->th_id = curthread;
3639 		th_trace->th_next = nce->nce_trace[bucket_id];
3640 		th_trace->th_prev = &nce->nce_trace[bucket_id];
3641 		if (th_trace->th_next != NULL)
3642 			th_trace->th_next->th_prev = &th_trace->th_next;
3643 		nce->nce_trace[bucket_id] = th_trace;
3644 	}
3645 	ASSERT(th_trace->th_refcnt < TR_BUF_MAX - 1);
3646 	th_trace->th_refcnt++;
3647 	th_trace_rrecord(th_trace);
3648 }
3649 
3650 void
3651 nce_untrace_ref(nce_t *nce)
3652 {
3653 	th_trace_t *th_trace;
3654 
3655 	ASSERT(MUTEX_HELD(&nce->nce_lock));
3656 
3657 	if (nce->nce_trace_disable == B_TRUE)
3658 		return;
3659 
3660 	th_trace = th_trace_nce_lookup(nce);
3661 	ASSERT(th_trace != NULL && th_trace->th_refcnt > 0);
3662 
3663 	th_trace_rrecord(th_trace);
3664 	th_trace->th_refcnt--;
3665 }
3666 
3667 void
3668 nce_trace_inactive(nce_t *nce)
3669 {
3670 	th_trace_t *th_trace;
3671 	int i;
3672 
3673 	ASSERT(MUTEX_HELD(&nce->nce_lock));
3674 
3675 	for (i = 0; i < IP_TR_HASH_MAX; i++) {
3676 		while (nce->nce_trace[i] != NULL) {
3677 			th_trace = nce->nce_trace[i];
3678 
3679 			/* unlink th_trace and free it */
3680 			nce->nce_trace[i] = th_trace->th_next;
3681 			if (th_trace->th_next != NULL)
3682 				th_trace->th_next->th_prev =
3683 				    &nce->nce_trace[i];
3684 
3685 			th_trace->th_next = NULL;
3686 			th_trace->th_prev = NULL;
3687 			kmem_free(th_trace, sizeof (th_trace_t));
3688 		}
3689 	}
3690 
3691 }
3692 
3693 /* ARGSUSED */
3694 int
3695 nce_thread_exit(nce_t *nce, caddr_t arg)
3696 {
3697 	th_trace_t	*th_trace;
3698 
3699 	mutex_enter(&nce->nce_lock);
3700 	th_trace = th_trace_nce_lookup(nce);
3701 
3702 	if (th_trace == NULL) {
3703 		mutex_exit(&nce->nce_lock);
3704 		return (0);
3705 	}
3706 
3707 	ASSERT(th_trace->th_refcnt == 0);
3708 
3709 	/* unlink th_trace and free it */
3710 	*th_trace->th_prev = th_trace->th_next;
3711 	if (th_trace->th_next != NULL)
3712 		th_trace->th_next->th_prev = th_trace->th_prev;
3713 	th_trace->th_next = NULL;
3714 	th_trace->th_prev = NULL;
3715 	kmem_free(th_trace, sizeof (th_trace_t));
3716 	mutex_exit(&nce->nce_lock);
3717 	return (0);
3718 }
3719 #endif
3720 
3721 /*
3722  * Called when address resolution fails due to a timeout.
3723  * Send an ICMP unreachable in response to all queued packets.
3724  */
3725 void
3726 arp_resolv_failed(nce_t *nce)
3727 {
3728 	mblk_t	*mp, *nxt_mp, *first_mp;
3729 	char	buf[INET6_ADDRSTRLEN];
3730 	zoneid_t zoneid = GLOBAL_ZONEID;
3731 	struct in_addr ipv4addr;
3732 	ip_stack_t *ipst = nce->nce_ill->ill_ipst;
3733 
3734 	IN6_V4MAPPED_TO_INADDR(&nce->nce_addr, &ipv4addr);
3735 	ip3dbg(("arp_resolv_failed: dst %s\n",
3736 	    inet_ntop(AF_INET, &ipv4addr, buf, sizeof (buf))));
3737 	mutex_enter(&nce->nce_lock);
3738 	mp = nce->nce_qd_mp;
3739 	nce->nce_qd_mp = NULL;
3740 	mutex_exit(&nce->nce_lock);
3741 
3742 	while (mp != NULL) {
3743 		nxt_mp = mp->b_next;
3744 		mp->b_next = NULL;
3745 		mp->b_prev = NULL;
3746 
3747 		first_mp = mp;
3748 		/*
3749 		 * Send icmp unreachable messages
3750 		 * to the hosts.
3751 		 */
3752 		(void) ip_hdr_complete((ipha_t *)mp->b_rptr, zoneid, ipst);
3753 		ip3dbg(("arp_resolv_failed: Calling icmp_unreachable\n"));
3754 		icmp_unreachable(nce->nce_ill->ill_wq, first_mp,
3755 		    ICMP_HOST_UNREACHABLE, zoneid, ipst);
3756 		mp = nxt_mp;
3757 	}
3758 }
3759 
3760 static int
3761 ndp_lookup_then_add_v4(ill_t *ill, uchar_t *hw_addr, const in_addr_t *addr,
3762     const in_addr_t *mask, const in_addr_t *extract_mask,
3763     uint32_t hw_extract_start, uint16_t flags, uint16_t state,
3764     nce_t **newnce, mblk_t *fp_mp, mblk_t *res_mp)
3765 {
3766 	int	err = 0;
3767 	nce_t	*nce;
3768 	in6_addr_t addr6;
3769 	ip_stack_t *ipst = ill->ill_ipst;
3770 
3771 	mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
3772 	nce = *((nce_t **)NCE_HASH_PTR_V4(ipst, *addr));
3773 	IN6_IPADDR_TO_V4MAPPED(*addr, &addr6);
3774 	nce = nce_lookup_addr(ill, &addr6, nce);
3775 	if (nce == NULL) {
3776 		err = ndp_add_v4(ill,
3777 		    hw_addr,
3778 		    addr,
3779 		    mask,
3780 		    extract_mask,
3781 		    hw_extract_start,
3782 		    flags,
3783 		    state,
3784 		    newnce,
3785 		    fp_mp,
3786 		    res_mp);
3787 	} else {
3788 		*newnce = nce;
3789 		err = EEXIST;
3790 	}
3791 	mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
3792 	return (err);
3793 }
3794 
3795 /*
3796  * NDP Cache Entry creation routine for IPv4.
3797  * Mapped entries are handled in arp.
3798  * This routine must always be called with ndp4->ndp_g_lock held.
3799  * Prior to return, nce_refcnt is incremented.
3800  */
3801 static int
3802 ndp_add_v4(ill_t *ill, uchar_t *hw_addr, const in_addr_t *addr,
3803     const in_addr_t *mask, const in_addr_t *extract_mask,
3804     uint32_t hw_extract_start, uint16_t flags, uint16_t state,
3805     nce_t **newnce, mblk_t *fp_mp, mblk_t *res_mp)
3806 {
3807 	static	nce_t		nce_nil;
3808 	nce_t		*nce;
3809 	mblk_t		*mp;
3810 	mblk_t		*template;
3811 	nce_t		**ncep;
3812 	ip_stack_t	*ipst = ill->ill_ipst;
3813 
3814 	ASSERT(MUTEX_HELD(&ipst->ips_ndp4->ndp_g_lock));
3815 	ASSERT(ill != NULL);
3816 	if ((flags & ~NCE_EXTERNAL_FLAGS_MASK)) {
3817 		return (EINVAL);
3818 	}
3819 	ASSERT((flags & NCE_F_MAPPING) == 0);
3820 	ASSERT(extract_mask == NULL);
3821 	/*
3822 	 * Allocate the mblk to hold the nce.
3823 	 */
3824 	mp = allocb(sizeof (nce_t), BPRI_MED);
3825 	if (mp == NULL)
3826 		return (ENOMEM);
3827 
3828 	nce = (nce_t *)mp->b_rptr;
3829 	mp->b_wptr = (uchar_t *)&nce[1];
3830 	*nce = nce_nil;
3831 
3832 	/*
3833 	 * This one holds link layer address; if res_mp has been provided
3834 	 * by the caller, accept it without any further checks. Otherwise,
3835 	 * for V4, we fill it up with ill_resolver_mp here, then in
3836 	 * in ire_arpresolve(), we fill it up with the ARP query
3837 	 * once its formulated.
3838 	 */
3839 	if (res_mp != NULL) {
3840 		template = res_mp;
3841 	} else  {
3842 		if (ill->ill_resolver_mp == NULL) {
3843 			freeb(mp);
3844 			return (EINVAL);
3845 		}
3846 		template = copyb(ill->ill_resolver_mp);
3847 	}
3848 	if (template == NULL) {
3849 		freeb(mp);
3850 		return (ENOMEM);
3851 	}
3852 	nce->nce_ill = ill;
3853 	nce->nce_ipversion = IPV4_VERSION;
3854 	nce->nce_flags = flags;
3855 	nce->nce_state = state;
3856 	nce->nce_pcnt = ND_MAX_UNICAST_SOLICIT;
3857 	nce->nce_rcnt = ill->ill_xmit_count;
3858 	IN6_IPADDR_TO_V4MAPPED(*addr, &nce->nce_addr);
3859 	if (*mask == IP_HOST_MASK) {
3860 		nce->nce_mask = ipv6_all_ones;
3861 	} else  {
3862 		IN6_IPADDR_TO_V4MAPPED(*mask, &nce->nce_mask);
3863 	}
3864 	nce->nce_extract_mask = ipv6_all_zeros;
3865 	nce->nce_ll_extract_start = hw_extract_start;
3866 	nce->nce_fp_mp = (fp_mp? fp_mp : NULL);
3867 	nce->nce_res_mp = template;
3868 	if (state == ND_REACHABLE)
3869 		nce->nce_last = TICK_TO_MSEC(lbolt64);
3870 	else
3871 		nce->nce_last = 0;
3872 	nce->nce_qd_mp = NULL;
3873 	nce->nce_mp = mp;
3874 	if (hw_addr != NULL)
3875 		nce_set_ll(nce, hw_addr);
3876 	/* This one is for nce getting created */
3877 	nce->nce_refcnt = 1;
3878 	mutex_init(&nce->nce_lock, NULL, MUTEX_DEFAULT, NULL);
3879 	ncep = ((nce_t **)NCE_HASH_PTR_V4(ipst, *addr));
3880 
3881 #ifdef NCE_DEBUG
3882 	bzero(nce->nce_trace, sizeof (th_trace_t *) * IP_TR_HASH_MAX);
3883 #endif
3884 	/*
3885 	 * Atomically ensure that the ill is not CONDEMNED, before
3886 	 * adding the NCE.
3887 	 */
3888 	mutex_enter(&ill->ill_lock);
3889 	if (ill->ill_state_flags & ILL_CONDEMNED) {
3890 		mutex_exit(&ill->ill_lock);
3891 		freeb(mp);
3892 		if (res_mp == NULL) {
3893 			/*
3894 			 * template was locally allocated. need to free it.
3895 			 */
3896 			freeb(template);
3897 		}
3898 		return (EINVAL);
3899 	}
3900 	if ((nce->nce_next = *ncep) != NULL)
3901 		nce->nce_next->nce_ptpn = &nce->nce_next;
3902 	*ncep = nce;
3903 	nce->nce_ptpn = ncep;
3904 	*newnce = nce;
3905 	/* This one is for nce being used by an active thread */
3906 	NCE_REFHOLD(*newnce);
3907 
3908 	/* Bump up the number of nce's referencing this ill */
3909 	ill->ill_nce_cnt++;
3910 	mutex_exit(&ill->ill_lock);
3911 	return (0);
3912 }
3913 
3914 void
3915 ndp_flush_qd_mp(nce_t *nce)
3916 {
3917 	mblk_t *qd_mp, *qd_next;
3918 
3919 	ASSERT(MUTEX_HELD(&nce->nce_lock));
3920 	qd_mp = nce->nce_qd_mp;
3921 	nce->nce_qd_mp = NULL;
3922 	while (qd_mp != NULL) {
3923 		qd_next = qd_mp->b_next;
3924 		qd_mp->b_next = NULL;
3925 		qd_mp->b_prev = NULL;
3926 		freemsg(qd_mp);
3927 		qd_mp = qd_next;
3928 	}
3929 }
3930 
3931 nce_t *
3932 nce_reinit(nce_t *nce)
3933 {
3934 	nce_t *newnce = NULL;
3935 	in_addr_t nce_addr, nce_mask;
3936 	ip_stack_t *ipst = nce->nce_ill->ill_ipst;
3937 
3938 	IN6_V4MAPPED_TO_IPADDR(&nce->nce_addr, nce_addr);
3939 	IN6_V4MAPPED_TO_IPADDR(&nce->nce_mask, nce_mask);
3940 	/*
3941 	 * delete the old one. this will get rid of any ire's pointing
3942 	 * at this nce.
3943 	 */
3944 	ndp_delete(nce);
3945 	/*
3946 	 * create a new nce with the same addr and mask.
3947 	 */
3948 	mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
3949 	(void) ndp_add_v4(nce->nce_ill, NULL, &nce_addr, &nce_mask, NULL, 0, 0,
3950 	    ND_INITIAL, &newnce, NULL, NULL);
3951 	mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
3952 	/*
3953 	 * refrele the old nce.
3954 	 */
3955 	NCE_REFRELE(nce);
3956 	return (newnce);
3957 }
3958 
3959 /*
3960  * ndp_walk routine to delete all entries that have a given destination or
3961  * gateway address and cached link layer (MAC) address.  This is used when ARP
3962  * informs us that a network-to-link-layer mapping may have changed.
3963  */
3964 void
3965 nce_delete_hw_changed(nce_t *nce, void *arg)
3966 {
3967 	nce_hw_map_t *hwm = arg;
3968 	mblk_t *mp;
3969 	dl_unitdata_req_t *dlu;
3970 	uchar_t *macaddr;
3971 	ill_t *ill;
3972 	int saplen;
3973 	ipaddr_t nce_addr;
3974 
3975 	if (nce->nce_state != ND_REACHABLE)
3976 		return;
3977 
3978 	IN6_V4MAPPED_TO_IPADDR(&nce->nce_addr, nce_addr);
3979 	if (nce_addr != hwm->hwm_addr)
3980 		return;
3981 
3982 	mutex_enter(&nce->nce_lock);
3983 	if ((mp = nce->nce_res_mp) == NULL) {
3984 		mutex_exit(&nce->nce_lock);
3985 		return;
3986 	}
3987 	dlu = (dl_unitdata_req_t *)mp->b_rptr;
3988 	macaddr = (uchar_t *)(dlu + 1);
3989 	ill = nce->nce_ill;
3990 	if ((saplen = ill->ill_sap_length) > 0)
3991 		macaddr += saplen;
3992 	else
3993 		saplen = -saplen;
3994 
3995 	/*
3996 	 * If the hardware address is unchanged, then leave this one alone.
3997 	 * Note that saplen == abs(saplen) now.
3998 	 */
3999 	if (hwm->hwm_hwlen == dlu->dl_dest_addr_length - saplen &&
4000 	    bcmp(hwm->hwm_hwaddr, macaddr, hwm->hwm_hwlen) == 0) {
4001 		mutex_exit(&nce->nce_lock);
4002 		return;
4003 	}
4004 	mutex_exit(&nce->nce_lock);
4005 
4006 	DTRACE_PROBE1(nce__hw__deleted, nce_t *, nce);
4007 	ndp_delete(nce);
4008 }
4009 
4010 /*
4011  * This function verifies whether a given IPv4 address is potentially known to
4012  * the NCE subsystem.  If so, then ARP must not delete the corresponding ace_t,
4013  * so that it can continue to look for hardware changes on that address.
4014  */
4015 boolean_t
4016 ndp_lookup_ipaddr(in_addr_t addr, netstack_t *ns)
4017 {
4018 	nce_t		*nce;
4019 	struct in_addr	nceaddr;
4020 	ip_stack_t	*ipst = ns->netstack_ip;
4021 
4022 	if (addr == INADDR_ANY)
4023 		return (B_FALSE);
4024 
4025 	mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
4026 	nce = *(nce_t **)NCE_HASH_PTR_V4(ipst, addr);
4027 	for (; nce != NULL; nce = nce->nce_next) {
4028 		/* Note that only v4 mapped entries are in the table. */
4029 		IN6_V4MAPPED_TO_INADDR(&nce->nce_addr, &nceaddr);
4030 		if (addr == nceaddr.s_addr &&
4031 		    IN6_ARE_ADDR_EQUAL(&nce->nce_mask, &ipv6_all_ones)) {
4032 			/* Single flag check; no lock needed */
4033 			if (!(nce->nce_flags & NCE_F_CONDEMNED))
4034 				break;
4035 		}
4036 	}
4037 	mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
4038 	return (nce != NULL);
4039 }
4040