xref: /illumos-gate/usr/src/uts/common/inet/ip/ip_ndp.c (revision 618b6b99eb6eee4272ca949f5ac45efb4425f02c)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 #include <sys/types.h>
29 #include <sys/stream.h>
30 #include <sys/stropts.h>
31 #include <sys/strsun.h>
32 #include <sys/sysmacros.h>
33 #include <sys/errno.h>
34 #include <sys/dlpi.h>
35 #include <sys/socket.h>
36 #include <sys/ddi.h>
37 #include <sys/sunddi.h>
38 #include <sys/cmn_err.h>
39 #include <sys/debug.h>
40 #include <sys/vtrace.h>
41 #include <sys/kmem.h>
42 #include <sys/zone.h>
43 #include <sys/ethernet.h>
44 #include <sys/sdt.h>
45 
46 #include <net/if.h>
47 #include <net/if_types.h>
48 #include <net/if_dl.h>
49 #include <net/route.h>
50 #include <netinet/in.h>
51 #include <netinet/ip6.h>
52 #include <netinet/icmp6.h>
53 
54 #include <inet/common.h>
55 #include <inet/mi.h>
56 #include <inet/mib2.h>
57 #include <inet/nd.h>
58 #include <inet/ip.h>
59 #include <inet/ip_impl.h>
60 #include <inet/ipclassifier.h>
61 #include <inet/ip_if.h>
62 #include <inet/ip_ire.h>
63 #include <inet/ip_rts.h>
64 #include <inet/ip6.h>
65 #include <inet/ip_ndp.h>
66 #include <inet/ipsec_impl.h>
67 #include <inet/ipsec_info.h>
68 #include <inet/sctp_ip.h>
69 
70 /*
71  * Function names with nce_ prefix are static while function
72  * names with ndp_ prefix are used by rest of the IP.
73  *
74  * Lock ordering:
75  *
76  *	ndp_g_lock -> ill_lock -> nce_lock
77  *
78  * The ndp_g_lock protects the NCE hash (nce_hash_tbl, NCE_HASH_PTR) and
79  * nce_next.  Nce_lock protects the contents of the NCE (particularly
80  * nce_refcnt).
81  */
82 
83 static	boolean_t nce_cmp_ll_addr(const nce_t *nce, const uchar_t *new_ll_addr,
84     uint32_t ll_addr_len);
85 static	void	nce_ire_delete(nce_t *nce);
86 static	void	nce_ire_delete1(ire_t *ire, char *nce_arg);
87 static	void 	nce_set_ll(nce_t *nce, uchar_t *ll_addr);
88 static	nce_t	*nce_lookup_addr(ill_t *, const in6_addr_t *, nce_t *);
89 static	nce_t	*nce_lookup_mapping(ill_t *ill, const in6_addr_t *addr);
90 static	void	nce_make_mapping(nce_t *nce, uchar_t *addrpos,
91     uchar_t *addr);
92 static	int	nce_set_multicast(ill_t *ill, const in6_addr_t *addr);
93 static	void	nce_queue_mp(nce_t *nce, mblk_t *mp);
94 static	void	nce_report1(nce_t *nce, uchar_t *mp_arg);
95 static	mblk_t	*nce_udreq_alloc(ill_t *ill);
96 static	void	nce_update(nce_t *nce, uint16_t new_state,
97     uchar_t *new_ll_addr);
98 static	uint32_t	nce_solicit(nce_t *nce, mblk_t *mp);
99 static	boolean_t	nce_xmit(ill_t *ill, uint32_t operation,
100     ill_t *hwaddr_ill, boolean_t use_lla_addr, const in6_addr_t *sender,
101     const in6_addr_t *target, int flag);
102 extern void	th_trace_rrecord(th_trace_t *);
103 static int	ndp_add_v4(ill_t *, const in_addr_t *, uint16_t,
104     nce_t **, nce_t *);
105 
106 /*
107  * We track the time of creation of the nce in the  nce_init_time field
108  * of IPv4 nce_t entries. If an nce is stuck in the ND_INITIAL state for
109  * more than NCE_STUCK_TIMEOUT milliseconds, trigger the nce-stuck dtrace
110  * probe to assist in debugging. This probe will be fired from
111  * nce_thread_exit() for debug kernels, and from nce_report1() when
112  * 'ndd -get /dev/ip ip_ndp_cache_report' is invoked on both debug and
113  * non-debug kernels.
114  */
115 #define	NCE_STUCK_TIMEOUT	120000
116 
117 #ifdef NCE_DEBUG
118 void	nce_trace_inactive(nce_t *);
119 #endif
120 
121 #define	NCE_HASH_PTR_V4(ipst, addr)					\
122 	(&((ipst)->ips_ndp4->nce_hash_tbl[IRE_ADDR_HASH(addr, NCE_TABLE_SIZE)]))
123 
124 #define	NCE_HASH_PTR_V6(ipst, addr)				 \
125 	(&((ipst)->ips_ndp6->nce_hash_tbl[NCE_ADDR_HASH_V6(addr, \
126 		NCE_TABLE_SIZE)]))
127 
128 /*
129  * Compute default flags to use for an advertisement of this nce's address.
130  */
131 static int
132 nce_advert_flags(const nce_t *nce)
133 {
134 	int flag = 0;
135 
136 	if (nce->nce_flags & NCE_F_ISROUTER)
137 		flag |= NDP_ISROUTER;
138 	if (!(nce->nce_flags & NCE_F_PROXY))
139 		flag |= NDP_ORIDE;
140 	return (flag);
141 }
142 
143 /* Non-tunable probe interval, based on link capabilities */
144 #define	ILL_PROBE_INTERVAL(ill)	((ill)->ill_note_link ? 150 : 1500)
145 
146 /*
147  * NDP Cache Entry creation routine.
148  * Mapped entries will never do NUD .
149  * This routine must always be called with ndp6->ndp_g_lock held.
150  * Prior to return, nce_refcnt is incremented.
151  */
152 int
153 ndp_add_v6(ill_t *ill, uchar_t *hw_addr, const in6_addr_t *addr,
154     const in6_addr_t *mask, const in6_addr_t *extract_mask,
155     uint32_t hw_extract_start, uint16_t flags, uint16_t state,
156     nce_t **newnce)
157 {
158 	static	nce_t		nce_nil;
159 	nce_t		*nce;
160 	mblk_t		*mp;
161 	mblk_t		*template;
162 	nce_t		**ncep;
163 	int		err;
164 	boolean_t	dropped = B_FALSE;
165 	ip_stack_t	*ipst = ill->ill_ipst;
166 
167 	ASSERT(MUTEX_HELD(&ipst->ips_ndp6->ndp_g_lock));
168 	ASSERT(ill != NULL && ill->ill_isv6);
169 	if (IN6_IS_ADDR_UNSPECIFIED(addr)) {
170 		ip0dbg(("ndp_add_v6: no addr\n"));
171 		return (EINVAL);
172 	}
173 	if ((flags & ~NCE_EXTERNAL_FLAGS_MASK)) {
174 		ip0dbg(("ndp_add_v6: flags = %x\n", (int)flags));
175 		return (EINVAL);
176 	}
177 	if (IN6_IS_ADDR_UNSPECIFIED(extract_mask) &&
178 	    (flags & NCE_F_MAPPING)) {
179 		ip0dbg(("ndp_add_v6: extract mask zero for mapping"));
180 		return (EINVAL);
181 	}
182 	/*
183 	 * Allocate the mblk to hold the nce.
184 	 *
185 	 * XXX This can come out of a separate cache - nce_cache.
186 	 * We don't need the mp anymore as there are no more
187 	 * "qwriter"s
188 	 */
189 	mp = allocb(sizeof (nce_t), BPRI_MED);
190 	if (mp == NULL)
191 		return (ENOMEM);
192 
193 	nce = (nce_t *)mp->b_rptr;
194 	mp->b_wptr = (uchar_t *)&nce[1];
195 	*nce = nce_nil;
196 
197 	/*
198 	 * This one holds link layer address
199 	 */
200 	if (ill->ill_net_type == IRE_IF_RESOLVER) {
201 		template = nce_udreq_alloc(ill);
202 	} else {
203 		if (ill->ill_resolver_mp == NULL) {
204 			freeb(mp);
205 			return (EINVAL);
206 		}
207 		ASSERT((ill->ill_net_type == IRE_IF_NORESOLVER));
208 		template = copyb(ill->ill_resolver_mp);
209 	}
210 	if (template == NULL) {
211 		freeb(mp);
212 		return (ENOMEM);
213 	}
214 	nce->nce_ill = ill;
215 	nce->nce_ipversion = IPV6_VERSION;
216 	nce->nce_flags = flags;
217 	nce->nce_state = state;
218 	nce->nce_pcnt = ND_MAX_UNICAST_SOLICIT;
219 	nce->nce_rcnt = ill->ill_xmit_count;
220 	nce->nce_addr = *addr;
221 	nce->nce_mask = *mask;
222 	nce->nce_extract_mask = *extract_mask;
223 	nce->nce_ll_extract_start = hw_extract_start;
224 	nce->nce_fp_mp = NULL;
225 	nce->nce_res_mp = template;
226 	if (state == ND_REACHABLE)
227 		nce->nce_last = TICK_TO_MSEC(lbolt64);
228 	else
229 		nce->nce_last = 0;
230 	nce->nce_qd_mp = NULL;
231 	nce->nce_mp = mp;
232 	if (hw_addr != NULL)
233 		nce_set_ll(nce, hw_addr);
234 	/* This one is for nce getting created */
235 	nce->nce_refcnt = 1;
236 	mutex_init(&nce->nce_lock, NULL, MUTEX_DEFAULT, NULL);
237 	if (nce->nce_flags & NCE_F_MAPPING) {
238 		ASSERT(IN6_IS_ADDR_MULTICAST(addr));
239 		ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&nce->nce_mask));
240 		ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&nce->nce_extract_mask));
241 		ncep = &ipst->ips_ndp6->nce_mask_entries;
242 	} else {
243 		ncep = ((nce_t **)NCE_HASH_PTR_V6(ipst, *addr));
244 	}
245 
246 #ifdef NCE_DEBUG
247 	bzero(nce->nce_trace, sizeof (th_trace_t *) * IP_TR_HASH_MAX);
248 #endif
249 	/*
250 	 * Atomically ensure that the ill is not CONDEMNED, before
251 	 * adding the NCE.
252 	 */
253 	mutex_enter(&ill->ill_lock);
254 	if (ill->ill_state_flags & ILL_CONDEMNED) {
255 		mutex_exit(&ill->ill_lock);
256 		freeb(mp);
257 		freeb(template);
258 		return (EINVAL);
259 	}
260 	if ((nce->nce_next = *ncep) != NULL)
261 		nce->nce_next->nce_ptpn = &nce->nce_next;
262 	*ncep = nce;
263 	nce->nce_ptpn = ncep;
264 	*newnce = nce;
265 	/* This one is for nce being used by an active thread */
266 	NCE_REFHOLD(*newnce);
267 
268 	/* Bump up the number of nce's referencing this ill */
269 	ill->ill_nce_cnt++;
270 	mutex_exit(&ill->ill_lock);
271 
272 	err = 0;
273 	if ((flags & NCE_F_PERMANENT) && state == ND_PROBE) {
274 		mutex_enter(&nce->nce_lock);
275 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
276 		nce->nce_pcnt = ND_MAX_UNICAST_SOLICIT;
277 		mutex_exit(&nce->nce_lock);
278 		dropped = nce_xmit(ill, ND_NEIGHBOR_SOLICIT, NULL, B_FALSE,
279 		    &ipv6_all_zeros, addr, NDP_PROBE);
280 		if (dropped) {
281 			mutex_enter(&nce->nce_lock);
282 			nce->nce_pcnt++;
283 			mutex_exit(&nce->nce_lock);
284 		}
285 		NDP_RESTART_TIMER(nce, ILL_PROBE_INTERVAL(ill));
286 		mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
287 		err = EINPROGRESS;
288 	} else if (flags & NCE_F_UNSOL_ADV) {
289 		/*
290 		 * We account for the transmit below by assigning one
291 		 * less than the ndd variable. Subsequent decrements
292 		 * are done in ndp_timer.
293 		 */
294 		mutex_enter(&nce->nce_lock);
295 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
296 		nce->nce_unsolicit_count = ipst->ips_ip_ndp_unsolicit_count - 1;
297 		mutex_exit(&nce->nce_lock);
298 		dropped = nce_xmit(ill,
299 		    ND_NEIGHBOR_ADVERT,
300 		    ill,	/* ill to be used for extracting ill_nd_lla */
301 		    B_TRUE,	/* use ill_nd_lla */
302 		    addr,	/* Source and target of the advertisement pkt */
303 		    &ipv6_all_hosts_mcast, /* Destination of the packet */
304 		    nce_advert_flags(nce));
305 		mutex_enter(&nce->nce_lock);
306 		if (dropped)
307 			nce->nce_unsolicit_count++;
308 		if (nce->nce_unsolicit_count != 0) {
309 			nce->nce_timeout_id = timeout(ndp_timer, nce,
310 			    MSEC_TO_TICK(ipst->ips_ip_ndp_unsolicit_interval));
311 		}
312 		mutex_exit(&nce->nce_lock);
313 		mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
314 	}
315 	/*
316 	 * If the hw_addr is NULL, typically for ND_INCOMPLETE nces, then
317 	 * we call nce_fastpath as soon as the nce is resolved in ndp_process.
318 	 * We call nce_fastpath from nce_update if the link layer address of
319 	 * the peer changes from nce_update
320 	 */
321 	if (hw_addr != NULL || ill->ill_net_type == IRE_IF_NORESOLVER)
322 		nce_fastpath(nce);
323 	return (err);
324 }
325 
326 int
327 ndp_lookup_then_add_v6(ill_t *ill, uchar_t *hw_addr, const in6_addr_t *addr,
328     const in6_addr_t *mask, const in6_addr_t *extract_mask,
329     uint32_t hw_extract_start, uint16_t flags, uint16_t state,
330     nce_t **newnce)
331 {
332 	int	err = 0;
333 	nce_t	*nce;
334 	ip_stack_t	*ipst = ill->ill_ipst;
335 
336 	ASSERT(ill->ill_isv6);
337 	mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
338 
339 	/* Get head of v6 hash table */
340 	nce = *((nce_t **)NCE_HASH_PTR_V6(ipst, *addr));
341 	nce = nce_lookup_addr(ill, addr, nce);
342 	if (nce == NULL) {
343 		err = ndp_add_v6(ill,
344 		    hw_addr,
345 		    addr,
346 		    mask,
347 		    extract_mask,
348 		    hw_extract_start,
349 		    flags,
350 		    state,
351 		    newnce);
352 	} else {
353 		*newnce = nce;
354 		err = EEXIST;
355 	}
356 	mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
357 	return (err);
358 }
359 
360 /*
361  * Remove all the CONDEMNED nces from the appropriate hash table.
362  * We create a private list of NCEs, these may have ires pointing
363  * to them, so the list will be passed through to clean up dependent
364  * ires and only then we can do NCE_REFRELE which can make NCE inactive.
365  */
366 static void
367 nce_remove(ndp_g_t *ndp, nce_t *nce, nce_t **free_nce_list)
368 {
369 	nce_t *nce1;
370 	nce_t **ptpn;
371 
372 	ASSERT(MUTEX_HELD(&ndp->ndp_g_lock));
373 	ASSERT(ndp->ndp_g_walker == 0);
374 	for (; nce; nce = nce1) {
375 		nce1 = nce->nce_next;
376 		mutex_enter(&nce->nce_lock);
377 		if (nce->nce_flags & NCE_F_CONDEMNED) {
378 			ptpn = nce->nce_ptpn;
379 			nce1 = nce->nce_next;
380 			if (nce1 != NULL)
381 				nce1->nce_ptpn = ptpn;
382 			*ptpn = nce1;
383 			nce->nce_ptpn = NULL;
384 			nce->nce_next = NULL;
385 			nce->nce_next = *free_nce_list;
386 			*free_nce_list = nce;
387 		}
388 		mutex_exit(&nce->nce_lock);
389 	}
390 }
391 
392 /*
393  * 1. Mark the nce CONDEMNED. This ensures that no new nce_lookup()
394  *    will return this NCE. Also no new IREs will be created that
395  *    point to this NCE (See ire_add_v6).  Also no new timeouts will
396  *    be started (See NDP_RESTART_TIMER).
397  * 2. Cancel any currently running timeouts.
398  * 3. If there is an ndp walker, return. The walker will do the cleanup.
399  *    This ensures that walkers see a consistent list of NCEs while walking.
400  * 4. Otherwise remove the NCE from the list of NCEs
401  * 5. Delete all IREs pointing to this NCE.
402  */
403 void
404 ndp_delete(nce_t *nce)
405 {
406 	nce_t	**ptpn;
407 	nce_t	*nce1;
408 	int	ipversion = nce->nce_ipversion;
409 	ndp_g_t *ndp;
410 	ip_stack_t	*ipst = nce->nce_ill->ill_ipst;
411 
412 	if (ipversion == IPV4_VERSION)
413 		ndp = ipst->ips_ndp4;
414 	else
415 		ndp = ipst->ips_ndp6;
416 
417 	/* Serialize deletes */
418 	mutex_enter(&nce->nce_lock);
419 	if (nce->nce_flags & NCE_F_CONDEMNED) {
420 		/* Some other thread is doing the delete */
421 		mutex_exit(&nce->nce_lock);
422 		return;
423 	}
424 	/*
425 	 * Caller has a refhold. Also 1 ref for being in the list. Thus
426 	 * refcnt has to be >= 2
427 	 */
428 	ASSERT(nce->nce_refcnt >= 2);
429 	nce->nce_flags |= NCE_F_CONDEMNED;
430 	mutex_exit(&nce->nce_lock);
431 
432 	nce_fastpath_list_delete(nce);
433 
434 	/*
435 	 * Cancel any running timer. Timeout can't be restarted
436 	 * since CONDEMNED is set. Can't hold nce_lock across untimeout.
437 	 * Passing invalid timeout id is fine.
438 	 */
439 	if (nce->nce_timeout_id != 0) {
440 		(void) untimeout(nce->nce_timeout_id);
441 		nce->nce_timeout_id = 0;
442 	}
443 
444 	mutex_enter(&ndp->ndp_g_lock);
445 	if (nce->nce_ptpn == NULL) {
446 		/*
447 		 * The last ndp walker has already removed this nce from
448 		 * the list after we marked the nce CONDEMNED and before
449 		 * we grabbed the global lock.
450 		 */
451 		mutex_exit(&ndp->ndp_g_lock);
452 		return;
453 	}
454 	if (ndp->ndp_g_walker > 0) {
455 		/*
456 		 * Can't unlink. The walker will clean up
457 		 */
458 		ndp->ndp_g_walker_cleanup = B_TRUE;
459 		mutex_exit(&ndp->ndp_g_lock);
460 		return;
461 	}
462 
463 	/*
464 	 * Now remove the nce from the list. NDP_RESTART_TIMER won't restart
465 	 * the timer since it is marked CONDEMNED.
466 	 */
467 	ptpn = nce->nce_ptpn;
468 	nce1 = nce->nce_next;
469 	if (nce1 != NULL)
470 		nce1->nce_ptpn = ptpn;
471 	*ptpn = nce1;
472 	nce->nce_ptpn = NULL;
473 	nce->nce_next = NULL;
474 	mutex_exit(&ndp->ndp_g_lock);
475 
476 	nce_ire_delete(nce);
477 }
478 
479 void
480 ndp_inactive(nce_t *nce)
481 {
482 	mblk_t		**mpp;
483 	ill_t		*ill;
484 
485 	ASSERT(nce->nce_refcnt == 0);
486 	ASSERT(MUTEX_HELD(&nce->nce_lock));
487 	ASSERT(nce->nce_fastpath == NULL);
488 
489 	/* Free all nce allocated messages */
490 	mpp = &nce->nce_first_mp_to_free;
491 	do {
492 		while (*mpp != NULL) {
493 			mblk_t  *mp;
494 
495 			mp = *mpp;
496 			*mpp = mp->b_next;
497 
498 			inet_freemsg(mp);
499 		}
500 	} while (mpp++ != &nce->nce_last_mp_to_free);
501 
502 #ifdef NCE_DEBUG
503 	nce_trace_inactive(nce);
504 #endif
505 
506 	ill = nce->nce_ill;
507 	mutex_enter(&ill->ill_lock);
508 	ill->ill_nce_cnt--;
509 	/*
510 	 * If the number of nce's associated with this ill have dropped
511 	 * to zero, check whether we need to restart any operation that
512 	 * is waiting for this to happen.
513 	 */
514 	if (ill->ill_nce_cnt == 0) {
515 		/* ipif_ill_refrele_tail drops the ill_lock */
516 		ipif_ill_refrele_tail(ill);
517 	} else {
518 		mutex_exit(&ill->ill_lock);
519 	}
520 	mutex_destroy(&nce->nce_lock);
521 	if (nce->nce_mp != NULL)
522 		inet_freemsg(nce->nce_mp);
523 }
524 
525 /*
526  * ndp_walk routine.  Delete the nce if it is associated with the ill
527  * that is going away.  Always called as a writer.
528  */
529 void
530 ndp_delete_per_ill(nce_t *nce, uchar_t *arg)
531 {
532 	if ((nce != NULL) && nce->nce_ill == (ill_t *)arg) {
533 		ndp_delete(nce);
534 	}
535 }
536 
537 /*
538  * Walk a list of to be inactive NCEs and blow away all the ires.
539  */
540 static void
541 nce_ire_delete_list(nce_t *nce)
542 {
543 	nce_t *nce_next;
544 
545 	ASSERT(nce != NULL);
546 	while (nce != NULL) {
547 		nce_next = nce->nce_next;
548 		nce->nce_next = NULL;
549 
550 		/*
551 		 * It is possible for the last ndp walker (this thread)
552 		 * to come here after ndp_delete has marked the nce CONDEMNED
553 		 * and before it has removed the nce from the fastpath list
554 		 * or called untimeout. So we need to do it here. It is safe
555 		 * for both ndp_delete and this thread to do it twice or
556 		 * even simultaneously since each of the threads has a
557 		 * reference on the nce.
558 		 */
559 		nce_fastpath_list_delete(nce);
560 		/*
561 		 * Cancel any running timer. Timeout can't be restarted
562 		 * since CONDEMNED is set. Can't hold nce_lock across untimeout.
563 		 * Passing invalid timeout id is fine.
564 		 */
565 		if (nce->nce_timeout_id != 0) {
566 			(void) untimeout(nce->nce_timeout_id);
567 			nce->nce_timeout_id = 0;
568 		}
569 		/*
570 		 * We might hit this func thus in the v4 case:
571 		 * ipif_down->ipif_ndp_down->ndp_walk
572 		 */
573 
574 		if (nce->nce_ipversion == IPV4_VERSION) {
575 			ire_walk_ill_v4(MATCH_IRE_ILL | MATCH_IRE_TYPE,
576 			    IRE_CACHE, nce_ire_delete1,
577 			    (char *)nce, nce->nce_ill);
578 		} else {
579 			ASSERT(nce->nce_ipversion == IPV6_VERSION);
580 			ire_walk_ill_v6(MATCH_IRE_ILL | MATCH_IRE_TYPE,
581 			    IRE_CACHE, nce_ire_delete1,
582 			    (char *)nce, nce->nce_ill);
583 		}
584 		NCE_REFRELE_NOTR(nce);
585 		nce = nce_next;
586 	}
587 }
588 
589 /*
590  * Delete an ire when the nce goes away.
591  */
592 /* ARGSUSED */
593 static void
594 nce_ire_delete(nce_t *nce)
595 {
596 	if (nce->nce_ipversion == IPV6_VERSION) {
597 		ire_walk_ill_v6(MATCH_IRE_ILL | MATCH_IRE_TYPE, IRE_CACHE,
598 		    nce_ire_delete1, (char *)nce, nce->nce_ill);
599 		NCE_REFRELE_NOTR(nce);
600 	} else {
601 		ire_walk_ill_v4(MATCH_IRE_ILL | MATCH_IRE_TYPE, IRE_CACHE,
602 		    nce_ire_delete1, (char *)nce, nce->nce_ill);
603 		NCE_REFRELE_NOTR(nce);
604 	}
605 }
606 
607 /*
608  * ire_walk routine used to delete every IRE that shares this nce
609  */
610 static void
611 nce_ire_delete1(ire_t *ire, char *nce_arg)
612 {
613 	nce_t	*nce = (nce_t *)nce_arg;
614 
615 	ASSERT(ire->ire_type == IRE_CACHE);
616 
617 	if (ire->ire_nce == nce) {
618 		ASSERT(ire->ire_ipversion == nce->nce_ipversion);
619 		ire_delete(ire);
620 	}
621 }
622 
623 /*
624  * Restart DAD on given NCE.  Returns B_TRUE if DAD has been restarted.
625  */
626 boolean_t
627 ndp_restart_dad(nce_t *nce)
628 {
629 	boolean_t started;
630 	boolean_t dropped;
631 
632 	if (nce == NULL)
633 		return (B_FALSE);
634 	mutex_enter(&nce->nce_lock);
635 	if (nce->nce_state == ND_PROBE) {
636 		mutex_exit(&nce->nce_lock);
637 		started = B_TRUE;
638 	} else if (nce->nce_state == ND_REACHABLE) {
639 		nce->nce_state = ND_PROBE;
640 		nce->nce_pcnt = ND_MAX_UNICAST_SOLICIT - 1;
641 		mutex_exit(&nce->nce_lock);
642 		dropped = nce_xmit(nce->nce_ill, ND_NEIGHBOR_SOLICIT, NULL,
643 		    B_FALSE, &ipv6_all_zeros, &nce->nce_addr, NDP_PROBE);
644 		if (dropped) {
645 			mutex_enter(&nce->nce_lock);
646 			nce->nce_pcnt++;
647 			mutex_exit(&nce->nce_lock);
648 		}
649 		NDP_RESTART_TIMER(nce, ILL_PROBE_INTERVAL(nce->nce_ill));
650 		started = B_TRUE;
651 	} else {
652 		mutex_exit(&nce->nce_lock);
653 		started = B_FALSE;
654 	}
655 	return (started);
656 }
657 
658 /*
659  * IPv6 Cache entry lookup.  Try to find an nce matching the parameters passed.
660  * If one is found, the refcnt on the nce will be incremented.
661  */
662 nce_t *
663 ndp_lookup_v6(ill_t *ill, const in6_addr_t *addr, boolean_t caller_holds_lock)
664 {
665 	nce_t	*nce;
666 	ip_stack_t	*ipst;
667 
668 	ASSERT(ill != NULL);
669 	ipst = ill->ill_ipst;
670 
671 	ASSERT(ill != NULL && ill->ill_isv6);
672 	if (!caller_holds_lock) {
673 		mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
674 	}
675 
676 	/* Get head of v6 hash table */
677 	nce = *((nce_t **)NCE_HASH_PTR_V6(ipst, *addr));
678 	nce = nce_lookup_addr(ill, addr, nce);
679 	if (nce == NULL)
680 		nce = nce_lookup_mapping(ill, addr);
681 	if (!caller_holds_lock)
682 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
683 	return (nce);
684 }
685 /*
686  * IPv4 Cache entry lookup.  Try to find an nce matching the parameters passed.
687  * If one is found, the refcnt on the nce will be incremented.
688  * Since multicast mappings are handled in arp, there are no nce_mcast_entries
689  * so we skip the nce_lookup_mapping call.
690  * XXX TODO: if the nce is found to be ND_STALE, ndp_delete it and return NULL
691  */
692 nce_t *
693 ndp_lookup_v4(ill_t *ill, const in_addr_t *addr, boolean_t caller_holds_lock)
694 {
695 	nce_t	*nce;
696 	in6_addr_t addr6;
697 	ip_stack_t *ipst = ill->ill_ipst;
698 
699 	if (!caller_holds_lock) {
700 		mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
701 	}
702 
703 	/* Get head of v4 hash table */
704 	nce = *((nce_t **)NCE_HASH_PTR_V4(ipst, *addr));
705 	IN6_IPADDR_TO_V4MAPPED(*addr, &addr6);
706 	nce = nce_lookup_addr(ill, &addr6, nce);
707 	if (!caller_holds_lock)
708 		mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
709 	return (nce);
710 }
711 
712 /*
713  * Cache entry lookup.  Try to find an nce matching the parameters passed.
714  * Look only for exact entries (no mappings).  If an nce is found, increment
715  * the hold count on that nce. The caller passes in the start of the
716  * appropriate hash table, and must be holding the appropriate global
717  * lock (ndp_g_lock).
718  */
719 static nce_t *
720 nce_lookup_addr(ill_t *ill, const in6_addr_t *addr, nce_t *nce)
721 {
722 	ndp_g_t		*ndp;
723 	ip_stack_t	*ipst = ill->ill_ipst;
724 
725 	if (ill->ill_isv6)
726 		ndp = ipst->ips_ndp6;
727 	else
728 		ndp = ipst->ips_ndp4;
729 
730 	ASSERT(ill != NULL);
731 	ASSERT(MUTEX_HELD(&ndp->ndp_g_lock));
732 	if (IN6_IS_ADDR_UNSPECIFIED(addr))
733 		return (NULL);
734 	for (; nce != NULL; nce = nce->nce_next) {
735 		if (nce->nce_ill == ill) {
736 			if (IN6_ARE_ADDR_EQUAL(&nce->nce_addr, addr) &&
737 			    IN6_ARE_ADDR_EQUAL(&nce->nce_mask,
738 			    &ipv6_all_ones)) {
739 				mutex_enter(&nce->nce_lock);
740 				if (!(nce->nce_flags & NCE_F_CONDEMNED)) {
741 					NCE_REFHOLD_LOCKED(nce);
742 					mutex_exit(&nce->nce_lock);
743 					break;
744 				}
745 				mutex_exit(&nce->nce_lock);
746 			}
747 		}
748 	}
749 	return (nce);
750 }
751 
752 /*
753  * Cache entry lookup.  Try to find an nce matching the parameters passed.
754  * Look only for mappings.
755  */
756 static nce_t *
757 nce_lookup_mapping(ill_t *ill, const in6_addr_t *addr)
758 {
759 	nce_t	*nce;
760 	ip_stack_t	*ipst = ill->ill_ipst;
761 
762 	ASSERT(ill != NULL && ill->ill_isv6);
763 	ASSERT(MUTEX_HELD(&ipst->ips_ndp6->ndp_g_lock));
764 	if (!IN6_IS_ADDR_MULTICAST(addr))
765 		return (NULL);
766 	nce = ipst->ips_ndp6->nce_mask_entries;
767 	for (; nce != NULL; nce = nce->nce_next)
768 		if (nce->nce_ill == ill &&
769 		    (V6_MASK_EQ(*addr, nce->nce_mask, nce->nce_addr))) {
770 			mutex_enter(&nce->nce_lock);
771 			if (!(nce->nce_flags & NCE_F_CONDEMNED)) {
772 				NCE_REFHOLD_LOCKED(nce);
773 				mutex_exit(&nce->nce_lock);
774 				break;
775 			}
776 			mutex_exit(&nce->nce_lock);
777 		}
778 	return (nce);
779 }
780 
781 /*
782  * Process passed in parameters either from an incoming packet or via
783  * user ioctl.
784  */
785 void
786 ndp_process(nce_t *nce, uchar_t *hw_addr, uint32_t flag, boolean_t is_adv)
787 {
788 	ill_t	*ill = nce->nce_ill;
789 	uint32_t hw_addr_len = ill->ill_nd_lla_len;
790 	mblk_t	*mp;
791 	boolean_t ll_updated = B_FALSE;
792 	boolean_t ll_changed;
793 	ip_stack_t	*ipst = ill->ill_ipst;
794 
795 	ASSERT(nce->nce_ipversion == IPV6_VERSION);
796 	/*
797 	 * No updates of link layer address or the neighbor state is
798 	 * allowed, when the cache is in NONUD state.  This still
799 	 * allows for responding to reachability solicitation.
800 	 */
801 	mutex_enter(&nce->nce_lock);
802 	if (nce->nce_state == ND_INCOMPLETE) {
803 		if (hw_addr == NULL) {
804 			mutex_exit(&nce->nce_lock);
805 			return;
806 		}
807 		nce_set_ll(nce, hw_addr);
808 		/*
809 		 * Update nce state and send the queued packets
810 		 * back to ip this time ire will be added.
811 		 */
812 		if (flag & ND_NA_FLAG_SOLICITED) {
813 			nce_update(nce, ND_REACHABLE, NULL);
814 		} else {
815 			nce_update(nce, ND_STALE, NULL);
816 		}
817 		mutex_exit(&nce->nce_lock);
818 		nce_fastpath(nce);
819 		mutex_enter(&nce->nce_lock);
820 		mp = nce->nce_qd_mp;
821 		nce->nce_qd_mp = NULL;
822 		mutex_exit(&nce->nce_lock);
823 		while (mp != NULL) {
824 			mblk_t *nxt_mp, *data_mp;
825 
826 			nxt_mp = mp->b_next;
827 			mp->b_next = NULL;
828 
829 			if (mp->b_datap->db_type == M_CTL)
830 				data_mp = mp->b_cont;
831 			else
832 				data_mp = mp;
833 			if (data_mp->b_prev != NULL) {
834 				ill_t   *inbound_ill;
835 				queue_t *fwdq = NULL;
836 				uint_t ifindex;
837 
838 				ifindex = (uint_t)(uintptr_t)data_mp->b_prev;
839 				inbound_ill = ill_lookup_on_ifindex(ifindex,
840 				    B_TRUE, NULL, NULL, NULL, NULL, ipst);
841 				if (inbound_ill == NULL) {
842 					data_mp->b_prev = NULL;
843 					freemsg(mp);
844 					return;
845 				} else {
846 					fwdq = inbound_ill->ill_rq;
847 				}
848 				data_mp->b_prev = NULL;
849 				/*
850 				 * Send a forwarded packet back into ip_rput_v6
851 				 * just as in ire_send_v6().
852 				 * Extract the queue from b_prev (set in
853 				 * ip_rput_data_v6).
854 				 */
855 				if (fwdq != NULL) {
856 					/*
857 					 * Forwarded packets hop count will
858 					 * get decremented in ip_rput_data_v6
859 					 */
860 					if (data_mp != mp)
861 						freeb(mp);
862 					put(fwdq, data_mp);
863 				} else {
864 					/*
865 					 * Send locally originated packets back
866 					 * into * ip_wput_v6.
867 					 */
868 					put(ill->ill_wq, mp);
869 				}
870 				ill_refrele(inbound_ill);
871 			} else {
872 				put(ill->ill_wq, mp);
873 			}
874 			mp = nxt_mp;
875 		}
876 		return;
877 	}
878 	ll_changed = nce_cmp_ll_addr(nce, hw_addr, hw_addr_len);
879 	if (!is_adv) {
880 		/* If this is a SOLICITATION request only */
881 		if (ll_changed)
882 			nce_update(nce, ND_STALE, hw_addr);
883 		mutex_exit(&nce->nce_lock);
884 		return;
885 	}
886 	if (!(flag & ND_NA_FLAG_OVERRIDE) && ll_changed) {
887 		/* If in any other state than REACHABLE, ignore */
888 		if (nce->nce_state == ND_REACHABLE) {
889 			nce_update(nce, ND_STALE, NULL);
890 		}
891 		mutex_exit(&nce->nce_lock);
892 		return;
893 	} else {
894 		if (ll_changed) {
895 			nce_update(nce, ND_UNCHANGED, hw_addr);
896 			ll_updated = B_TRUE;
897 		}
898 		if (flag & ND_NA_FLAG_SOLICITED) {
899 			nce_update(nce, ND_REACHABLE, NULL);
900 		} else {
901 			if (ll_updated) {
902 				nce_update(nce, ND_STALE, NULL);
903 			}
904 		}
905 		mutex_exit(&nce->nce_lock);
906 		if (!(flag & ND_NA_FLAG_ROUTER) && (nce->nce_flags &
907 		    NCE_F_ISROUTER)) {
908 			ire_t *ire;
909 
910 			/*
911 			 * Router turned to host.  We need to remove the
912 			 * entry as well as any default route that may be
913 			 * using this as a next hop.  This is required by
914 			 * section 7.2.5 of RFC 2461.
915 			 */
916 			ire = ire_ftable_lookup_v6(&ipv6_all_zeros,
917 			    &ipv6_all_zeros, &nce->nce_addr, IRE_DEFAULT,
918 			    nce->nce_ill->ill_ipif, NULL, ALL_ZONES, 0, NULL,
919 			    MATCH_IRE_ILL | MATCH_IRE_TYPE | MATCH_IRE_GW |
920 			    MATCH_IRE_DEFAULT, ipst);
921 			if (ire != NULL) {
922 				ip_rts_rtmsg(RTM_DELETE, ire, 0, ipst);
923 				ire_delete(ire);
924 				ire_refrele(ire);
925 			}
926 			ndp_delete(nce);
927 		}
928 	}
929 }
930 
931 /*
932  * Pass arg1 to the pfi supplied, along with each nce in existence.
933  * ndp_walk() places a REFHOLD on the nce and drops the lock when
934  * walking the hash list.
935  */
936 void
937 ndp_walk_common(ndp_g_t *ndp, ill_t *ill, pfi_t pfi, void *arg1,
938     boolean_t trace)
939 {
940 
941 	nce_t	*nce;
942 	nce_t	*nce1;
943 	nce_t	**ncep;
944 	nce_t	*free_nce_list = NULL;
945 
946 	mutex_enter(&ndp->ndp_g_lock);
947 	/* Prevent ndp_delete from unlink and free of NCE */
948 	ndp->ndp_g_walker++;
949 	mutex_exit(&ndp->ndp_g_lock);
950 	for (ncep = ndp->nce_hash_tbl;
951 	    ncep < A_END(ndp->nce_hash_tbl); ncep++) {
952 		for (nce = *ncep; nce != NULL; nce = nce1) {
953 			nce1 = nce->nce_next;
954 			if (ill == NULL || nce->nce_ill == ill) {
955 				if (trace) {
956 					NCE_REFHOLD(nce);
957 					(*pfi)(nce, arg1);
958 					NCE_REFRELE(nce);
959 				} else {
960 					NCE_REFHOLD_NOTR(nce);
961 					(*pfi)(nce, arg1);
962 					NCE_REFRELE_NOTR(nce);
963 				}
964 			}
965 		}
966 	}
967 	for (nce = ndp->nce_mask_entries; nce != NULL; nce = nce1) {
968 		nce1 = nce->nce_next;
969 		if (ill == NULL || nce->nce_ill == ill) {
970 			if (trace) {
971 				NCE_REFHOLD(nce);
972 				(*pfi)(nce, arg1);
973 				NCE_REFRELE(nce);
974 			} else {
975 				NCE_REFHOLD_NOTR(nce);
976 				(*pfi)(nce, arg1);
977 				NCE_REFRELE_NOTR(nce);
978 			}
979 		}
980 	}
981 	mutex_enter(&ndp->ndp_g_lock);
982 	ndp->ndp_g_walker--;
983 	/*
984 	 * While NCE's are removed from global list they are placed
985 	 * in a private list, to be passed to nce_ire_delete_list().
986 	 * The reason is, there may be ires pointing to this nce
987 	 * which needs to cleaned up.
988 	 */
989 	if (ndp->ndp_g_walker_cleanup && ndp->ndp_g_walker == 0) {
990 		/* Time to delete condemned entries */
991 		for (ncep = ndp->nce_hash_tbl;
992 		    ncep < A_END(ndp->nce_hash_tbl); ncep++) {
993 			nce = *ncep;
994 			if (nce != NULL) {
995 				nce_remove(ndp, nce, &free_nce_list);
996 			}
997 		}
998 		nce = ndp->nce_mask_entries;
999 		if (nce != NULL) {
1000 			nce_remove(ndp, nce, &free_nce_list);
1001 		}
1002 		ndp->ndp_g_walker_cleanup = B_FALSE;
1003 	}
1004 
1005 	mutex_exit(&ndp->ndp_g_lock);
1006 
1007 	if (free_nce_list != NULL) {
1008 		nce_ire_delete_list(free_nce_list);
1009 	}
1010 }
1011 
1012 /*
1013  * Walk everything.
1014  * Note that ill can be NULL hence can't derive the ipst from it.
1015  */
1016 void
1017 ndp_walk(ill_t *ill, pfi_t pfi, void *arg1, ip_stack_t *ipst)
1018 {
1019 	ndp_walk_common(ipst->ips_ndp4, ill, pfi, arg1, B_TRUE);
1020 	ndp_walk_common(ipst->ips_ndp6, ill, pfi, arg1, B_TRUE);
1021 }
1022 
1023 /*
1024  * Process resolve requests.  Handles both mapped entries
1025  * as well as cases that needs to be send out on the wire.
1026  * Lookup a NCE for a given IRE.  Regardless of whether one exists
1027  * or one is created, we defer making ire point to nce until the
1028  * ire is actually added at which point the nce_refcnt on the nce is
1029  * incremented.  This is done primarily to have symmetry between ire_add()
1030  * and ire_delete() which decrements the nce_refcnt, when an ire is deleted.
1031  */
1032 int
1033 ndp_resolver(ill_t *ill, const in6_addr_t *dst, mblk_t *mp, zoneid_t zoneid)
1034 {
1035 	nce_t		*nce;
1036 	int		err = 0;
1037 	uint32_t	ms;
1038 	mblk_t		*mp_nce = NULL;
1039 	ip_stack_t	*ipst = ill->ill_ipst;
1040 
1041 	ASSERT(ill->ill_isv6);
1042 	if (IN6_IS_ADDR_MULTICAST(dst)) {
1043 		err = nce_set_multicast(ill, dst);
1044 		return (err);
1045 	}
1046 	err = ndp_lookup_then_add_v6(ill,
1047 	    NULL,	/* No hardware address */
1048 	    dst,
1049 	    &ipv6_all_ones,
1050 	    &ipv6_all_zeros,
1051 	    0,
1052 	    (ill->ill_flags & ILLF_NONUD) ? NCE_F_NONUD : 0,
1053 	    ND_INCOMPLETE,
1054 	    &nce);
1055 
1056 	switch (err) {
1057 	case 0:
1058 		/*
1059 		 * New cache entry was created. Make sure that the state
1060 		 * is not ND_INCOMPLETE. It can be in some other state
1061 		 * even before we send out the solicitation as we could
1062 		 * get un-solicited advertisements.
1063 		 *
1064 		 * If this is an XRESOLV interface, simply return 0,
1065 		 * since we don't want to solicit just yet.
1066 		 */
1067 		if (ill->ill_flags & ILLF_XRESOLV) {
1068 			NCE_REFRELE(nce);
1069 			return (0);
1070 		}
1071 		rw_enter(&ipst->ips_ill_g_lock, RW_READER);
1072 		mutex_enter(&nce->nce_lock);
1073 		if (nce->nce_state != ND_INCOMPLETE) {
1074 			mutex_exit(&nce->nce_lock);
1075 			rw_exit(&ipst->ips_ill_g_lock);
1076 			NCE_REFRELE(nce);
1077 			return (0);
1078 		}
1079 		mp_nce = ip_prepend_zoneid(mp, zoneid, ipst);
1080 		if (mp_nce == NULL) {
1081 			/* The caller will free mp */
1082 			mutex_exit(&nce->nce_lock);
1083 			rw_exit(&ipst->ips_ill_g_lock);
1084 			ndp_delete(nce);
1085 			NCE_REFRELE(nce);
1086 			return (ENOMEM);
1087 		}
1088 		ms = nce_solicit(nce, mp_nce);
1089 		rw_exit(&ipst->ips_ill_g_lock);
1090 		if (ms == 0) {
1091 			/* The caller will free mp */
1092 			if (mp_nce != mp)
1093 				freeb(mp_nce);
1094 			mutex_exit(&nce->nce_lock);
1095 			ndp_delete(nce);
1096 			NCE_REFRELE(nce);
1097 			return (EBUSY);
1098 		}
1099 		mutex_exit(&nce->nce_lock);
1100 		NDP_RESTART_TIMER(nce, (clock_t)ms);
1101 		NCE_REFRELE(nce);
1102 		return (EINPROGRESS);
1103 	case EEXIST:
1104 		/* Resolution in progress just queue the packet */
1105 		mutex_enter(&nce->nce_lock);
1106 		if (nce->nce_state == ND_INCOMPLETE) {
1107 			mp_nce = ip_prepend_zoneid(mp, zoneid, ipst);
1108 			if (mp_nce == NULL) {
1109 				err = ENOMEM;
1110 			} else {
1111 				nce_queue_mp(nce, mp_nce);
1112 				err = EINPROGRESS;
1113 			}
1114 		} else {
1115 			/*
1116 			 * Any other state implies we have
1117 			 * a nce but IRE needs to be added ...
1118 			 * ire_add_v6() will take care of the
1119 			 * the case when the nce becomes CONDEMNED
1120 			 * before the ire is added to the table.
1121 			 */
1122 			err = 0;
1123 		}
1124 		mutex_exit(&nce->nce_lock);
1125 		NCE_REFRELE(nce);
1126 		break;
1127 	default:
1128 		ip1dbg(("ndp_resolver: Can't create NCE %d\n", err));
1129 		break;
1130 	}
1131 	return (err);
1132 }
1133 
1134 /*
1135  * When there is no resolver, the link layer template is passed in
1136  * the IRE.
1137  * Lookup a NCE for a given IRE.  Regardless of whether one exists
1138  * or one is created, we defer making ire point to nce until the
1139  * ire is actually added at which point the nce_refcnt on the nce is
1140  * incremented.  This is done primarily to have symmetry between ire_add()
1141  * and ire_delete() which decrements the nce_refcnt, when an ire is deleted.
1142  */
1143 int
1144 ndp_noresolver(ill_t *ill, const in6_addr_t *dst)
1145 {
1146 	nce_t		*nce;
1147 	int		err = 0;
1148 
1149 	ASSERT(ill != NULL);
1150 	ASSERT(ill->ill_isv6);
1151 	if (IN6_IS_ADDR_MULTICAST(dst)) {
1152 		err = nce_set_multicast(ill, dst);
1153 		return (err);
1154 	}
1155 
1156 	err = ndp_lookup_then_add_v6(ill,
1157 	    NULL,	/* hardware address */
1158 	    dst,
1159 	    &ipv6_all_ones,
1160 	    &ipv6_all_zeros,
1161 	    0,
1162 	    (ill->ill_flags & ILLF_NONUD) ? NCE_F_NONUD : 0,
1163 	    ND_REACHABLE,
1164 	    &nce);
1165 
1166 	switch (err) {
1167 	case 0:
1168 		/*
1169 		 * Cache entry with a proper resolver cookie was
1170 		 * created.
1171 		 */
1172 		NCE_REFRELE(nce);
1173 		break;
1174 	case EEXIST:
1175 		err = 0;
1176 		NCE_REFRELE(nce);
1177 		break;
1178 	default:
1179 		ip1dbg(("ndp_noresolver: Can't create NCE %d\n", err));
1180 		break;
1181 	}
1182 	return (err);
1183 }
1184 
1185 /*
1186  * For each interface an entry is added for the unspecified multicast group.
1187  * Here that mapping is used to form the multicast cache entry for a particular
1188  * multicast destination.
1189  */
1190 static int
1191 nce_set_multicast(ill_t *ill, const in6_addr_t *dst)
1192 {
1193 	nce_t		*mnce;	/* Multicast mapping entry */
1194 	nce_t		*nce;
1195 	uchar_t		*hw_addr = NULL;
1196 	int		err = 0;
1197 	ip_stack_t	*ipst = ill->ill_ipst;
1198 
1199 	ASSERT(ill != NULL);
1200 	ASSERT(ill->ill_isv6);
1201 	ASSERT(!(IN6_IS_ADDR_UNSPECIFIED(dst)));
1202 
1203 	mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
1204 	nce = *((nce_t **)NCE_HASH_PTR_V6(ipst, *dst));
1205 	nce = nce_lookup_addr(ill, dst, nce);
1206 	if (nce != NULL) {
1207 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
1208 		NCE_REFRELE(nce);
1209 		return (0);
1210 	}
1211 	/* No entry, now lookup for a mapping this should never fail */
1212 	mnce = nce_lookup_mapping(ill, dst);
1213 	if (mnce == NULL) {
1214 		/* Something broken for the interface. */
1215 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
1216 		return (ESRCH);
1217 	}
1218 	ASSERT(mnce->nce_flags & NCE_F_MAPPING);
1219 	if (ill->ill_net_type == IRE_IF_RESOLVER) {
1220 		/*
1221 		 * For IRE_IF_RESOLVER a hardware mapping can be
1222 		 * generated, for IRE_IF_NORESOLVER, resolution cookie
1223 		 * in the ill is copied in ndp_add_v6().
1224 		 */
1225 		hw_addr = kmem_alloc(ill->ill_nd_lla_len, KM_NOSLEEP);
1226 		if (hw_addr == NULL) {
1227 			mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
1228 			NCE_REFRELE(mnce);
1229 			return (ENOMEM);
1230 		}
1231 		nce_make_mapping(mnce, hw_addr, (uchar_t *)dst);
1232 	}
1233 	NCE_REFRELE(mnce);
1234 	/*
1235 	 * IRE_IF_NORESOLVER type simply copies the resolution
1236 	 * cookie passed in.  So no hw_addr is needed.
1237 	 */
1238 	err = ndp_add_v6(ill,
1239 	    hw_addr,
1240 	    dst,
1241 	    &ipv6_all_ones,
1242 	    &ipv6_all_zeros,
1243 	    0,
1244 	    NCE_F_NONUD,
1245 	    ND_REACHABLE,
1246 	    &nce);
1247 	mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
1248 	if (hw_addr != NULL)
1249 		kmem_free(hw_addr, ill->ill_nd_lla_len);
1250 	if (err != 0) {
1251 		ip1dbg(("nce_set_multicast: create failed" "%d\n", err));
1252 		return (err);
1253 	}
1254 	NCE_REFRELE(nce);
1255 	return (0);
1256 }
1257 
1258 /*
1259  * Return the link layer address, and any flags of a nce.
1260  */
1261 int
1262 ndp_query(ill_t *ill, struct lif_nd_req *lnr)
1263 {
1264 	nce_t		*nce;
1265 	in6_addr_t	*addr;
1266 	sin6_t		*sin6;
1267 	dl_unitdata_req_t	*dl;
1268 
1269 	ASSERT(ill != NULL && ill->ill_isv6);
1270 	sin6 = (sin6_t *)&lnr->lnr_addr;
1271 	addr =  &sin6->sin6_addr;
1272 
1273 	nce = ndp_lookup_v6(ill, addr, B_FALSE);
1274 	if (nce == NULL)
1275 		return (ESRCH);
1276 	/* If in INCOMPLETE state, no link layer address is available yet */
1277 	if (nce->nce_state == ND_INCOMPLETE)
1278 		goto done;
1279 	dl = (dl_unitdata_req_t *)nce->nce_res_mp->b_rptr;
1280 	if (ill->ill_flags & ILLF_XRESOLV)
1281 		lnr->lnr_hdw_len = dl->dl_dest_addr_length;
1282 	else
1283 		lnr->lnr_hdw_len = ill->ill_nd_lla_len;
1284 	ASSERT(NCE_LL_ADDR_OFFSET(ill) + lnr->lnr_hdw_len <=
1285 	    sizeof (lnr->lnr_hdw_addr));
1286 	bcopy(nce->nce_res_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill),
1287 	    (uchar_t *)&lnr->lnr_hdw_addr, lnr->lnr_hdw_len);
1288 	if (nce->nce_flags & NCE_F_ISROUTER)
1289 		lnr->lnr_flags = NDF_ISROUTER_ON;
1290 	if (nce->nce_flags & NCE_F_PROXY)
1291 		lnr->lnr_flags |= NDF_PROXY_ON;
1292 	if (nce->nce_flags & NCE_F_ANYCAST)
1293 		lnr->lnr_flags |= NDF_ANYCAST_ON;
1294 done:
1295 	NCE_REFRELE(nce);
1296 	return (0);
1297 }
1298 
1299 /*
1300  * Send Enable/Disable multicast reqs to driver.
1301  */
1302 int
1303 ndp_mcastreq(ill_t *ill, const in6_addr_t *addr, uint32_t hw_addr_len,
1304     uint32_t hw_addr_offset, mblk_t *mp)
1305 {
1306 	nce_t		*nce;
1307 	uchar_t		*hw_addr;
1308 	ip_stack_t	*ipst = ill->ill_ipst;
1309 
1310 	ASSERT(ill != NULL && ill->ill_isv6);
1311 	ASSERT(ill->ill_net_type == IRE_IF_RESOLVER);
1312 	hw_addr = mi_offset_paramc(mp, hw_addr_offset, hw_addr_len);
1313 	if (hw_addr == NULL || !IN6_IS_ADDR_MULTICAST(addr)) {
1314 		freemsg(mp);
1315 		return (EINVAL);
1316 	}
1317 	mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
1318 	nce = nce_lookup_mapping(ill, addr);
1319 	if (nce == NULL) {
1320 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
1321 		freemsg(mp);
1322 		return (ESRCH);
1323 	}
1324 	mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
1325 	/*
1326 	 * Update dl_addr_length and dl_addr_offset for primitives that
1327 	 * have physical addresses as opposed to full saps
1328 	 */
1329 	switch (((union DL_primitives *)mp->b_rptr)->dl_primitive) {
1330 	case DL_ENABMULTI_REQ:
1331 		/* Track the state if this is the first enabmulti */
1332 		if (ill->ill_dlpi_multicast_state == IDS_UNKNOWN)
1333 			ill->ill_dlpi_multicast_state = IDS_INPROGRESS;
1334 		ip1dbg(("ndp_mcastreq: ENABMULTI\n"));
1335 		break;
1336 	case DL_DISABMULTI_REQ:
1337 		ip1dbg(("ndp_mcastreq: DISABMULTI\n"));
1338 		break;
1339 	default:
1340 		NCE_REFRELE(nce);
1341 		ip1dbg(("ndp_mcastreq: default\n"));
1342 		return (EINVAL);
1343 	}
1344 	nce_make_mapping(nce, hw_addr, (uchar_t *)addr);
1345 	NCE_REFRELE(nce);
1346 	ill_dlpi_send(ill, mp);
1347 	return (0);
1348 }
1349 
1350 /*
1351  * Send a neighbor solicitation.
1352  * Returns number of milliseconds after which we should either rexmit or abort.
1353  * Return of zero means we should abort.
1354  * The caller holds the nce_lock to protect nce_qd_mp and nce_rcnt.
1355  *
1356  * NOTE: This routine drops nce_lock (and later reacquires it) when sending
1357  * the packet.
1358  * NOTE: This routine does not consume mp.
1359  */
1360 uint32_t
1361 nce_solicit(nce_t *nce, mblk_t *mp)
1362 {
1363 	ill_t		*ill;
1364 	ill_t		*src_ill;
1365 	ip6_t		*ip6h;
1366 	in6_addr_t	src;
1367 	in6_addr_t	dst;
1368 	ipif_t		*ipif;
1369 	ip6i_t		*ip6i;
1370 	boolean_t	dropped = B_FALSE;
1371 	ip_stack_t	*ipst = nce->nce_ill->ill_ipst;
1372 
1373 	ASSERT(RW_READ_HELD(&ipst->ips_ill_g_lock));
1374 	ASSERT(MUTEX_HELD(&nce->nce_lock));
1375 	ill = nce->nce_ill;
1376 	ASSERT(ill != NULL);
1377 
1378 	if (nce->nce_rcnt == 0) {
1379 		return (0);
1380 	}
1381 
1382 	if (mp == NULL) {
1383 		ASSERT(nce->nce_qd_mp != NULL);
1384 		mp = nce->nce_qd_mp;
1385 	} else {
1386 		nce_queue_mp(nce, mp);
1387 	}
1388 
1389 	/* Handle ip_newroute_v6 giving us IPSEC packets */
1390 	if (mp->b_datap->db_type == M_CTL)
1391 		mp = mp->b_cont;
1392 
1393 	ip6h = (ip6_t *)mp->b_rptr;
1394 	if (ip6h->ip6_nxt == IPPROTO_RAW) {
1395 		/*
1396 		 * This message should have been pulled up already in
1397 		 * ip_wput_v6. We can't do pullups here because the message
1398 		 * could be from the nce_qd_mp which could have b_next/b_prev
1399 		 * non-NULL.
1400 		 */
1401 		ip6i = (ip6i_t *)ip6h;
1402 		ASSERT((mp->b_wptr - (uchar_t *)ip6i) >=
1403 		    sizeof (ip6i_t) + IPV6_HDR_LEN);
1404 		ip6h = (ip6_t *)(mp->b_rptr + sizeof (ip6i_t));
1405 	}
1406 	src = ip6h->ip6_src;
1407 	/*
1408 	 * If the src of outgoing packet is one of the assigned interface
1409 	 * addresses use it, otherwise we will pick the source address below.
1410 	 */
1411 	src_ill = ill;
1412 	if (!IN6_IS_ADDR_UNSPECIFIED(&src)) {
1413 		if (ill->ill_group != NULL)
1414 			src_ill = ill->ill_group->illgrp_ill;
1415 		for (; src_ill != NULL; src_ill = src_ill->ill_group_next) {
1416 			for (ipif = src_ill->ill_ipif; ipif != NULL;
1417 			    ipif = ipif->ipif_next) {
1418 				if (IN6_ARE_ADDR_EQUAL(&src,
1419 				    &ipif->ipif_v6lcl_addr)) {
1420 					break;
1421 				}
1422 			}
1423 			if (ipif != NULL)
1424 				break;
1425 		}
1426 		/*
1427 		 * If no relevant ipif can be found, then it's not one of our
1428 		 * addresses.  Reset to :: and let nce_xmit.  If an ipif can be
1429 		 * found, but it's not yet done with DAD verification, then
1430 		 * just postpone this transmission until later.
1431 		 */
1432 		if (src_ill == NULL)
1433 			src = ipv6_all_zeros;
1434 		else if (!ipif->ipif_addr_ready)
1435 			return (ill->ill_reachable_retrans_time);
1436 	}
1437 	dst = nce->nce_addr;
1438 	/*
1439 	 * If source address is unspecified, nce_xmit will choose
1440 	 * one for us and initialize the hardware address also
1441 	 * appropriately.
1442 	 */
1443 	if (IN6_IS_ADDR_UNSPECIFIED(&src))
1444 		src_ill = NULL;
1445 	nce->nce_rcnt--;
1446 	mutex_exit(&nce->nce_lock);
1447 	rw_exit(&ipst->ips_ill_g_lock);
1448 	dropped = nce_xmit(ill, ND_NEIGHBOR_SOLICIT, src_ill, B_TRUE, &src,
1449 	    &dst, 0);
1450 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
1451 	mutex_enter(&nce->nce_lock);
1452 	if (dropped)
1453 		nce->nce_rcnt++;
1454 	return (ill->ill_reachable_retrans_time);
1455 }
1456 
1457 /*
1458  * Attempt to recover an address on an interface that's been marked as a
1459  * duplicate.  Because NCEs are destroyed when the interface goes down, there's
1460  * no easy way to just probe the address and have the right thing happen if
1461  * it's no longer in use.  Instead, we just bring it up normally and allow the
1462  * regular interface start-up logic to probe for a remaining duplicate and take
1463  * us back down if necessary.
1464  * Neither DHCP nor temporary addresses arrive here; they're excluded by
1465  * ip_ndp_excl.
1466  */
1467 /* ARGSUSED */
1468 static void
1469 ip_ndp_recover(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg)
1470 {
1471 	ill_t	*ill = rq->q_ptr;
1472 	ipif_t	*ipif;
1473 	in6_addr_t *addr = (in6_addr_t *)mp->b_rptr;
1474 
1475 	for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
1476 		/*
1477 		 * We do not support recovery of proxy ARP'd interfaces,
1478 		 * because the system lacks a complete proxy ARP mechanism.
1479 		 */
1480 		if ((ipif->ipif_flags & IPIF_POINTOPOINT) ||
1481 		    !IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6lcl_addr, addr)) {
1482 			continue;
1483 		}
1484 
1485 		/*
1486 		 * If we have already recovered or if the interface is going
1487 		 * away, then ignore.
1488 		 */
1489 		mutex_enter(&ill->ill_lock);
1490 		if (!(ipif->ipif_flags & IPIF_DUPLICATE) ||
1491 		    (ipif->ipif_flags & (IPIF_MOVING | IPIF_CONDEMNED))) {
1492 			mutex_exit(&ill->ill_lock);
1493 			continue;
1494 		}
1495 
1496 		ipif->ipif_flags &= ~IPIF_DUPLICATE;
1497 		ill->ill_ipif_dup_count--;
1498 		mutex_exit(&ill->ill_lock);
1499 		ipif->ipif_was_dup = B_TRUE;
1500 
1501 		if (ipif_ndp_up(ipif, addr) != EINPROGRESS)
1502 			(void) ipif_up_done_v6(ipif);
1503 	}
1504 	freeb(mp);
1505 }
1506 
1507 /*
1508  * Attempt to recover an IPv6 interface that's been shut down as a duplicate.
1509  * As long as someone else holds the address, the interface will stay down.
1510  * When that conflict goes away, the interface is brought back up.  This is
1511  * done so that accidental shutdowns of addresses aren't made permanent.  Your
1512  * server will recover from a failure.
1513  *
1514  * For DHCP and temporary addresses, recovery is not done in the kernel.
1515  * Instead, it's handled by user space processes (dhcpagent and in.ndpd).
1516  *
1517  * This function is entered on a timer expiry; the ID is in ipif_recovery_id.
1518  */
1519 static void
1520 ipif6_dup_recovery(void *arg)
1521 {
1522 	ipif_t *ipif = arg;
1523 
1524 	ipif->ipif_recovery_id = 0;
1525 	if (!(ipif->ipif_flags & IPIF_DUPLICATE))
1526 		return;
1527 
1528 	/*
1529 	 * No lock, because this is just an optimization.
1530 	 */
1531 	if (ipif->ipif_state_flags & (IPIF_MOVING | IPIF_CONDEMNED))
1532 		return;
1533 
1534 	/* If the link is down, we'll retry this later */
1535 	if (!(ipif->ipif_ill->ill_phyint->phyint_flags & PHYI_RUNNING))
1536 		return;
1537 
1538 	ndp_do_recovery(ipif);
1539 }
1540 
1541 /*
1542  * Perform interface recovery by forcing the duplicate interfaces up and
1543  * allowing the system to determine which ones should stay up.
1544  *
1545  * Called both by recovery timer expiry and link-up notification.
1546  */
1547 void
1548 ndp_do_recovery(ipif_t *ipif)
1549 {
1550 	ill_t *ill = ipif->ipif_ill;
1551 	mblk_t *mp;
1552 	ip_stack_t *ipst = ill->ill_ipst;
1553 
1554 	mp = allocb(sizeof (ipif->ipif_v6lcl_addr), BPRI_MED);
1555 	if (mp == NULL) {
1556 		mutex_enter(&ill->ill_lock);
1557 		if (ipif->ipif_recovery_id == 0 &&
1558 		    !(ipif->ipif_state_flags & (IPIF_MOVING |
1559 		    IPIF_CONDEMNED))) {
1560 			ipif->ipif_recovery_id = timeout(ipif6_dup_recovery,
1561 			    ipif, MSEC_TO_TICK(ipst->ips_ip_dup_recovery));
1562 		}
1563 		mutex_exit(&ill->ill_lock);
1564 	} else {
1565 		bcopy(&ipif->ipif_v6lcl_addr, mp->b_rptr,
1566 		    sizeof (ipif->ipif_v6lcl_addr));
1567 		ill_refhold(ill);
1568 		qwriter_ip(ill, ill->ill_rq, mp, ip_ndp_recover, NEW_OP,
1569 		    B_FALSE);
1570 	}
1571 }
1572 
1573 /*
1574  * Find the solicitation in the given message, and extract printable details
1575  * (MAC and IP addresses) from it.
1576  */
1577 static nd_neighbor_solicit_t *
1578 ip_ndp_find_solicitation(mblk_t *mp, mblk_t *dl_mp, ill_t *ill, char *hbuf,
1579     size_t hlen, char *sbuf, size_t slen, uchar_t **haddr)
1580 {
1581 	nd_neighbor_solicit_t *ns;
1582 	ip6_t *ip6h;
1583 	uchar_t *addr;
1584 	int alen;
1585 
1586 	alen = 0;
1587 	ip6h = (ip6_t *)mp->b_rptr;
1588 	if (dl_mp == NULL) {
1589 		nd_opt_hdr_t *opt;
1590 		int nslen;
1591 
1592 		/*
1593 		 * If it's from the fast-path, then it can't be a probe
1594 		 * message, and thus must include the source linkaddr option.
1595 		 * Extract that here.
1596 		 */
1597 		ns = (nd_neighbor_solicit_t *)((char *)ip6h + IPV6_HDR_LEN);
1598 		nslen = mp->b_wptr - (uchar_t *)ns;
1599 		if ((nslen -= sizeof (*ns)) > 0) {
1600 			opt = ndp_get_option((nd_opt_hdr_t *)(ns + 1), nslen,
1601 			    ND_OPT_SOURCE_LINKADDR);
1602 			if (opt != NULL &&
1603 			    opt->nd_opt_len * 8 - sizeof (*opt) >=
1604 			    ill->ill_nd_lla_len) {
1605 				addr = (uchar_t *)(opt + 1);
1606 				alen = ill->ill_nd_lla_len;
1607 			}
1608 		}
1609 		/*
1610 		 * We cheat a bit here for the sake of printing usable log
1611 		 * messages in the rare case where the reply we got was unicast
1612 		 * without a source linkaddr option, and the interface is in
1613 		 * fastpath mode.  (Sigh.)
1614 		 */
1615 		if (alen == 0 && ill->ill_type == IFT_ETHER &&
1616 		    MBLKHEAD(mp) >= sizeof (struct ether_header)) {
1617 			struct ether_header *pether;
1618 
1619 			pether = (struct ether_header *)((char *)ip6h -
1620 			    sizeof (*pether));
1621 			addr = pether->ether_shost.ether_addr_octet;
1622 			alen = ETHERADDRL;
1623 		}
1624 	} else {
1625 		dl_unitdata_ind_t *dlu;
1626 
1627 		dlu = (dl_unitdata_ind_t *)dl_mp->b_rptr;
1628 		alen = dlu->dl_src_addr_length;
1629 		if (alen > 0 && dlu->dl_src_addr_offset >= sizeof (*dlu) &&
1630 		    dlu->dl_src_addr_offset + alen <= MBLKL(dl_mp)) {
1631 			addr = dl_mp->b_rptr + dlu->dl_src_addr_offset;
1632 			if (ill->ill_sap_length < 0) {
1633 				alen += ill->ill_sap_length;
1634 			} else {
1635 				addr += ill->ill_sap_length;
1636 				alen -= ill->ill_sap_length;
1637 			}
1638 		}
1639 	}
1640 	if (alen > 0) {
1641 		*haddr = addr;
1642 		(void) mac_colon_addr(addr, alen, hbuf, hlen);
1643 	} else {
1644 		*haddr = NULL;
1645 		(void) strcpy(hbuf, "?");
1646 	}
1647 	ns = (nd_neighbor_solicit_t *)((char *)ip6h + IPV6_HDR_LEN);
1648 	(void) inet_ntop(AF_INET6, &ns->nd_ns_target, sbuf, slen);
1649 	return (ns);
1650 }
1651 
1652 /*
1653  * This is for exclusive changes due to NDP duplicate address detection
1654  * failure.
1655  */
1656 /* ARGSUSED */
1657 static void
1658 ip_ndp_excl(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg)
1659 {
1660 	ill_t	*ill = rq->q_ptr;
1661 	ipif_t	*ipif;
1662 	char ibuf[LIFNAMSIZ + 10];	/* 10 digits for logical i/f number */
1663 	char hbuf[MAC_STR_LEN];
1664 	char sbuf[INET6_ADDRSTRLEN];
1665 	nd_neighbor_solicit_t *ns;
1666 	mblk_t *dl_mp = NULL;
1667 	uchar_t *haddr;
1668 	ip_stack_t *ipst = ill->ill_ipst;
1669 
1670 	if (DB_TYPE(mp) != M_DATA) {
1671 		dl_mp = mp;
1672 		mp = mp->b_cont;
1673 	}
1674 	ns = ip_ndp_find_solicitation(mp, dl_mp, ill, hbuf, sizeof (hbuf), sbuf,
1675 	    sizeof (sbuf), &haddr);
1676 	if (haddr != NULL &&
1677 	    bcmp(haddr, ill->ill_phys_addr, ill->ill_phys_addr_length) == 0) {
1678 		/*
1679 		 * Ignore conflicts generated by misbehaving switches that just
1680 		 * reflect our own messages back to us.
1681 		 */
1682 		goto ignore_conflict;
1683 	}
1684 	(void) strlcpy(ibuf, ill->ill_name, sizeof (ibuf));
1685 	for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
1686 
1687 		if ((ipif->ipif_flags & IPIF_POINTOPOINT) ||
1688 		    !IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6lcl_addr,
1689 		    &ns->nd_ns_target)) {
1690 			continue;
1691 		}
1692 
1693 		/* If it's already marked, then don't do anything. */
1694 		if (ipif->ipif_flags & IPIF_DUPLICATE)
1695 			continue;
1696 
1697 		/*
1698 		 * If this is a failure during duplicate recovery, then don't
1699 		 * complain.  It may take a long time to recover.
1700 		 */
1701 		if (!ipif->ipif_was_dup) {
1702 			if (ipif->ipif_id != 0) {
1703 				(void) snprintf(ibuf + ill->ill_name_length - 1,
1704 				    sizeof (ibuf) - ill->ill_name_length + 1,
1705 				    ":%d", ipif->ipif_id);
1706 			}
1707 			cmn_err(CE_WARN, "%s has duplicate address %s (in "
1708 			    "use by %s); disabled", ibuf, sbuf, hbuf);
1709 		}
1710 		mutex_enter(&ill->ill_lock);
1711 		ASSERT(!(ipif->ipif_flags & IPIF_DUPLICATE));
1712 		ipif->ipif_flags |= IPIF_DUPLICATE;
1713 		ill->ill_ipif_dup_count++;
1714 		mutex_exit(&ill->ill_lock);
1715 		(void) ipif_down(ipif, NULL, NULL);
1716 		ipif_down_tail(ipif);
1717 		mutex_enter(&ill->ill_lock);
1718 		if (!(ipif->ipif_flags & (IPIF_DHCPRUNNING|IPIF_TEMPORARY)) &&
1719 		    ill->ill_net_type == IRE_IF_RESOLVER &&
1720 		    !(ipif->ipif_state_flags & (IPIF_MOVING |
1721 		    IPIF_CONDEMNED)) &&
1722 		    ipst->ips_ip_dup_recovery > 0) {
1723 			ipif->ipif_recovery_id = timeout(ipif6_dup_recovery,
1724 			    ipif, MSEC_TO_TICK(ipst->ips_ip_dup_recovery));
1725 		}
1726 		mutex_exit(&ill->ill_lock);
1727 	}
1728 ignore_conflict:
1729 	if (dl_mp != NULL)
1730 		freeb(dl_mp);
1731 	freemsg(mp);
1732 }
1733 
1734 /*
1735  * Handle failure by tearing down the ipifs with the specified address.  Note
1736  * that tearing down the ipif also means deleting the nce through ipif_down, so
1737  * it's not possible to do recovery by just restarting the nce timer.  Instead,
1738  * we start a timer on the ipif.
1739  */
1740 static void
1741 ip_ndp_failure(ill_t *ill, mblk_t *mp, mblk_t *dl_mp, nce_t *nce)
1742 {
1743 	if ((mp = copymsg(mp)) != NULL) {
1744 		if (dl_mp == NULL)
1745 			dl_mp = mp;
1746 		else if ((dl_mp = copyb(dl_mp)) != NULL)
1747 			dl_mp->b_cont = mp;
1748 		if (dl_mp == NULL) {
1749 			freemsg(mp);
1750 		} else {
1751 			ill_refhold(ill);
1752 			qwriter_ip(ill, ill->ill_rq, dl_mp, ip_ndp_excl, NEW_OP,
1753 			    B_FALSE);
1754 		}
1755 	}
1756 	ndp_delete(nce);
1757 }
1758 
1759 /*
1760  * Handle a discovered conflict: some other system is advertising that it owns
1761  * one of our IP addresses.  We need to defend ourselves, or just shut down the
1762  * interface.
1763  */
1764 static void
1765 ip_ndp_conflict(ill_t *ill, mblk_t *mp, mblk_t *dl_mp, nce_t *nce)
1766 {
1767 	ipif_t *ipif;
1768 	uint32_t now;
1769 	uint_t maxdefense;
1770 	uint_t defs;
1771 	ip_stack_t *ipst = ill->ill_ipst;
1772 
1773 	ipif = ipif_lookup_addr_v6(&nce->nce_addr, ill, ALL_ZONES, NULL, NULL,
1774 	    NULL, NULL, ipst);
1775 	if (ipif == NULL)
1776 		return;
1777 	/*
1778 	 * First, figure out if this address is disposable.
1779 	 */
1780 	if (ipif->ipif_flags & (IPIF_DHCPRUNNING | IPIF_TEMPORARY))
1781 		maxdefense = ipst->ips_ip_max_temp_defend;
1782 	else
1783 		maxdefense = ipst->ips_ip_max_defend;
1784 
1785 	/*
1786 	 * Now figure out how many times we've defended ourselves.  Ignore
1787 	 * defenses that happened long in the past.
1788 	 */
1789 	now = gethrestime_sec();
1790 	mutex_enter(&nce->nce_lock);
1791 	if ((defs = nce->nce_defense_count) > 0 &&
1792 	    now - nce->nce_defense_time > ipst->ips_ip_defend_interval) {
1793 		nce->nce_defense_count = defs = 0;
1794 	}
1795 	nce->nce_defense_count++;
1796 	nce->nce_defense_time = now;
1797 	mutex_exit(&nce->nce_lock);
1798 	ipif_refrele(ipif);
1799 
1800 	/*
1801 	 * If we've defended ourselves too many times already, then give up and
1802 	 * tear down the interface(s) using this address.  Otherwise, defend by
1803 	 * sending out an unsolicited Neighbor Advertisement.
1804 	 */
1805 	if (defs >= maxdefense) {
1806 		ip_ndp_failure(ill, mp, dl_mp, nce);
1807 	} else {
1808 		char hbuf[MAC_STR_LEN];
1809 		char sbuf[INET6_ADDRSTRLEN];
1810 		uchar_t *haddr;
1811 
1812 		(void) ip_ndp_find_solicitation(mp, dl_mp, ill, hbuf,
1813 		    sizeof (hbuf), sbuf, sizeof (sbuf), &haddr);
1814 		cmn_err(CE_WARN, "node %s is using our IP address %s on %s",
1815 		    hbuf, sbuf, ill->ill_name);
1816 		(void) nce_xmit(ill, ND_NEIGHBOR_ADVERT, ill, B_FALSE,
1817 		    &nce->nce_addr, &ipv6_all_hosts_mcast,
1818 		    nce_advert_flags(nce));
1819 	}
1820 }
1821 
1822 static void
1823 ndp_input_solicit(ill_t *ill, mblk_t *mp, mblk_t *dl_mp)
1824 {
1825 	nd_neighbor_solicit_t *ns;
1826 	uint32_t	hlen = ill->ill_nd_lla_len;
1827 	uchar_t		*haddr = NULL;
1828 	icmp6_t		*icmp_nd;
1829 	ip6_t		*ip6h;
1830 	nce_t		*our_nce = NULL;
1831 	in6_addr_t	target;
1832 	in6_addr_t	src;
1833 	int		len;
1834 	int		flag = 0;
1835 	nd_opt_hdr_t	*opt = NULL;
1836 	boolean_t	bad_solicit = B_FALSE;
1837 	mib2_ipv6IfIcmpEntry_t	*mib = ill->ill_icmp6_mib;
1838 
1839 	ip6h = (ip6_t *)mp->b_rptr;
1840 	icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
1841 	len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN;
1842 	src = ip6h->ip6_src;
1843 	ns = (nd_neighbor_solicit_t *)icmp_nd;
1844 	target = ns->nd_ns_target;
1845 	if (IN6_IS_ADDR_MULTICAST(&target)) {
1846 		if (ip_debug > 2) {
1847 			/* ip1dbg */
1848 			pr_addr_dbg("ndp_input_solicit: Target is"
1849 			    " multicast! %s\n", AF_INET6, &target);
1850 		}
1851 		bad_solicit = B_TRUE;
1852 		goto done;
1853 	}
1854 	if (len > sizeof (nd_neighbor_solicit_t)) {
1855 		/* Options present */
1856 		opt = (nd_opt_hdr_t *)&ns[1];
1857 		len -= sizeof (nd_neighbor_solicit_t);
1858 		if (!ndp_verify_optlen(opt, len)) {
1859 			ip1dbg(("ndp_input_solicit: Bad opt len\n"));
1860 			bad_solicit = B_TRUE;
1861 			goto done;
1862 		}
1863 	}
1864 	if (IN6_IS_ADDR_UNSPECIFIED(&src)) {
1865 		/* Check to see if this is a valid DAD solicitation */
1866 		if (!IN6_IS_ADDR_MC_SOLICITEDNODE(&ip6h->ip6_dst)) {
1867 			if (ip_debug > 2) {
1868 				/* ip1dbg */
1869 				pr_addr_dbg("ndp_input_solicit: IPv6 "
1870 				    "Destination is not solicited node "
1871 				    "multicast %s\n", AF_INET6,
1872 				    &ip6h->ip6_dst);
1873 			}
1874 			bad_solicit = B_TRUE;
1875 			goto done;
1876 		}
1877 	}
1878 
1879 	our_nce = ndp_lookup_v6(ill, &target, B_FALSE);
1880 	/*
1881 	 * If this is a valid Solicitation, a permanent
1882 	 * entry should exist in the cache
1883 	 */
1884 	if (our_nce == NULL ||
1885 	    !(our_nce->nce_flags & NCE_F_PERMANENT)) {
1886 		ip1dbg(("ndp_input_solicit: Wrong target in NS?!"
1887 		    "ifname=%s ", ill->ill_name));
1888 		if (ip_debug > 2) {
1889 			/* ip1dbg */
1890 			pr_addr_dbg(" dst %s\n", AF_INET6, &target);
1891 		}
1892 		bad_solicit = B_TRUE;
1893 		goto done;
1894 	}
1895 
1896 	/* At this point we should have a verified NS per spec */
1897 	if (opt != NULL) {
1898 		opt = ndp_get_option(opt, len, ND_OPT_SOURCE_LINKADDR);
1899 		if (opt != NULL) {
1900 			haddr = (uchar_t *)&opt[1];
1901 			if (hlen > opt->nd_opt_len * 8 - sizeof (*opt) ||
1902 			    hlen == 0) {
1903 				ip1dbg(("ndp_input_advert: bad SLLA\n"));
1904 				bad_solicit = B_TRUE;
1905 				goto done;
1906 			}
1907 		}
1908 	}
1909 
1910 	/* If sending directly to peer, set the unicast flag */
1911 	if (!IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst))
1912 		flag |= NDP_UNICAST;
1913 
1914 	/*
1915 	 * Create/update the entry for the soliciting node.
1916 	 * or respond to outstanding queries, don't if
1917 	 * the source is unspecified address.
1918 	 */
1919 	if (!IN6_IS_ADDR_UNSPECIFIED(&src)) {
1920 		int	err;
1921 		nce_t	*nnce;
1922 
1923 		ASSERT(ill->ill_isv6);
1924 		/*
1925 		 * Regular solicitations *must* include the Source Link-Layer
1926 		 * Address option.  Ignore messages that do not.
1927 		 */
1928 		if (haddr == NULL && IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst)) {
1929 			ip1dbg(("ndp_input_solicit: source link-layer address "
1930 			    "option missing with a specified source.\n"));
1931 			bad_solicit = B_TRUE;
1932 			goto done;
1933 		}
1934 
1935 		/*
1936 		 * This is a regular solicitation.  If we're still in the
1937 		 * process of verifying the address, then don't respond at all
1938 		 * and don't keep track of the sender.
1939 		 */
1940 		if (our_nce->nce_state == ND_PROBE)
1941 			goto done;
1942 
1943 		/*
1944 		 * If the solicitation doesn't have sender hardware address
1945 		 * (legal for unicast solicitation), then process without
1946 		 * installing the return NCE.  Either we already know it, or
1947 		 * we'll be forced to look it up when (and if) we reply to the
1948 		 * packet.
1949 		 */
1950 		if (haddr == NULL)
1951 			goto no_source;
1952 
1953 		err = ndp_lookup_then_add_v6(ill,
1954 		    haddr,
1955 		    &src,	/* Soliciting nodes address */
1956 		    &ipv6_all_ones,
1957 		    &ipv6_all_zeros,
1958 		    0,
1959 		    0,
1960 		    ND_STALE,
1961 		    &nnce);
1962 		switch (err) {
1963 		case 0:
1964 			/* done with this entry */
1965 			NCE_REFRELE(nnce);
1966 			break;
1967 		case EEXIST:
1968 			/*
1969 			 * B_FALSE indicates this is not an
1970 			 * an advertisement.
1971 			 */
1972 			ndp_process(nnce, haddr, 0, B_FALSE);
1973 			NCE_REFRELE(nnce);
1974 			break;
1975 		default:
1976 			ip1dbg(("ndp_input_solicit: Can't create NCE %d\n",
1977 			    err));
1978 			goto done;
1979 		}
1980 no_source:
1981 		flag |= NDP_SOLICITED;
1982 	} else {
1983 		/*
1984 		 * No source link layer address option should be present in a
1985 		 * valid DAD request.
1986 		 */
1987 		if (haddr != NULL) {
1988 			ip1dbg(("ndp_input_solicit: source link-layer address "
1989 			    "option present with an unspecified source.\n"));
1990 			bad_solicit = B_TRUE;
1991 			goto done;
1992 		}
1993 		if (our_nce->nce_state == ND_PROBE) {
1994 			/*
1995 			 * Internally looped-back probes won't have DLPI
1996 			 * attached to them.  External ones (which are sent by
1997 			 * multicast) always will.  Just ignore our own
1998 			 * transmissions.
1999 			 */
2000 			if (dl_mp != NULL) {
2001 				/*
2002 				 * If someone else is probing our address, then
2003 				 * we've crossed wires.  Declare failure.
2004 				 */
2005 				ip_ndp_failure(ill, mp, dl_mp, our_nce);
2006 			}
2007 			goto done;
2008 		}
2009 		/*
2010 		 * This is a DAD probe.  Multicast the advertisement to the
2011 		 * all-nodes address.
2012 		 */
2013 		src = ipv6_all_hosts_mcast;
2014 	}
2015 	flag |= nce_advert_flags(our_nce);
2016 	/* Response to a solicitation */
2017 	(void) nce_xmit(ill,
2018 	    ND_NEIGHBOR_ADVERT,
2019 	    ill,	/* ill to be used for extracting ill_nd_lla */
2020 	    B_TRUE,	/* use ill_nd_lla */
2021 	    &target,	/* Source and target of the advertisement pkt */
2022 	    &src,	/* IP Destination (source of original pkt) */
2023 	    flag);
2024 done:
2025 	if (bad_solicit)
2026 		BUMP_MIB(mib, ipv6IfIcmpInBadNeighborSolicitations);
2027 	if (our_nce != NULL)
2028 		NCE_REFRELE(our_nce);
2029 }
2030 
2031 void
2032 ndp_input_advert(ill_t *ill, mblk_t *mp, mblk_t *dl_mp)
2033 {
2034 	nd_neighbor_advert_t *na;
2035 	uint32_t	hlen = ill->ill_nd_lla_len;
2036 	uchar_t		*haddr = NULL;
2037 	icmp6_t		*icmp_nd;
2038 	ip6_t		*ip6h;
2039 	nce_t		*dst_nce = NULL;
2040 	in6_addr_t	target;
2041 	nd_opt_hdr_t	*opt = NULL;
2042 	int		len;
2043 	mib2_ipv6IfIcmpEntry_t	*mib = ill->ill_icmp6_mib;
2044 	ip_stack_t	*ipst = ill->ill_ipst;
2045 
2046 	ip6h = (ip6_t *)mp->b_rptr;
2047 	icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
2048 	len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN;
2049 	na = (nd_neighbor_advert_t *)icmp_nd;
2050 	if (IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst) &&
2051 	    (na->nd_na_flags_reserved & ND_NA_FLAG_SOLICITED)) {
2052 		ip1dbg(("ndp_input_advert: Target is multicast but the "
2053 		    "solicited flag is not zero\n"));
2054 		BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements);
2055 		return;
2056 	}
2057 	target = na->nd_na_target;
2058 	if (IN6_IS_ADDR_MULTICAST(&target)) {
2059 		ip1dbg(("ndp_input_advert: Target is multicast!\n"));
2060 		BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements);
2061 		return;
2062 	}
2063 	if (len > sizeof (nd_neighbor_advert_t)) {
2064 		opt = (nd_opt_hdr_t *)&na[1];
2065 		if (!ndp_verify_optlen(opt,
2066 		    len - sizeof (nd_neighbor_advert_t))) {
2067 			ip1dbg(("ndp_input_advert: cannot verify SLLA\n"));
2068 			BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements);
2069 			return;
2070 		}
2071 		/* At this point we have a verified NA per spec */
2072 		len -= sizeof (nd_neighbor_advert_t);
2073 		opt = ndp_get_option(opt, len, ND_OPT_TARGET_LINKADDR);
2074 		if (opt != NULL) {
2075 			haddr = (uchar_t *)&opt[1];
2076 			if (hlen > opt->nd_opt_len * 8 - sizeof (*opt) ||
2077 			    hlen == 0) {
2078 				ip1dbg(("ndp_input_advert: bad SLLA\n"));
2079 				BUMP_MIB(mib,
2080 				    ipv6IfIcmpInBadNeighborAdvertisements);
2081 				return;
2082 			}
2083 		}
2084 	}
2085 
2086 	/*
2087 	 * If this interface is part of the group look at all the
2088 	 * ills in the group.
2089 	 */
2090 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
2091 	if (ill->ill_group != NULL)
2092 		ill = ill->ill_group->illgrp_ill;
2093 
2094 	for (; ill != NULL; ill = ill->ill_group_next) {
2095 		mutex_enter(&ill->ill_lock);
2096 		if (!ILL_CAN_LOOKUP(ill)) {
2097 			mutex_exit(&ill->ill_lock);
2098 			continue;
2099 		}
2100 		ill_refhold_locked(ill);
2101 		mutex_exit(&ill->ill_lock);
2102 		dst_nce = ndp_lookup_v6(ill, &target, B_FALSE);
2103 		/* We have to drop the lock since ndp_process calls put* */
2104 		rw_exit(&ipst->ips_ill_g_lock);
2105 		if (dst_nce != NULL) {
2106 			if ((dst_nce->nce_flags & NCE_F_PERMANENT) &&
2107 			    dst_nce->nce_state == ND_PROBE) {
2108 				/*
2109 				 * Someone else sent an advertisement for an
2110 				 * address that we're trying to configure.
2111 				 * Tear it down.  Note that dl_mp might be NULL
2112 				 * if we're getting a unicast reply.  This
2113 				 * isn't typically done (multicast is the norm
2114 				 * in response to a probe), but ip_ndp_failure
2115 				 * will handle the dl_mp == NULL case as well.
2116 				 */
2117 				ip_ndp_failure(ill, mp, dl_mp, dst_nce);
2118 			} else if (dst_nce->nce_flags & NCE_F_PERMANENT) {
2119 				/*
2120 				 * Someone just announced one of our local
2121 				 * addresses.  If it wasn't us, then this is a
2122 				 * conflict.  Defend the address or shut it
2123 				 * down.
2124 				 */
2125 				if (dl_mp != NULL &&
2126 				    (haddr == NULL ||
2127 				    nce_cmp_ll_addr(dst_nce, haddr,
2128 				    ill->ill_nd_lla_len))) {
2129 					ip_ndp_conflict(ill, mp, dl_mp,
2130 					    dst_nce);
2131 				}
2132 			} else {
2133 				if (na->nd_na_flags_reserved &
2134 				    ND_NA_FLAG_ROUTER) {
2135 					dst_nce->nce_flags |= NCE_F_ISROUTER;
2136 				}
2137 				/* B_TRUE indicates this an advertisement */
2138 				ndp_process(dst_nce, haddr,
2139 				    na->nd_na_flags_reserved, B_TRUE);
2140 			}
2141 			NCE_REFRELE(dst_nce);
2142 		}
2143 		rw_enter(&ipst->ips_ill_g_lock, RW_READER);
2144 		ill_refrele(ill);
2145 	}
2146 	rw_exit(&ipst->ips_ill_g_lock);
2147 }
2148 
2149 /*
2150  * Process NDP neighbor solicitation/advertisement messages.
2151  * The checksum has already checked o.k before reaching here.
2152  */
2153 void
2154 ndp_input(ill_t *ill, mblk_t *mp, mblk_t *dl_mp)
2155 {
2156 	icmp6_t		*icmp_nd;
2157 	ip6_t		*ip6h;
2158 	int		len;
2159 	mib2_ipv6IfIcmpEntry_t	*mib = ill->ill_icmp6_mib;
2160 
2161 
2162 	if (!pullupmsg(mp, -1)) {
2163 		ip1dbg(("ndp_input: pullupmsg failed\n"));
2164 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2165 		goto done;
2166 	}
2167 	ip6h = (ip6_t *)mp->b_rptr;
2168 	if (ip6h->ip6_hops != IPV6_MAX_HOPS) {
2169 		ip1dbg(("ndp_input: hoplimit != IPV6_MAX_HOPS\n"));
2170 		BUMP_MIB(mib, ipv6IfIcmpBadHoplimit);
2171 		goto done;
2172 	}
2173 	/*
2174 	 * NDP does not accept any extension headers between the
2175 	 * IP header and the ICMP header since e.g. a routing
2176 	 * header could be dangerous.
2177 	 * This assumes that any AH or ESP headers are removed
2178 	 * by ip prior to passing the packet to ndp_input.
2179 	 */
2180 	if (ip6h->ip6_nxt != IPPROTO_ICMPV6) {
2181 		ip1dbg(("ndp_input: Wrong next header 0x%x\n",
2182 		    ip6h->ip6_nxt));
2183 		BUMP_MIB(mib, ipv6IfIcmpInErrors);
2184 		goto done;
2185 	}
2186 	icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
2187 	ASSERT(icmp_nd->icmp6_type == ND_NEIGHBOR_SOLICIT ||
2188 	    icmp_nd->icmp6_type == ND_NEIGHBOR_ADVERT);
2189 	if (icmp_nd->icmp6_code != 0) {
2190 		ip1dbg(("ndp_input: icmp6 code != 0 \n"));
2191 		BUMP_MIB(mib, ipv6IfIcmpInErrors);
2192 		goto done;
2193 	}
2194 	len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN;
2195 	/*
2196 	 * Make sure packet length is large enough for either
2197 	 * a NS or a NA icmp packet.
2198 	 */
2199 	if (len <  sizeof (struct icmp6_hdr) + sizeof (struct in6_addr)) {
2200 		ip1dbg(("ndp_input: packet too short\n"));
2201 		BUMP_MIB(mib, ipv6IfIcmpInErrors);
2202 		goto done;
2203 	}
2204 	if (icmp_nd->icmp6_type == ND_NEIGHBOR_SOLICIT) {
2205 		ndp_input_solicit(ill, mp, dl_mp);
2206 	} else {
2207 		ndp_input_advert(ill, mp, dl_mp);
2208 	}
2209 done:
2210 	freemsg(mp);
2211 }
2212 
2213 /*
2214  * nce_xmit is called to form and transmit a ND solicitation or
2215  * advertisement ICMP packet.
2216  *
2217  * If the source address is unspecified and this isn't a probe (used for
2218  * duplicate address detection), an appropriate source address and link layer
2219  * address will be chosen here.  The link layer address option is included if
2220  * the source is specified (i.e., all non-probe packets), and omitted (per the
2221  * specification) otherwise.
2222  *
2223  * It returns B_FALSE only if it does a successful put() to the
2224  * corresponding ill's ill_wq otherwise returns B_TRUE.
2225  */
2226 static boolean_t
2227 nce_xmit(ill_t *ill, uint32_t operation, ill_t *hwaddr_ill,
2228     boolean_t use_nd_lla, const in6_addr_t *sender, const in6_addr_t *target,
2229     int flag)
2230 {
2231 	uint32_t	len;
2232 	icmp6_t 	*icmp6;
2233 	mblk_t		*mp;
2234 	ip6_t		*ip6h;
2235 	nd_opt_hdr_t	*opt;
2236 	uint_t		plen;
2237 	ip6i_t		*ip6i;
2238 	ipif_t		*src_ipif = NULL;
2239 	uint8_t		*hw_addr;
2240 	zoneid_t	zoneid = GLOBAL_ZONEID;
2241 
2242 	/*
2243 	 * If we have a unspecified source(sender) address, select a
2244 	 * proper source address for the solicitation here itself so
2245 	 * that we can initialize the h/w address correctly. This is
2246 	 * needed for interface groups as source address can come from
2247 	 * the whole group and the h/w address initialized from ill will
2248 	 * be wrong if the source address comes from a different ill.
2249 	 *
2250 	 * If the sender is specified then we use this address in order
2251 	 * to lookup the zoneid before calling ip_output_v6(). This is to
2252 	 * enable unicast ND_NEIGHBOR_ADVERT packets to be routed correctly
2253 	 * by IP (we cannot guarantee that the global zone has an interface
2254 	 * route to the destination).
2255 	 *
2256 	 * Note that the NA never comes here with the unspecified source
2257 	 * address. The following asserts that whenever the source
2258 	 * address is specified, the haddr also should be specified.
2259 	 */
2260 	ASSERT(IN6_IS_ADDR_UNSPECIFIED(sender) || (hwaddr_ill != NULL));
2261 
2262 	if (IN6_IS_ADDR_UNSPECIFIED(sender) && !(flag & NDP_PROBE)) {
2263 		ASSERT(operation != ND_NEIGHBOR_ADVERT);
2264 		/*
2265 		 * Pick a source address for this solicitation, but
2266 		 * restrict the selection to addresses assigned to the
2267 		 * output interface (or interface group).  We do this
2268 		 * because the destination will create a neighbor cache
2269 		 * entry for the source address of this packet, so the
2270 		 * source address had better be a valid neighbor.
2271 		 */
2272 		src_ipif = ipif_select_source_v6(ill, target, RESTRICT_TO_ILL,
2273 		    IPV6_PREFER_SRC_DEFAULT, ALL_ZONES);
2274 		if (src_ipif == NULL) {
2275 			char buf[INET6_ADDRSTRLEN];
2276 
2277 			ip1dbg(("nce_xmit: No source ipif for dst %s\n",
2278 			    inet_ntop(AF_INET6, (char *)target, buf,
2279 			    sizeof (buf))));
2280 			return (B_TRUE);
2281 		}
2282 		sender = &src_ipif->ipif_v6src_addr;
2283 		hwaddr_ill = src_ipif->ipif_ill;
2284 	} else if (!(IN6_IS_ADDR_UNSPECIFIED(sender))) {
2285 		zoneid = ipif_lookup_addr_zoneid_v6(sender, ill, ill->ill_ipst);
2286 		/*
2287 		 * It's possible for ipif_lookup_addr_zoneid_v6() to return
2288 		 * ALL_ZONES if it cannot find a matching ipif for the address
2289 		 * we are trying to use. In this case we err on the side of
2290 		 * trying to send the packet by defaulting to the GLOBAL_ZONEID.
2291 		 */
2292 		if (zoneid == ALL_ZONES)
2293 			zoneid = GLOBAL_ZONEID;
2294 	}
2295 
2296 	/*
2297 	 * Always make sure that the NS/NA packets don't get load
2298 	 * spread. This is needed so that the probe packets sent
2299 	 * by the in.mpathd daemon can really go out on the desired
2300 	 * interface. Probe packets are made to go out on a desired
2301 	 * interface by including a ip6i with ATTACH_IF flag. As these
2302 	 * packets indirectly end up sending/receiving NS/NA packets
2303 	 * (neighbor doing NUD), we have to make sure that NA
2304 	 * also go out on the same interface.
2305 	 */
2306 	plen = (sizeof (nd_opt_hdr_t) + ill->ill_nd_lla_len + 7) / 8;
2307 	len = IPV6_HDR_LEN + sizeof (ip6i_t) + sizeof (nd_neighbor_advert_t) +
2308 	    plen * 8;
2309 	mp = allocb(len,  BPRI_LO);
2310 	if (mp == NULL) {
2311 		if (src_ipif != NULL)
2312 			ipif_refrele(src_ipif);
2313 		return (B_TRUE);
2314 	}
2315 	bzero((char *)mp->b_rptr, len);
2316 	mp->b_wptr = mp->b_rptr + len;
2317 
2318 	ip6i = (ip6i_t *)mp->b_rptr;
2319 	ip6i->ip6i_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
2320 	ip6i->ip6i_nxt = IPPROTO_RAW;
2321 	ip6i->ip6i_flags = IP6I_ATTACH_IF | IP6I_HOPLIMIT;
2322 	if (flag & NDP_PROBE)
2323 		ip6i->ip6i_flags |= IP6I_UNSPEC_SRC;
2324 	ip6i->ip6i_ifindex = ill->ill_phyint->phyint_ifindex;
2325 
2326 	ip6h = (ip6_t *)(mp->b_rptr + sizeof (ip6i_t));
2327 	ip6h->ip6_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
2328 	ip6h->ip6_plen = htons(len - IPV6_HDR_LEN - sizeof (ip6i_t));
2329 	ip6h->ip6_nxt = IPPROTO_ICMPV6;
2330 	ip6h->ip6_hops = IPV6_MAX_HOPS;
2331 	ip6h->ip6_dst = *target;
2332 	icmp6 = (icmp6_t *)&ip6h[1];
2333 
2334 	opt = (nd_opt_hdr_t *)((uint8_t *)ip6h + IPV6_HDR_LEN +
2335 	    sizeof (nd_neighbor_advert_t));
2336 
2337 	if (operation == ND_NEIGHBOR_SOLICIT) {
2338 		nd_neighbor_solicit_t *ns = (nd_neighbor_solicit_t *)icmp6;
2339 
2340 		if (!(flag & NDP_PROBE))
2341 			opt->nd_opt_type = ND_OPT_SOURCE_LINKADDR;
2342 		ip6h->ip6_src = *sender;
2343 		ns->nd_ns_target = *target;
2344 		if (!(flag & NDP_UNICAST)) {
2345 			/* Form multicast address of the target */
2346 			ip6h->ip6_dst = ipv6_solicited_node_mcast;
2347 			ip6h->ip6_dst.s6_addr32[3] |=
2348 			    ns->nd_ns_target.s6_addr32[3];
2349 		}
2350 	} else {
2351 		nd_neighbor_advert_t *na = (nd_neighbor_advert_t *)icmp6;
2352 
2353 		ASSERT(!(flag & NDP_PROBE));
2354 		opt->nd_opt_type = ND_OPT_TARGET_LINKADDR;
2355 		ip6h->ip6_src = *sender;
2356 		na->nd_na_target = *sender;
2357 		if (flag & NDP_ISROUTER)
2358 			na->nd_na_flags_reserved |= ND_NA_FLAG_ROUTER;
2359 		if (flag & NDP_SOLICITED)
2360 			na->nd_na_flags_reserved |= ND_NA_FLAG_SOLICITED;
2361 		if (flag & NDP_ORIDE)
2362 			na->nd_na_flags_reserved |= ND_NA_FLAG_OVERRIDE;
2363 	}
2364 
2365 	hw_addr = NULL;
2366 	if (!(flag & NDP_PROBE)) {
2367 		hw_addr = use_nd_lla ? hwaddr_ill->ill_nd_lla :
2368 		    hwaddr_ill->ill_phys_addr;
2369 		if (hw_addr != NULL) {
2370 			/* Fill in link layer address and option len */
2371 			opt->nd_opt_len = (uint8_t)plen;
2372 			bcopy(hw_addr, &opt[1], hwaddr_ill->ill_nd_lla_len);
2373 		}
2374 	}
2375 	if (hw_addr == NULL) {
2376 		/* If there's no link layer address option, then strip it. */
2377 		len -= plen * 8;
2378 		mp->b_wptr = mp->b_rptr + len;
2379 		ip6h->ip6_plen = htons(len - IPV6_HDR_LEN - sizeof (ip6i_t));
2380 	}
2381 
2382 	icmp6->icmp6_type = (uint8_t)operation;
2383 	icmp6->icmp6_code = 0;
2384 	/*
2385 	 * Prepare for checksum by putting icmp length in the icmp
2386 	 * checksum field. The checksum is calculated in ip_wput_v6.
2387 	 */
2388 	icmp6->icmp6_cksum = ip6h->ip6_plen;
2389 
2390 	if (src_ipif != NULL)
2391 		ipif_refrele(src_ipif);
2392 
2393 	ip_output_v6((void *)(uintptr_t)zoneid, mp, ill->ill_wq, IP_WPUT);
2394 	return (B_FALSE);
2395 }
2396 
2397 /*
2398  * Make a link layer address (does not include the SAP) from an nce.
2399  * To form the link layer address, use the last four bytes of ipv6
2400  * address passed in and the fixed offset stored in nce.
2401  */
2402 static void
2403 nce_make_mapping(nce_t *nce, uchar_t *addrpos, uchar_t *addr)
2404 {
2405 	uchar_t *mask, *to;
2406 	ill_t	*ill = nce->nce_ill;
2407 	int 	len;
2408 
2409 	if (ill->ill_net_type == IRE_IF_NORESOLVER)
2410 		return;
2411 	ASSERT(nce->nce_res_mp != NULL);
2412 	ASSERT(ill->ill_net_type == IRE_IF_RESOLVER);
2413 	ASSERT(nce->nce_flags & NCE_F_MAPPING);
2414 	ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&nce->nce_extract_mask));
2415 	ASSERT(addr != NULL);
2416 	bcopy(nce->nce_res_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill),
2417 	    addrpos, ill->ill_nd_lla_len);
2418 	len = MIN((int)ill->ill_nd_lla_len - nce->nce_ll_extract_start,
2419 	    IPV6_ADDR_LEN);
2420 	mask = (uchar_t *)&nce->nce_extract_mask;
2421 	mask += (IPV6_ADDR_LEN - len);
2422 	addr += (IPV6_ADDR_LEN - len);
2423 	to = addrpos + nce->nce_ll_extract_start;
2424 	while (len-- > 0)
2425 		*to++ |= *mask++ & *addr++;
2426 }
2427 
2428 /*
2429  * Pass a cache report back out via NDD.
2430  */
2431 /* ARGSUSED */
2432 int
2433 ndp_report(queue_t *q, mblk_t *mp, caddr_t arg, cred_t *ioc_cr)
2434 {
2435 	ip_stack_t	*ipst;
2436 
2437 	if (CONN_Q(q))
2438 		ipst = CONNQ_TO_IPST(q);
2439 	else
2440 		ipst = ILLQ_TO_IPST(q);
2441 
2442 	(void) mi_mpprintf(mp, "ifname      hardware addr    flags"
2443 	    "     proto addr/mask");
2444 	ndp_walk(NULL, (pfi_t)nce_report1, (uchar_t *)mp, ipst);
2445 	return (0);
2446 }
2447 
2448 /*
2449  * Add a single line to the NDP Cache Entry Report.
2450  */
2451 static void
2452 nce_report1(nce_t *nce, uchar_t *mp_arg)
2453 {
2454 	ill_t		*ill = nce->nce_ill;
2455 	char		local_buf[INET6_ADDRSTRLEN];
2456 	uchar_t		flags_buf[10];
2457 	uint32_t	flags = nce->nce_flags;
2458 	mblk_t		*mp = (mblk_t *)mp_arg;
2459 	uchar_t		*h;
2460 	uchar_t		*m = flags_buf;
2461 	in6_addr_t	v6addr;
2462 	uint64_t	now;
2463 
2464 	/*
2465 	 * Lock the nce to protect nce_res_mp from being changed
2466 	 * if an external resolver address resolution completes
2467 	 * while nce_res_mp is being accessed here.
2468 	 *
2469 	 * Deal with all address formats, not just Ethernet-specific
2470 	 * In addition, make sure that the mblk has enough space
2471 	 * before writing to it. If is doesn't, allocate a new one.
2472 	 */
2473 	if (nce->nce_ipversion == IPV4_VERSION) {
2474 		/*
2475 		 * Don't include v4 NCEs in NDP cache entry report.
2476 		 * But sanity check for lingering ND_INITIAL entries
2477 		 * when we do 'ndd -get /dev/ip ip_ndp_cache_report'
2478 		 */
2479 		if (nce->nce_state == ND_INITIAL) {
2480 
2481 			now = TICK_TO_MSEC(lbolt64);
2482 			if (now - nce->nce_init_time > NCE_STUCK_TIMEOUT) {
2483 				DTRACE_PROBE1(nce__stuck, nce_t *, nce);
2484 			}
2485 		}
2486 		return;
2487 	}
2488 
2489 	ASSERT(ill != NULL);
2490 	v6addr = nce->nce_mask;
2491 	if (flags & NCE_F_PERMANENT)
2492 		*m++ = 'P';
2493 	if (flags & NCE_F_ISROUTER)
2494 		*m++ = 'R';
2495 	if (flags & NCE_F_MAPPING)
2496 		*m++ = 'M';
2497 	*m = '\0';
2498 
2499 	if (ill->ill_net_type == IRE_IF_RESOLVER) {
2500 		size_t		addrlen;
2501 		char		*addr_buf;
2502 		dl_unitdata_req_t	*dl;
2503 
2504 		mutex_enter(&nce->nce_lock);
2505 		h = nce->nce_res_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill);
2506 		dl = (dl_unitdata_req_t *)nce->nce_res_mp->b_rptr;
2507 		if (ill->ill_flags & ILLF_XRESOLV)
2508 			addrlen = (3 * (dl->dl_dest_addr_length));
2509 		else
2510 			addrlen = (3 * (ill->ill_nd_lla_len));
2511 		if (addrlen <= 0) {
2512 			mutex_exit(&nce->nce_lock);
2513 			(void) mi_mpprintf(mp,
2514 			    "%8s %9s %5s %s/%d",
2515 			    ill->ill_name,
2516 			    "None",
2517 			    (uchar_t *)&flags_buf,
2518 			    inet_ntop(AF_INET6, (char *)&nce->nce_addr,
2519 			    (char *)local_buf, sizeof (local_buf)),
2520 			    ip_mask_to_plen_v6(&v6addr));
2521 		} else {
2522 			/*
2523 			 * Convert the hardware/lla address to ascii
2524 			 */
2525 			addr_buf = kmem_zalloc(addrlen, KM_NOSLEEP);
2526 			if (addr_buf == NULL) {
2527 				mutex_exit(&nce->nce_lock);
2528 				return;
2529 			}
2530 			(void) mac_colon_addr((uint8_t *)h,
2531 			    (ill->ill_flags & ILLF_XRESOLV) ?
2532 			    dl->dl_dest_addr_length : ill->ill_nd_lla_len,
2533 			    addr_buf, addrlen);
2534 			mutex_exit(&nce->nce_lock);
2535 			(void) mi_mpprintf(mp, "%8s %17s %5s %s/%d",
2536 			    ill->ill_name, addr_buf, (uchar_t *)&flags_buf,
2537 			    inet_ntop(AF_INET6, (char *)&nce->nce_addr,
2538 			    (char *)local_buf, sizeof (local_buf)),
2539 			    ip_mask_to_plen_v6(&v6addr));
2540 			kmem_free(addr_buf, addrlen);
2541 		}
2542 	} else {
2543 		(void) mi_mpprintf(mp,
2544 		    "%8s %9s %5s %s/%d",
2545 		    ill->ill_name,
2546 		    "None",
2547 		    (uchar_t *)&flags_buf,
2548 		    inet_ntop(AF_INET6, (char *)&nce->nce_addr,
2549 		    (char *)local_buf, sizeof (local_buf)),
2550 		    ip_mask_to_plen_v6(&v6addr));
2551 	}
2552 }
2553 
2554 mblk_t *
2555 nce_udreq_alloc(ill_t *ill)
2556 {
2557 	mblk_t	*template_mp = NULL;
2558 	dl_unitdata_req_t *dlur;
2559 	int	sap_length;
2560 
2561 	ASSERT(ill->ill_isv6);
2562 
2563 	sap_length = ill->ill_sap_length;
2564 	template_mp = ip_dlpi_alloc(sizeof (dl_unitdata_req_t) +
2565 	    ill->ill_nd_lla_len + ABS(sap_length), DL_UNITDATA_REQ);
2566 	if (template_mp == NULL)
2567 		return (NULL);
2568 
2569 	dlur = (dl_unitdata_req_t *)template_mp->b_rptr;
2570 	dlur->dl_priority.dl_min = 0;
2571 	dlur->dl_priority.dl_max = 0;
2572 	dlur->dl_dest_addr_length = ABS(sap_length) + ill->ill_nd_lla_len;
2573 	dlur->dl_dest_addr_offset = sizeof (dl_unitdata_req_t);
2574 
2575 	/* Copy in the SAP value. */
2576 	NCE_LL_SAP_COPY(ill, template_mp);
2577 
2578 	return (template_mp);
2579 }
2580 
2581 /*
2582  * NDP retransmit timer.
2583  * This timer goes off when:
2584  * a. It is time to retransmit NS for resolver.
2585  * b. It is time to send reachability probes.
2586  */
2587 void
2588 ndp_timer(void *arg)
2589 {
2590 	nce_t		*nce = arg;
2591 	ill_t		*ill = nce->nce_ill;
2592 	uint32_t	ms;
2593 	char		addrbuf[INET6_ADDRSTRLEN];
2594 	mblk_t		*mp;
2595 	boolean_t	dropped = B_FALSE;
2596 	ip_stack_t	*ipst = ill->ill_ipst;
2597 
2598 	/*
2599 	 * The timer has to be cancelled by ndp_delete before doing the final
2600 	 * refrele. So the NCE is guaranteed to exist when the timer runs
2601 	 * until it clears the timeout_id. Before clearing the timeout_id
2602 	 * bump up the refcnt so that we can continue to use the nce
2603 	 */
2604 	ASSERT(nce != NULL);
2605 
2606 	/*
2607 	 * Grab the ill_g_lock now itself to avoid lock order problems.
2608 	 * nce_solicit needs ill_g_lock to be able to traverse ills
2609 	 */
2610 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
2611 	mutex_enter(&nce->nce_lock);
2612 	NCE_REFHOLD_LOCKED(nce);
2613 	nce->nce_timeout_id = 0;
2614 
2615 	/*
2616 	 * Check the reachability state first.
2617 	 */
2618 	switch (nce->nce_state) {
2619 	case ND_DELAY:
2620 		rw_exit(&ipst->ips_ill_g_lock);
2621 		nce->nce_state = ND_PROBE;
2622 		mutex_exit(&nce->nce_lock);
2623 		(void) nce_xmit(ill, ND_NEIGHBOR_SOLICIT, NULL, B_FALSE,
2624 		    &ipv6_all_zeros, &nce->nce_addr, NDP_UNICAST);
2625 		if (ip_debug > 3) {
2626 			/* ip2dbg */
2627 			pr_addr_dbg("ndp_timer: state for %s changed "
2628 			    "to PROBE\n", AF_INET6, &nce->nce_addr);
2629 		}
2630 		NDP_RESTART_TIMER(nce, ill->ill_reachable_retrans_time);
2631 		NCE_REFRELE(nce);
2632 		return;
2633 	case ND_PROBE:
2634 		/* must be retransmit timer */
2635 		rw_exit(&ipst->ips_ill_g_lock);
2636 		nce->nce_pcnt--;
2637 		ASSERT(nce->nce_pcnt < ND_MAX_UNICAST_SOLICIT &&
2638 		    nce->nce_pcnt >= -1);
2639 		if (nce->nce_pcnt > 0) {
2640 			/*
2641 			 * As per RFC2461, the nce gets deleted after
2642 			 * MAX_UNICAST_SOLICIT unsuccessful re-transmissions.
2643 			 * Note that the first unicast solicitation is sent
2644 			 * during the DELAY state.
2645 			 */
2646 			ip2dbg(("ndp_timer: pcount=%x dst %s\n",
2647 			    nce->nce_pcnt, inet_ntop(AF_INET6, &nce->nce_addr,
2648 			    addrbuf, sizeof (addrbuf))));
2649 			mutex_exit(&nce->nce_lock);
2650 			dropped = nce_xmit(ill, ND_NEIGHBOR_SOLICIT, NULL,
2651 			    B_FALSE, &ipv6_all_zeros, &nce->nce_addr,
2652 			    (nce->nce_flags & NCE_F_PERMANENT) ? NDP_PROBE :
2653 			    NDP_UNICAST);
2654 			if (dropped) {
2655 				mutex_enter(&nce->nce_lock);
2656 				nce->nce_pcnt++;
2657 				mutex_exit(&nce->nce_lock);
2658 			}
2659 			NDP_RESTART_TIMER(nce, ILL_PROBE_INTERVAL(ill));
2660 		} else if (nce->nce_pcnt < 0) {
2661 			/* No hope, delete the nce */
2662 			nce->nce_state = ND_UNREACHABLE;
2663 			mutex_exit(&nce->nce_lock);
2664 			if (ip_debug > 2) {
2665 				/* ip1dbg */
2666 				pr_addr_dbg("ndp_timer: Delete IRE for"
2667 				    " dst %s\n", AF_INET6, &nce->nce_addr);
2668 			}
2669 			ndp_delete(nce);
2670 		} else if (!(nce->nce_flags & NCE_F_PERMANENT)) {
2671 			/* Wait RetransTimer, before deleting the entry */
2672 			ip2dbg(("ndp_timer: pcount=%x dst %s\n",
2673 			    nce->nce_pcnt, inet_ntop(AF_INET6,
2674 			    &nce->nce_addr, addrbuf, sizeof (addrbuf))));
2675 			mutex_exit(&nce->nce_lock);
2676 			/* Wait one interval before killing */
2677 			NDP_RESTART_TIMER(nce, ill->ill_reachable_retrans_time);
2678 		} else if (ill->ill_phyint->phyint_flags & PHYI_RUNNING) {
2679 			ipif_t *ipif;
2680 
2681 			/*
2682 			 * We're done probing, and we can now declare this
2683 			 * address to be usable.  Let IP know that it's ok to
2684 			 * use.
2685 			 */
2686 			nce->nce_state = ND_REACHABLE;
2687 			mutex_exit(&nce->nce_lock);
2688 			ipif = ipif_lookup_addr_v6(&nce->nce_addr, ill,
2689 			    ALL_ZONES, NULL, NULL, NULL, NULL, ipst);
2690 			if (ipif != NULL) {
2691 				if (ipif->ipif_was_dup) {
2692 					char ibuf[LIFNAMSIZ + 10];
2693 					char sbuf[INET6_ADDRSTRLEN];
2694 
2695 					ipif->ipif_was_dup = B_FALSE;
2696 					(void) strlcpy(ibuf, ill->ill_name,
2697 					    sizeof (ibuf));
2698 					(void) inet_ntop(AF_INET6,
2699 					    &ipif->ipif_v6lcl_addr,
2700 					    sbuf, sizeof (sbuf));
2701 					if (ipif->ipif_id != 0) {
2702 						(void) snprintf(ibuf +
2703 						    ill->ill_name_length - 1,
2704 						    sizeof (ibuf) -
2705 						    ill->ill_name_length + 1,
2706 						    ":%d", ipif->ipif_id);
2707 					}
2708 					cmn_err(CE_NOTE, "recovered address "
2709 					    "%s on %s", sbuf, ibuf);
2710 				}
2711 				if ((ipif->ipif_flags & IPIF_UP) &&
2712 				    !ipif->ipif_addr_ready) {
2713 					ip_rts_ifmsg(ipif);
2714 					ip_rts_newaddrmsg(RTM_ADD, 0, ipif);
2715 					sctp_update_ipif(ipif, SCTP_IPIF_UP);
2716 				}
2717 				ipif->ipif_addr_ready = 1;
2718 				ipif_refrele(ipif);
2719 			}
2720 			/* Begin defending our new address */
2721 			nce->nce_unsolicit_count = 0;
2722 			dropped = nce_xmit(ill, ND_NEIGHBOR_ADVERT, ill,
2723 			    B_FALSE, &nce->nce_addr, &ipv6_all_hosts_mcast,
2724 			    nce_advert_flags(nce));
2725 			if (dropped) {
2726 				nce->nce_unsolicit_count = 1;
2727 				NDP_RESTART_TIMER(nce,
2728 				    ipst->ips_ip_ndp_unsolicit_interval);
2729 			} else if (ipst->ips_ip_ndp_defense_interval != 0) {
2730 				NDP_RESTART_TIMER(nce,
2731 				    ipst->ips_ip_ndp_defense_interval);
2732 			}
2733 		} else {
2734 			/*
2735 			 * This is an address we're probing to be our own, but
2736 			 * the ill is down.  Wait until it comes back before
2737 			 * doing anything, but switch to reachable state so
2738 			 * that the restart will work.
2739 			 */
2740 			nce->nce_state = ND_REACHABLE;
2741 			mutex_exit(&nce->nce_lock);
2742 		}
2743 		NCE_REFRELE(nce);
2744 		return;
2745 	case ND_INCOMPLETE:
2746 		/*
2747 		 * Must be resolvers retransmit timer.
2748 		 */
2749 		for (mp = nce->nce_qd_mp; mp != NULL; mp = mp->b_next) {
2750 			ip6i_t	*ip6i;
2751 			ip6_t	*ip6h;
2752 			mblk_t *data_mp;
2753 
2754 			/*
2755 			 * Walk the list of packets queued, and see if there
2756 			 * are any multipathing probe packets. Such packets
2757 			 * are always queued at the head. Since this is a
2758 			 * retransmit timer firing, mark such packets as
2759 			 * delayed in ND resolution. This info will be used
2760 			 * in ip_wput_v6(). Multipathing probe packets will
2761 			 * always have an ip6i_t. Once we hit a packet without
2762 			 * it, we can break out of this loop.
2763 			 */
2764 			if (mp->b_datap->db_type == M_CTL)
2765 				data_mp = mp->b_cont;
2766 			else
2767 				data_mp = mp;
2768 
2769 			ip6h = (ip6_t *)data_mp->b_rptr;
2770 			if (ip6h->ip6_nxt != IPPROTO_RAW)
2771 				break;
2772 
2773 			/*
2774 			 * This message should have been pulled up already in
2775 			 * ip_wput_v6. We can't do pullups here because the
2776 			 * b_next/b_prev is non-NULL.
2777 			 */
2778 			ip6i = (ip6i_t *)ip6h;
2779 			ASSERT((data_mp->b_wptr - (uchar_t *)ip6i) >=
2780 			    sizeof (ip6i_t) + IPV6_HDR_LEN);
2781 
2782 			/* Mark this packet as delayed due to ND resolution */
2783 			if (ip6i->ip6i_flags & IP6I_DROP_IFDELAYED)
2784 				ip6i->ip6i_flags |= IP6I_ND_DELAYED;
2785 		}
2786 		if (nce->nce_qd_mp != NULL) {
2787 			ms = nce_solicit(nce, NULL);
2788 			rw_exit(&ipst->ips_ill_g_lock);
2789 			if (ms == 0) {
2790 				if (nce->nce_state != ND_REACHABLE) {
2791 					mutex_exit(&nce->nce_lock);
2792 					nce_resolv_failed(nce);
2793 					ndp_delete(nce);
2794 				} else {
2795 					mutex_exit(&nce->nce_lock);
2796 				}
2797 			} else {
2798 				mutex_exit(&nce->nce_lock);
2799 				NDP_RESTART_TIMER(nce, (clock_t)ms);
2800 			}
2801 			NCE_REFRELE(nce);
2802 			return;
2803 		}
2804 		mutex_exit(&nce->nce_lock);
2805 		rw_exit(&ipst->ips_ill_g_lock);
2806 		NCE_REFRELE(nce);
2807 		break;
2808 	case ND_REACHABLE :
2809 		rw_exit(&ipst->ips_ill_g_lock);
2810 		if (((nce->nce_flags & NCE_F_UNSOL_ADV) &&
2811 		    nce->nce_unsolicit_count != 0) ||
2812 		    ((nce->nce_flags & NCE_F_PERMANENT) &&
2813 		    ipst->ips_ip_ndp_defense_interval != 0)) {
2814 			if (nce->nce_unsolicit_count > 0)
2815 				nce->nce_unsolicit_count--;
2816 			mutex_exit(&nce->nce_lock);
2817 			dropped = nce_xmit(ill,
2818 			    ND_NEIGHBOR_ADVERT,
2819 			    ill,	/* ill to be used for hw addr */
2820 			    B_FALSE,	/* use ill_phys_addr */
2821 			    &nce->nce_addr,
2822 			    &ipv6_all_hosts_mcast,
2823 			    nce_advert_flags(nce));
2824 			if (dropped) {
2825 				mutex_enter(&nce->nce_lock);
2826 				nce->nce_unsolicit_count++;
2827 				mutex_exit(&nce->nce_lock);
2828 			}
2829 			if (nce->nce_unsolicit_count != 0) {
2830 				NDP_RESTART_TIMER(nce,
2831 				    ipst->ips_ip_ndp_unsolicit_interval);
2832 			} else {
2833 				NDP_RESTART_TIMER(nce,
2834 				    ipst->ips_ip_ndp_defense_interval);
2835 			}
2836 		} else {
2837 			mutex_exit(&nce->nce_lock);
2838 		}
2839 		NCE_REFRELE(nce);
2840 		break;
2841 	default:
2842 		rw_exit(&ipst->ips_ill_g_lock);
2843 		mutex_exit(&nce->nce_lock);
2844 		NCE_REFRELE(nce);
2845 		break;
2846 	}
2847 }
2848 
2849 /*
2850  * Set a link layer address from the ll_addr passed in.
2851  * Copy SAP from ill.
2852  */
2853 static void
2854 nce_set_ll(nce_t *nce, uchar_t *ll_addr)
2855 {
2856 	ill_t	*ill = nce->nce_ill;
2857 	uchar_t	*woffset;
2858 
2859 	ASSERT(ll_addr != NULL);
2860 	/* Always called before fast_path_probe */
2861 	ASSERT(nce->nce_fp_mp == NULL);
2862 	if (ill->ill_sap_length != 0) {
2863 		/*
2864 		 * Copy the SAP type specified in the
2865 		 * request into the xmit template.
2866 		 */
2867 		NCE_LL_SAP_COPY(ill, nce->nce_res_mp);
2868 	}
2869 	if (ill->ill_phys_addr_length > 0) {
2870 		/*
2871 		 * The bcopy() below used to be called for the physical address
2872 		 * length rather than the link layer address length. For
2873 		 * ethernet and many other media, the phys_addr and lla are
2874 		 * identical.
2875 		 * However, with xresolv interfaces being introduced, the
2876 		 * phys_addr and lla are no longer the same, and the physical
2877 		 * address may not have any useful meaning, so we use the lla
2878 		 * for IPv6 address resolution and destination addressing.
2879 		 *
2880 		 * For PPP or other interfaces with a zero length
2881 		 * physical address, don't do anything here.
2882 		 * The bcopy() with a zero phys_addr length was previously
2883 		 * a no-op for interfaces with a zero-length physical address.
2884 		 * Using the lla for them would change the way they operate.
2885 		 * Doing nothing in such cases preserves expected behavior.
2886 		 */
2887 		woffset = nce->nce_res_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill);
2888 		bcopy(ll_addr, woffset, ill->ill_nd_lla_len);
2889 	}
2890 }
2891 
2892 static boolean_t
2893 nce_cmp_ll_addr(const nce_t *nce, const uchar_t *ll_addr, uint32_t ll_addr_len)
2894 {
2895 	ill_t	*ill = nce->nce_ill;
2896 	uchar_t	*ll_offset;
2897 
2898 	ASSERT(nce->nce_res_mp != NULL);
2899 	if (ll_addr == NULL)
2900 		return (B_FALSE);
2901 	ll_offset = nce->nce_res_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill);
2902 	if (bcmp(ll_addr, ll_offset, ll_addr_len) != 0)
2903 		return (B_TRUE);
2904 	return (B_FALSE);
2905 }
2906 
2907 /*
2908  * Updates the link layer address or the reachability state of
2909  * a cache entry.  Reset probe counter if needed.
2910  */
2911 static void
2912 nce_update(nce_t *nce, uint16_t new_state, uchar_t *new_ll_addr)
2913 {
2914 	ill_t	*ill = nce->nce_ill;
2915 	boolean_t need_stop_timer = B_FALSE;
2916 	boolean_t need_fastpath_update = B_FALSE;
2917 
2918 	ASSERT(MUTEX_HELD(&nce->nce_lock));
2919 	ASSERT(nce->nce_ipversion == IPV6_VERSION);
2920 	/*
2921 	 * If this interface does not do NUD, there is no point
2922 	 * in allowing an update to the cache entry.  Although
2923 	 * we will respond to NS.
2924 	 * The only time we accept an update for a resolver when
2925 	 * NUD is turned off is when it has just been created.
2926 	 * Non-Resolvers will always be created as REACHABLE.
2927 	 */
2928 	if (new_state != ND_UNCHANGED) {
2929 		if ((nce->nce_flags & NCE_F_NONUD) &&
2930 		    (nce->nce_state != ND_INCOMPLETE))
2931 			return;
2932 		ASSERT((int16_t)new_state >= ND_STATE_VALID_MIN);
2933 		ASSERT((int16_t)new_state <= ND_STATE_VALID_MAX);
2934 		need_stop_timer = B_TRUE;
2935 		if (new_state == ND_REACHABLE)
2936 			nce->nce_last = TICK_TO_MSEC(lbolt64);
2937 		else {
2938 			/* We force NUD in this case */
2939 			nce->nce_last = 0;
2940 		}
2941 		nce->nce_state = new_state;
2942 		nce->nce_pcnt = ND_MAX_UNICAST_SOLICIT;
2943 	}
2944 	/*
2945 	 * In case of fast path we need to free the the fastpath
2946 	 * M_DATA and do another probe.  Otherwise we can just
2947 	 * overwrite the DL_UNITDATA_REQ data, noting we'll lose
2948 	 * whatever packets that happens to be transmitting at the time.
2949 	 */
2950 	if (new_ll_addr != NULL) {
2951 		ASSERT(nce->nce_res_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill) +
2952 		    ill->ill_nd_lla_len <= nce->nce_res_mp->b_wptr);
2953 		bcopy(new_ll_addr, nce->nce_res_mp->b_rptr +
2954 		    NCE_LL_ADDR_OFFSET(ill), ill->ill_nd_lla_len);
2955 		if (nce->nce_fp_mp != NULL) {
2956 			freemsg(nce->nce_fp_mp);
2957 			nce->nce_fp_mp = NULL;
2958 		}
2959 		need_fastpath_update = B_TRUE;
2960 	}
2961 	mutex_exit(&nce->nce_lock);
2962 	if (need_stop_timer) {
2963 		(void) untimeout(nce->nce_timeout_id);
2964 		nce->nce_timeout_id = 0;
2965 	}
2966 	if (need_fastpath_update)
2967 		nce_fastpath(nce);
2968 	mutex_enter(&nce->nce_lock);
2969 }
2970 
2971 void
2972 nce_queue_mp_common(nce_t *nce, mblk_t *mp, boolean_t head_insert)
2973 {
2974 	uint_t	count = 0;
2975 	mblk_t  **mpp;
2976 
2977 	ASSERT(MUTEX_HELD(&nce->nce_lock));
2978 
2979 	for (mpp = &nce->nce_qd_mp; *mpp != NULL;
2980 	    mpp = &(*mpp)->b_next) {
2981 		if (++count >
2982 		    nce->nce_ill->ill_max_buf) {
2983 			mblk_t *tmp = nce->nce_qd_mp->b_next;
2984 
2985 			nce->nce_qd_mp->b_next = NULL;
2986 			nce->nce_qd_mp->b_prev = NULL;
2987 			freemsg(nce->nce_qd_mp);
2988 			nce->nce_qd_mp = tmp;
2989 		}
2990 	}
2991 	/* put this on the list */
2992 	if (head_insert) {
2993 		mp->b_next = nce->nce_qd_mp;
2994 		nce->nce_qd_mp = mp;
2995 	} else {
2996 		*mpp = mp;
2997 	}
2998 }
2999 
3000 static void
3001 nce_queue_mp(nce_t *nce, mblk_t *mp)
3002 {
3003 	boolean_t head_insert = B_FALSE;
3004 	ip6_t	*ip6h;
3005 	ip6i_t	*ip6i;
3006 	mblk_t *data_mp;
3007 
3008 	ASSERT(MUTEX_HELD(&nce->nce_lock));
3009 
3010 	if (mp->b_datap->db_type == M_CTL)
3011 		data_mp = mp->b_cont;
3012 	else
3013 		data_mp = mp;
3014 	ip6h = (ip6_t *)data_mp->b_rptr;
3015 	if (ip6h->ip6_nxt == IPPROTO_RAW) {
3016 		/*
3017 		 * This message should have been pulled up already in
3018 		 * ip_wput_v6. We can't do pullups here because the message
3019 		 * could be from the nce_qd_mp which could have b_next/b_prev
3020 		 * non-NULL.
3021 		 */
3022 		ip6i = (ip6i_t *)ip6h;
3023 		ASSERT((data_mp->b_wptr - (uchar_t *)ip6i) >=
3024 		    sizeof (ip6i_t) + IPV6_HDR_LEN);
3025 		/*
3026 		 * Multipathing probe packets have IP6I_DROP_IFDELAYED set.
3027 		 * This has 2 aspects mentioned below.
3028 		 * 1. Perform head insertion in the nce_qd_mp for these packets.
3029 		 * This ensures that next retransmit of ND solicitation
3030 		 * will use the interface specified by the probe packet,
3031 		 * for both NS and NA. This corresponds to the src address
3032 		 * in the IPv6 packet. If we insert at tail, we will be
3033 		 * depending on the packet at the head for successful
3034 		 * ND resolution. This is not reliable, because the interface
3035 		 * on which the NA arrives could be different from the interface
3036 		 * on which the NS was sent, and if the receiving interface is
3037 		 * failed, it will appear that the sending interface is also
3038 		 * failed, causing in.mpathd to misdiagnose this as link
3039 		 * failure.
3040 		 * 2. Drop the original packet, if the ND resolution did not
3041 		 * succeed in the first attempt. However we will create the
3042 		 * nce and the ire, as soon as the ND resolution succeeds.
3043 		 * We don't gain anything by queueing multiple probe packets
3044 		 * and sending them back-to-back once resolution succeeds.
3045 		 * It is sufficient to send just 1 packet after ND resolution
3046 		 * succeeds. Since mpathd is sending down probe packets at a
3047 		 * constant rate, we don't need to send the queued packet. We
3048 		 * need to queue it only for NDP resolution. The benefit of
3049 		 * dropping the probe packets that were delayed in ND
3050 		 * resolution, is that in.mpathd will not see inflated
3051 		 * RTT. If the ND resolution does not succeed within
3052 		 * in.mpathd's failure detection time, mpathd may detect
3053 		 * a failure, and it does not matter whether the packet
3054 		 * was queued or dropped.
3055 		 */
3056 		if (ip6i->ip6i_flags & IP6I_DROP_IFDELAYED)
3057 			head_insert = B_TRUE;
3058 	}
3059 
3060 	nce_queue_mp_common(nce, mp, head_insert);
3061 }
3062 
3063 /*
3064  * Called when address resolution failed due to a timeout.
3065  * Send an ICMP unreachable in response to all queued packets.
3066  */
3067 void
3068 nce_resolv_failed(nce_t *nce)
3069 {
3070 	mblk_t	*mp, *nxt_mp, *first_mp;
3071 	char	buf[INET6_ADDRSTRLEN];
3072 	ip6_t *ip6h;
3073 	zoneid_t zoneid = GLOBAL_ZONEID;
3074 	ip_stack_t	*ipst = nce->nce_ill->ill_ipst;
3075 
3076 	ip1dbg(("nce_resolv_failed: dst %s\n",
3077 	    inet_ntop(AF_INET6, (char *)&nce->nce_addr, buf, sizeof (buf))));
3078 	mutex_enter(&nce->nce_lock);
3079 	mp = nce->nce_qd_mp;
3080 	nce->nce_qd_mp = NULL;
3081 	mutex_exit(&nce->nce_lock);
3082 	while (mp != NULL) {
3083 		nxt_mp = mp->b_next;
3084 		mp->b_next = NULL;
3085 		mp->b_prev = NULL;
3086 
3087 		first_mp = mp;
3088 		if (mp->b_datap->db_type == M_CTL) {
3089 			ipsec_out_t *io = (ipsec_out_t *)mp->b_rptr;
3090 			ASSERT(io->ipsec_out_type == IPSEC_OUT);
3091 			zoneid = io->ipsec_out_zoneid;
3092 			ASSERT(zoneid != ALL_ZONES);
3093 			mp = mp->b_cont;
3094 		}
3095 
3096 		ip6h = (ip6_t *)mp->b_rptr;
3097 		if (ip6h->ip6_nxt == IPPROTO_RAW) {
3098 			ip6i_t *ip6i;
3099 			/*
3100 			 * This message should have been pulled up already
3101 			 * in ip_wput_v6. ip_hdr_complete_v6 assumes that
3102 			 * the header is pulled up.
3103 			 */
3104 			ip6i = (ip6i_t *)ip6h;
3105 			ASSERT((mp->b_wptr - (uchar_t *)ip6i) >=
3106 			    sizeof (ip6i_t) + IPV6_HDR_LEN);
3107 			mp->b_rptr += sizeof (ip6i_t);
3108 		}
3109 		/*
3110 		 * Ignore failure since icmp_unreachable_v6 will silently
3111 		 * drop packets with an unspecified source address.
3112 		 */
3113 		(void) ip_hdr_complete_v6((ip6_t *)mp->b_rptr, zoneid, ipst);
3114 		icmp_unreachable_v6(nce->nce_ill->ill_wq, first_mp,
3115 		    ICMP6_DST_UNREACH_ADDR, B_FALSE, B_FALSE, zoneid, ipst);
3116 		mp = nxt_mp;
3117 	}
3118 }
3119 
3120 /*
3121  * Called by SIOCSNDP* ioctl to add/change an nce entry
3122  * and the corresponding attributes.
3123  * Disallow states other than ND_REACHABLE or ND_STALE.
3124  */
3125 int
3126 ndp_sioc_update(ill_t *ill, lif_nd_req_t *lnr)
3127 {
3128 	sin6_t		*sin6;
3129 	in6_addr_t	*addr;
3130 	nce_t		*nce;
3131 	int		err;
3132 	uint16_t	new_flags = 0;
3133 	uint16_t	old_flags = 0;
3134 	int		inflags = lnr->lnr_flags;
3135 	ip_stack_t	*ipst = ill->ill_ipst;
3136 
3137 	ASSERT(ill->ill_isv6);
3138 	if ((lnr->lnr_state_create != ND_REACHABLE) &&
3139 	    (lnr->lnr_state_create != ND_STALE))
3140 		return (EINVAL);
3141 
3142 	sin6 = (sin6_t *)&lnr->lnr_addr;
3143 	addr = &sin6->sin6_addr;
3144 
3145 	mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
3146 	/* We know it can not be mapping so just look in the hash table */
3147 	nce = *((nce_t **)NCE_HASH_PTR_V6(ipst, *addr));
3148 	nce = nce_lookup_addr(ill, addr, nce);
3149 	if (nce != NULL)
3150 		new_flags = nce->nce_flags;
3151 
3152 	switch (inflags & (NDF_ISROUTER_ON|NDF_ISROUTER_OFF)) {
3153 	case NDF_ISROUTER_ON:
3154 		new_flags |= NCE_F_ISROUTER;
3155 		break;
3156 	case NDF_ISROUTER_OFF:
3157 		new_flags &= ~NCE_F_ISROUTER;
3158 		break;
3159 	case (NDF_ISROUTER_OFF|NDF_ISROUTER_ON):
3160 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
3161 		if (nce != NULL)
3162 			NCE_REFRELE(nce);
3163 		return (EINVAL);
3164 	}
3165 
3166 	switch (inflags & (NDF_ANYCAST_ON|NDF_ANYCAST_OFF)) {
3167 	case NDF_ANYCAST_ON:
3168 		new_flags |= NCE_F_ANYCAST;
3169 		break;
3170 	case NDF_ANYCAST_OFF:
3171 		new_flags &= ~NCE_F_ANYCAST;
3172 		break;
3173 	case (NDF_ANYCAST_OFF|NDF_ANYCAST_ON):
3174 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
3175 		if (nce != NULL)
3176 			NCE_REFRELE(nce);
3177 		return (EINVAL);
3178 	}
3179 
3180 	switch (inflags & (NDF_PROXY_ON|NDF_PROXY_OFF)) {
3181 	case NDF_PROXY_ON:
3182 		new_flags |= NCE_F_PROXY;
3183 		break;
3184 	case NDF_PROXY_OFF:
3185 		new_flags &= ~NCE_F_PROXY;
3186 		break;
3187 	case (NDF_PROXY_OFF|NDF_PROXY_ON):
3188 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
3189 		if (nce != NULL)
3190 			NCE_REFRELE(nce);
3191 		return (EINVAL);
3192 	}
3193 
3194 	if (nce == NULL) {
3195 		err = ndp_add_v6(ill,
3196 		    (uchar_t *)lnr->lnr_hdw_addr,
3197 		    addr,
3198 		    &ipv6_all_ones,
3199 		    &ipv6_all_zeros,
3200 		    0,
3201 		    new_flags,
3202 		    lnr->lnr_state_create,
3203 		    &nce);
3204 		if (err != 0) {
3205 			mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
3206 			ip1dbg(("ndp_sioc_update: Can't create NCE %d\n", err));
3207 			return (err);
3208 		}
3209 	}
3210 	old_flags = nce->nce_flags;
3211 	if (old_flags & NCE_F_ISROUTER && !(new_flags & NCE_F_ISROUTER)) {
3212 		/*
3213 		 * Router turned to host, delete all ires.
3214 		 * XXX Just delete the entry, but we need to add too.
3215 		 */
3216 		nce->nce_flags &= ~NCE_F_ISROUTER;
3217 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
3218 		ndp_delete(nce);
3219 		NCE_REFRELE(nce);
3220 		return (0);
3221 	}
3222 	mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
3223 
3224 	mutex_enter(&nce->nce_lock);
3225 	nce->nce_flags = new_flags;
3226 	mutex_exit(&nce->nce_lock);
3227 	/*
3228 	 * Note that we ignore the state at this point, which
3229 	 * should be either STALE or REACHABLE.  Instead we let
3230 	 * the link layer address passed in to determine the state
3231 	 * much like incoming packets.
3232 	 */
3233 	ndp_process(nce, (uchar_t *)lnr->lnr_hdw_addr, 0, B_FALSE);
3234 	NCE_REFRELE(nce);
3235 	return (0);
3236 }
3237 
3238 /*
3239  * If the device driver supports it, we make nce_fp_mp to have
3240  * an M_DATA prepend.  Otherwise nce_fp_mp will be null.
3241  * The caller ensures there is hold on nce for this function.
3242  * Note that since ill_fastpath_probe() copies the mblk there is
3243  * no need for the hold beyond this function.
3244  */
3245 void
3246 nce_fastpath(nce_t *nce)
3247 {
3248 	ill_t	*ill = nce->nce_ill;
3249 	int res;
3250 
3251 	ASSERT(ill != NULL);
3252 	ASSERT(nce->nce_state != ND_INITIAL && nce->nce_state != ND_INCOMPLETE);
3253 
3254 	if (nce->nce_fp_mp != NULL) {
3255 		/* Already contains fastpath info */
3256 		return;
3257 	}
3258 	if (nce->nce_res_mp != NULL) {
3259 		nce_fastpath_list_add(nce);
3260 		res = ill_fastpath_probe(ill, nce->nce_res_mp);
3261 		/*
3262 		 * EAGAIN is an indication of a transient error
3263 		 * i.e. allocation failure etc. leave the nce in the list it
3264 		 * will be updated when another probe happens for another ire
3265 		 * if not it will be taken out of the list when the ire is
3266 		 * deleted.
3267 		 */
3268 
3269 		if (res != 0 && res != EAGAIN)
3270 			nce_fastpath_list_delete(nce);
3271 	}
3272 }
3273 
3274 /*
3275  * Drain the list of nce's waiting for fastpath response.
3276  */
3277 void
3278 nce_fastpath_list_dispatch(ill_t *ill, boolean_t (*func)(nce_t *, void  *),
3279     void *arg)
3280 {
3281 
3282 	nce_t *next_nce;
3283 	nce_t *current_nce;
3284 	nce_t *first_nce;
3285 	nce_t *prev_nce = NULL;
3286 
3287 	mutex_enter(&ill->ill_lock);
3288 	first_nce = current_nce = (nce_t *)ill->ill_fastpath_list;
3289 	while (current_nce != (nce_t *)&ill->ill_fastpath_list) {
3290 		next_nce = current_nce->nce_fastpath;
3291 		/*
3292 		 * Take it off the list if we're flushing, or if the callback
3293 		 * routine tells us to do so.  Otherwise, leave the nce in the
3294 		 * fastpath list to handle any pending response from the lower
3295 		 * layer.  We can't drain the list when the callback routine
3296 		 * comparison failed, because the response is asynchronous in
3297 		 * nature, and may not arrive in the same order as the list
3298 		 * insertion.
3299 		 */
3300 		if (func == NULL || func(current_nce, arg)) {
3301 			current_nce->nce_fastpath = NULL;
3302 			if (current_nce == first_nce)
3303 				ill->ill_fastpath_list = first_nce = next_nce;
3304 			else
3305 				prev_nce->nce_fastpath = next_nce;
3306 		} else {
3307 			/* previous element that is still in the list */
3308 			prev_nce = current_nce;
3309 		}
3310 		current_nce = next_nce;
3311 	}
3312 	mutex_exit(&ill->ill_lock);
3313 }
3314 
3315 /*
3316  * Add nce to the nce fastpath list.
3317  */
3318 void
3319 nce_fastpath_list_add(nce_t *nce)
3320 {
3321 	ill_t *ill;
3322 
3323 	ill = nce->nce_ill;
3324 
3325 	mutex_enter(&ill->ill_lock);
3326 	mutex_enter(&nce->nce_lock);
3327 
3328 	/*
3329 	 * if nce has not been deleted and
3330 	 * is not already in the list add it.
3331 	 */
3332 	if (!(nce->nce_flags & NCE_F_CONDEMNED) &&
3333 	    (nce->nce_fastpath == NULL)) {
3334 		nce->nce_fastpath = (nce_t *)ill->ill_fastpath_list;
3335 		ill->ill_fastpath_list = nce;
3336 	}
3337 
3338 	mutex_exit(&nce->nce_lock);
3339 	mutex_exit(&ill->ill_lock);
3340 }
3341 
3342 /*
3343  * remove nce from the nce fastpath list.
3344  */
3345 void
3346 nce_fastpath_list_delete(nce_t *nce)
3347 {
3348 	nce_t *nce_ptr;
3349 
3350 	ill_t *ill;
3351 
3352 	ill = nce->nce_ill;
3353 	ASSERT(ill != NULL);
3354 
3355 	mutex_enter(&ill->ill_lock);
3356 	if (nce->nce_fastpath == NULL)
3357 		goto done;
3358 
3359 	ASSERT(ill->ill_fastpath_list != &ill->ill_fastpath_list);
3360 
3361 	if (ill->ill_fastpath_list == nce) {
3362 		ill->ill_fastpath_list = nce->nce_fastpath;
3363 	} else {
3364 		nce_ptr = ill->ill_fastpath_list;
3365 		while (nce_ptr != (nce_t *)&ill->ill_fastpath_list) {
3366 			if (nce_ptr->nce_fastpath == nce) {
3367 				nce_ptr->nce_fastpath = nce->nce_fastpath;
3368 				break;
3369 			}
3370 			nce_ptr = nce_ptr->nce_fastpath;
3371 		}
3372 	}
3373 
3374 	nce->nce_fastpath = NULL;
3375 done:
3376 	mutex_exit(&ill->ill_lock);
3377 }
3378 
3379 /*
3380  * Update all NCE's that are not in fastpath mode and
3381  * have an nce_fp_mp that matches mp. mp->b_cont contains
3382  * the fastpath header.
3383  *
3384  * Returns TRUE if entry should be dequeued, or FALSE otherwise.
3385  */
3386 boolean_t
3387 ndp_fastpath_update(nce_t *nce, void *arg)
3388 {
3389 	mblk_t 	*mp, *fp_mp;
3390 	uchar_t	*mp_rptr, *ud_mp_rptr;
3391 	mblk_t	*ud_mp = nce->nce_res_mp;
3392 	ptrdiff_t	cmplen;
3393 
3394 	if (nce->nce_flags & NCE_F_MAPPING)
3395 		return (B_TRUE);
3396 	if ((nce->nce_fp_mp != NULL) || (ud_mp == NULL))
3397 		return (B_TRUE);
3398 
3399 	ip2dbg(("ndp_fastpath_update: trying\n"));
3400 	mp = (mblk_t *)arg;
3401 	mp_rptr = mp->b_rptr;
3402 	cmplen = mp->b_wptr - mp_rptr;
3403 	ASSERT(cmplen >= 0);
3404 	ud_mp_rptr = ud_mp->b_rptr;
3405 	/*
3406 	 * The nce is locked here to prevent any other threads
3407 	 * from accessing and changing nce_res_mp when the IPv6 address
3408 	 * becomes resolved to an lla while we're in the middle
3409 	 * of looking at and comparing the hardware address (lla).
3410 	 * It is also locked to prevent multiple threads in nce_fastpath_update
3411 	 * from examining nce_res_mp atthe same time.
3412 	 */
3413 	mutex_enter(&nce->nce_lock);
3414 	if (ud_mp->b_wptr - ud_mp_rptr != cmplen ||
3415 	    bcmp((char *)mp_rptr, (char *)ud_mp_rptr, cmplen) != 0) {
3416 		mutex_exit(&nce->nce_lock);
3417 		/*
3418 		 * Don't take the ire off the fastpath list yet,
3419 		 * since the response may come later.
3420 		 */
3421 		return (B_FALSE);
3422 	}
3423 	/* Matched - install mp as the fastpath mp */
3424 	ip1dbg(("ndp_fastpath_update: match\n"));
3425 	fp_mp = dupb(mp->b_cont);
3426 	if (fp_mp != NULL) {
3427 		nce->nce_fp_mp = fp_mp;
3428 	}
3429 	mutex_exit(&nce->nce_lock);
3430 	return (B_TRUE);
3431 }
3432 
3433 /*
3434  * This function handles the DL_NOTE_FASTPATH_FLUSH notification from
3435  * driver.  Note that it assumes IP is exclusive...
3436  */
3437 /* ARGSUSED */
3438 void
3439 ndp_fastpath_flush(nce_t *nce, char *arg)
3440 {
3441 	if (nce->nce_flags & NCE_F_MAPPING)
3442 		return;
3443 	/* No fastpath info? */
3444 	if (nce->nce_fp_mp == NULL || nce->nce_res_mp == NULL)
3445 		return;
3446 
3447 	if (nce->nce_ipversion == IPV4_VERSION &&
3448 	    nce->nce_flags & NCE_F_BCAST) {
3449 		/*
3450 		 * IPv4 BROADCAST entries:
3451 		 * We can't delete the nce since it is difficult to
3452 		 * recreate these without going through the
3453 		 * ipif down/up dance.
3454 		 *
3455 		 * All access to nce->nce_fp_mp in the case of these
3456 		 * is protected by nce_lock.
3457 		 */
3458 		mutex_enter(&nce->nce_lock);
3459 		if (nce->nce_fp_mp != NULL) {
3460 			freeb(nce->nce_fp_mp);
3461 			nce->nce_fp_mp = NULL;
3462 			mutex_exit(&nce->nce_lock);
3463 			nce_fastpath(nce);
3464 		} else {
3465 			mutex_exit(&nce->nce_lock);
3466 		}
3467 	} else {
3468 		/* Just delete the NCE... */
3469 		ndp_delete(nce);
3470 	}
3471 }
3472 
3473 /*
3474  * Return a pointer to a given option in the packet.
3475  * Assumes that option part of the packet have already been validated.
3476  */
3477 nd_opt_hdr_t *
3478 ndp_get_option(nd_opt_hdr_t *opt, int optlen, int opt_type)
3479 {
3480 	while (optlen > 0) {
3481 		if (opt->nd_opt_type == opt_type)
3482 			return (opt);
3483 		optlen -= 8 * opt->nd_opt_len;
3484 		opt = (struct nd_opt_hdr *)((char *)opt + 8 * opt->nd_opt_len);
3485 	}
3486 	return (NULL);
3487 }
3488 
3489 /*
3490  * Verify all option lengths present are > 0, also check to see
3491  * if the option lengths and packet length are consistent.
3492  */
3493 boolean_t
3494 ndp_verify_optlen(nd_opt_hdr_t *opt, int optlen)
3495 {
3496 	ASSERT(opt != NULL);
3497 	while (optlen > 0) {
3498 		if (opt->nd_opt_len == 0)
3499 			return (B_FALSE);
3500 		optlen -= 8 * opt->nd_opt_len;
3501 		if (optlen < 0)
3502 			return (B_FALSE);
3503 		opt = (struct nd_opt_hdr *)((char *)opt + 8 * opt->nd_opt_len);
3504 	}
3505 	return (B_TRUE);
3506 }
3507 
3508 /*
3509  * ndp_walk function.
3510  * Free a fraction of the NCE cache entries.
3511  * A fraction of zero means to not free any in that category.
3512  */
3513 void
3514 ndp_cache_reclaim(nce_t *nce, char *arg)
3515 {
3516 	nce_cache_reclaim_t *ncr = (nce_cache_reclaim_t *)arg;
3517 	uint_t	rand;
3518 
3519 	if (nce->nce_flags & NCE_F_PERMANENT)
3520 		return;
3521 
3522 	rand = (uint_t)lbolt +
3523 	    NCE_ADDR_HASH_V6(nce->nce_addr, NCE_TABLE_SIZE);
3524 	if (ncr->ncr_host != 0 &&
3525 	    (rand/ncr->ncr_host)*ncr->ncr_host == rand) {
3526 		ndp_delete(nce);
3527 		return;
3528 	}
3529 }
3530 
3531 /*
3532  * ndp_walk function.
3533  * Count the number of NCEs that can be deleted.
3534  * These would be hosts but not routers.
3535  */
3536 void
3537 ndp_cache_count(nce_t *nce, char *arg)
3538 {
3539 	ncc_cache_count_t *ncc = (ncc_cache_count_t *)arg;
3540 
3541 	if (nce->nce_flags & NCE_F_PERMANENT)
3542 		return;
3543 
3544 	ncc->ncc_total++;
3545 	if (!(nce->nce_flags & NCE_F_ISROUTER))
3546 		ncc->ncc_host++;
3547 }
3548 
3549 #ifdef NCE_DEBUG
3550 th_trace_t *
3551 th_trace_nce_lookup(nce_t *nce)
3552 {
3553 	int bucket_id;
3554 	th_trace_t *th_trace;
3555 
3556 	ASSERT(MUTEX_HELD(&nce->nce_lock));
3557 
3558 	bucket_id = IP_TR_HASH(curthread);
3559 	ASSERT(bucket_id < IP_TR_HASH_MAX);
3560 
3561 	for (th_trace = nce->nce_trace[bucket_id]; th_trace != NULL;
3562 	    th_trace = th_trace->th_next) {
3563 		if (th_trace->th_id == curthread)
3564 			return (th_trace);
3565 	}
3566 	return (NULL);
3567 }
3568 
3569 void
3570 nce_trace_ref(nce_t *nce)
3571 {
3572 	int bucket_id;
3573 	th_trace_t *th_trace;
3574 
3575 	/*
3576 	 * Attempt to locate the trace buffer for the curthread.
3577 	 * If it does not exist, then allocate a new trace buffer
3578 	 * and link it in list of trace bufs for this ipif, at the head
3579 	 */
3580 	ASSERT(MUTEX_HELD(&nce->nce_lock));
3581 
3582 	if (nce->nce_trace_disable == B_TRUE)
3583 		return;
3584 
3585 	th_trace = th_trace_nce_lookup(nce);
3586 	if (th_trace == NULL) {
3587 		bucket_id = IP_TR_HASH(curthread);
3588 		th_trace = (th_trace_t *)kmem_zalloc(sizeof (th_trace_t),
3589 		    KM_NOSLEEP);
3590 		if (th_trace == NULL) {
3591 			nce->nce_trace_disable = B_TRUE;
3592 			nce_trace_inactive(nce);
3593 			return;
3594 		}
3595 		th_trace->th_id = curthread;
3596 		th_trace->th_next = nce->nce_trace[bucket_id];
3597 		th_trace->th_prev = &nce->nce_trace[bucket_id];
3598 		if (th_trace->th_next != NULL)
3599 			th_trace->th_next->th_prev = &th_trace->th_next;
3600 		nce->nce_trace[bucket_id] = th_trace;
3601 	}
3602 	ASSERT(th_trace->th_refcnt < TR_BUF_MAX - 1);
3603 	th_trace->th_refcnt++;
3604 	th_trace_rrecord(th_trace);
3605 }
3606 
3607 void
3608 nce_untrace_ref(nce_t *nce)
3609 {
3610 	th_trace_t *th_trace;
3611 
3612 	ASSERT(MUTEX_HELD(&nce->nce_lock));
3613 
3614 	if (nce->nce_trace_disable == B_TRUE)
3615 		return;
3616 
3617 	th_trace = th_trace_nce_lookup(nce);
3618 	ASSERT(th_trace != NULL && th_trace->th_refcnt > 0);
3619 
3620 	th_trace_rrecord(th_trace);
3621 	th_trace->th_refcnt--;
3622 }
3623 
3624 void
3625 nce_trace_inactive(nce_t *nce)
3626 {
3627 	th_trace_t *th_trace;
3628 	int i;
3629 
3630 	ASSERT(MUTEX_HELD(&nce->nce_lock));
3631 
3632 	for (i = 0; i < IP_TR_HASH_MAX; i++) {
3633 		while (nce->nce_trace[i] != NULL) {
3634 			th_trace = nce->nce_trace[i];
3635 
3636 			/* unlink th_trace and free it */
3637 			nce->nce_trace[i] = th_trace->th_next;
3638 			if (th_trace->th_next != NULL)
3639 				th_trace->th_next->th_prev =
3640 				    &nce->nce_trace[i];
3641 
3642 			th_trace->th_next = NULL;
3643 			th_trace->th_prev = NULL;
3644 			kmem_free(th_trace, sizeof (th_trace_t));
3645 		}
3646 	}
3647 
3648 }
3649 
3650 /* ARGSUSED */
3651 int
3652 nce_thread_exit(nce_t *nce, caddr_t arg)
3653 {
3654 	th_trace_t	*th_trace;
3655 	uint64_t	now;
3656 
3657 	mutex_enter(&nce->nce_lock);
3658 	if (nce->nce_state == ND_INITIAL) {
3659 
3660 		now = TICK_TO_MSEC(lbolt64);
3661 		if (now - nce->nce_init_time > NCE_STUCK_TIMEOUT) {
3662 			DTRACE_PROBE1(nce__stuck, nce_t *, nce);
3663 		}
3664 	}
3665 	th_trace = th_trace_nce_lookup(nce);
3666 
3667 	if (th_trace == NULL) {
3668 		mutex_exit(&nce->nce_lock);
3669 		return (0);
3670 	}
3671 
3672 	ASSERT(th_trace->th_refcnt == 0);
3673 
3674 	/* unlink th_trace and free it */
3675 	*th_trace->th_prev = th_trace->th_next;
3676 	if (th_trace->th_next != NULL)
3677 		th_trace->th_next->th_prev = th_trace->th_prev;
3678 	th_trace->th_next = NULL;
3679 	th_trace->th_prev = NULL;
3680 	kmem_free(th_trace, sizeof (th_trace_t));
3681 	mutex_exit(&nce->nce_lock);
3682 	return (0);
3683 }
3684 #endif
3685 
3686 /*
3687  * Called when address resolution fails due to a timeout.
3688  * Send an ICMP unreachable in response to all queued packets.
3689  */
3690 void
3691 arp_resolv_failed(nce_t *nce)
3692 {
3693 	mblk_t	*mp, *nxt_mp, *first_mp;
3694 	char	buf[INET6_ADDRSTRLEN];
3695 	zoneid_t zoneid = GLOBAL_ZONEID;
3696 	struct in_addr ipv4addr;
3697 	ip_stack_t *ipst = nce->nce_ill->ill_ipst;
3698 
3699 	IN6_V4MAPPED_TO_INADDR(&nce->nce_addr, &ipv4addr);
3700 	ip3dbg(("arp_resolv_failed: dst %s\n",
3701 	    inet_ntop(AF_INET, &ipv4addr, buf, sizeof (buf))));
3702 	mutex_enter(&nce->nce_lock);
3703 	mp = nce->nce_qd_mp;
3704 	nce->nce_qd_mp = NULL;
3705 	mutex_exit(&nce->nce_lock);
3706 
3707 	while (mp != NULL) {
3708 		nxt_mp = mp->b_next;
3709 		mp->b_next = NULL;
3710 		mp->b_prev = NULL;
3711 
3712 		first_mp = mp;
3713 		/*
3714 		 * Send icmp unreachable messages
3715 		 * to the hosts.
3716 		 */
3717 		(void) ip_hdr_complete((ipha_t *)mp->b_rptr, zoneid, ipst);
3718 		ip3dbg(("arp_resolv_failed: Calling icmp_unreachable\n"));
3719 		icmp_unreachable(nce->nce_ill->ill_wq, first_mp,
3720 		    ICMP_HOST_UNREACHABLE, zoneid, ipst);
3721 		mp = nxt_mp;
3722 	}
3723 }
3724 
3725 int
3726 ndp_lookup_then_add_v4(ill_t *ill, const in_addr_t *addr, uint16_t flags,
3727     nce_t **newnce, nce_t *src_nce)
3728 {
3729 	int	err;
3730 	nce_t	*nce;
3731 	in6_addr_t addr6;
3732 	ip_stack_t *ipst = ill->ill_ipst;
3733 
3734 	mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
3735 	nce = *((nce_t **)NCE_HASH_PTR_V4(ipst, *addr));
3736 	IN6_IPADDR_TO_V4MAPPED(*addr, &addr6);
3737 	nce = nce_lookup_addr(ill, &addr6, nce);
3738 	if (nce == NULL) {
3739 		err = ndp_add_v4(ill, addr, flags, newnce, src_nce);
3740 	} else {
3741 		*newnce = nce;
3742 		err = EEXIST;
3743 	}
3744 	mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
3745 	return (err);
3746 }
3747 
3748 /*
3749  * NDP Cache Entry creation routine for IPv4.
3750  * Mapped entries are handled in arp.
3751  * This routine must always be called with ndp4->ndp_g_lock held.
3752  * Prior to return, nce_refcnt is incremented.
3753  */
3754 static int
3755 ndp_add_v4(ill_t *ill, const in_addr_t *addr, uint16_t flags,
3756     nce_t **newnce, nce_t *src_nce)
3757 {
3758 	static	nce_t		nce_nil;
3759 	nce_t		*nce;
3760 	mblk_t		*mp;
3761 	mblk_t		*template = NULL;
3762 	nce_t		**ncep;
3763 	ip_stack_t	*ipst = ill->ill_ipst;
3764 	uint16_t	state = ND_INITIAL;
3765 	int		err;
3766 
3767 	ASSERT(MUTEX_HELD(&ipst->ips_ndp4->ndp_g_lock));
3768 	ASSERT(!ill->ill_isv6);
3769 	ASSERT((flags & NCE_F_MAPPING) == 0);
3770 
3771 	if (ill->ill_resolver_mp == NULL)
3772 		return (EINVAL);
3773 	/*
3774 	 * Allocate the mblk to hold the nce.
3775 	 */
3776 	mp = allocb(sizeof (nce_t), BPRI_MED);
3777 	if (mp == NULL)
3778 		return (ENOMEM);
3779 
3780 	nce = (nce_t *)mp->b_rptr;
3781 	mp->b_wptr = (uchar_t *)&nce[1];
3782 	*nce = nce_nil;
3783 	nce->nce_ill = ill;
3784 	nce->nce_ipversion = IPV4_VERSION;
3785 	nce->nce_flags = flags;
3786 	nce->nce_pcnt = ND_MAX_UNICAST_SOLICIT;
3787 	nce->nce_rcnt = ill->ill_xmit_count;
3788 	IN6_IPADDR_TO_V4MAPPED(*addr, &nce->nce_addr);
3789 	nce->nce_mask = ipv6_all_ones;
3790 	nce->nce_extract_mask = ipv6_all_zeros;
3791 	nce->nce_ll_extract_start = 0;
3792 	nce->nce_qd_mp = NULL;
3793 	nce->nce_mp = mp;
3794 	/* This one is for nce getting created */
3795 	nce->nce_refcnt = 1;
3796 	mutex_init(&nce->nce_lock, NULL, MUTEX_DEFAULT, NULL);
3797 	ncep = ((nce_t **)NCE_HASH_PTR_V4(ipst, *addr));
3798 
3799 #ifdef NCE_DEBUG
3800 	bzero(nce->nce_trace, sizeof (th_trace_t *) * IP_TR_HASH_MAX);
3801 #endif
3802 	if (src_nce != NULL) {
3803 		/*
3804 		 * src_nce has been provided by the caller. The only
3805 		 * caller who provides a non-null, non-broadcast
3806 		 * src_nce is from ip_newroute() which must pass in
3807 		 * a ND_REACHABLE src_nce (this condition is verified
3808 		 * via an ASSERT for the save_ire->ire_nce in ip_newroute())
3809 		 */
3810 		mutex_enter(&src_nce->nce_lock);
3811 		state = src_nce->nce_state;
3812 		if ((src_nce->nce_flags & NCE_F_CONDEMNED) ||
3813 		    (ipst->ips_ndp4->ndp_g_hw_change > 0)) {
3814 			/*
3815 			 * src_nce has been deleted, or
3816 			 * ip_arp_news is in the middle of
3817 			 * flushing entries in the the nce.
3818 			 * Fail the add, since we don't know
3819 			 * if it is safe to copy the contents of
3820 			 * src_nce
3821 			 */
3822 			DTRACE_PROBE2(nce__bad__src__nce,
3823 			    nce_t *, src_nce, ill_t *, ill);
3824 			mutex_exit(&src_nce->nce_lock);
3825 			err = EINVAL;
3826 			goto err_ret;
3827 		}
3828 		template = copyb(src_nce->nce_res_mp);
3829 		mutex_exit(&src_nce->nce_lock);
3830 		if (template == NULL) {
3831 			err = ENOMEM;
3832 			goto err_ret;
3833 		}
3834 	} else if (flags & NCE_F_BCAST) {
3835 		/*
3836 		 * broadcast nce.
3837 		 */
3838 		template = copyb(ill->ill_bcast_mp);
3839 		if (template == NULL) {
3840 			err = ENOMEM;
3841 			goto err_ret;
3842 		}
3843 		state = ND_REACHABLE;
3844 	} else if (ill->ill_net_type == IRE_IF_NORESOLVER) {
3845 		/*
3846 		 * NORESOLVER entries are always created in the REACHABLE
3847 		 * state. We create a nce_res_mp with the IP nexthop address
3848 		 * in the destination address in the DLPI hdr if the
3849 		 * physical length is exactly 4 bytes.
3850 		 *
3851 		 * XXX not clear which drivers set ill_phys_addr_length to
3852 		 * IP_ADDR_LEN.
3853 		 */
3854 		if (ill->ill_phys_addr_length == IP_ADDR_LEN) {
3855 			template = ill_dlur_gen((uchar_t *)addr,
3856 			    ill->ill_phys_addr_length,
3857 			    ill->ill_sap, ill->ill_sap_length);
3858 		} else {
3859 			template = copyb(ill->ill_resolver_mp);
3860 		}
3861 		if (template == NULL) {
3862 			err = ENOMEM;
3863 			goto err_ret;
3864 		}
3865 		state = ND_REACHABLE;
3866 	}
3867 	nce->nce_fp_mp = NULL;
3868 	nce->nce_res_mp = template;
3869 	nce->nce_state = state;
3870 	if (state == ND_REACHABLE) {
3871 		nce->nce_last = TICK_TO_MSEC(lbolt64);
3872 		nce->nce_init_time = TICK_TO_MSEC(lbolt64);
3873 	} else {
3874 		nce->nce_last = 0;
3875 		if (state == ND_INITIAL)
3876 			nce->nce_init_time = TICK_TO_MSEC(lbolt64);
3877 	}
3878 
3879 	ASSERT((nce->nce_res_mp == NULL && nce->nce_state == ND_INITIAL) ||
3880 	    (nce->nce_res_mp != NULL && nce->nce_state == ND_REACHABLE));
3881 	/*
3882 	 * Atomically ensure that the ill is not CONDEMNED, before
3883 	 * adding the NCE.
3884 	 */
3885 	mutex_enter(&ill->ill_lock);
3886 	if (ill->ill_state_flags & ILL_CONDEMNED) {
3887 		mutex_exit(&ill->ill_lock);
3888 		err = EINVAL;
3889 		goto err_ret;
3890 	}
3891 	if ((nce->nce_next = *ncep) != NULL)
3892 		nce->nce_next->nce_ptpn = &nce->nce_next;
3893 	*ncep = nce;
3894 	nce->nce_ptpn = ncep;
3895 	*newnce = nce;
3896 	/* This one is for nce being used by an active thread */
3897 	NCE_REFHOLD(*newnce);
3898 
3899 	/* Bump up the number of nce's referencing this ill */
3900 	ill->ill_nce_cnt++;
3901 	mutex_exit(&ill->ill_lock);
3902 	DTRACE_PROBE1(ndp__add__v4, nce_t *, nce);
3903 	return (0);
3904 err_ret:
3905 	freeb(mp);
3906 	freemsg(template);
3907 	return (err);
3908 }
3909 
3910 void
3911 ndp_flush_qd_mp(nce_t *nce)
3912 {
3913 	mblk_t *qd_mp, *qd_next;
3914 
3915 	ASSERT(MUTEX_HELD(&nce->nce_lock));
3916 	qd_mp = nce->nce_qd_mp;
3917 	nce->nce_qd_mp = NULL;
3918 	while (qd_mp != NULL) {
3919 		qd_next = qd_mp->b_next;
3920 		qd_mp->b_next = NULL;
3921 		qd_mp->b_prev = NULL;
3922 		freemsg(qd_mp);
3923 		qd_mp = qd_next;
3924 	}
3925 }
3926 
3927 
3928 /*
3929  * ndp_walk routine to delete all entries that have a given destination or
3930  * gateway address and cached link layer (MAC) address.  This is used when ARP
3931  * informs us that a network-to-link-layer mapping may have changed.
3932  */
3933 void
3934 nce_delete_hw_changed(nce_t *nce, void *arg)
3935 {
3936 	nce_hw_map_t *hwm = arg;
3937 	mblk_t *mp;
3938 	dl_unitdata_req_t *dlu;
3939 	uchar_t *macaddr;
3940 	ill_t *ill;
3941 	int saplen;
3942 	ipaddr_t nce_addr;
3943 
3944 	if (nce->nce_state != ND_REACHABLE)
3945 		return;
3946 
3947 	IN6_V4MAPPED_TO_IPADDR(&nce->nce_addr, nce_addr);
3948 	if (nce_addr != hwm->hwm_addr)
3949 		return;
3950 
3951 	mutex_enter(&nce->nce_lock);
3952 	if ((mp = nce->nce_res_mp) == NULL) {
3953 		mutex_exit(&nce->nce_lock);
3954 		return;
3955 	}
3956 	dlu = (dl_unitdata_req_t *)mp->b_rptr;
3957 	macaddr = (uchar_t *)(dlu + 1);
3958 	ill = nce->nce_ill;
3959 	if ((saplen = ill->ill_sap_length) > 0)
3960 		macaddr += saplen;
3961 	else
3962 		saplen = -saplen;
3963 
3964 	/*
3965 	 * If the hardware address is unchanged, then leave this one alone.
3966 	 * Note that saplen == abs(saplen) now.
3967 	 */
3968 	if (hwm->hwm_hwlen == dlu->dl_dest_addr_length - saplen &&
3969 	    bcmp(hwm->hwm_hwaddr, macaddr, hwm->hwm_hwlen) == 0) {
3970 		mutex_exit(&nce->nce_lock);
3971 		return;
3972 	}
3973 	mutex_exit(&nce->nce_lock);
3974 
3975 	DTRACE_PROBE1(nce__hw__deleted, nce_t *, nce);
3976 	ndp_delete(nce);
3977 }
3978 
3979 /*
3980  * This function verifies whether a given IPv4 address is potentially known to
3981  * the NCE subsystem.  If so, then ARP must not delete the corresponding ace_t,
3982  * so that it can continue to look for hardware changes on that address.
3983  */
3984 boolean_t
3985 ndp_lookup_ipaddr(in_addr_t addr, netstack_t *ns)
3986 {
3987 	nce_t		*nce;
3988 	struct in_addr	nceaddr;
3989 	ip_stack_t	*ipst = ns->netstack_ip;
3990 
3991 	if (addr == INADDR_ANY)
3992 		return (B_FALSE);
3993 
3994 	mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
3995 	nce = *(nce_t **)NCE_HASH_PTR_V4(ipst, addr);
3996 	for (; nce != NULL; nce = nce->nce_next) {
3997 		/* Note that only v4 mapped entries are in the table. */
3998 		IN6_V4MAPPED_TO_INADDR(&nce->nce_addr, &nceaddr);
3999 		if (addr == nceaddr.s_addr &&
4000 		    IN6_ARE_ADDR_EQUAL(&nce->nce_mask, &ipv6_all_ones)) {
4001 			/* Single flag check; no lock needed */
4002 			if (!(nce->nce_flags & NCE_F_CONDEMNED))
4003 				break;
4004 		}
4005 	}
4006 	mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
4007 	return (nce != NULL);
4008 }
4009