xref: /titanic_50/usr/src/uts/common/inet/ip/ip_ndp.c (revision 2a8164df8a5f42c8a00f10c67d7bc84f80ae9c41)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #include <sys/types.h>
27 #include <sys/stream.h>
28 #include <sys/stropts.h>
29 #include <sys/strsun.h>
30 #include <sys/sysmacros.h>
31 #include <sys/errno.h>
32 #include <sys/dlpi.h>
33 #include <sys/socket.h>
34 #include <sys/ddi.h>
35 #include <sys/sunddi.h>
36 #include <sys/cmn_err.h>
37 #include <sys/debug.h>
38 #include <sys/vtrace.h>
39 #include <sys/kmem.h>
40 #include <sys/zone.h>
41 #include <sys/ethernet.h>
42 #include <sys/sdt.h>
43 
44 #include <net/if.h>
45 #include <net/if_types.h>
46 #include <net/if_dl.h>
47 #include <net/route.h>
48 #include <netinet/in.h>
49 #include <netinet/ip6.h>
50 #include <netinet/icmp6.h>
51 
52 #include <inet/common.h>
53 #include <inet/mi.h>
54 #include <inet/mib2.h>
55 #include <inet/nd.h>
56 #include <inet/ip.h>
57 #include <inet/ip_impl.h>
58 #include <inet/ipclassifier.h>
59 #include <inet/ip_if.h>
60 #include <inet/ip_ire.h>
61 #include <inet/ip_rts.h>
62 #include <inet/ip6.h>
63 #include <inet/ip_ndp.h>
64 #include <inet/ipsec_impl.h>
65 #include <inet/ipsec_info.h>
66 #include <inet/sctp_ip.h>
67 
68 /*
69  * Function names with nce_ prefix are static while function
70  * names with ndp_ prefix are used by rest of the IP.
71  *
72  * Lock ordering:
73  *
74  *	ndp_g_lock -> ill_lock -> nce_lock
75  *
76  * The ndp_g_lock protects the NCE hash (nce_hash_tbl, NCE_HASH_PTR) and
77  * nce_next.  Nce_lock protects the contents of the NCE (particularly
78  * nce_refcnt).
79  */
80 
81 static	boolean_t nce_cmp_ll_addr(const nce_t *nce, const uchar_t *new_ll_addr,
82     uint32_t ll_addr_len);
83 static	void	nce_ire_delete(nce_t *nce);
84 static	void	nce_ire_delete1(ire_t *ire, char *nce_arg);
85 static	void 	nce_set_ll(nce_t *nce, uchar_t *ll_addr);
86 static	nce_t	*nce_lookup_addr(ill_t *, boolean_t, const in6_addr_t *,
87     nce_t *);
88 static	nce_t	*nce_lookup_mapping(ill_t *, const in6_addr_t *);
89 static	void	nce_make_mapping(nce_t *nce, uchar_t *addrpos,
90     uchar_t *addr);
91 static	int	nce_set_multicast(ill_t *ill, const in6_addr_t *addr);
92 static	void	nce_queue_mp(nce_t *nce, mblk_t *mp);
93 static	mblk_t	*nce_udreq_alloc(ill_t *ill);
94 static	void	nce_update(nce_t *nce, uint16_t new_state,
95     uchar_t *new_ll_addr);
96 static	uint32_t	nce_solicit(nce_t *nce, mblk_t *mp);
97 static	boolean_t	nce_xmit(ill_t *ill, uint8_t type,
98     boolean_t use_lla_addr, const in6_addr_t *sender,
99     const in6_addr_t *target, int flag);
100 static boolean_t	nce_xmit_advert(nce_t *nce, boolean_t use_nd_lla,
101     const in6_addr_t *target, uint_t flags);
102 static boolean_t	nce_xmit_solicit(nce_t *nce, boolean_t use_nd_lla,
103     const in6_addr_t *src, uint_t flags);
104 static int	ndp_add_v4(ill_t *, const in_addr_t *, uint16_t,
105     nce_t **, nce_t *);
106 static ipif_t	*ip_ndp_lookup_addr_v6(const in6_addr_t *v6addrp, ill_t *ill);
107 
108 #ifdef DEBUG
109 static void	nce_trace_cleanup(const nce_t *);
110 #endif
111 
112 #define	NCE_HASH_PTR_V4(ipst, addr)					\
113 	(&((ipst)->ips_ndp4->nce_hash_tbl[IRE_ADDR_HASH(addr, NCE_TABLE_SIZE)]))
114 
115 #define	NCE_HASH_PTR_V6(ipst, addr)				 \
116 	(&((ipst)->ips_ndp6->nce_hash_tbl[NCE_ADDR_HASH_V6(addr, \
117 		NCE_TABLE_SIZE)]))
118 
119 /* Non-tunable probe interval, based on link capabilities */
120 #define	ILL_PROBE_INTERVAL(ill)	((ill)->ill_note_link ? 150 : 1500)
121 
122 /*
123  * NDP Cache Entry creation routine.
124  * Mapped entries will never do NUD .
125  * This routine must always be called with ndp6->ndp_g_lock held.
126  * Prior to return, nce_refcnt is incremented.
127  */
128 int
129 ndp_add_v6(ill_t *ill, uchar_t *hw_addr, const in6_addr_t *addr,
130     const in6_addr_t *mask, const in6_addr_t *extract_mask,
131     uint32_t hw_extract_start, uint16_t flags, uint16_t state,
132     nce_t **newnce)
133 {
134 	static	nce_t		nce_nil;
135 	nce_t		*nce;
136 	mblk_t		*mp;
137 	mblk_t		*template;
138 	nce_t		**ncep;
139 	int		err;
140 	boolean_t	dropped = B_FALSE;
141 	ip_stack_t	*ipst = ill->ill_ipst;
142 
143 	ASSERT(MUTEX_HELD(&ipst->ips_ndp6->ndp_g_lock));
144 	ASSERT(ill != NULL && ill->ill_isv6);
145 	if (IN6_IS_ADDR_UNSPECIFIED(addr)) {
146 		ip0dbg(("ndp_add_v6: no addr\n"));
147 		return (EINVAL);
148 	}
149 	if ((flags & ~NCE_EXTERNAL_FLAGS_MASK)) {
150 		ip0dbg(("ndp_add_v6: flags = %x\n", (int)flags));
151 		return (EINVAL);
152 	}
153 	if (IN6_IS_ADDR_UNSPECIFIED(extract_mask) &&
154 	    (flags & NCE_F_MAPPING)) {
155 		ip0dbg(("ndp_add_v6: extract mask zero for mapping"));
156 		return (EINVAL);
157 	}
158 	/*
159 	 * Allocate the mblk to hold the nce.
160 	 *
161 	 * XXX This can come out of a separate cache - nce_cache.
162 	 * We don't need the mp anymore as there are no more
163 	 * "qwriter"s
164 	 */
165 	mp = allocb(sizeof (nce_t), BPRI_MED);
166 	if (mp == NULL)
167 		return (ENOMEM);
168 
169 	nce = (nce_t *)mp->b_rptr;
170 	mp->b_wptr = (uchar_t *)&nce[1];
171 	*nce = nce_nil;
172 
173 	/*
174 	 * This one holds link layer address
175 	 */
176 	if (ill->ill_net_type == IRE_IF_RESOLVER) {
177 		template = nce_udreq_alloc(ill);
178 	} else {
179 		if (ill->ill_resolver_mp == NULL) {
180 			freeb(mp);
181 			return (EINVAL);
182 		}
183 		ASSERT((ill->ill_net_type == IRE_IF_NORESOLVER));
184 		template = copyb(ill->ill_resolver_mp);
185 	}
186 	if (template == NULL) {
187 		freeb(mp);
188 		return (ENOMEM);
189 	}
190 	nce->nce_ill = ill;
191 	nce->nce_ipversion = IPV6_VERSION;
192 	nce->nce_flags = flags;
193 	nce->nce_state = state;
194 	nce->nce_pcnt = ND_MAX_UNICAST_SOLICIT;
195 	nce->nce_rcnt = ill->ill_xmit_count;
196 	nce->nce_addr = *addr;
197 	nce->nce_mask = *mask;
198 	nce->nce_extract_mask = *extract_mask;
199 	nce->nce_ll_extract_start = hw_extract_start;
200 	nce->nce_fp_mp = NULL;
201 	nce->nce_res_mp = template;
202 	if (state == ND_REACHABLE)
203 		nce->nce_last = TICK_TO_MSEC(lbolt64);
204 	else
205 		nce->nce_last = 0;
206 	nce->nce_qd_mp = NULL;
207 	nce->nce_mp = mp;
208 	if (hw_addr != NULL)
209 		nce_set_ll(nce, hw_addr);
210 	/* This one is for nce getting created */
211 	nce->nce_refcnt = 1;
212 	mutex_init(&nce->nce_lock, NULL, MUTEX_DEFAULT, NULL);
213 	if (nce->nce_flags & NCE_F_MAPPING) {
214 		ASSERT(IN6_IS_ADDR_MULTICAST(addr));
215 		ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&nce->nce_mask));
216 		ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&nce->nce_extract_mask));
217 		ncep = &ipst->ips_ndp6->nce_mask_entries;
218 	} else {
219 		ncep = ((nce_t **)NCE_HASH_PTR_V6(ipst, *addr));
220 	}
221 
222 	nce->nce_trace_disable = B_FALSE;
223 
224 	/*
225 	 * Atomically ensure that the ill is not CONDEMNED, before
226 	 * adding the NCE.
227 	 */
228 	mutex_enter(&ill->ill_lock);
229 	if (ill->ill_state_flags & ILL_CONDEMNED) {
230 		mutex_exit(&ill->ill_lock);
231 		freeb(mp);
232 		freeb(template);
233 		return (EINVAL);
234 	}
235 	if ((nce->nce_next = *ncep) != NULL)
236 		nce->nce_next->nce_ptpn = &nce->nce_next;
237 	*ncep = nce;
238 	nce->nce_ptpn = ncep;
239 	*newnce = nce;
240 	/* This one is for nce being used by an active thread */
241 	NCE_REFHOLD(*newnce);
242 
243 	/* Bump up the number of nce's referencing this ill */
244 	DTRACE_PROBE3(ill__incr__cnt, (ill_t *), ill,
245 	    (char *), "nce", (void *), nce);
246 	ill->ill_nce_cnt++;
247 	mutex_exit(&ill->ill_lock);
248 
249 	err = 0;
250 	if ((flags & NCE_F_PERMANENT) && state == ND_PROBE) {
251 		mutex_enter(&nce->nce_lock);
252 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
253 		nce->nce_pcnt = ND_MAX_UNICAST_SOLICIT;
254 		mutex_exit(&nce->nce_lock);
255 		dropped = nce_xmit_solicit(nce, B_FALSE, NULL, NDP_PROBE);
256 		if (dropped) {
257 			mutex_enter(&nce->nce_lock);
258 			nce->nce_pcnt++;
259 			mutex_exit(&nce->nce_lock);
260 		}
261 		NDP_RESTART_TIMER(nce, ILL_PROBE_INTERVAL(ill));
262 		mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
263 		err = EINPROGRESS;
264 	} else if (flags & NCE_F_UNSOL_ADV) {
265 		/*
266 		 * We account for the transmit below by assigning one
267 		 * less than the ndd variable. Subsequent decrements
268 		 * are done in ndp_timer.
269 		 */
270 		mutex_enter(&nce->nce_lock);
271 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
272 		nce->nce_unsolicit_count = ipst->ips_ip_ndp_unsolicit_count - 1;
273 		mutex_exit(&nce->nce_lock);
274 		dropped = nce_xmit_advert(nce, B_TRUE, &ipv6_all_hosts_mcast,
275 		    0);
276 		mutex_enter(&nce->nce_lock);
277 		if (dropped)
278 			nce->nce_unsolicit_count++;
279 		if (nce->nce_unsolicit_count != 0) {
280 			ASSERT(nce->nce_timeout_id == 0);
281 			nce->nce_timeout_id = timeout(ndp_timer, nce,
282 			    MSEC_TO_TICK(ipst->ips_ip_ndp_unsolicit_interval));
283 		}
284 		mutex_exit(&nce->nce_lock);
285 		mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
286 	}
287 
288 	/*
289 	 * If the hw_addr is NULL, typically for ND_INCOMPLETE nces, then
290 	 * we call nce_fastpath as soon as the nce is resolved in ndp_process.
291 	 * We call nce_fastpath from nce_update if the link layer address of
292 	 * the peer changes from nce_update
293 	 */
294 	if (hw_addr != NULL || ill->ill_net_type == IRE_IF_NORESOLVER)
295 		nce_fastpath(nce);
296 	return (err);
297 }
298 
299 int
300 ndp_lookup_then_add_v6(ill_t *ill, boolean_t match_illgrp, uchar_t *hw_addr,
301     const in6_addr_t *addr, const in6_addr_t *mask,
302     const in6_addr_t *extract_mask, uint32_t hw_extract_start, uint16_t flags,
303     uint16_t state, nce_t **newnce)
304 {
305 	int	err = 0;
306 	nce_t	*nce;
307 	ip_stack_t	*ipst = ill->ill_ipst;
308 
309 	ASSERT(ill->ill_isv6);
310 	mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
311 
312 	/* Get head of v6 hash table */
313 	nce = *((nce_t **)NCE_HASH_PTR_V6(ipst, *addr));
314 	nce = nce_lookup_addr(ill, match_illgrp, addr, nce);
315 	if (nce == NULL) {
316 		err = ndp_add_v6(ill,
317 		    hw_addr,
318 		    addr,
319 		    mask,
320 		    extract_mask,
321 		    hw_extract_start,
322 		    flags,
323 		    state,
324 		    newnce);
325 	} else {
326 		*newnce = nce;
327 		err = EEXIST;
328 	}
329 	mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
330 	return (err);
331 }
332 
333 /*
334  * Remove all the CONDEMNED nces from the appropriate hash table.
335  * We create a private list of NCEs, these may have ires pointing
336  * to them, so the list will be passed through to clean up dependent
337  * ires and only then we can do NCE_REFRELE which can make NCE inactive.
338  */
339 static void
340 nce_remove(ndp_g_t *ndp, nce_t *nce, nce_t **free_nce_list)
341 {
342 	nce_t *nce1;
343 	nce_t **ptpn;
344 
345 	ASSERT(MUTEX_HELD(&ndp->ndp_g_lock));
346 	ASSERT(ndp->ndp_g_walker == 0);
347 	for (; nce; nce = nce1) {
348 		nce1 = nce->nce_next;
349 		mutex_enter(&nce->nce_lock);
350 		if (nce->nce_flags & NCE_F_CONDEMNED) {
351 			ptpn = nce->nce_ptpn;
352 			nce1 = nce->nce_next;
353 			if (nce1 != NULL)
354 				nce1->nce_ptpn = ptpn;
355 			*ptpn = nce1;
356 			nce->nce_ptpn = NULL;
357 			nce->nce_next = NULL;
358 			nce->nce_next = *free_nce_list;
359 			*free_nce_list = nce;
360 		}
361 		mutex_exit(&nce->nce_lock);
362 	}
363 }
364 
365 /*
366  * 1. Mark the nce CONDEMNED. This ensures that no new nce_lookup()
367  *    will return this NCE. Also no new IREs will be created that
368  *    point to this NCE (See ire_add_v6).  Also no new timeouts will
369  *    be started (See NDP_RESTART_TIMER).
370  * 2. Cancel any currently running timeouts.
371  * 3. If there is an ndp walker, return. The walker will do the cleanup.
372  *    This ensures that walkers see a consistent list of NCEs while walking.
373  * 4. Otherwise remove the NCE from the list of NCEs
374  * 5. Delete all IREs pointing to this NCE.
375  */
376 void
377 ndp_delete(nce_t *nce)
378 {
379 	nce_t	**ptpn;
380 	nce_t	*nce1;
381 	int	ipversion = nce->nce_ipversion;
382 	ndp_g_t *ndp;
383 	ip_stack_t	*ipst = nce->nce_ill->ill_ipst;
384 
385 	if (ipversion == IPV4_VERSION)
386 		ndp = ipst->ips_ndp4;
387 	else
388 		ndp = ipst->ips_ndp6;
389 
390 	/* Serialize deletes */
391 	mutex_enter(&nce->nce_lock);
392 	if (nce->nce_flags & NCE_F_CONDEMNED) {
393 		/* Some other thread is doing the delete */
394 		mutex_exit(&nce->nce_lock);
395 		return;
396 	}
397 	/*
398 	 * Caller has a refhold. Also 1 ref for being in the list. Thus
399 	 * refcnt has to be >= 2
400 	 */
401 	ASSERT(nce->nce_refcnt >= 2);
402 	nce->nce_flags |= NCE_F_CONDEMNED;
403 	mutex_exit(&nce->nce_lock);
404 
405 	nce_fastpath_list_delete(nce);
406 
407 	/*
408 	 * Cancel any running timer. Timeout can't be restarted
409 	 * since CONDEMNED is set. Can't hold nce_lock across untimeout.
410 	 * Passing invalid timeout id is fine.
411 	 */
412 	if (nce->nce_timeout_id != 0) {
413 		(void) untimeout(nce->nce_timeout_id);
414 		nce->nce_timeout_id = 0;
415 	}
416 
417 	mutex_enter(&ndp->ndp_g_lock);
418 	if (nce->nce_ptpn == NULL) {
419 		/*
420 		 * The last ndp walker has already removed this nce from
421 		 * the list after we marked the nce CONDEMNED and before
422 		 * we grabbed the global lock.
423 		 */
424 		mutex_exit(&ndp->ndp_g_lock);
425 		return;
426 	}
427 	if (ndp->ndp_g_walker > 0) {
428 		/*
429 		 * Can't unlink. The walker will clean up
430 		 */
431 		ndp->ndp_g_walker_cleanup = B_TRUE;
432 		mutex_exit(&ndp->ndp_g_lock);
433 		return;
434 	}
435 
436 	/*
437 	 * Now remove the nce from the list. NDP_RESTART_TIMER won't restart
438 	 * the timer since it is marked CONDEMNED.
439 	 */
440 	ptpn = nce->nce_ptpn;
441 	nce1 = nce->nce_next;
442 	if (nce1 != NULL)
443 		nce1->nce_ptpn = ptpn;
444 	*ptpn = nce1;
445 	nce->nce_ptpn = NULL;
446 	nce->nce_next = NULL;
447 	mutex_exit(&ndp->ndp_g_lock);
448 
449 	nce_ire_delete(nce);
450 }
451 
452 void
453 ndp_inactive(nce_t *nce)
454 {
455 	mblk_t		**mpp;
456 	ill_t		*ill;
457 
458 	ASSERT(nce->nce_refcnt == 0);
459 	ASSERT(MUTEX_HELD(&nce->nce_lock));
460 	ASSERT(nce->nce_fastpath == NULL);
461 
462 	/* Free all nce allocated messages */
463 	mpp = &nce->nce_first_mp_to_free;
464 	do {
465 		while (*mpp != NULL) {
466 			mblk_t  *mp;
467 
468 			mp = *mpp;
469 			*mpp = mp->b_next;
470 
471 			inet_freemsg(mp);
472 		}
473 	} while (mpp++ != &nce->nce_last_mp_to_free);
474 
475 #ifdef DEBUG
476 	nce_trace_cleanup(nce);
477 #endif
478 
479 	ill = nce->nce_ill;
480 	mutex_enter(&ill->ill_lock);
481 	DTRACE_PROBE3(ill__decr__cnt, (ill_t *), ill,
482 	    (char *), "nce", (void *), nce);
483 	ill->ill_nce_cnt--;
484 	/*
485 	 * If the number of nce's associated with this ill have dropped
486 	 * to zero, check whether we need to restart any operation that
487 	 * is waiting for this to happen.
488 	 */
489 	if (ILL_DOWN_OK(ill)) {
490 		/* ipif_ill_refrele_tail drops the ill_lock */
491 		ipif_ill_refrele_tail(ill);
492 	} else {
493 		mutex_exit(&ill->ill_lock);
494 	}
495 	mutex_destroy(&nce->nce_lock);
496 	if (nce->nce_mp != NULL)
497 		inet_freemsg(nce->nce_mp);
498 }
499 
500 /*
501  * ndp_walk routine.  Delete the nce if it is associated with the ill
502  * that is going away.  Always called as a writer.
503  */
504 void
505 ndp_delete_per_ill(nce_t *nce, uchar_t *arg)
506 {
507 	if ((nce != NULL) && nce->nce_ill == (ill_t *)arg) {
508 		ndp_delete(nce);
509 	}
510 }
511 
512 /*
513  * Walk a list of to be inactive NCEs and blow away all the ires.
514  */
515 static void
516 nce_ire_delete_list(nce_t *nce)
517 {
518 	nce_t *nce_next;
519 
520 	ASSERT(nce != NULL);
521 	while (nce != NULL) {
522 		nce_next = nce->nce_next;
523 		nce->nce_next = NULL;
524 
525 		/*
526 		 * It is possible for the last ndp walker (this thread)
527 		 * to come here after ndp_delete has marked the nce CONDEMNED
528 		 * and before it has removed the nce from the fastpath list
529 		 * or called untimeout. So we need to do it here. It is safe
530 		 * for both ndp_delete and this thread to do it twice or
531 		 * even simultaneously since each of the threads has a
532 		 * reference on the nce.
533 		 */
534 		nce_fastpath_list_delete(nce);
535 		/*
536 		 * Cancel any running timer. Timeout can't be restarted
537 		 * since CONDEMNED is set. Can't hold nce_lock across untimeout.
538 		 * Passing invalid timeout id is fine.
539 		 */
540 		if (nce->nce_timeout_id != 0) {
541 			(void) untimeout(nce->nce_timeout_id);
542 			nce->nce_timeout_id = 0;
543 		}
544 		/*
545 		 * We might hit this func thus in the v4 case:
546 		 * ipif_down->ipif_ndp_down->ndp_walk
547 		 */
548 
549 		if (nce->nce_ipversion == IPV4_VERSION) {
550 			ire_walk_ill_v4(MATCH_IRE_ILL | MATCH_IRE_TYPE,
551 			    IRE_CACHE, nce_ire_delete1, nce, nce->nce_ill);
552 		} else {
553 			ASSERT(nce->nce_ipversion == IPV6_VERSION);
554 			ire_walk_ill_v6(MATCH_IRE_ILL | MATCH_IRE_TYPE,
555 			    IRE_CACHE, nce_ire_delete1, nce, nce->nce_ill);
556 		}
557 		NCE_REFRELE_NOTR(nce);
558 		nce = nce_next;
559 	}
560 }
561 
562 /*
563  * Delete an ire when the nce goes away.
564  */
565 /* ARGSUSED */
566 static void
567 nce_ire_delete(nce_t *nce)
568 {
569 	if (nce->nce_ipversion == IPV6_VERSION) {
570 		ire_walk_ill_v6(MATCH_IRE_ILL | MATCH_IRE_TYPE, IRE_CACHE,
571 		    nce_ire_delete1, (char *)nce, nce->nce_ill);
572 		NCE_REFRELE_NOTR(nce);
573 	} else {
574 		ire_walk_ill_v4(MATCH_IRE_ILL | MATCH_IRE_TYPE, IRE_CACHE,
575 		    nce_ire_delete1, (char *)nce, nce->nce_ill);
576 		NCE_REFRELE_NOTR(nce);
577 	}
578 }
579 
580 /*
581  * ire_walk routine used to delete every IRE that shares this nce
582  */
583 static void
584 nce_ire_delete1(ire_t *ire, char *nce_arg)
585 {
586 	nce_t	*nce = (nce_t *)nce_arg;
587 
588 	ASSERT(ire->ire_type == IRE_CACHE);
589 
590 	if (ire->ire_nce == nce) {
591 		ASSERT(ire->ire_ipversion == nce->nce_ipversion);
592 		ire_delete(ire);
593 	}
594 }
595 
596 /*
597  * Restart DAD on given NCE.  Returns B_TRUE if DAD has been restarted.
598  */
599 boolean_t
600 ndp_restart_dad(nce_t *nce)
601 {
602 	boolean_t started;
603 	boolean_t dropped;
604 
605 	if (nce == NULL)
606 		return (B_FALSE);
607 	mutex_enter(&nce->nce_lock);
608 	if (nce->nce_state == ND_PROBE) {
609 		mutex_exit(&nce->nce_lock);
610 		started = B_TRUE;
611 	} else if (nce->nce_state == ND_REACHABLE) {
612 		nce->nce_state = ND_PROBE;
613 		nce->nce_pcnt = ND_MAX_UNICAST_SOLICIT - 1;
614 		mutex_exit(&nce->nce_lock);
615 		dropped = nce_xmit_solicit(nce, B_FALSE, NULL, NDP_PROBE);
616 		if (dropped) {
617 			mutex_enter(&nce->nce_lock);
618 			nce->nce_pcnt++;
619 			mutex_exit(&nce->nce_lock);
620 		}
621 		NDP_RESTART_TIMER(nce, ILL_PROBE_INTERVAL(nce->nce_ill));
622 		started = B_TRUE;
623 	} else {
624 		mutex_exit(&nce->nce_lock);
625 		started = B_FALSE;
626 	}
627 	return (started);
628 }
629 
630 /*
631  * IPv6 Cache entry lookup.  Try to find an nce matching the parameters passed.
632  * If one is found, the refcnt on the nce will be incremented.
633  */
634 nce_t *
635 ndp_lookup_v6(ill_t *ill, boolean_t match_illgrp, const in6_addr_t *addr,
636     boolean_t caller_holds_lock)
637 {
638 	nce_t	*nce;
639 	ip_stack_t *ipst = ill->ill_ipst;
640 
641 	ASSERT(ill->ill_isv6);
642 	if (!caller_holds_lock)
643 		mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
644 
645 	/* Get head of v6 hash table */
646 	nce = *((nce_t **)NCE_HASH_PTR_V6(ipst, *addr));
647 	nce = nce_lookup_addr(ill, match_illgrp, addr, nce);
648 	if (nce == NULL)
649 		nce = nce_lookup_mapping(ill, addr);
650 	if (!caller_holds_lock)
651 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
652 	return (nce);
653 }
654 /*
655  * IPv4 Cache entry lookup.  Try to find an nce matching the parameters passed.
656  * If one is found, the refcnt on the nce will be incremented.
657  * Since multicast mappings are handled in arp, there are no nce_mcast_entries
658  * so we skip the nce_lookup_mapping call.
659  * XXX TODO: if the nce is found to be ND_STALE, ndp_delete it and return NULL
660  */
661 nce_t *
662 ndp_lookup_v4(ill_t *ill, const in_addr_t *addr, boolean_t caller_holds_lock)
663 {
664 	nce_t	*nce;
665 	in6_addr_t addr6;
666 	ip_stack_t *ipst = ill->ill_ipst;
667 
668 	if (!caller_holds_lock)
669 		mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
670 
671 	/* Get head of v4 hash table */
672 	nce = *((nce_t **)NCE_HASH_PTR_V4(ipst, *addr));
673 	IN6_IPADDR_TO_V4MAPPED(*addr, &addr6);
674 	/*
675 	 * NOTE: IPv4 never matches across the illgrp since the NCE's we're
676 	 * looking up have fastpath headers that are inherently per-ill.
677 	 */
678 	nce = nce_lookup_addr(ill, B_FALSE, &addr6, nce);
679 	if (!caller_holds_lock)
680 		mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
681 	return (nce);
682 }
683 
684 /*
685  * Cache entry lookup.  Try to find an nce matching the parameters passed.
686  * Look only for exact entries (no mappings).  If an nce is found, increment
687  * the hold count on that nce. The caller passes in the start of the
688  * appropriate hash table, and must be holding the appropriate global
689  * lock (ndp_g_lock).
690  */
691 static nce_t *
692 nce_lookup_addr(ill_t *ill, boolean_t match_illgrp, const in6_addr_t *addr,
693     nce_t *nce)
694 {
695 	ndp_g_t		*ndp;
696 	ip_stack_t	*ipst = ill->ill_ipst;
697 
698 	if (ill->ill_isv6)
699 		ndp = ipst->ips_ndp6;
700 	else
701 		ndp = ipst->ips_ndp4;
702 
703 	ASSERT(MUTEX_HELD(&ndp->ndp_g_lock));
704 	if (IN6_IS_ADDR_UNSPECIFIED(addr))
705 		return (NULL);
706 	for (; nce != NULL; nce = nce->nce_next) {
707 		if (nce->nce_ill == ill ||
708 		    match_illgrp && IS_IN_SAME_ILLGRP(ill, nce->nce_ill)) {
709 			if (IN6_ARE_ADDR_EQUAL(&nce->nce_addr, addr) &&
710 			    IN6_ARE_ADDR_EQUAL(&nce->nce_mask,
711 			    &ipv6_all_ones)) {
712 				mutex_enter(&nce->nce_lock);
713 				if (!(nce->nce_flags & NCE_F_CONDEMNED)) {
714 					NCE_REFHOLD_LOCKED(nce);
715 					mutex_exit(&nce->nce_lock);
716 					break;
717 				}
718 				mutex_exit(&nce->nce_lock);
719 			}
720 		}
721 	}
722 	return (nce);
723 }
724 
725 /*
726  * Cache entry lookup.  Try to find an nce matching the parameters passed.
727  * Look only for mappings.
728  */
729 static nce_t *
730 nce_lookup_mapping(ill_t *ill, const in6_addr_t *addr)
731 {
732 	nce_t	*nce;
733 	ip_stack_t	*ipst = ill->ill_ipst;
734 
735 	ASSERT(ill != NULL && ill->ill_isv6);
736 	ASSERT(MUTEX_HELD(&ipst->ips_ndp6->ndp_g_lock));
737 	if (!IN6_IS_ADDR_MULTICAST(addr))
738 		return (NULL);
739 	nce = ipst->ips_ndp6->nce_mask_entries;
740 	for (; nce != NULL; nce = nce->nce_next)
741 		if (nce->nce_ill == ill &&
742 		    (V6_MASK_EQ(*addr, nce->nce_mask, nce->nce_addr))) {
743 			mutex_enter(&nce->nce_lock);
744 			if (!(nce->nce_flags & NCE_F_CONDEMNED)) {
745 				NCE_REFHOLD_LOCKED(nce);
746 				mutex_exit(&nce->nce_lock);
747 				break;
748 			}
749 			mutex_exit(&nce->nce_lock);
750 		}
751 	return (nce);
752 }
753 
754 /*
755  * Process passed in parameters either from an incoming packet or via
756  * user ioctl.
757  */
758 static void
759 nce_process(nce_t *nce, uchar_t *hw_addr, uint32_t flag, boolean_t is_adv)
760 {
761 	ill_t	*ill = nce->nce_ill;
762 	uint32_t hw_addr_len = ill->ill_nd_lla_len;
763 	mblk_t	*mp;
764 	boolean_t ll_updated = B_FALSE;
765 	boolean_t ll_changed;
766 	ip_stack_t	*ipst = ill->ill_ipst;
767 
768 	ASSERT(nce->nce_ipversion == IPV6_VERSION);
769 	/*
770 	 * No updates of link layer address or the neighbor state is
771 	 * allowed, when the cache is in NONUD state.  This still
772 	 * allows for responding to reachability solicitation.
773 	 */
774 	mutex_enter(&nce->nce_lock);
775 	if (nce->nce_state == ND_INCOMPLETE) {
776 		if (hw_addr == NULL) {
777 			mutex_exit(&nce->nce_lock);
778 			return;
779 		}
780 		nce_set_ll(nce, hw_addr);
781 		/*
782 		 * Update nce state and send the queued packets
783 		 * back to ip this time ire will be added.
784 		 */
785 		if (flag & ND_NA_FLAG_SOLICITED) {
786 			nce_update(nce, ND_REACHABLE, NULL);
787 		} else {
788 			nce_update(nce, ND_STALE, NULL);
789 		}
790 		mutex_exit(&nce->nce_lock);
791 		nce_fastpath(nce);
792 		mutex_enter(&nce->nce_lock);
793 		mp = nce->nce_qd_mp;
794 		nce->nce_qd_mp = NULL;
795 		mutex_exit(&nce->nce_lock);
796 		while (mp != NULL) {
797 			mblk_t *nxt_mp, *data_mp;
798 
799 			nxt_mp = mp->b_next;
800 			mp->b_next = NULL;
801 
802 			if (mp->b_datap->db_type == M_CTL)
803 				data_mp = mp->b_cont;
804 			else
805 				data_mp = mp;
806 			if (data_mp->b_prev != NULL) {
807 				ill_t   *inbound_ill;
808 				queue_t *fwdq = NULL;
809 				uint_t ifindex;
810 
811 				ifindex = (uint_t)(uintptr_t)data_mp->b_prev;
812 				inbound_ill = ill_lookup_on_ifindex(ifindex,
813 				    B_TRUE, NULL, NULL, NULL, NULL, ipst);
814 				if (inbound_ill == NULL) {
815 					data_mp->b_prev = NULL;
816 					freemsg(mp);
817 					return;
818 				} else {
819 					fwdq = inbound_ill->ill_rq;
820 				}
821 				data_mp->b_prev = NULL;
822 				/*
823 				 * Send a forwarded packet back into ip_rput_v6
824 				 * just as in ire_send_v6().
825 				 * Extract the queue from b_prev (set in
826 				 * ip_rput_data_v6).
827 				 */
828 				if (fwdq != NULL) {
829 					/*
830 					 * Forwarded packets hop count will
831 					 * get decremented in ip_rput_data_v6
832 					 */
833 					if (data_mp != mp)
834 						freeb(mp);
835 					put(fwdq, data_mp);
836 				} else {
837 					/*
838 					 * Send locally originated packets back
839 					 * into ip_wput_v6.
840 					 */
841 					put(ill->ill_wq, mp);
842 				}
843 				ill_refrele(inbound_ill);
844 			} else {
845 				put(ill->ill_wq, mp);
846 			}
847 			mp = nxt_mp;
848 		}
849 		return;
850 	}
851 	ll_changed = nce_cmp_ll_addr(nce, hw_addr, hw_addr_len);
852 	if (!is_adv) {
853 		/* If this is a SOLICITATION request only */
854 		if (ll_changed)
855 			nce_update(nce, ND_STALE, hw_addr);
856 		mutex_exit(&nce->nce_lock);
857 		return;
858 	}
859 	if (!(flag & ND_NA_FLAG_OVERRIDE) && ll_changed) {
860 		/* If in any other state than REACHABLE, ignore */
861 		if (nce->nce_state == ND_REACHABLE) {
862 			nce_update(nce, ND_STALE, NULL);
863 		}
864 		mutex_exit(&nce->nce_lock);
865 		return;
866 	} else {
867 		if (ll_changed) {
868 			nce_update(nce, ND_UNCHANGED, hw_addr);
869 			ll_updated = B_TRUE;
870 		}
871 		if (flag & ND_NA_FLAG_SOLICITED) {
872 			nce_update(nce, ND_REACHABLE, NULL);
873 		} else {
874 			if (ll_updated) {
875 				nce_update(nce, ND_STALE, NULL);
876 			}
877 		}
878 		mutex_exit(&nce->nce_lock);
879 		if (!(flag & ND_NA_FLAG_ROUTER) && (nce->nce_flags &
880 		    NCE_F_ISROUTER)) {
881 			ire_t *ire;
882 
883 			/*
884 			 * Router turned to host.  We need to remove the
885 			 * entry as well as any default route that may be
886 			 * using this as a next hop.  This is required by
887 			 * section 7.2.5 of RFC 2461.
888 			 */
889 			ire = ire_ftable_lookup_v6(&ipv6_all_zeros,
890 			    &ipv6_all_zeros, &nce->nce_addr, IRE_DEFAULT,
891 			    nce->nce_ill->ill_ipif, NULL, ALL_ZONES, 0, NULL,
892 			    MATCH_IRE_ILL | MATCH_IRE_TYPE | MATCH_IRE_GW |
893 			    MATCH_IRE_DEFAULT, ipst);
894 			if (ire != NULL) {
895 				ip_rts_rtmsg(RTM_DELETE, ire, 0, ipst);
896 				ire_delete(ire);
897 				ire_refrele(ire);
898 			}
899 			ndp_delete(nce);
900 		}
901 	}
902 }
903 
904 /*
905  * Walker state structure used by ndp_process() / ndp_process_entry().
906  */
907 typedef struct ndp_process_data {
908 	ill_t		*np_ill; 	/* ill/illgrp to match against */
909 	const in6_addr_t *np_addr; 	/* IPv6 address to match */
910 	uchar_t		*np_hw_addr; 	/* passed to nce_process() */
911 	uint32_t	np_flag;	/* passed to nce_process() */
912 	boolean_t	np_is_adv;	/* passed to nce_process() */
913 } ndp_process_data_t;
914 
915 /*
916  * Walker callback used by ndp_process() for IPMP groups: calls nce_process()
917  * for each NCE with a matching address that's in the same IPMP group.
918  */
919 static void
920 ndp_process_entry(nce_t *nce, void *arg)
921 {
922 	ndp_process_data_t *npp = arg;
923 
924 	if (IS_IN_SAME_ILLGRP(nce->nce_ill, npp->np_ill) &&
925 	    IN6_ARE_ADDR_EQUAL(&nce->nce_addr, npp->np_addr) &&
926 	    IN6_ARE_ADDR_EQUAL(&nce->nce_mask, &ipv6_all_ones)) {
927 		nce_process(nce, npp->np_hw_addr, npp->np_flag, npp->np_is_adv);
928 	}
929 }
930 
931 /*
932  * Wrapper around nce_process() that handles IPMP.  In particular, for IPMP,
933  * NCEs are per-underlying-ill (because of nce_fp_mp) and thus we may have
934  * more than one NCE for a given IPv6 address to tend to.  In that case, we
935  * need to walk all NCEs and callback nce_process() for each one.  Since this
936  * is expensive, in the non-IPMP case we just directly call nce_process().
937  * Ultimately, nce_fp_mp needs to be moved out of the nce_t so that all IP
938  * interfaces in an IPMP group share the same NCEs -- at which point this
939  * function can be removed entirely.
940  */
941 void
942 ndp_process(nce_t *nce, uchar_t *hw_addr, uint32_t flag, boolean_t is_adv)
943 {
944 	ill_t *ill = nce->nce_ill;
945 	struct ndp_g_s *ndp = ill->ill_ipst->ips_ndp6;
946 	ndp_process_data_t np;
947 
948 	if (ill->ill_grp == NULL) {
949 		nce_process(nce, hw_addr, flag, is_adv);
950 		return;
951 	}
952 
953 	/* IPMP case: walk all NCEs */
954 	np.np_ill = ill;
955 	np.np_addr = &nce->nce_addr;
956 	np.np_flag = flag;
957 	np.np_is_adv = is_adv;
958 	np.np_hw_addr = hw_addr;
959 
960 	ndp_walk_common(ndp, NULL, (pfi_t)ndp_process_entry, &np, ALL_ZONES);
961 }
962 
963 /*
964  * Pass arg1 to the pfi supplied, along with each nce in existence.
965  * ndp_walk() places a REFHOLD on the nce and drops the lock when
966  * walking the hash list.
967  */
968 void
969 ndp_walk_common(ndp_g_t *ndp, ill_t *ill, pfi_t pfi, void *arg1,
970     boolean_t trace)
971 {
972 	nce_t	*nce;
973 	nce_t	*nce1;
974 	nce_t	**ncep;
975 	nce_t	*free_nce_list = NULL;
976 
977 	mutex_enter(&ndp->ndp_g_lock);
978 	/* Prevent ndp_delete from unlink and free of NCE */
979 	ndp->ndp_g_walker++;
980 	mutex_exit(&ndp->ndp_g_lock);
981 	for (ncep = ndp->nce_hash_tbl;
982 	    ncep < A_END(ndp->nce_hash_tbl); ncep++) {
983 		for (nce = *ncep; nce != NULL; nce = nce1) {
984 			nce1 = nce->nce_next;
985 			if (ill == NULL || nce->nce_ill == ill) {
986 				if (trace) {
987 					NCE_REFHOLD(nce);
988 					(*pfi)(nce, arg1);
989 					NCE_REFRELE(nce);
990 				} else {
991 					NCE_REFHOLD_NOTR(nce);
992 					(*pfi)(nce, arg1);
993 					NCE_REFRELE_NOTR(nce);
994 				}
995 			}
996 		}
997 	}
998 	for (nce = ndp->nce_mask_entries; nce != NULL; nce = nce1) {
999 		nce1 = nce->nce_next;
1000 		if (ill == NULL || nce->nce_ill == ill) {
1001 			if (trace) {
1002 				NCE_REFHOLD(nce);
1003 				(*pfi)(nce, arg1);
1004 				NCE_REFRELE(nce);
1005 			} else {
1006 				NCE_REFHOLD_NOTR(nce);
1007 				(*pfi)(nce, arg1);
1008 				NCE_REFRELE_NOTR(nce);
1009 			}
1010 		}
1011 	}
1012 	mutex_enter(&ndp->ndp_g_lock);
1013 	ndp->ndp_g_walker--;
1014 	/*
1015 	 * While NCE's are removed from global list they are placed
1016 	 * in a private list, to be passed to nce_ire_delete_list().
1017 	 * The reason is, there may be ires pointing to this nce
1018 	 * which needs to cleaned up.
1019 	 */
1020 	if (ndp->ndp_g_walker_cleanup && ndp->ndp_g_walker == 0) {
1021 		/* Time to delete condemned entries */
1022 		for (ncep = ndp->nce_hash_tbl;
1023 		    ncep < A_END(ndp->nce_hash_tbl); ncep++) {
1024 			nce = *ncep;
1025 			if (nce != NULL) {
1026 				nce_remove(ndp, nce, &free_nce_list);
1027 			}
1028 		}
1029 		nce = ndp->nce_mask_entries;
1030 		if (nce != NULL) {
1031 			nce_remove(ndp, nce, &free_nce_list);
1032 		}
1033 		ndp->ndp_g_walker_cleanup = B_FALSE;
1034 	}
1035 
1036 	mutex_exit(&ndp->ndp_g_lock);
1037 
1038 	if (free_nce_list != NULL) {
1039 		nce_ire_delete_list(free_nce_list);
1040 	}
1041 }
1042 
1043 /*
1044  * Walk everything.
1045  * Note that ill can be NULL hence can't derive the ipst from it.
1046  */
1047 void
1048 ndp_walk(ill_t *ill, pfi_t pfi, void *arg1, ip_stack_t *ipst)
1049 {
1050 	ndp_walk_common(ipst->ips_ndp4, ill, pfi, arg1, B_TRUE);
1051 	ndp_walk_common(ipst->ips_ndp6, ill, pfi, arg1, B_TRUE);
1052 }
1053 
1054 /*
1055  * Process resolve requests.  Handles both mapped entries
1056  * as well as cases that needs to be send out on the wire.
1057  * Lookup a NCE for a given IRE.  Regardless of whether one exists
1058  * or one is created, we defer making ire point to nce until the
1059  * ire is actually added at which point the nce_refcnt on the nce is
1060  * incremented.  This is done primarily to have symmetry between ire_add()
1061  * and ire_delete() which decrements the nce_refcnt, when an ire is deleted.
1062  */
1063 int
1064 ndp_resolver(ill_t *ill, const in6_addr_t *dst, mblk_t *mp, zoneid_t zoneid)
1065 {
1066 	nce_t		*nce, *hw_nce = NULL;
1067 	int		err;
1068 	ill_t		*ipmp_ill;
1069 	uint16_t	nce_flags;
1070 	uint32_t	ms;
1071 	mblk_t		*mp_nce = NULL;
1072 	ip_stack_t	*ipst = ill->ill_ipst;
1073 	uchar_t		*hwaddr = NULL;
1074 
1075 	ASSERT(ill->ill_isv6);
1076 
1077 	if (IN6_IS_ADDR_MULTICAST(dst))
1078 		return (nce_set_multicast(ill, dst));
1079 
1080 	nce_flags = (ill->ill_flags & ILLF_NONUD) ? NCE_F_NONUD : 0;
1081 
1082 	/*
1083 	 * If `ill' is under IPMP, then first check to see if there's an NCE
1084 	 * for `dst' on the IPMP meta-interface (e.g., because an application
1085 	 * explicitly did an SIOCLIFSETND to tie a hardware address to `dst').
1086 	 * If so, we use that hardware address when creating the NCE below.
1087 	 * Note that we don't yet have a mechanism to remove these NCEs if the
1088 	 * NCE for `dst' on the IPMP meta-interface is subsequently removed --
1089 	 * but rather than build such a beast, we should fix NCEs so that they
1090 	 * can be properly shared across an IPMP group.
1091 	 */
1092 	if (IS_UNDER_IPMP(ill)) {
1093 		if ((ipmp_ill = ipmp_ill_hold_ipmp_ill(ill)) != NULL) {
1094 			hw_nce = ndp_lookup_v6(ipmp_ill, B_FALSE, dst, B_FALSE);
1095 			if (hw_nce != NULL && hw_nce->nce_res_mp != NULL) {
1096 				hwaddr = hw_nce->nce_res_mp->b_rptr +
1097 				    NCE_LL_ADDR_OFFSET(ipmp_ill);
1098 				nce_flags |= hw_nce->nce_flags;
1099 			}
1100 			ill_refrele(ipmp_ill);
1101 		}
1102 	}
1103 
1104 	err = ndp_lookup_then_add_v6(ill,
1105 	    B_FALSE,	/* NCE fastpath is per ill; don't match across group */
1106 	    hwaddr,
1107 	    dst,
1108 	    &ipv6_all_ones,
1109 	    &ipv6_all_zeros,
1110 	    0,
1111 	    nce_flags,
1112 	    hwaddr != NULL ? ND_REACHABLE : ND_INCOMPLETE,
1113 	    &nce);
1114 
1115 	if (hw_nce != NULL)
1116 		NCE_REFRELE(hw_nce);
1117 
1118 	switch (err) {
1119 	case 0:
1120 		/*
1121 		 * New cache entry was created. Make sure that the state
1122 		 * is not ND_INCOMPLETE. It can be in some other state
1123 		 * even before we send out the solicitation as we could
1124 		 * get un-solicited advertisements.
1125 		 *
1126 		 * If this is an XRESOLV interface, simply return 0,
1127 		 * since we don't want to solicit just yet.
1128 		 */
1129 		if (ill->ill_flags & ILLF_XRESOLV) {
1130 			NCE_REFRELE(nce);
1131 			return (0);
1132 		}
1133 
1134 		mutex_enter(&nce->nce_lock);
1135 		if (nce->nce_state != ND_INCOMPLETE) {
1136 			mutex_exit(&nce->nce_lock);
1137 			NCE_REFRELE(nce);
1138 			return (0);
1139 		}
1140 		mp_nce = ip_prepend_zoneid(mp, zoneid, ipst);
1141 		if (mp_nce == NULL) {
1142 			/* The caller will free mp */
1143 			mutex_exit(&nce->nce_lock);
1144 			ndp_delete(nce);
1145 			NCE_REFRELE(nce);
1146 			return (ENOMEM);
1147 		}
1148 		if ((ms = nce_solicit(nce, mp_nce)) == 0) {
1149 			/* The caller will free mp */
1150 			if (mp_nce != mp)
1151 				freeb(mp_nce);
1152 			mutex_exit(&nce->nce_lock);
1153 			ndp_delete(nce);
1154 			NCE_REFRELE(nce);
1155 			return (EBUSY);
1156 		}
1157 		mutex_exit(&nce->nce_lock);
1158 		NDP_RESTART_TIMER(nce, (clock_t)ms);
1159 		NCE_REFRELE(nce);
1160 		return (EINPROGRESS);
1161 	case EEXIST:
1162 		/* Resolution in progress just queue the packet */
1163 		mutex_enter(&nce->nce_lock);
1164 		if (nce->nce_state == ND_INCOMPLETE) {
1165 			mp_nce = ip_prepend_zoneid(mp, zoneid, ipst);
1166 			if (mp_nce == NULL) {
1167 				err = ENOMEM;
1168 			} else {
1169 				nce_queue_mp(nce, mp_nce);
1170 				err = EINPROGRESS;
1171 			}
1172 		} else {
1173 			/*
1174 			 * Any other state implies we have
1175 			 * a nce but IRE needs to be added ...
1176 			 * ire_add_v6() will take care of the
1177 			 * the case when the nce becomes CONDEMNED
1178 			 * before the ire is added to the table.
1179 			 */
1180 			err = 0;
1181 		}
1182 		mutex_exit(&nce->nce_lock);
1183 		NCE_REFRELE(nce);
1184 		break;
1185 	default:
1186 		ip1dbg(("ndp_resolver: Can't create NCE %d\n", err));
1187 		break;
1188 	}
1189 	return (err);
1190 }
1191 
1192 /*
1193  * When there is no resolver, the link layer template is passed in
1194  * the IRE.
1195  * Lookup a NCE for a given IRE.  Regardless of whether one exists
1196  * or one is created, we defer making ire point to nce until the
1197  * ire is actually added at which point the nce_refcnt on the nce is
1198  * incremented.  This is done primarily to have symmetry between ire_add()
1199  * and ire_delete() which decrements the nce_refcnt, when an ire is deleted.
1200  */
1201 int
1202 ndp_noresolver(ill_t *ill, const in6_addr_t *dst)
1203 {
1204 	nce_t		*nce;
1205 	int		err = 0;
1206 
1207 	ASSERT(ill != NULL);
1208 	ASSERT(ill->ill_isv6);
1209 	if (IN6_IS_ADDR_MULTICAST(dst)) {
1210 		err = nce_set_multicast(ill, dst);
1211 		return (err);
1212 	}
1213 
1214 	err = ndp_lookup_then_add_v6(ill,
1215 	    B_FALSE,	/* NCE fastpath is per ill; don't match across group */
1216 	    NULL,	/* hardware address */
1217 	    dst,
1218 	    &ipv6_all_ones,
1219 	    &ipv6_all_zeros,
1220 	    0,
1221 	    (ill->ill_flags & ILLF_NONUD) ? NCE_F_NONUD : 0,
1222 	    ND_REACHABLE,
1223 	    &nce);
1224 
1225 	switch (err) {
1226 	case 0:
1227 		/*
1228 		 * Cache entry with a proper resolver cookie was
1229 		 * created.
1230 		 */
1231 		NCE_REFRELE(nce);
1232 		break;
1233 	case EEXIST:
1234 		err = 0;
1235 		NCE_REFRELE(nce);
1236 		break;
1237 	default:
1238 		ip1dbg(("ndp_noresolver: Can't create NCE %d\n", err));
1239 		break;
1240 	}
1241 	return (err);
1242 }
1243 
1244 /*
1245  * For each interface an entry is added for the unspecified multicast group.
1246  * Here that mapping is used to form the multicast cache entry for a particular
1247  * multicast destination.
1248  */
1249 static int
1250 nce_set_multicast(ill_t *ill, const in6_addr_t *dst)
1251 {
1252 	nce_t		*mnce;	/* Multicast mapping entry */
1253 	nce_t		*nce;
1254 	uchar_t		*hw_addr = NULL;
1255 	int		err = 0;
1256 	ip_stack_t	*ipst = ill->ill_ipst;
1257 
1258 	ASSERT(ill != NULL);
1259 	ASSERT(ill->ill_isv6);
1260 	ASSERT(!(IN6_IS_ADDR_UNSPECIFIED(dst)));
1261 
1262 	mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
1263 	nce = *((nce_t **)NCE_HASH_PTR_V6(ipst, *dst));
1264 	nce = nce_lookup_addr(ill, B_FALSE, dst, nce);
1265 	if (nce != NULL) {
1266 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
1267 		NCE_REFRELE(nce);
1268 		return (0);
1269 	}
1270 	/* No entry, now lookup for a mapping this should never fail */
1271 	mnce = nce_lookup_mapping(ill, dst);
1272 	if (mnce == NULL) {
1273 		/* Something broken for the interface. */
1274 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
1275 		return (ESRCH);
1276 	}
1277 	ASSERT(mnce->nce_flags & NCE_F_MAPPING);
1278 	if (ill->ill_net_type == IRE_IF_RESOLVER) {
1279 		/*
1280 		 * For IRE_IF_RESOLVER a hardware mapping can be
1281 		 * generated, for IRE_IF_NORESOLVER, resolution cookie
1282 		 * in the ill is copied in ndp_add_v6().
1283 		 */
1284 		hw_addr = kmem_alloc(ill->ill_nd_lla_len, KM_NOSLEEP);
1285 		if (hw_addr == NULL) {
1286 			mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
1287 			NCE_REFRELE(mnce);
1288 			return (ENOMEM);
1289 		}
1290 		nce_make_mapping(mnce, hw_addr, (uchar_t *)dst);
1291 	}
1292 	NCE_REFRELE(mnce);
1293 	/*
1294 	 * IRE_IF_NORESOLVER type simply copies the resolution
1295 	 * cookie passed in.  So no hw_addr is needed.
1296 	 */
1297 	err = ndp_add_v6(ill,
1298 	    hw_addr,
1299 	    dst,
1300 	    &ipv6_all_ones,
1301 	    &ipv6_all_zeros,
1302 	    0,
1303 	    NCE_F_NONUD,
1304 	    ND_REACHABLE,
1305 	    &nce);
1306 	mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
1307 	if (hw_addr != NULL)
1308 		kmem_free(hw_addr, ill->ill_nd_lla_len);
1309 	if (err != 0) {
1310 		ip1dbg(("nce_set_multicast: create failed" "%d\n", err));
1311 		return (err);
1312 	}
1313 	NCE_REFRELE(nce);
1314 	return (0);
1315 }
1316 
1317 /*
1318  * Return the link layer address, and any flags of a nce.
1319  */
1320 int
1321 ndp_query(ill_t *ill, struct lif_nd_req *lnr)
1322 {
1323 	nce_t		*nce;
1324 	in6_addr_t	*addr;
1325 	sin6_t		*sin6;
1326 	dl_unitdata_req_t	*dl;
1327 
1328 	ASSERT(ill != NULL && ill->ill_isv6);
1329 	sin6 = (sin6_t *)&lnr->lnr_addr;
1330 	addr =  &sin6->sin6_addr;
1331 
1332 	/*
1333 	 * NOTE: if the ill is an IPMP interface, then match against the whole
1334 	 * illgrp.  This e.g. allows in.ndpd to retrieve the link layer
1335 	 * addresses for the data addresses on an IPMP interface even though
1336 	 * ipif_ndp_up() created them with an nce_ill of ipif_bound_ill.
1337 	 */
1338 	nce = ndp_lookup_v6(ill, IS_IPMP(ill), addr, B_FALSE);
1339 	if (nce == NULL)
1340 		return (ESRCH);
1341 	/* If in INCOMPLETE state, no link layer address is available yet */
1342 	if (nce->nce_state == ND_INCOMPLETE)
1343 		goto done;
1344 	dl = (dl_unitdata_req_t *)nce->nce_res_mp->b_rptr;
1345 	if (ill->ill_flags & ILLF_XRESOLV)
1346 		lnr->lnr_hdw_len = dl->dl_dest_addr_length;
1347 	else
1348 		lnr->lnr_hdw_len = ill->ill_nd_lla_len;
1349 	ASSERT(NCE_LL_ADDR_OFFSET(ill) + lnr->lnr_hdw_len <=
1350 	    sizeof (lnr->lnr_hdw_addr));
1351 	bcopy(nce->nce_res_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill),
1352 	    (uchar_t *)&lnr->lnr_hdw_addr, lnr->lnr_hdw_len);
1353 	if (nce->nce_flags & NCE_F_ISROUTER)
1354 		lnr->lnr_flags = NDF_ISROUTER_ON;
1355 	if (nce->nce_flags & NCE_F_ANYCAST)
1356 		lnr->lnr_flags |= NDF_ANYCAST_ON;
1357 done:
1358 	NCE_REFRELE(nce);
1359 	return (0);
1360 }
1361 
1362 /*
1363  * Send Enable/Disable multicast reqs to driver.
1364  */
1365 int
1366 ndp_mcastreq(ill_t *ill, const in6_addr_t *addr, uint32_t hw_addr_len,
1367     uint32_t hw_addr_offset, mblk_t *mp)
1368 {
1369 	nce_t		*nce;
1370 	uchar_t		*hw_addr;
1371 	ip_stack_t	*ipst = ill->ill_ipst;
1372 
1373 	ASSERT(ill != NULL && ill->ill_isv6);
1374 	ASSERT(ill->ill_net_type == IRE_IF_RESOLVER);
1375 	hw_addr = mi_offset_paramc(mp, hw_addr_offset, hw_addr_len);
1376 	if (hw_addr == NULL || !IN6_IS_ADDR_MULTICAST(addr)) {
1377 		freemsg(mp);
1378 		return (EINVAL);
1379 	}
1380 	mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
1381 	nce = nce_lookup_mapping(ill, addr);
1382 	if (nce == NULL) {
1383 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
1384 		freemsg(mp);
1385 		return (ESRCH);
1386 	}
1387 	mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
1388 	/*
1389 	 * Update dl_addr_length and dl_addr_offset for primitives that
1390 	 * have physical addresses as opposed to full saps
1391 	 */
1392 	switch (((union DL_primitives *)mp->b_rptr)->dl_primitive) {
1393 	case DL_ENABMULTI_REQ:
1394 		/* Track the state if this is the first enabmulti */
1395 		if (ill->ill_dlpi_multicast_state == IDS_UNKNOWN)
1396 			ill->ill_dlpi_multicast_state = IDS_INPROGRESS;
1397 		ip1dbg(("ndp_mcastreq: ENABMULTI\n"));
1398 		break;
1399 	case DL_DISABMULTI_REQ:
1400 		ip1dbg(("ndp_mcastreq: DISABMULTI\n"));
1401 		break;
1402 	default:
1403 		NCE_REFRELE(nce);
1404 		ip1dbg(("ndp_mcastreq: default\n"));
1405 		return (EINVAL);
1406 	}
1407 	nce_make_mapping(nce, hw_addr, (uchar_t *)addr);
1408 	NCE_REFRELE(nce);
1409 	ill_dlpi_send(ill, mp);
1410 	return (0);
1411 }
1412 
1413 /*
1414  * Send a neighbor solicitation.
1415  * Returns number of milliseconds after which we should either rexmit or abort.
1416  * Return of zero means we should abort.
1417  * The caller holds the nce_lock to protect nce_qd_mp and nce_rcnt.
1418  *
1419  * NOTE: This routine drops nce_lock (and later reacquires it) when sending
1420  * the packet.
1421  * NOTE: This routine does not consume mp.
1422  */
1423 uint32_t
1424 nce_solicit(nce_t *nce, mblk_t *mp)
1425 {
1426 	ip6_t		*ip6h;
1427 	in6_addr_t	sender;
1428 	boolean_t	dropped;
1429 
1430 	ASSERT(MUTEX_HELD(&nce->nce_lock));
1431 
1432 	if (nce->nce_rcnt == 0)
1433 		return (0);
1434 
1435 	if (mp == NULL) {
1436 		ASSERT(nce->nce_qd_mp != NULL);
1437 		mp = nce->nce_qd_mp;
1438 	} else {
1439 		nce_queue_mp(nce, mp);
1440 	}
1441 
1442 	/* Handle ip_newroute_v6 giving us IPSEC packets */
1443 	if (mp->b_datap->db_type == M_CTL)
1444 		mp = mp->b_cont;
1445 
1446 	ip6h = (ip6_t *)mp->b_rptr;
1447 	if (ip6h->ip6_nxt == IPPROTO_RAW) {
1448 		/*
1449 		 * This message should have been pulled up already in
1450 		 * ip_wput_v6. We can't do pullups here because the message
1451 		 * could be from the nce_qd_mp which could have b_next/b_prev
1452 		 * non-NULL.
1453 		 */
1454 		ASSERT(MBLKL(mp) >= sizeof (ip6i_t) + IPV6_HDR_LEN);
1455 		ip6h = (ip6_t *)(mp->b_rptr + sizeof (ip6i_t));
1456 	}
1457 
1458 	/*
1459 	 * Need to copy the sender address into a local since `mp' can
1460 	 * go away once we drop nce_lock.
1461 	 */
1462 	sender = ip6h->ip6_src;
1463 	nce->nce_rcnt--;
1464 	mutex_exit(&nce->nce_lock);
1465 	dropped = nce_xmit_solicit(nce, B_TRUE, &sender, 0);
1466 	mutex_enter(&nce->nce_lock);
1467 	if (dropped)
1468 		nce->nce_rcnt++;
1469 	return (nce->nce_ill->ill_reachable_retrans_time);
1470 }
1471 
1472 /*
1473  * Attempt to recover an address on an interface that's been marked as a
1474  * duplicate.  Because NCEs are destroyed when the interface goes down, there's
1475  * no easy way to just probe the address and have the right thing happen if
1476  * it's no longer in use.  Instead, we just bring it up normally and allow the
1477  * regular interface start-up logic to probe for a remaining duplicate and take
1478  * us back down if necessary.
1479  * Neither DHCP nor temporary addresses arrive here; they're excluded by
1480  * ip_ndp_excl.
1481  */
1482 /* ARGSUSED */
1483 static void
1484 ip_ndp_recover(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg)
1485 {
1486 	ill_t	*ill = rq->q_ptr;
1487 	ipif_t	*ipif;
1488 	in6_addr_t *addr = (in6_addr_t *)mp->b_rptr;
1489 
1490 	for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
1491 		/*
1492 		 * We do not support recovery of proxy ARP'd interfaces,
1493 		 * because the system lacks a complete proxy ARP mechanism.
1494 		 */
1495 		if ((ipif->ipif_flags & IPIF_POINTOPOINT) ||
1496 		    !IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6lcl_addr, addr)) {
1497 			continue;
1498 		}
1499 
1500 		/*
1501 		 * If we have already recovered or if the interface is going
1502 		 * away, then ignore.
1503 		 */
1504 		mutex_enter(&ill->ill_lock);
1505 		if (!(ipif->ipif_flags & IPIF_DUPLICATE) ||
1506 		    (ipif->ipif_state_flags & IPIF_CONDEMNED)) {
1507 			mutex_exit(&ill->ill_lock);
1508 			continue;
1509 		}
1510 
1511 		ipif->ipif_flags &= ~IPIF_DUPLICATE;
1512 		ill->ill_ipif_dup_count--;
1513 		mutex_exit(&ill->ill_lock);
1514 		ipif->ipif_was_dup = B_TRUE;
1515 
1516 		VERIFY(ipif_ndp_up(ipif, B_TRUE) != EINPROGRESS);
1517 		(void) ipif_up_done_v6(ipif);
1518 	}
1519 	freeb(mp);
1520 }
1521 
1522 /*
1523  * Attempt to recover an IPv6 interface that's been shut down as a duplicate.
1524  * As long as someone else holds the address, the interface will stay down.
1525  * When that conflict goes away, the interface is brought back up.  This is
1526  * done so that accidental shutdowns of addresses aren't made permanent.  Your
1527  * server will recover from a failure.
1528  *
1529  * For DHCP and temporary addresses, recovery is not done in the kernel.
1530  * Instead, it's handled by user space processes (dhcpagent and in.ndpd).
1531  *
1532  * This function is entered on a timer expiry; the ID is in ipif_recovery_id.
1533  */
1534 static void
1535 ipif6_dup_recovery(void *arg)
1536 {
1537 	ipif_t *ipif = arg;
1538 
1539 	ipif->ipif_recovery_id = 0;
1540 	if (!(ipif->ipif_flags & IPIF_DUPLICATE))
1541 		return;
1542 
1543 	/*
1544 	 * No lock, because this is just an optimization.
1545 	 */
1546 	if (ipif->ipif_state_flags & IPIF_CONDEMNED)
1547 		return;
1548 
1549 	/* If the link is down, we'll retry this later */
1550 	if (!(ipif->ipif_ill->ill_phyint->phyint_flags & PHYI_RUNNING))
1551 		return;
1552 
1553 	ndp_do_recovery(ipif);
1554 }
1555 
1556 /*
1557  * Perform interface recovery by forcing the duplicate interfaces up and
1558  * allowing the system to determine which ones should stay up.
1559  *
1560  * Called both by recovery timer expiry and link-up notification.
1561  */
1562 void
1563 ndp_do_recovery(ipif_t *ipif)
1564 {
1565 	ill_t *ill = ipif->ipif_ill;
1566 	mblk_t *mp;
1567 	ip_stack_t *ipst = ill->ill_ipst;
1568 
1569 	mp = allocb(sizeof (ipif->ipif_v6lcl_addr), BPRI_MED);
1570 	if (mp == NULL) {
1571 		mutex_enter(&ill->ill_lock);
1572 		if (ipif->ipif_recovery_id == 0 &&
1573 		    !(ipif->ipif_state_flags & IPIF_CONDEMNED)) {
1574 			ipif->ipif_recovery_id = timeout(ipif6_dup_recovery,
1575 			    ipif, MSEC_TO_TICK(ipst->ips_ip_dup_recovery));
1576 		}
1577 		mutex_exit(&ill->ill_lock);
1578 	} else {
1579 		/*
1580 		 * A recovery timer may still be running if we got here from
1581 		 * ill_restart_dad(); cancel that timer.
1582 		 */
1583 		if (ipif->ipif_recovery_id != 0)
1584 			(void) untimeout(ipif->ipif_recovery_id);
1585 		ipif->ipif_recovery_id = 0;
1586 
1587 		bcopy(&ipif->ipif_v6lcl_addr, mp->b_rptr,
1588 		    sizeof (ipif->ipif_v6lcl_addr));
1589 		ill_refhold(ill);
1590 		qwriter_ip(ill, ill->ill_rq, mp, ip_ndp_recover, NEW_OP,
1591 		    B_FALSE);
1592 	}
1593 }
1594 
1595 /*
1596  * Find the MAC and IP addresses in an NA/NS message.
1597  */
1598 static void
1599 ip_ndp_find_addresses(mblk_t *mp, mblk_t *dl_mp, ill_t *ill, in6_addr_t *targp,
1600     uchar_t **haddr, uint_t *haddrlenp)
1601 {
1602 	ip6_t *ip6h = (ip6_t *)mp->b_rptr;
1603 	icmp6_t *icmp6 = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
1604 	nd_neighbor_advert_t *na = (nd_neighbor_advert_t *)icmp6;
1605 	nd_neighbor_solicit_t *ns = (nd_neighbor_solicit_t *)icmp6;
1606 	uchar_t *addr;
1607 	int alen = 0;
1608 
1609 	if (dl_mp == NULL) {
1610 		nd_opt_hdr_t *opt = NULL;
1611 		int len;
1612 
1613 		/*
1614 		 * If it's from the fast-path, then it can't be a probe
1615 		 * message, and thus must include a linkaddr option.
1616 		 * Extract that here.
1617 		 */
1618 		switch (icmp6->icmp6_type) {
1619 		case ND_NEIGHBOR_SOLICIT:
1620 			len = mp->b_wptr - (uchar_t *)ns;
1621 			if ((len -= sizeof (*ns)) > 0) {
1622 				opt = ndp_get_option((nd_opt_hdr_t *)(ns + 1),
1623 				    len, ND_OPT_SOURCE_LINKADDR);
1624 			}
1625 			break;
1626 		case ND_NEIGHBOR_ADVERT:
1627 			len = mp->b_wptr - (uchar_t *)na;
1628 			if ((len -= sizeof (*na)) > 0) {
1629 				opt = ndp_get_option((nd_opt_hdr_t *)(na + 1),
1630 				    len, ND_OPT_TARGET_LINKADDR);
1631 			}
1632 			break;
1633 		}
1634 
1635 		if (opt != NULL && opt->nd_opt_len * 8 - sizeof (*opt) >=
1636 		    ill->ill_nd_lla_len) {
1637 			addr = (uchar_t *)(opt + 1);
1638 			alen = ill->ill_nd_lla_len;
1639 		}
1640 
1641 		/*
1642 		 * We cheat a bit here for the sake of printing usable log
1643 		 * messages in the rare case where the reply we got was unicast
1644 		 * without a source linkaddr option, and the interface is in
1645 		 * fastpath mode.  (Sigh.)
1646 		 */
1647 		if (alen == 0 && ill->ill_type == IFT_ETHER &&
1648 		    MBLKHEAD(mp) >= sizeof (struct ether_header)) {
1649 			struct ether_header *pether;
1650 
1651 			pether = (struct ether_header *)((char *)ip6h -
1652 			    sizeof (*pether));
1653 			addr = pether->ether_shost.ether_addr_octet;
1654 			alen = ETHERADDRL;
1655 		}
1656 	} else {
1657 		dl_unitdata_ind_t *dlu;
1658 
1659 		dlu = (dl_unitdata_ind_t *)dl_mp->b_rptr;
1660 		alen = dlu->dl_src_addr_length;
1661 		if (alen > 0 && dlu->dl_src_addr_offset >= sizeof (*dlu) &&
1662 		    dlu->dl_src_addr_offset + alen <= MBLKL(dl_mp)) {
1663 			addr = dl_mp->b_rptr + dlu->dl_src_addr_offset;
1664 			if (ill->ill_sap_length < 0) {
1665 				alen += ill->ill_sap_length;
1666 			} else {
1667 				addr += ill->ill_sap_length;
1668 				alen -= ill->ill_sap_length;
1669 			}
1670 		}
1671 	}
1672 
1673 	if (alen > 0) {
1674 		*haddr = addr;
1675 		*haddrlenp = alen;
1676 	} else {
1677 		*haddr = NULL;
1678 		*haddrlenp = 0;
1679 	}
1680 
1681 	/* nd_ns_target and nd_na_target are at the same offset, so we cheat */
1682 	*targp = ns->nd_ns_target;
1683 }
1684 
1685 /*
1686  * This is for exclusive changes due to NDP duplicate address detection
1687  * failure.
1688  */
1689 /* ARGSUSED */
1690 static void
1691 ip_ndp_excl(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg)
1692 {
1693 	ill_t	*ill = rq->q_ptr;
1694 	ipif_t	*ipif;
1695 	mblk_t	*dl_mp = NULL;
1696 	uchar_t	*haddr;
1697 	uint_t	haddrlen;
1698 	ip_stack_t *ipst = ill->ill_ipst;
1699 	in6_addr_t targ;
1700 
1701 	if (DB_TYPE(mp) != M_DATA) {
1702 		dl_mp = mp;
1703 		mp = mp->b_cont;
1704 	}
1705 
1706 	ip_ndp_find_addresses(mp, dl_mp, ill, &targ, &haddr, &haddrlen);
1707 	if (haddr != NULL && haddrlen == ill->ill_phys_addr_length) {
1708 		/*
1709 		 * Ignore conflicts generated by misbehaving switches that
1710 		 * just reflect our own messages back to us.  For IPMP, we may
1711 		 * see reflections across any ill in the illgrp.
1712 		 */
1713 		if (bcmp(haddr, ill->ill_phys_addr, haddrlen) == 0 ||
1714 		    IS_UNDER_IPMP(ill) &&
1715 		    ipmp_illgrp_find_ill(ill->ill_grp, haddr, haddrlen) != NULL)
1716 			goto ignore_conflict;
1717 	}
1718 
1719 	/*
1720 	 * Look up the appropriate ipif.
1721 	 */
1722 	ipif = ipif_lookup_addr_v6(&targ, ill, ALL_ZONES, NULL, NULL, NULL,
1723 	    NULL, ipst);
1724 	if (ipif == NULL)
1725 		goto ignore_conflict;
1726 
1727 	/* Reload the ill to match the ipif */
1728 	ill = ipif->ipif_ill;
1729 
1730 	/* If it's already duplicate or ineligible, then don't do anything. */
1731 	if (ipif->ipif_flags & (IPIF_POINTOPOINT|IPIF_DUPLICATE)) {
1732 		ipif_refrele(ipif);
1733 		goto ignore_conflict;
1734 	}
1735 
1736 	/*
1737 	 * If this is a failure during duplicate recovery, then don't
1738 	 * complain.  It may take a long time to recover.
1739 	 */
1740 	if (!ipif->ipif_was_dup) {
1741 		char ibuf[LIFNAMSIZ];
1742 		char hbuf[MAC_STR_LEN];
1743 		char sbuf[INET6_ADDRSTRLEN];
1744 
1745 		ipif_get_name(ipif, ibuf, sizeof (ibuf));
1746 		cmn_err(CE_WARN, "%s has duplicate address %s (in use by %s);"
1747 		    " disabled", ibuf,
1748 		    inet_ntop(AF_INET6, &targ, sbuf, sizeof (sbuf)),
1749 		    mac_colon_addr(haddr, haddrlen, hbuf, sizeof (hbuf)));
1750 	}
1751 	mutex_enter(&ill->ill_lock);
1752 	ASSERT(!(ipif->ipif_flags & IPIF_DUPLICATE));
1753 	ipif->ipif_flags |= IPIF_DUPLICATE;
1754 	ill->ill_ipif_dup_count++;
1755 	mutex_exit(&ill->ill_lock);
1756 	(void) ipif_down(ipif, NULL, NULL);
1757 	ipif_down_tail(ipif);
1758 	mutex_enter(&ill->ill_lock);
1759 	if (!(ipif->ipif_flags & (IPIF_DHCPRUNNING|IPIF_TEMPORARY)) &&
1760 	    ill->ill_net_type == IRE_IF_RESOLVER &&
1761 	    !(ipif->ipif_state_flags & IPIF_CONDEMNED) &&
1762 	    ipst->ips_ip_dup_recovery > 0) {
1763 		ASSERT(ipif->ipif_recovery_id == 0);
1764 		ipif->ipif_recovery_id = timeout(ipif6_dup_recovery,
1765 		    ipif, MSEC_TO_TICK(ipst->ips_ip_dup_recovery));
1766 	}
1767 	mutex_exit(&ill->ill_lock);
1768 	ipif_refrele(ipif);
1769 ignore_conflict:
1770 	if (dl_mp != NULL)
1771 		freeb(dl_mp);
1772 	freemsg(mp);
1773 }
1774 
1775 /*
1776  * Handle failure by tearing down the ipifs with the specified address.  Note
1777  * that tearing down the ipif also means deleting the nce through ipif_down, so
1778  * it's not possible to do recovery by just restarting the nce timer.  Instead,
1779  * we start a timer on the ipif.
1780  */
1781 static void
1782 ip_ndp_failure(ill_t *ill, mblk_t *mp, mblk_t *dl_mp)
1783 {
1784 	if ((mp = copymsg(mp)) != NULL) {
1785 		if (dl_mp == NULL)
1786 			dl_mp = mp;
1787 		else if ((dl_mp = copyb(dl_mp)) != NULL)
1788 			dl_mp->b_cont = mp;
1789 		if (dl_mp == NULL) {
1790 			freemsg(mp);
1791 		} else {
1792 			ill_refhold(ill);
1793 			qwriter_ip(ill, ill->ill_rq, dl_mp, ip_ndp_excl, NEW_OP,
1794 			    B_FALSE);
1795 		}
1796 	}
1797 }
1798 
1799 /*
1800  * Handle a discovered conflict: some other system is advertising that it owns
1801  * one of our IP addresses.  We need to defend ourselves, or just shut down the
1802  * interface.
1803  */
1804 static void
1805 ip_ndp_conflict(ill_t *ill, mblk_t *mp, mblk_t *dl_mp, nce_t *nce)
1806 {
1807 	ipif_t *ipif;
1808 	uint32_t now;
1809 	uint_t maxdefense;
1810 	uint_t defs;
1811 	ip_stack_t *ipst = ill->ill_ipst;
1812 
1813 	ipif = ipif_lookup_addr_v6(&nce->nce_addr, ill, ALL_ZONES, NULL, NULL,
1814 	    NULL, NULL, ipst);
1815 	if (ipif == NULL)
1816 		return;
1817 
1818 	/*
1819 	 * First, figure out if this address is disposable.
1820 	 */
1821 	if (ipif->ipif_flags & (IPIF_DHCPRUNNING | IPIF_TEMPORARY))
1822 		maxdefense = ipst->ips_ip_max_temp_defend;
1823 	else
1824 		maxdefense = ipst->ips_ip_max_defend;
1825 
1826 	/*
1827 	 * Now figure out how many times we've defended ourselves.  Ignore
1828 	 * defenses that happened long in the past.
1829 	 */
1830 	now = gethrestime_sec();
1831 	mutex_enter(&nce->nce_lock);
1832 	if ((defs = nce->nce_defense_count) > 0 &&
1833 	    now - nce->nce_defense_time > ipst->ips_ip_defend_interval) {
1834 		nce->nce_defense_count = defs = 0;
1835 	}
1836 	nce->nce_defense_count++;
1837 	nce->nce_defense_time = now;
1838 	mutex_exit(&nce->nce_lock);
1839 	ipif_refrele(ipif);
1840 
1841 	/*
1842 	 * If we've defended ourselves too many times already, then give up and
1843 	 * tear down the interface(s) using this address.  Otherwise, defend by
1844 	 * sending out an unsolicited Neighbor Advertisement.
1845 	 */
1846 	if (defs >= maxdefense) {
1847 		ip_ndp_failure(ill, mp, dl_mp);
1848 	} else {
1849 		char hbuf[MAC_STR_LEN];
1850 		char sbuf[INET6_ADDRSTRLEN];
1851 		uchar_t *haddr;
1852 		uint_t haddrlen;
1853 		in6_addr_t targ;
1854 
1855 		ip_ndp_find_addresses(mp, dl_mp, ill, &targ, &haddr, &haddrlen);
1856 		cmn_err(CE_WARN, "node %s is using our IP address %s on %s",
1857 		    mac_colon_addr(haddr, haddrlen, hbuf, sizeof (hbuf)),
1858 		    inet_ntop(AF_INET6, &targ, sbuf, sizeof (sbuf)),
1859 		    ill->ill_name);
1860 
1861 		(void) nce_xmit_advert(nce, B_FALSE, &ipv6_all_hosts_mcast, 0);
1862 	}
1863 }
1864 
1865 static void
1866 ndp_input_solicit(ill_t *ill, mblk_t *mp, mblk_t *dl_mp)
1867 {
1868 	nd_neighbor_solicit_t *ns;
1869 	uint32_t	hlen = ill->ill_nd_lla_len;
1870 	uchar_t		*haddr = NULL;
1871 	icmp6_t		*icmp_nd;
1872 	ip6_t		*ip6h;
1873 	nce_t		*our_nce = NULL;
1874 	in6_addr_t	target;
1875 	in6_addr_t	src;
1876 	int		len;
1877 	int		flag = 0;
1878 	nd_opt_hdr_t	*opt = NULL;
1879 	boolean_t	bad_solicit = B_FALSE;
1880 	mib2_ipv6IfIcmpEntry_t	*mib = ill->ill_icmp6_mib;
1881 
1882 	ip6h = (ip6_t *)mp->b_rptr;
1883 	icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
1884 	len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN;
1885 	src = ip6h->ip6_src;
1886 	ns = (nd_neighbor_solicit_t *)icmp_nd;
1887 	target = ns->nd_ns_target;
1888 	if (IN6_IS_ADDR_MULTICAST(&target)) {
1889 		if (ip_debug > 2) {
1890 			/* ip1dbg */
1891 			pr_addr_dbg("ndp_input_solicit: Target is"
1892 			    " multicast! %s\n", AF_INET6, &target);
1893 		}
1894 		bad_solicit = B_TRUE;
1895 		goto done;
1896 	}
1897 	if (len > sizeof (nd_neighbor_solicit_t)) {
1898 		/* Options present */
1899 		opt = (nd_opt_hdr_t *)&ns[1];
1900 		len -= sizeof (nd_neighbor_solicit_t);
1901 		if (!ndp_verify_optlen(opt, len)) {
1902 			ip1dbg(("ndp_input_solicit: Bad opt len\n"));
1903 			bad_solicit = B_TRUE;
1904 			goto done;
1905 		}
1906 
1907 	}
1908 	if (IN6_IS_ADDR_UNSPECIFIED(&src)) {
1909 		/* Check to see if this is a valid DAD solicitation */
1910 		if (!IN6_IS_ADDR_MC_SOLICITEDNODE(&ip6h->ip6_dst)) {
1911 			if (ip_debug > 2) {
1912 				/* ip1dbg */
1913 				pr_addr_dbg("ndp_input_solicit: IPv6 "
1914 				    "Destination is not solicited node "
1915 				    "multicast %s\n", AF_INET6,
1916 				    &ip6h->ip6_dst);
1917 			}
1918 			bad_solicit = B_TRUE;
1919 			goto done;
1920 		}
1921 	}
1922 
1923 	/*
1924 	 * NOTE: with IPMP, it's possible the nominated multicast ill (which
1925 	 * received this packet if it's multicast) is not the ill tied to
1926 	 * e.g. the IPMP ill's data link-local.  So we match across the illgrp
1927 	 * to ensure we find the associated NCE.
1928 	 */
1929 	our_nce = ndp_lookup_v6(ill, B_TRUE, &target, B_FALSE);
1930 	/*
1931 	 * If this is a valid Solicitation, a permanent
1932 	 * entry should exist in the cache
1933 	 */
1934 	if (our_nce == NULL ||
1935 	    !(our_nce->nce_flags & NCE_F_PERMANENT)) {
1936 		ip1dbg(("ndp_input_solicit: Wrong target in NS?!"
1937 		    "ifname=%s ", ill->ill_name));
1938 		if (ip_debug > 2) {
1939 			/* ip1dbg */
1940 			pr_addr_dbg(" dst %s\n", AF_INET6, &target);
1941 		}
1942 		bad_solicit = B_TRUE;
1943 		goto done;
1944 	}
1945 
1946 	/* At this point we should have a verified NS per spec */
1947 	if (opt != NULL) {
1948 		opt = ndp_get_option(opt, len, ND_OPT_SOURCE_LINKADDR);
1949 		if (opt != NULL) {
1950 			haddr = (uchar_t *)&opt[1];
1951 			if (hlen > opt->nd_opt_len * 8 - sizeof (*opt) ||
1952 			    hlen == 0) {
1953 				ip1dbg(("ndp_input_solicit: bad SLLA\n"));
1954 				bad_solicit = B_TRUE;
1955 				goto done;
1956 			}
1957 		}
1958 	}
1959 
1960 	/* If sending directly to peer, set the unicast flag */
1961 	if (!IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst))
1962 		flag |= NDP_UNICAST;
1963 
1964 	/*
1965 	 * Create/update the entry for the soliciting node.
1966 	 * or respond to outstanding queries, don't if
1967 	 * the source is unspecified address.
1968 	 */
1969 	if (!IN6_IS_ADDR_UNSPECIFIED(&src)) {
1970 		int	err;
1971 		nce_t	*nnce;
1972 
1973 		ASSERT(ill->ill_isv6);
1974 		/*
1975 		 * Regular solicitations *must* include the Source Link-Layer
1976 		 * Address option.  Ignore messages that do not.
1977 		 */
1978 		if (haddr == NULL && IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst)) {
1979 			ip1dbg(("ndp_input_solicit: source link-layer address "
1980 			    "option missing with a specified source.\n"));
1981 			bad_solicit = B_TRUE;
1982 			goto done;
1983 		}
1984 
1985 		/*
1986 		 * This is a regular solicitation.  If we're still in the
1987 		 * process of verifying the address, then don't respond at all
1988 		 * and don't keep track of the sender.
1989 		 */
1990 		if (our_nce->nce_state == ND_PROBE)
1991 			goto done;
1992 
1993 		/*
1994 		 * If the solicitation doesn't have sender hardware address
1995 		 * (legal for unicast solicitation), then process without
1996 		 * installing the return NCE.  Either we already know it, or
1997 		 * we'll be forced to look it up when (and if) we reply to the
1998 		 * packet.
1999 		 */
2000 		if (haddr == NULL)
2001 			goto no_source;
2002 
2003 		err = ndp_lookup_then_add_v6(ill,
2004 		    B_FALSE,
2005 		    haddr,
2006 		    &src,	/* Soliciting nodes address */
2007 		    &ipv6_all_ones,
2008 		    &ipv6_all_zeros,
2009 		    0,
2010 		    0,
2011 		    ND_STALE,
2012 		    &nnce);
2013 		switch (err) {
2014 		case 0:
2015 			/* done with this entry */
2016 			NCE_REFRELE(nnce);
2017 			break;
2018 		case EEXIST:
2019 			/*
2020 			 * B_FALSE indicates this is not an an advertisement.
2021 			 */
2022 			ndp_process(nnce, haddr, 0, B_FALSE);
2023 			NCE_REFRELE(nnce);
2024 			break;
2025 		default:
2026 			ip1dbg(("ndp_input_solicit: Can't create NCE %d\n",
2027 			    err));
2028 			goto done;
2029 		}
2030 no_source:
2031 		flag |= NDP_SOLICITED;
2032 	} else {
2033 		/*
2034 		 * No source link layer address option should be present in a
2035 		 * valid DAD request.
2036 		 */
2037 		if (haddr != NULL) {
2038 			ip1dbg(("ndp_input_solicit: source link-layer address "
2039 			    "option present with an unspecified source.\n"));
2040 			bad_solicit = B_TRUE;
2041 			goto done;
2042 		}
2043 		if (our_nce->nce_state == ND_PROBE) {
2044 			/*
2045 			 * Internally looped-back probes won't have DLPI
2046 			 * attached to them.  External ones (which are sent by
2047 			 * multicast) always will.  Just ignore our own
2048 			 * transmissions.
2049 			 */
2050 			if (dl_mp != NULL) {
2051 				/*
2052 				 * If someone else is probing our address, then
2053 				 * we've crossed wires.  Declare failure.
2054 				 */
2055 				ip_ndp_failure(ill, mp, dl_mp);
2056 			}
2057 			goto done;
2058 		}
2059 		/*
2060 		 * This is a DAD probe.  Multicast the advertisement to the
2061 		 * all-nodes address.
2062 		 */
2063 		src = ipv6_all_hosts_mcast;
2064 	}
2065 	/* Response to a solicitation */
2066 	(void) nce_xmit_advert(our_nce, B_TRUE, &src, flag);
2067 done:
2068 	if (bad_solicit)
2069 		BUMP_MIB(mib, ipv6IfIcmpInBadNeighborSolicitations);
2070 	if (our_nce != NULL)
2071 		NCE_REFRELE(our_nce);
2072 }
2073 
2074 void
2075 ndp_input_advert(ill_t *ill, mblk_t *mp, mblk_t *dl_mp)
2076 {
2077 	nd_neighbor_advert_t *na;
2078 	uint32_t	hlen = ill->ill_nd_lla_len;
2079 	uchar_t		*haddr = NULL;
2080 	icmp6_t		*icmp_nd;
2081 	ip6_t		*ip6h;
2082 	nce_t		*dst_nce = NULL;
2083 	in6_addr_t	target;
2084 	nd_opt_hdr_t	*opt = NULL;
2085 	int		len;
2086 	ip_stack_t	*ipst = ill->ill_ipst;
2087 	mib2_ipv6IfIcmpEntry_t	*mib = ill->ill_icmp6_mib;
2088 
2089 	ip6h = (ip6_t *)mp->b_rptr;
2090 	icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
2091 	len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN;
2092 	na = (nd_neighbor_advert_t *)icmp_nd;
2093 	if (IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst) &&
2094 	    (na->nd_na_flags_reserved & ND_NA_FLAG_SOLICITED)) {
2095 		ip1dbg(("ndp_input_advert: Target is multicast but the "
2096 		    "solicited flag is not zero\n"));
2097 		BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements);
2098 		return;
2099 	}
2100 	target = na->nd_na_target;
2101 	if (IN6_IS_ADDR_MULTICAST(&target)) {
2102 		ip1dbg(("ndp_input_advert: Target is multicast!\n"));
2103 		BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements);
2104 		return;
2105 	}
2106 	if (len > sizeof (nd_neighbor_advert_t)) {
2107 		opt = (nd_opt_hdr_t *)&na[1];
2108 		if (!ndp_verify_optlen(opt,
2109 		    len - sizeof (nd_neighbor_advert_t))) {
2110 			ip1dbg(("ndp_input_advert: cannot verify SLLA\n"));
2111 			BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements);
2112 			return;
2113 		}
2114 		/* At this point we have a verified NA per spec */
2115 		len -= sizeof (nd_neighbor_advert_t);
2116 		opt = ndp_get_option(opt, len, ND_OPT_TARGET_LINKADDR);
2117 		if (opt != NULL) {
2118 			haddr = (uchar_t *)&opt[1];
2119 			if (hlen > opt->nd_opt_len * 8 - sizeof (*opt) ||
2120 			    hlen == 0) {
2121 				ip1dbg(("ndp_input_advert: bad SLLA\n"));
2122 				BUMP_MIB(mib,
2123 				    ipv6IfIcmpInBadNeighborAdvertisements);
2124 				return;
2125 			}
2126 		}
2127 	}
2128 
2129 	/*
2130 	 * NOTE: we match across the illgrp since we need to do DAD for all of
2131 	 * our local addresses, and those are spread across all the active
2132 	 * ills in the group.
2133 	 */
2134 	if ((dst_nce = ndp_lookup_v6(ill, B_TRUE, &target, B_FALSE)) == NULL)
2135 		return;
2136 
2137 	if (dst_nce->nce_flags & NCE_F_PERMANENT) {
2138 		/*
2139 		 * Someone just advertised one of our local addresses.	First,
2140 		 * check it it was us -- if so, we can safely ignore it.
2141 		 */
2142 		if (haddr != NULL) {
2143 			if (!nce_cmp_ll_addr(dst_nce, haddr, hlen))
2144 				goto out;	/* from us -- no conflict */
2145 
2146 			/*
2147 			 * If we're in an IPMP group, check if this is an echo
2148 			 * from another ill in the group.  Use the double-
2149 			 * checked locking pattern to avoid grabbing
2150 			 * ill_g_lock in the non-IPMP case.
2151 			 */
2152 			if (IS_UNDER_IPMP(ill)) {
2153 				rw_enter(&ipst->ips_ill_g_lock, RW_READER);
2154 				if (IS_UNDER_IPMP(ill) && ipmp_illgrp_find_ill(
2155 				    ill->ill_grp, haddr, hlen) != NULL) {
2156 					rw_exit(&ipst->ips_ill_g_lock);
2157 					goto out;
2158 				}
2159 				rw_exit(&ipst->ips_ill_g_lock);
2160 			}
2161 		}
2162 
2163 		/*
2164 		 * Our own (looped-back) unsolicited neighbor advertisements
2165 		 * will get here with dl_mp == NULL.  (These will usually be
2166 		 * filtered by the `haddr' checks above, but point-to-point
2167 		 * links have no hardware address and thus make it here.)
2168 		 */
2169 		if (dl_mp == NULL && dst_nce->nce_state != ND_PROBE)
2170 			goto out;
2171 
2172 		/*
2173 		 * This appears to be a real conflict.  If we're trying to
2174 		 * configure this NCE (ND_PROBE), then shut it down.
2175 		 * Otherwise, handle the discovered conflict.
2176 		 *
2177 		 * In the ND_PROBE case, dl_mp might be NULL if we're getting
2178 		 * a unicast reply.  This isn't typically done (multicast is
2179 		 * the norm in response to a probe), but we can handle it.
2180 		 */
2181 		if (dst_nce->nce_state == ND_PROBE)
2182 			ip_ndp_failure(ill, mp, dl_mp);
2183 		else
2184 			ip_ndp_conflict(ill, mp, dl_mp, dst_nce);
2185 	} else {
2186 		if (na->nd_na_flags_reserved & ND_NA_FLAG_ROUTER)
2187 			dst_nce->nce_flags |= NCE_F_ISROUTER;
2188 
2189 		/* B_TRUE indicates this an advertisement */
2190 		ndp_process(dst_nce, haddr, na->nd_na_flags_reserved, B_TRUE);
2191 	}
2192 out:
2193 	NCE_REFRELE(dst_nce);
2194 }
2195 
2196 /*
2197  * Process NDP neighbor solicitation/advertisement messages.
2198  * The checksum has already checked o.k before reaching here.
2199  */
2200 void
2201 ndp_input(ill_t *ill, mblk_t *mp, mblk_t *dl_mp)
2202 {
2203 	icmp6_t		*icmp_nd;
2204 	ip6_t		*ip6h;
2205 	int		len;
2206 	mib2_ipv6IfIcmpEntry_t	*mib = ill->ill_icmp6_mib;
2207 
2208 
2209 	if (!pullupmsg(mp, -1)) {
2210 		ip1dbg(("ndp_input: pullupmsg failed\n"));
2211 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2212 		goto done;
2213 	}
2214 	ip6h = (ip6_t *)mp->b_rptr;
2215 	if (ip6h->ip6_hops != IPV6_MAX_HOPS) {
2216 		ip1dbg(("ndp_input: hoplimit != IPV6_MAX_HOPS\n"));
2217 		BUMP_MIB(mib, ipv6IfIcmpBadHoplimit);
2218 		goto done;
2219 	}
2220 	/*
2221 	 * NDP does not accept any extension headers between the
2222 	 * IP header and the ICMP header since e.g. a routing
2223 	 * header could be dangerous.
2224 	 * This assumes that any AH or ESP headers are removed
2225 	 * by ip prior to passing the packet to ndp_input.
2226 	 */
2227 	if (ip6h->ip6_nxt != IPPROTO_ICMPV6) {
2228 		ip1dbg(("ndp_input: Wrong next header 0x%x\n",
2229 		    ip6h->ip6_nxt));
2230 		BUMP_MIB(mib, ipv6IfIcmpInErrors);
2231 		goto done;
2232 	}
2233 	icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
2234 	ASSERT(icmp_nd->icmp6_type == ND_NEIGHBOR_SOLICIT ||
2235 	    icmp_nd->icmp6_type == ND_NEIGHBOR_ADVERT);
2236 	if (icmp_nd->icmp6_code != 0) {
2237 		ip1dbg(("ndp_input: icmp6 code != 0 \n"));
2238 		BUMP_MIB(mib, ipv6IfIcmpInErrors);
2239 		goto done;
2240 	}
2241 	len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN;
2242 	/*
2243 	 * Make sure packet length is large enough for either
2244 	 * a NS or a NA icmp packet.
2245 	 */
2246 	if (len <  sizeof (struct icmp6_hdr) + sizeof (struct in6_addr)) {
2247 		ip1dbg(("ndp_input: packet too short\n"));
2248 		BUMP_MIB(mib, ipv6IfIcmpInErrors);
2249 		goto done;
2250 	}
2251 	if (icmp_nd->icmp6_type == ND_NEIGHBOR_SOLICIT) {
2252 		ndp_input_solicit(ill, mp, dl_mp);
2253 	} else {
2254 		ndp_input_advert(ill, mp, dl_mp);
2255 	}
2256 done:
2257 	freemsg(mp);
2258 }
2259 
2260 /*
2261  * Utility routine to send an advertisement.  Assumes that the NCE cannot
2262  * go away (e.g., because it's refheld).
2263  */
2264 static boolean_t
2265 nce_xmit_advert(nce_t *nce, boolean_t use_nd_lla, const in6_addr_t *target,
2266     uint_t flags)
2267 {
2268 	ASSERT((flags & NDP_PROBE) == 0);
2269 
2270 	if (nce->nce_flags & NCE_F_ISROUTER)
2271 		flags |= NDP_ISROUTER;
2272 	if (!(nce->nce_flags & NCE_F_ANYCAST))
2273 		flags |= NDP_ORIDE;
2274 
2275 	return (nce_xmit(nce->nce_ill, ND_NEIGHBOR_ADVERT, use_nd_lla,
2276 	    &nce->nce_addr, target, flags));
2277 }
2278 
2279 /*
2280  * Utility routine to send a solicitation.  Assumes that the NCE cannot
2281  * go away (e.g., because it's refheld).
2282  */
2283 static boolean_t
2284 nce_xmit_solicit(nce_t *nce, boolean_t use_nd_lla, const in6_addr_t *sender,
2285     uint_t flags)
2286 {
2287 	if (flags & NDP_PROBE)
2288 		sender = &ipv6_all_zeros;
2289 
2290 	return (nce_xmit(nce->nce_ill, ND_NEIGHBOR_SOLICIT, use_nd_lla,
2291 	    sender, &nce->nce_addr, flags));
2292 }
2293 
2294 /*
2295  * nce_xmit is called to form and transmit a ND solicitation or
2296  * advertisement ICMP packet.
2297  *
2298  * If the source address is unspecified and this isn't a probe (used for
2299  * duplicate address detection), an appropriate source address and link layer
2300  * address will be chosen here.  The link layer address option is included if
2301  * the source is specified (i.e., all non-probe packets), and omitted (per the
2302  * specification) otherwise.
2303  *
2304  * It returns B_FALSE only if it does a successful put() to the
2305  * corresponding ill's ill_wq otherwise returns B_TRUE.
2306  */
2307 static boolean_t
2308 nce_xmit(ill_t *ill, uint8_t type, boolean_t use_nd_lla,
2309     const in6_addr_t *sender, const in6_addr_t *target, int flag)
2310 {
2311 	ill_t		*hwaddr_ill;
2312 	uint32_t	len;
2313 	icmp6_t 	*icmp6;
2314 	mblk_t		*mp;
2315 	ip6_t		*ip6h;
2316 	nd_opt_hdr_t	*opt;
2317 	uint_t		plen, maxplen;
2318 	ip6i_t		*ip6i;
2319 	ipif_t		*src_ipif = NULL;
2320 	uint8_t		*hw_addr;
2321 	zoneid_t	zoneid = GLOBAL_ZONEID;
2322 	char		buf[INET6_ADDRSTRLEN];
2323 
2324 	ASSERT(!IS_IPMP(ill));
2325 
2326 	/*
2327 	 * Check that the sender is actually a usable address on `ill', and if
2328 	 * so, track that as the src_ipif.  If not, for solicitations, set the
2329 	 * sender to :: so that a new one will be picked below; for adverts,
2330 	 * drop the packet since we expect nce_xmit_advert() to always provide
2331 	 * a valid sender.
2332 	 */
2333 	if (!IN6_IS_ADDR_UNSPECIFIED(sender)) {
2334 		if ((src_ipif = ip_ndp_lookup_addr_v6(sender, ill)) == NULL ||
2335 		    !src_ipif->ipif_addr_ready) {
2336 			if (src_ipif != NULL) {
2337 				ipif_refrele(src_ipif);
2338 				src_ipif = NULL;
2339 			}
2340 			if (type == ND_NEIGHBOR_ADVERT) {
2341 				ip1dbg(("nce_xmit: No source ipif for src %s\n",
2342 				    inet_ntop(AF_INET6, sender, buf,
2343 				    sizeof (buf))));
2344 				return (B_TRUE);
2345 			}
2346 			sender = &ipv6_all_zeros;
2347 		}
2348 	}
2349 
2350 	/*
2351 	 * If we still have an unspecified source (sender) address and this
2352 	 * isn't a probe, select a source address from `ill'.
2353 	 */
2354 	if (IN6_IS_ADDR_UNSPECIFIED(sender) && !(flag & NDP_PROBE)) {
2355 		ASSERT(type != ND_NEIGHBOR_ADVERT);
2356 		/*
2357 		 * Pick a source address for this solicitation, but restrict
2358 		 * the selection to addresses assigned to the output
2359 		 * interface.  We do this because the destination will create
2360 		 * a neighbor cache entry for the source address of this
2361 		 * packet, so the source address needs to be a valid neighbor.
2362 		 */
2363 		src_ipif = ipif_select_source_v6(ill, target, B_TRUE,
2364 		    IPV6_PREFER_SRC_DEFAULT, ALL_ZONES);
2365 		if (src_ipif == NULL) {
2366 			ip1dbg(("nce_xmit: No source ipif for dst %s\n",
2367 			    inet_ntop(AF_INET6, target, buf, sizeof (buf))));
2368 			return (B_TRUE);
2369 		}
2370 		sender = &src_ipif->ipif_v6src_addr;
2371 	}
2372 
2373 	/*
2374 	 * We're either sending a probe or we have a source address.
2375 	 */
2376 	ASSERT((flag & NDP_PROBE) || src_ipif != NULL);
2377 
2378 	maxplen = roundup(sizeof (nd_opt_hdr_t) + ND_MAX_HDW_LEN, 8);
2379 	len = IPV6_HDR_LEN + sizeof (ip6i_t) + sizeof (nd_neighbor_advert_t) +
2380 	    maxplen;
2381 	mp = allocb(len,  BPRI_LO);
2382 	if (mp == NULL) {
2383 		if (src_ipif != NULL)
2384 			ipif_refrele(src_ipif);
2385 		return (B_TRUE);
2386 	}
2387 	bzero((char *)mp->b_rptr, len);
2388 	mp->b_wptr = mp->b_rptr + len;
2389 
2390 	ip6i = (ip6i_t *)mp->b_rptr;
2391 	ip6i->ip6i_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
2392 	ip6i->ip6i_nxt = IPPROTO_RAW;
2393 	ip6i->ip6i_flags = IP6I_HOPLIMIT;
2394 	if (flag & NDP_PROBE)
2395 		ip6i->ip6i_flags |= IP6I_UNSPEC_SRC;
2396 
2397 	ip6h = (ip6_t *)(mp->b_rptr + sizeof (ip6i_t));
2398 	ip6h->ip6_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
2399 	ip6h->ip6_plen = htons(len - IPV6_HDR_LEN - sizeof (ip6i_t));
2400 	ip6h->ip6_nxt = IPPROTO_ICMPV6;
2401 	ip6h->ip6_hops = IPV6_MAX_HOPS;
2402 	ip6h->ip6_src = *sender;
2403 	ip6h->ip6_dst = *target;
2404 	icmp6 = (icmp6_t *)&ip6h[1];
2405 
2406 	opt = (nd_opt_hdr_t *)((uint8_t *)ip6h + IPV6_HDR_LEN +
2407 	    sizeof (nd_neighbor_advert_t));
2408 
2409 	if (type == ND_NEIGHBOR_SOLICIT) {
2410 		nd_neighbor_solicit_t *ns = (nd_neighbor_solicit_t *)icmp6;
2411 
2412 		if (!(flag & NDP_PROBE))
2413 			opt->nd_opt_type = ND_OPT_SOURCE_LINKADDR;
2414 		ns->nd_ns_target = *target;
2415 		if (!(flag & NDP_UNICAST)) {
2416 			/* Form multicast address of the target */
2417 			ip6h->ip6_dst = ipv6_solicited_node_mcast;
2418 			ip6h->ip6_dst.s6_addr32[3] |=
2419 			    ns->nd_ns_target.s6_addr32[3];
2420 		}
2421 	} else {
2422 		nd_neighbor_advert_t *na = (nd_neighbor_advert_t *)icmp6;
2423 
2424 		ASSERT(!(flag & NDP_PROBE));
2425 		opt->nd_opt_type = ND_OPT_TARGET_LINKADDR;
2426 		na->nd_na_target = *sender;
2427 		if (flag & NDP_ISROUTER)
2428 			na->nd_na_flags_reserved |= ND_NA_FLAG_ROUTER;
2429 		if (flag & NDP_SOLICITED)
2430 			na->nd_na_flags_reserved |= ND_NA_FLAG_SOLICITED;
2431 		if (flag & NDP_ORIDE)
2432 			na->nd_na_flags_reserved |= ND_NA_FLAG_OVERRIDE;
2433 	}
2434 
2435 	hw_addr = NULL;
2436 	if (!(flag & NDP_PROBE)) {
2437 		/*
2438 		 * Use our source address to find the hardware address to put
2439 		 * in the packet, so that the hardware address and IP address
2440 		 * will match up -- even if that hardware address doesn't
2441 		 * match the ill we actually transmit the packet through.
2442 		 */
2443 		if (IS_IPMP(src_ipif->ipif_ill)) {
2444 			hwaddr_ill = ipmp_ipif_hold_bound_ill(src_ipif);
2445 			if (hwaddr_ill == NULL) {
2446 				ip1dbg(("nce_xmit: no bound ill!\n"));
2447 				ipif_refrele(src_ipif);
2448 				freemsg(mp);
2449 				return (B_TRUE);
2450 			}
2451 		} else {
2452 			hwaddr_ill = src_ipif->ipif_ill;
2453 			ill_refhold(hwaddr_ill);	/* for symmetry */
2454 		}
2455 
2456 		plen = roundup(sizeof (nd_opt_hdr_t) +
2457 		    hwaddr_ill->ill_nd_lla_len, 8);
2458 
2459 		hw_addr = use_nd_lla ? hwaddr_ill->ill_nd_lla :
2460 		    hwaddr_ill->ill_phys_addr;
2461 		if (hw_addr != NULL) {
2462 			/* Fill in link layer address and option len */
2463 			opt->nd_opt_len = (uint8_t)(plen / 8);
2464 			bcopy(hw_addr, &opt[1], hwaddr_ill->ill_nd_lla_len);
2465 		}
2466 
2467 		ill_refrele(hwaddr_ill);
2468 	}
2469 
2470 	if (hw_addr == NULL)
2471 		plen = 0;
2472 
2473 	/* Fix up the length of the packet now that plen is known */
2474 	len -= (maxplen - plen);
2475 	mp->b_wptr = mp->b_rptr + len;
2476 	ip6h->ip6_plen = htons(len - IPV6_HDR_LEN - sizeof (ip6i_t));
2477 
2478 	icmp6->icmp6_type = type;
2479 	icmp6->icmp6_code = 0;
2480 	/*
2481 	 * Prepare for checksum by putting icmp length in the icmp
2482 	 * checksum field. The checksum is calculated in ip_wput_v6.
2483 	 */
2484 	icmp6->icmp6_cksum = ip6h->ip6_plen;
2485 
2486 	/*
2487 	 * Before we toss the src_ipif, look up the zoneid to pass to
2488 	 * ip_output_v6().  This is to ensure unicast ND_NEIGHBOR_ADVERT
2489 	 * packets to be routed correctly by IP (we cannot guarantee that the
2490 	 * global zone has an interface route to the destination).
2491 	 */
2492 	if (src_ipif != NULL) {
2493 		if ((zoneid = src_ipif->ipif_zoneid) == ALL_ZONES)
2494 			zoneid = GLOBAL_ZONEID;
2495 		ipif_refrele(src_ipif);
2496 	}
2497 
2498 	ip_output_v6((void *)(uintptr_t)zoneid, mp, ill->ill_wq, IP_WPUT);
2499 	return (B_FALSE);
2500 }
2501 
2502 /*
2503  * Make a link layer address (does not include the SAP) from an nce.
2504  * To form the link layer address, use the last four bytes of ipv6
2505  * address passed in and the fixed offset stored in nce.
2506  */
2507 static void
2508 nce_make_mapping(nce_t *nce, uchar_t *addrpos, uchar_t *addr)
2509 {
2510 	uchar_t *mask, *to;
2511 	ill_t	*ill = nce->nce_ill;
2512 	int 	len;
2513 
2514 	if (ill->ill_net_type == IRE_IF_NORESOLVER)
2515 		return;
2516 	ASSERT(nce->nce_res_mp != NULL);
2517 	ASSERT(ill->ill_net_type == IRE_IF_RESOLVER);
2518 	ASSERT(nce->nce_flags & NCE_F_MAPPING);
2519 	ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&nce->nce_extract_mask));
2520 	ASSERT(addr != NULL);
2521 	bcopy(nce->nce_res_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill),
2522 	    addrpos, ill->ill_nd_lla_len);
2523 	len = MIN((int)ill->ill_nd_lla_len - nce->nce_ll_extract_start,
2524 	    IPV6_ADDR_LEN);
2525 	mask = (uchar_t *)&nce->nce_extract_mask;
2526 	mask += (IPV6_ADDR_LEN - len);
2527 	addr += (IPV6_ADDR_LEN - len);
2528 	to = addrpos + nce->nce_ll_extract_start;
2529 	while (len-- > 0)
2530 		*to++ |= *mask++ & *addr++;
2531 }
2532 
2533 mblk_t *
2534 nce_udreq_alloc(ill_t *ill)
2535 {
2536 	mblk_t	*template_mp = NULL;
2537 	dl_unitdata_req_t *dlur;
2538 	int	sap_length;
2539 
2540 	ASSERT(ill->ill_isv6);
2541 
2542 	sap_length = ill->ill_sap_length;
2543 	template_mp = ip_dlpi_alloc(sizeof (dl_unitdata_req_t) +
2544 	    ill->ill_nd_lla_len + ABS(sap_length), DL_UNITDATA_REQ);
2545 	if (template_mp == NULL)
2546 		return (NULL);
2547 
2548 	dlur = (dl_unitdata_req_t *)template_mp->b_rptr;
2549 	dlur->dl_priority.dl_min = 0;
2550 	dlur->dl_priority.dl_max = 0;
2551 	dlur->dl_dest_addr_length = ABS(sap_length) + ill->ill_nd_lla_len;
2552 	dlur->dl_dest_addr_offset = sizeof (dl_unitdata_req_t);
2553 
2554 	/* Copy in the SAP value. */
2555 	NCE_LL_SAP_COPY(ill, template_mp);
2556 
2557 	return (template_mp);
2558 }
2559 
2560 /*
2561  * NDP retransmit timer.
2562  * This timer goes off when:
2563  * a. It is time to retransmit NS for resolver.
2564  * b. It is time to send reachability probes.
2565  */
2566 void
2567 ndp_timer(void *arg)
2568 {
2569 	nce_t		*nce = arg;
2570 	ill_t		*ill = nce->nce_ill;
2571 	uint32_t	ms;
2572 	char		addrbuf[INET6_ADDRSTRLEN];
2573 	boolean_t	dropped = B_FALSE;
2574 	ip_stack_t	*ipst = ill->ill_ipst;
2575 
2576 	/*
2577 	 * The timer has to be cancelled by ndp_delete before doing the final
2578 	 * refrele. So the NCE is guaranteed to exist when the timer runs
2579 	 * until it clears the timeout_id. Before clearing the timeout_id
2580 	 * bump up the refcnt so that we can continue to use the nce
2581 	 */
2582 	ASSERT(nce != NULL);
2583 
2584 	mutex_enter(&nce->nce_lock);
2585 	NCE_REFHOLD_LOCKED(nce);
2586 	nce->nce_timeout_id = 0;
2587 
2588 	/*
2589 	 * Check the reachability state first.
2590 	 */
2591 	switch (nce->nce_state) {
2592 	case ND_DELAY:
2593 		nce->nce_state = ND_PROBE;
2594 		mutex_exit(&nce->nce_lock);
2595 		(void) nce_xmit_solicit(nce, B_FALSE, &ipv6_all_zeros,
2596 		    NDP_UNICAST);
2597 		if (ip_debug > 3) {
2598 			/* ip2dbg */
2599 			pr_addr_dbg("ndp_timer: state for %s changed "
2600 			    "to PROBE\n", AF_INET6, &nce->nce_addr);
2601 		}
2602 		NDP_RESTART_TIMER(nce, ill->ill_reachable_retrans_time);
2603 		NCE_REFRELE(nce);
2604 		return;
2605 	case ND_PROBE:
2606 		/* must be retransmit timer */
2607 		nce->nce_pcnt--;
2608 		ASSERT(nce->nce_pcnt < ND_MAX_UNICAST_SOLICIT &&
2609 		    nce->nce_pcnt >= -1);
2610 		if (nce->nce_pcnt > 0) {
2611 			/*
2612 			 * As per RFC2461, the nce gets deleted after
2613 			 * MAX_UNICAST_SOLICIT unsuccessful re-transmissions.
2614 			 * Note that the first unicast solicitation is sent
2615 			 * during the DELAY state.
2616 			 */
2617 			ip2dbg(("ndp_timer: pcount=%x dst %s\n",
2618 			    nce->nce_pcnt, inet_ntop(AF_INET6, &nce->nce_addr,
2619 			    addrbuf, sizeof (addrbuf))));
2620 			mutex_exit(&nce->nce_lock);
2621 			dropped = nce_xmit_solicit(nce, B_FALSE,
2622 			    &ipv6_all_zeros,
2623 			    (nce->nce_flags & NCE_F_PERMANENT) ? NDP_PROBE :
2624 			    NDP_UNICAST);
2625 			if (dropped) {
2626 				mutex_enter(&nce->nce_lock);
2627 				nce->nce_pcnt++;
2628 				mutex_exit(&nce->nce_lock);
2629 			}
2630 			NDP_RESTART_TIMER(nce, ILL_PROBE_INTERVAL(ill));
2631 		} else if (nce->nce_pcnt < 0) {
2632 			/* No hope, delete the nce */
2633 			nce->nce_state = ND_UNREACHABLE;
2634 			mutex_exit(&nce->nce_lock);
2635 			if (ip_debug > 2) {
2636 				/* ip1dbg */
2637 				pr_addr_dbg("ndp_timer: Delete IRE for"
2638 				    " dst %s\n", AF_INET6, &nce->nce_addr);
2639 			}
2640 			ndp_delete(nce);
2641 		} else if (!(nce->nce_flags & NCE_F_PERMANENT)) {
2642 			/* Wait RetransTimer, before deleting the entry */
2643 			ip2dbg(("ndp_timer: pcount=%x dst %s\n",
2644 			    nce->nce_pcnt, inet_ntop(AF_INET6,
2645 			    &nce->nce_addr, addrbuf, sizeof (addrbuf))));
2646 			mutex_exit(&nce->nce_lock);
2647 			/* Wait one interval before killing */
2648 			NDP_RESTART_TIMER(nce, ill->ill_reachable_retrans_time);
2649 		} else if (ill->ill_phyint->phyint_flags & PHYI_RUNNING) {
2650 			ipif_t *ipif;
2651 
2652 			/*
2653 			 * We're done probing, and we can now declare this
2654 			 * address to be usable.  Let IP know that it's ok to
2655 			 * use.
2656 			 */
2657 			nce->nce_state = ND_REACHABLE;
2658 			mutex_exit(&nce->nce_lock);
2659 			ipif = ip_ndp_lookup_addr_v6(&nce->nce_addr,
2660 			    nce->nce_ill);
2661 			if (ipif != NULL) {
2662 				if (ipif->ipif_was_dup) {
2663 					char ibuf[LIFNAMSIZ + 10];
2664 					char sbuf[INET6_ADDRSTRLEN];
2665 
2666 					ipif->ipif_was_dup = B_FALSE;
2667 					(void) inet_ntop(AF_INET6,
2668 					    &ipif->ipif_v6lcl_addr,
2669 					    sbuf, sizeof (sbuf));
2670 					ipif_get_name(ipif, ibuf,
2671 					    sizeof (ibuf));
2672 					cmn_err(CE_NOTE, "recovered address "
2673 					    "%s on %s", sbuf, ibuf);
2674 				}
2675 				if ((ipif->ipif_flags & IPIF_UP) &&
2676 				    !ipif->ipif_addr_ready)
2677 					ipif_up_notify(ipif);
2678 				ipif->ipif_addr_ready = 1;
2679 				ipif_refrele(ipif);
2680 			}
2681 			/* Begin defending our new address */
2682 			nce->nce_unsolicit_count = 0;
2683 			dropped = nce_xmit_advert(nce, B_FALSE,
2684 			    &ipv6_all_hosts_mcast, 0);
2685 			if (dropped) {
2686 				nce->nce_unsolicit_count = 1;
2687 				NDP_RESTART_TIMER(nce,
2688 				    ipst->ips_ip_ndp_unsolicit_interval);
2689 			} else if (ipst->ips_ip_ndp_defense_interval != 0) {
2690 				NDP_RESTART_TIMER(nce,
2691 				    ipst->ips_ip_ndp_defense_interval);
2692 			}
2693 		} else {
2694 			/*
2695 			 * This is an address we're probing to be our own, but
2696 			 * the ill is down.  Wait until it comes back before
2697 			 * doing anything, but switch to reachable state so
2698 			 * that the restart will work.
2699 			 */
2700 			nce->nce_state = ND_REACHABLE;
2701 			mutex_exit(&nce->nce_lock);
2702 		}
2703 		NCE_REFRELE(nce);
2704 		return;
2705 	case ND_INCOMPLETE: {
2706 		ip6_t	*ip6h;
2707 		ip6i_t	*ip6i;
2708 		mblk_t	*mp, *datamp, *nextmp, **prevmpp;
2709 
2710 		/*
2711 		 * Per case (2) in the nce_queue_mp() comments, scan nce_qd_mp
2712 		 * for any IPMP probe packets, and toss 'em.  IPMP probe
2713 		 * packets will always be at the head of nce_qd_mp and always
2714 		 * have an ip6i_t header, so we can stop at the first queued
2715 		 * ND packet without an ip6i_t.
2716 		 */
2717 		prevmpp = &nce->nce_qd_mp;
2718 		for (mp = nce->nce_qd_mp; mp != NULL; mp = nextmp) {
2719 			nextmp = mp->b_next;
2720 			datamp = (DB_TYPE(mp) == M_CTL) ? mp->b_cont : mp;
2721 			ip6h = (ip6_t *)datamp->b_rptr;
2722 			if (ip6h->ip6_nxt != IPPROTO_RAW)
2723 				break;
2724 
2725 			ip6i = (ip6i_t *)ip6h;
2726 			if (ip6i->ip6i_flags & IP6I_IPMP_PROBE) {
2727 				inet_freemsg(mp);
2728 				*prevmpp = nextmp;
2729 			} else {
2730 				prevmpp = &mp->b_next;
2731 			}
2732 		}
2733 
2734 		/*
2735 		 * Must be resolver's retransmit timer.
2736 		 */
2737 		if (nce->nce_qd_mp != NULL) {
2738 			if ((ms = nce_solicit(nce, NULL)) == 0) {
2739 				if (nce->nce_state != ND_REACHABLE) {
2740 					mutex_exit(&nce->nce_lock);
2741 					nce_resolv_failed(nce);
2742 					ndp_delete(nce);
2743 				} else {
2744 					mutex_exit(&nce->nce_lock);
2745 				}
2746 			} else {
2747 				mutex_exit(&nce->nce_lock);
2748 				NDP_RESTART_TIMER(nce, (clock_t)ms);
2749 			}
2750 			NCE_REFRELE(nce);
2751 			return;
2752 		}
2753 		mutex_exit(&nce->nce_lock);
2754 		NCE_REFRELE(nce);
2755 		break;
2756 	}
2757 	case ND_REACHABLE:
2758 		if (((nce->nce_flags & NCE_F_UNSOL_ADV) &&
2759 		    nce->nce_unsolicit_count != 0) ||
2760 		    ((nce->nce_flags & NCE_F_PERMANENT) &&
2761 		    ipst->ips_ip_ndp_defense_interval != 0)) {
2762 			if (nce->nce_unsolicit_count > 0)
2763 				nce->nce_unsolicit_count--;
2764 			mutex_exit(&nce->nce_lock);
2765 			dropped = nce_xmit_advert(nce, B_FALSE,
2766 			    &ipv6_all_hosts_mcast, 0);
2767 			if (dropped) {
2768 				mutex_enter(&nce->nce_lock);
2769 				nce->nce_unsolicit_count++;
2770 				mutex_exit(&nce->nce_lock);
2771 			}
2772 			if (nce->nce_unsolicit_count != 0) {
2773 				NDP_RESTART_TIMER(nce,
2774 				    ipst->ips_ip_ndp_unsolicit_interval);
2775 			} else {
2776 				NDP_RESTART_TIMER(nce,
2777 				    ipst->ips_ip_ndp_defense_interval);
2778 			}
2779 		} else {
2780 			mutex_exit(&nce->nce_lock);
2781 		}
2782 		NCE_REFRELE(nce);
2783 		break;
2784 	default:
2785 		mutex_exit(&nce->nce_lock);
2786 		NCE_REFRELE(nce);
2787 		break;
2788 	}
2789 }
2790 
2791 /*
2792  * Set a link layer address from the ll_addr passed in.
2793  * Copy SAP from ill.
2794  */
2795 static void
2796 nce_set_ll(nce_t *nce, uchar_t *ll_addr)
2797 {
2798 	ill_t	*ill = nce->nce_ill;
2799 	uchar_t	*woffset;
2800 
2801 	ASSERT(ll_addr != NULL);
2802 	/* Always called before fast_path_probe */
2803 	ASSERT(nce->nce_fp_mp == NULL);
2804 	if (ill->ill_sap_length != 0) {
2805 		/*
2806 		 * Copy the SAP type specified in the
2807 		 * request into the xmit template.
2808 		 */
2809 		NCE_LL_SAP_COPY(ill, nce->nce_res_mp);
2810 	}
2811 	if (ill->ill_phys_addr_length > 0) {
2812 		/*
2813 		 * The bcopy() below used to be called for the physical address
2814 		 * length rather than the link layer address length. For
2815 		 * ethernet and many other media, the phys_addr and lla are
2816 		 * identical.
2817 		 * However, with xresolv interfaces being introduced, the
2818 		 * phys_addr and lla are no longer the same, and the physical
2819 		 * address may not have any useful meaning, so we use the lla
2820 		 * for IPv6 address resolution and destination addressing.
2821 		 *
2822 		 * For PPP or other interfaces with a zero length
2823 		 * physical address, don't do anything here.
2824 		 * The bcopy() with a zero phys_addr length was previously
2825 		 * a no-op for interfaces with a zero-length physical address.
2826 		 * Using the lla for them would change the way they operate.
2827 		 * Doing nothing in such cases preserves expected behavior.
2828 		 */
2829 		woffset = nce->nce_res_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill);
2830 		bcopy(ll_addr, woffset, ill->ill_nd_lla_len);
2831 	}
2832 }
2833 
2834 static boolean_t
2835 nce_cmp_ll_addr(const nce_t *nce, const uchar_t *ll_addr, uint32_t ll_addr_len)
2836 {
2837 	ill_t	*ill = nce->nce_ill;
2838 	uchar_t	*ll_offset;
2839 
2840 	ASSERT(nce->nce_res_mp != NULL);
2841 	if (ll_addr == NULL)
2842 		return (B_FALSE);
2843 	ll_offset = nce->nce_res_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill);
2844 	if (bcmp(ll_addr, ll_offset, ll_addr_len) != 0)
2845 		return (B_TRUE);
2846 	return (B_FALSE);
2847 }
2848 
2849 /*
2850  * Updates the link layer address or the reachability state of
2851  * a cache entry.  Reset probe counter if needed.
2852  */
2853 static void
2854 nce_update(nce_t *nce, uint16_t new_state, uchar_t *new_ll_addr)
2855 {
2856 	ill_t	*ill = nce->nce_ill;
2857 	boolean_t need_stop_timer = B_FALSE;
2858 	boolean_t need_fastpath_update = B_FALSE;
2859 
2860 	ASSERT(MUTEX_HELD(&nce->nce_lock));
2861 	ASSERT(nce->nce_ipversion == IPV6_VERSION);
2862 	/*
2863 	 * If this interface does not do NUD, there is no point
2864 	 * in allowing an update to the cache entry.  Although
2865 	 * we will respond to NS.
2866 	 * The only time we accept an update for a resolver when
2867 	 * NUD is turned off is when it has just been created.
2868 	 * Non-Resolvers will always be created as REACHABLE.
2869 	 */
2870 	if (new_state != ND_UNCHANGED) {
2871 		if ((nce->nce_flags & NCE_F_NONUD) &&
2872 		    (nce->nce_state != ND_INCOMPLETE))
2873 			return;
2874 		ASSERT((int16_t)new_state >= ND_STATE_VALID_MIN);
2875 		ASSERT((int16_t)new_state <= ND_STATE_VALID_MAX);
2876 		need_stop_timer = B_TRUE;
2877 		if (new_state == ND_REACHABLE)
2878 			nce->nce_last = TICK_TO_MSEC(lbolt64);
2879 		else {
2880 			/* We force NUD in this case */
2881 			nce->nce_last = 0;
2882 		}
2883 		nce->nce_state = new_state;
2884 		nce->nce_pcnt = ND_MAX_UNICAST_SOLICIT;
2885 	}
2886 	/*
2887 	 * In case of fast path we need to free the the fastpath
2888 	 * M_DATA and do another probe.  Otherwise we can just
2889 	 * overwrite the DL_UNITDATA_REQ data, noting we'll lose
2890 	 * whatever packets that happens to be transmitting at the time.
2891 	 */
2892 	if (new_ll_addr != NULL) {
2893 		ASSERT(nce->nce_res_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill) +
2894 		    ill->ill_nd_lla_len <= nce->nce_res_mp->b_wptr);
2895 		bcopy(new_ll_addr, nce->nce_res_mp->b_rptr +
2896 		    NCE_LL_ADDR_OFFSET(ill), ill->ill_nd_lla_len);
2897 		if (nce->nce_fp_mp != NULL) {
2898 			freemsg(nce->nce_fp_mp);
2899 			nce->nce_fp_mp = NULL;
2900 		}
2901 		need_fastpath_update = B_TRUE;
2902 	}
2903 	mutex_exit(&nce->nce_lock);
2904 	if (need_stop_timer) {
2905 		(void) untimeout(nce->nce_timeout_id);
2906 		nce->nce_timeout_id = 0;
2907 	}
2908 	if (need_fastpath_update)
2909 		nce_fastpath(nce);
2910 	mutex_enter(&nce->nce_lock);
2911 }
2912 
2913 void
2914 nce_queue_mp_common(nce_t *nce, mblk_t *mp, boolean_t head_insert)
2915 {
2916 	uint_t	count = 0;
2917 	mblk_t  **mpp, *tmp;
2918 
2919 	ASSERT(MUTEX_HELD(&nce->nce_lock));
2920 
2921 	for (mpp = &nce->nce_qd_mp; *mpp != NULL; mpp = &(*mpp)->b_next) {
2922 		if (++count > nce->nce_ill->ill_max_buf) {
2923 			tmp = nce->nce_qd_mp->b_next;
2924 			nce->nce_qd_mp->b_next = NULL;
2925 			nce->nce_qd_mp->b_prev = NULL;
2926 			freemsg(nce->nce_qd_mp);
2927 			nce->nce_qd_mp = tmp;
2928 		}
2929 	}
2930 
2931 	if (head_insert) {
2932 		mp->b_next = nce->nce_qd_mp;
2933 		nce->nce_qd_mp = mp;
2934 	} else {
2935 		*mpp = mp;
2936 	}
2937 }
2938 
2939 static void
2940 nce_queue_mp(nce_t *nce, mblk_t *mp)
2941 {
2942 	boolean_t head_insert = B_FALSE;
2943 	ip6_t	*ip6h;
2944 	ip6i_t  *ip6i;
2945 	mblk_t	*data_mp;
2946 
2947 	ASSERT(MUTEX_HELD(&nce->nce_lock));
2948 
2949 	if (mp->b_datap->db_type == M_CTL)
2950 		data_mp = mp->b_cont;
2951 	else
2952 		data_mp = mp;
2953 	ip6h = (ip6_t *)data_mp->b_rptr;
2954 	if (ip6h->ip6_nxt == IPPROTO_RAW) {
2955 		/*
2956 		 * This message should have been pulled up already in
2957 		 * ip_wput_v6. We can't do pullups here because the message
2958 		 * could be from the nce_qd_mp which could have b_next/b_prev
2959 		 * non-NULL.
2960 		 */
2961 		ip6i = (ip6i_t *)ip6h;
2962 		ASSERT(MBLKL(data_mp) >= sizeof (ip6i_t) + IPV6_HDR_LEN);
2963 
2964 		/*
2965 		 * If this packet is marked IP6I_IPMP_PROBE, then we need to:
2966 		 *
2967 		 *   1. Insert it at the head of the nce_qd_mp list.  Consider
2968 		 *	the normal (non-probe) load-speading case where the
2969 		 *	source address of the ND packet is not tied to nce_ill.
2970 		 *	If the ill bound to the source address cannot receive,
2971 		 *	the response to the ND packet will not be received.
2972 		 *	However, if ND packets for nce_ill's probes are queued
2973 		 *	behind that ND packet, those probes will also fail to
2974 		 *	be sent, and thus in.mpathd will erroneously conclude
2975 		 *	that nce_ill has also failed.
2976 		 *
2977 		 *   2. Drop the probe packet in ndp_timer() if the ND did
2978 		 *	not succeed on the first attempt.  This ensures that
2979 		 *	ND problems do not manifest as probe RTT spikes.
2980 		 */
2981 		if (ip6i->ip6i_flags & IP6I_IPMP_PROBE)
2982 			head_insert = B_TRUE;
2983 	}
2984 	nce_queue_mp_common(nce, mp, head_insert);
2985 }
2986 
2987 /*
2988  * Called when address resolution failed due to a timeout.
2989  * Send an ICMP unreachable in response to all queued packets.
2990  */
2991 void
2992 nce_resolv_failed(nce_t *nce)
2993 {
2994 	mblk_t	*mp, *nxt_mp, *first_mp;
2995 	char	buf[INET6_ADDRSTRLEN];
2996 	ip6_t *ip6h;
2997 	zoneid_t zoneid = GLOBAL_ZONEID;
2998 	ip_stack_t	*ipst = nce->nce_ill->ill_ipst;
2999 
3000 	ip1dbg(("nce_resolv_failed: dst %s\n",
3001 	    inet_ntop(AF_INET6, (char *)&nce->nce_addr, buf, sizeof (buf))));
3002 	mutex_enter(&nce->nce_lock);
3003 	mp = nce->nce_qd_mp;
3004 	nce->nce_qd_mp = NULL;
3005 	mutex_exit(&nce->nce_lock);
3006 	while (mp != NULL) {
3007 		nxt_mp = mp->b_next;
3008 		mp->b_next = NULL;
3009 		mp->b_prev = NULL;
3010 
3011 		first_mp = mp;
3012 		if (mp->b_datap->db_type == M_CTL) {
3013 			ipsec_out_t *io = (ipsec_out_t *)mp->b_rptr;
3014 			ASSERT(io->ipsec_out_type == IPSEC_OUT);
3015 			zoneid = io->ipsec_out_zoneid;
3016 			ASSERT(zoneid != ALL_ZONES);
3017 			mp = mp->b_cont;
3018 			mp->b_next = NULL;
3019 			mp->b_prev = NULL;
3020 		}
3021 
3022 		ip6h = (ip6_t *)mp->b_rptr;
3023 		if (ip6h->ip6_nxt == IPPROTO_RAW) {
3024 			ip6i_t *ip6i;
3025 			/*
3026 			 * This message should have been pulled up already
3027 			 * in ip_wput_v6. ip_hdr_complete_v6 assumes that
3028 			 * the header is pulled up.
3029 			 */
3030 			ip6i = (ip6i_t *)ip6h;
3031 			ASSERT((mp->b_wptr - (uchar_t *)ip6i) >=
3032 			    sizeof (ip6i_t) + IPV6_HDR_LEN);
3033 			mp->b_rptr += sizeof (ip6i_t);
3034 		}
3035 		/*
3036 		 * Ignore failure since icmp_unreachable_v6 will silently
3037 		 * drop packets with an unspecified source address.
3038 		 */
3039 		(void) ip_hdr_complete_v6((ip6_t *)mp->b_rptr, zoneid, ipst);
3040 		icmp_unreachable_v6(nce->nce_ill->ill_wq, first_mp,
3041 		    ICMP6_DST_UNREACH_ADDR, B_FALSE, B_FALSE, zoneid, ipst);
3042 		mp = nxt_mp;
3043 	}
3044 }
3045 
3046 /*
3047  * Called by SIOCSNDP* ioctl to add/change an nce entry
3048  * and the corresponding attributes.
3049  * Disallow states other than ND_REACHABLE or ND_STALE.
3050  */
3051 int
3052 ndp_sioc_update(ill_t *ill, lif_nd_req_t *lnr)
3053 {
3054 	sin6_t		*sin6;
3055 	in6_addr_t	*addr;
3056 	nce_t		*nce;
3057 	int		err;
3058 	uint16_t	new_flags = 0;
3059 	uint16_t	old_flags = 0;
3060 	int		inflags = lnr->lnr_flags;
3061 	ip_stack_t	*ipst = ill->ill_ipst;
3062 
3063 	ASSERT(ill->ill_isv6);
3064 	if ((lnr->lnr_state_create != ND_REACHABLE) &&
3065 	    (lnr->lnr_state_create != ND_STALE))
3066 		return (EINVAL);
3067 
3068 	if (lnr->lnr_hdw_len > ND_MAX_HDW_LEN)
3069 		return (EINVAL);
3070 
3071 	sin6 = (sin6_t *)&lnr->lnr_addr;
3072 	addr = &sin6->sin6_addr;
3073 
3074 	mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
3075 	/* We know it can not be mapping so just look in the hash table */
3076 	nce = *((nce_t **)NCE_HASH_PTR_V6(ipst, *addr));
3077 	/* See comment in ndp_query() regarding IS_IPMP(ill) usage */
3078 	nce = nce_lookup_addr(ill, IS_IPMP(ill), addr, nce);
3079 	if (nce != NULL)
3080 		new_flags = nce->nce_flags;
3081 
3082 	switch (inflags & (NDF_ISROUTER_ON|NDF_ISROUTER_OFF)) {
3083 	case NDF_ISROUTER_ON:
3084 		new_flags |= NCE_F_ISROUTER;
3085 		break;
3086 	case NDF_ISROUTER_OFF:
3087 		new_flags &= ~NCE_F_ISROUTER;
3088 		break;
3089 	case (NDF_ISROUTER_OFF|NDF_ISROUTER_ON):
3090 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
3091 		if (nce != NULL)
3092 			NCE_REFRELE(nce);
3093 		return (EINVAL);
3094 	}
3095 
3096 	switch (inflags & (NDF_ANYCAST_ON|NDF_ANYCAST_OFF)) {
3097 	case NDF_ANYCAST_ON:
3098 		new_flags |= NCE_F_ANYCAST;
3099 		break;
3100 	case NDF_ANYCAST_OFF:
3101 		new_flags &= ~NCE_F_ANYCAST;
3102 		break;
3103 	case (NDF_ANYCAST_OFF|NDF_ANYCAST_ON):
3104 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
3105 		if (nce != NULL)
3106 			NCE_REFRELE(nce);
3107 		return (EINVAL);
3108 	}
3109 
3110 	if (nce == NULL) {
3111 		err = ndp_add_v6(ill,
3112 		    (uchar_t *)lnr->lnr_hdw_addr,
3113 		    addr,
3114 		    &ipv6_all_ones,
3115 		    &ipv6_all_zeros,
3116 		    0,
3117 		    new_flags,
3118 		    lnr->lnr_state_create,
3119 		    &nce);
3120 		if (err != 0) {
3121 			mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
3122 			ip1dbg(("ndp_sioc_update: Can't create NCE %d\n", err));
3123 			return (err);
3124 		}
3125 	}
3126 	old_flags = nce->nce_flags;
3127 	if (old_flags & NCE_F_ISROUTER && !(new_flags & NCE_F_ISROUTER)) {
3128 		/*
3129 		 * Router turned to host, delete all ires.
3130 		 * XXX Just delete the entry, but we need to add too.
3131 		 */
3132 		nce->nce_flags &= ~NCE_F_ISROUTER;
3133 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
3134 		ndp_delete(nce);
3135 		NCE_REFRELE(nce);
3136 		return (0);
3137 	}
3138 	mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
3139 
3140 	mutex_enter(&nce->nce_lock);
3141 	nce->nce_flags = new_flags;
3142 	mutex_exit(&nce->nce_lock);
3143 	/*
3144 	 * Note that we ignore the state at this point, which
3145 	 * should be either STALE or REACHABLE.  Instead we let
3146 	 * the link layer address passed in to determine the state
3147 	 * much like incoming packets.
3148 	 */
3149 	nce_process(nce, (uchar_t *)lnr->lnr_hdw_addr, 0, B_FALSE);
3150 	NCE_REFRELE(nce);
3151 	return (0);
3152 }
3153 
3154 /*
3155  * If the device driver supports it, we make nce_fp_mp to have
3156  * an M_DATA prepend.  Otherwise nce_fp_mp will be null.
3157  * The caller ensures there is hold on nce for this function.
3158  * Note that since ill_fastpath_probe() copies the mblk there is
3159  * no need for the hold beyond this function.
3160  */
3161 void
3162 nce_fastpath(nce_t *nce)
3163 {
3164 	ill_t	*ill = nce->nce_ill;
3165 	int res;
3166 
3167 	ASSERT(ill != NULL);
3168 	ASSERT(nce->nce_state != ND_INITIAL && nce->nce_state != ND_INCOMPLETE);
3169 
3170 	if (nce->nce_fp_mp != NULL) {
3171 		/* Already contains fastpath info */
3172 		return;
3173 	}
3174 	if (nce->nce_res_mp != NULL) {
3175 		nce_fastpath_list_add(nce);
3176 		res = ill_fastpath_probe(ill, nce->nce_res_mp);
3177 		/*
3178 		 * EAGAIN is an indication of a transient error
3179 		 * i.e. allocation failure etc. leave the nce in the list it
3180 		 * will be updated when another probe happens for another ire
3181 		 * if not it will be taken out of the list when the ire is
3182 		 * deleted.
3183 		 */
3184 
3185 		if (res != 0 && res != EAGAIN)
3186 			nce_fastpath_list_delete(nce);
3187 	}
3188 }
3189 
3190 /*
3191  * Drain the list of nce's waiting for fastpath response.
3192  */
3193 void
3194 nce_fastpath_list_dispatch(ill_t *ill, boolean_t (*func)(nce_t *, void  *),
3195     void *arg)
3196 {
3197 
3198 	nce_t *next_nce;
3199 	nce_t *current_nce;
3200 	nce_t *first_nce;
3201 	nce_t *prev_nce = NULL;
3202 
3203 	mutex_enter(&ill->ill_lock);
3204 	first_nce = current_nce = (nce_t *)ill->ill_fastpath_list;
3205 	while (current_nce != (nce_t *)&ill->ill_fastpath_list) {
3206 		next_nce = current_nce->nce_fastpath;
3207 		/*
3208 		 * Take it off the list if we're flushing, or if the callback
3209 		 * routine tells us to do so.  Otherwise, leave the nce in the
3210 		 * fastpath list to handle any pending response from the lower
3211 		 * layer.  We can't drain the list when the callback routine
3212 		 * comparison failed, because the response is asynchronous in
3213 		 * nature, and may not arrive in the same order as the list
3214 		 * insertion.
3215 		 */
3216 		if (func == NULL || func(current_nce, arg)) {
3217 			current_nce->nce_fastpath = NULL;
3218 			if (current_nce == first_nce)
3219 				ill->ill_fastpath_list = first_nce = next_nce;
3220 			else
3221 				prev_nce->nce_fastpath = next_nce;
3222 		} else {
3223 			/* previous element that is still in the list */
3224 			prev_nce = current_nce;
3225 		}
3226 		current_nce = next_nce;
3227 	}
3228 	mutex_exit(&ill->ill_lock);
3229 }
3230 
3231 /*
3232  * Add nce to the nce fastpath list.
3233  */
3234 void
3235 nce_fastpath_list_add(nce_t *nce)
3236 {
3237 	ill_t *ill;
3238 
3239 	ill = nce->nce_ill;
3240 
3241 	mutex_enter(&ill->ill_lock);
3242 	mutex_enter(&nce->nce_lock);
3243 
3244 	/*
3245 	 * if nce has not been deleted and
3246 	 * is not already in the list add it.
3247 	 */
3248 	if (!(nce->nce_flags & NCE_F_CONDEMNED) &&
3249 	    (nce->nce_fastpath == NULL)) {
3250 		nce->nce_fastpath = (nce_t *)ill->ill_fastpath_list;
3251 		ill->ill_fastpath_list = nce;
3252 	}
3253 
3254 	mutex_exit(&nce->nce_lock);
3255 	mutex_exit(&ill->ill_lock);
3256 }
3257 
3258 /*
3259  * remove nce from the nce fastpath list.
3260  */
3261 void
3262 nce_fastpath_list_delete(nce_t *nce)
3263 {
3264 	nce_t *nce_ptr;
3265 
3266 	ill_t *ill;
3267 
3268 	ill = nce->nce_ill;
3269 	ASSERT(ill != NULL);
3270 
3271 	mutex_enter(&ill->ill_lock);
3272 	if (nce->nce_fastpath == NULL)
3273 		goto done;
3274 
3275 	ASSERT(ill->ill_fastpath_list != &ill->ill_fastpath_list);
3276 
3277 	if (ill->ill_fastpath_list == nce) {
3278 		ill->ill_fastpath_list = nce->nce_fastpath;
3279 	} else {
3280 		nce_ptr = ill->ill_fastpath_list;
3281 		while (nce_ptr != (nce_t *)&ill->ill_fastpath_list) {
3282 			if (nce_ptr->nce_fastpath == nce) {
3283 				nce_ptr->nce_fastpath = nce->nce_fastpath;
3284 				break;
3285 			}
3286 			nce_ptr = nce_ptr->nce_fastpath;
3287 		}
3288 	}
3289 
3290 	nce->nce_fastpath = NULL;
3291 done:
3292 	mutex_exit(&ill->ill_lock);
3293 }
3294 
3295 /*
3296  * Update all NCE's that are not in fastpath mode and
3297  * have an nce_fp_mp that matches mp. mp->b_cont contains
3298  * the fastpath header.
3299  *
3300  * Returns TRUE if entry should be dequeued, or FALSE otherwise.
3301  */
3302 boolean_t
3303 ndp_fastpath_update(nce_t *nce, void *arg)
3304 {
3305 	mblk_t 	*mp, *fp_mp;
3306 	uchar_t	*mp_rptr, *ud_mp_rptr;
3307 	mblk_t	*ud_mp = nce->nce_res_mp;
3308 	ptrdiff_t	cmplen;
3309 
3310 	if (nce->nce_flags & NCE_F_MAPPING)
3311 		return (B_TRUE);
3312 	if ((nce->nce_fp_mp != NULL) || (ud_mp == NULL))
3313 		return (B_TRUE);
3314 
3315 	ip2dbg(("ndp_fastpath_update: trying\n"));
3316 	mp = (mblk_t *)arg;
3317 	mp_rptr = mp->b_rptr;
3318 	cmplen = mp->b_wptr - mp_rptr;
3319 	ASSERT(cmplen >= 0);
3320 	ud_mp_rptr = ud_mp->b_rptr;
3321 	/*
3322 	 * The nce is locked here to prevent any other threads
3323 	 * from accessing and changing nce_res_mp when the IPv6 address
3324 	 * becomes resolved to an lla while we're in the middle
3325 	 * of looking at and comparing the hardware address (lla).
3326 	 * It is also locked to prevent multiple threads in nce_fastpath_update
3327 	 * from examining nce_res_mp atthe same time.
3328 	 */
3329 	mutex_enter(&nce->nce_lock);
3330 	if (ud_mp->b_wptr - ud_mp_rptr != cmplen ||
3331 	    bcmp((char *)mp_rptr, (char *)ud_mp_rptr, cmplen) != 0) {
3332 		mutex_exit(&nce->nce_lock);
3333 		/*
3334 		 * Don't take the ire off the fastpath list yet,
3335 		 * since the response may come later.
3336 		 */
3337 		return (B_FALSE);
3338 	}
3339 	/* Matched - install mp as the fastpath mp */
3340 	ip1dbg(("ndp_fastpath_update: match\n"));
3341 	fp_mp = dupb(mp->b_cont);
3342 	if (fp_mp != NULL) {
3343 		nce->nce_fp_mp = fp_mp;
3344 	}
3345 	mutex_exit(&nce->nce_lock);
3346 	return (B_TRUE);
3347 }
3348 
3349 /*
3350  * This function handles the DL_NOTE_FASTPATH_FLUSH notification from
3351  * driver.  Note that it assumes IP is exclusive...
3352  */
3353 /* ARGSUSED */
3354 void
3355 ndp_fastpath_flush(nce_t *nce, char *arg)
3356 {
3357 	if (nce->nce_flags & NCE_F_MAPPING)
3358 		return;
3359 	/* No fastpath info? */
3360 	if (nce->nce_fp_mp == NULL || nce->nce_res_mp == NULL)
3361 		return;
3362 
3363 	if (nce->nce_ipversion == IPV4_VERSION &&
3364 	    nce->nce_flags & NCE_F_BCAST) {
3365 		/*
3366 		 * IPv4 BROADCAST entries:
3367 		 * We can't delete the nce since it is difficult to
3368 		 * recreate these without going through the
3369 		 * ipif down/up dance.
3370 		 *
3371 		 * All access to nce->nce_fp_mp in the case of these
3372 		 * is protected by nce_lock.
3373 		 */
3374 		mutex_enter(&nce->nce_lock);
3375 		if (nce->nce_fp_mp != NULL) {
3376 			freeb(nce->nce_fp_mp);
3377 			nce->nce_fp_mp = NULL;
3378 			mutex_exit(&nce->nce_lock);
3379 			nce_fastpath(nce);
3380 		} else {
3381 			mutex_exit(&nce->nce_lock);
3382 		}
3383 	} else {
3384 		/* Just delete the NCE... */
3385 		ndp_delete(nce);
3386 	}
3387 }
3388 
3389 /*
3390  * Return a pointer to a given option in the packet.
3391  * Assumes that option part of the packet have already been validated.
3392  */
3393 nd_opt_hdr_t *
3394 ndp_get_option(nd_opt_hdr_t *opt, int optlen, int opt_type)
3395 {
3396 	while (optlen > 0) {
3397 		if (opt->nd_opt_type == opt_type)
3398 			return (opt);
3399 		optlen -= 8 * opt->nd_opt_len;
3400 		opt = (struct nd_opt_hdr *)((char *)opt + 8 * opt->nd_opt_len);
3401 	}
3402 	return (NULL);
3403 }
3404 
3405 /*
3406  * Verify all option lengths present are > 0, also check to see
3407  * if the option lengths and packet length are consistent.
3408  */
3409 boolean_t
3410 ndp_verify_optlen(nd_opt_hdr_t *opt, int optlen)
3411 {
3412 	ASSERT(opt != NULL);
3413 	while (optlen > 0) {
3414 		if (opt->nd_opt_len == 0)
3415 			return (B_FALSE);
3416 		optlen -= 8 * opt->nd_opt_len;
3417 		if (optlen < 0)
3418 			return (B_FALSE);
3419 		opt = (struct nd_opt_hdr *)((char *)opt + 8 * opt->nd_opt_len);
3420 	}
3421 	return (B_TRUE);
3422 }
3423 
3424 /*
3425  * ndp_walk function.
3426  * Free a fraction of the NCE cache entries.
3427  * A fraction of zero means to not free any in that category.
3428  */
3429 void
3430 ndp_cache_reclaim(nce_t *nce, char *arg)
3431 {
3432 	nce_cache_reclaim_t *ncr = (nce_cache_reclaim_t *)arg;
3433 	uint_t	rand;
3434 
3435 	if (nce->nce_flags & NCE_F_PERMANENT)
3436 		return;
3437 
3438 	rand = (uint_t)lbolt +
3439 	    NCE_ADDR_HASH_V6(nce->nce_addr, NCE_TABLE_SIZE);
3440 	if (ncr->ncr_host != 0 &&
3441 	    (rand/ncr->ncr_host)*ncr->ncr_host == rand) {
3442 		ndp_delete(nce);
3443 		return;
3444 	}
3445 }
3446 
3447 /*
3448  * ndp_walk function.
3449  * Count the number of NCEs that can be deleted.
3450  * These would be hosts but not routers.
3451  */
3452 void
3453 ndp_cache_count(nce_t *nce, char *arg)
3454 {
3455 	ncc_cache_count_t *ncc = (ncc_cache_count_t *)arg;
3456 
3457 	if (nce->nce_flags & NCE_F_PERMANENT)
3458 		return;
3459 
3460 	ncc->ncc_total++;
3461 	if (!(nce->nce_flags & NCE_F_ISROUTER))
3462 		ncc->ncc_host++;
3463 }
3464 
3465 #ifdef DEBUG
3466 void
3467 nce_trace_ref(nce_t *nce)
3468 {
3469 	ASSERT(MUTEX_HELD(&nce->nce_lock));
3470 
3471 	if (nce->nce_trace_disable)
3472 		return;
3473 
3474 	if (!th_trace_ref(nce, nce->nce_ill->ill_ipst)) {
3475 		nce->nce_trace_disable = B_TRUE;
3476 		nce_trace_cleanup(nce);
3477 	}
3478 }
3479 
3480 void
3481 nce_untrace_ref(nce_t *nce)
3482 {
3483 	ASSERT(MUTEX_HELD(&nce->nce_lock));
3484 
3485 	if (!nce->nce_trace_disable)
3486 		th_trace_unref(nce);
3487 }
3488 
3489 static void
3490 nce_trace_cleanup(const nce_t *nce)
3491 {
3492 	th_trace_cleanup(nce, nce->nce_trace_disable);
3493 }
3494 #endif
3495 
3496 /*
3497  * Called when address resolution fails due to a timeout.
3498  * Send an ICMP unreachable in response to all queued packets.
3499  */
3500 void
3501 arp_resolv_failed(nce_t *nce)
3502 {
3503 	mblk_t	*mp, *nxt_mp, *first_mp;
3504 	char	buf[INET6_ADDRSTRLEN];
3505 	zoneid_t zoneid = GLOBAL_ZONEID;
3506 	struct in_addr ipv4addr;
3507 	ip_stack_t *ipst = nce->nce_ill->ill_ipst;
3508 
3509 	IN6_V4MAPPED_TO_INADDR(&nce->nce_addr, &ipv4addr);
3510 	ip3dbg(("arp_resolv_failed: dst %s\n",
3511 	    inet_ntop(AF_INET, &ipv4addr, buf, sizeof (buf))));
3512 	mutex_enter(&nce->nce_lock);
3513 	mp = nce->nce_qd_mp;
3514 	nce->nce_qd_mp = NULL;
3515 	mutex_exit(&nce->nce_lock);
3516 
3517 	while (mp != NULL) {
3518 		nxt_mp = mp->b_next;
3519 		mp->b_next = NULL;
3520 		mp->b_prev = NULL;
3521 
3522 		first_mp = mp;
3523 		/*
3524 		 * Send icmp unreachable messages
3525 		 * to the hosts.
3526 		 */
3527 		(void) ip_hdr_complete((ipha_t *)mp->b_rptr, zoneid, ipst);
3528 		ip3dbg(("arp_resolv_failed: Calling icmp_unreachable\n"));
3529 		icmp_unreachable(nce->nce_ill->ill_wq, first_mp,
3530 		    ICMP_HOST_UNREACHABLE, zoneid, ipst);
3531 		mp = nxt_mp;
3532 	}
3533 }
3534 
3535 int
3536 ndp_lookup_then_add_v4(ill_t *ill, const in_addr_t *addr, uint16_t flags,
3537     nce_t **newnce, nce_t *src_nce)
3538 {
3539 	int	err;
3540 	nce_t	*nce;
3541 	in6_addr_t addr6;
3542 	ip_stack_t *ipst = ill->ill_ipst;
3543 
3544 	mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
3545 	nce = *((nce_t **)NCE_HASH_PTR_V4(ipst, *addr));
3546 	IN6_IPADDR_TO_V4MAPPED(*addr, &addr6);
3547 	/*
3548 	 * NOTE: IPv4 never matches across the illgrp since the NCE's we're
3549 	 * looking up have fastpath headers that are inherently per-ill.
3550 	 */
3551 	nce = nce_lookup_addr(ill, B_FALSE, &addr6, nce);
3552 	if (nce == NULL) {
3553 		err = ndp_add_v4(ill, addr, flags, newnce, src_nce);
3554 	} else {
3555 		*newnce = nce;
3556 		err = EEXIST;
3557 	}
3558 	mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
3559 	return (err);
3560 }
3561 
3562 /*
3563  * NDP Cache Entry creation routine for IPv4.
3564  * Mapped entries are handled in arp.
3565  * This routine must always be called with ndp4->ndp_g_lock held.
3566  * Prior to return, nce_refcnt is incremented.
3567  */
3568 static int
3569 ndp_add_v4(ill_t *ill, const in_addr_t *addr, uint16_t flags,
3570     nce_t **newnce, nce_t *src_nce)
3571 {
3572 	static	nce_t		nce_nil;
3573 	nce_t		*nce;
3574 	mblk_t		*mp;
3575 	mblk_t		*template = NULL;
3576 	nce_t		**ncep;
3577 	ip_stack_t	*ipst = ill->ill_ipst;
3578 	uint16_t	state = ND_INITIAL;
3579 	int		err;
3580 
3581 	ASSERT(MUTEX_HELD(&ipst->ips_ndp4->ndp_g_lock));
3582 	ASSERT(!ill->ill_isv6);
3583 	ASSERT((flags & NCE_F_MAPPING) == 0);
3584 
3585 	if (ill->ill_resolver_mp == NULL)
3586 		return (EINVAL);
3587 	/*
3588 	 * Allocate the mblk to hold the nce.
3589 	 */
3590 	mp = allocb(sizeof (nce_t), BPRI_MED);
3591 	if (mp == NULL)
3592 		return (ENOMEM);
3593 
3594 	nce = (nce_t *)mp->b_rptr;
3595 	mp->b_wptr = (uchar_t *)&nce[1];
3596 	*nce = nce_nil;
3597 	nce->nce_ill = ill;
3598 	nce->nce_ipversion = IPV4_VERSION;
3599 	nce->nce_flags = flags;
3600 	nce->nce_pcnt = ND_MAX_UNICAST_SOLICIT;
3601 	nce->nce_rcnt = ill->ill_xmit_count;
3602 	IN6_IPADDR_TO_V4MAPPED(*addr, &nce->nce_addr);
3603 	nce->nce_mask = ipv6_all_ones;
3604 	nce->nce_extract_mask = ipv6_all_zeros;
3605 	nce->nce_ll_extract_start = 0;
3606 	nce->nce_qd_mp = NULL;
3607 	nce->nce_mp = mp;
3608 	/* This one is for nce getting created */
3609 	nce->nce_refcnt = 1;
3610 	mutex_init(&nce->nce_lock, NULL, MUTEX_DEFAULT, NULL);
3611 	ncep = ((nce_t **)NCE_HASH_PTR_V4(ipst, *addr));
3612 
3613 	nce->nce_trace_disable = B_FALSE;
3614 
3615 	if (src_nce != NULL) {
3616 		/*
3617 		 * src_nce has been provided by the caller. The only
3618 		 * caller who provides a non-null, non-broadcast
3619 		 * src_nce is from ip_newroute() which must pass in
3620 		 * a ND_REACHABLE src_nce (this condition is verified
3621 		 * via an ASSERT for the save_ire->ire_nce in ip_newroute())
3622 		 */
3623 		mutex_enter(&src_nce->nce_lock);
3624 		state = src_nce->nce_state;
3625 		if ((src_nce->nce_flags & NCE_F_CONDEMNED) ||
3626 		    (ipst->ips_ndp4->ndp_g_hw_change > 0)) {
3627 			/*
3628 			 * src_nce has been deleted, or
3629 			 * ip_arp_news is in the middle of
3630 			 * flushing entries in the the nce.
3631 			 * Fail the add, since we don't know
3632 			 * if it is safe to copy the contents of
3633 			 * src_nce
3634 			 */
3635 			DTRACE_PROBE2(nce__bad__src__nce,
3636 			    nce_t *, src_nce, ill_t *, ill);
3637 			mutex_exit(&src_nce->nce_lock);
3638 			err = EINVAL;
3639 			goto err_ret;
3640 		}
3641 		template = copyb(src_nce->nce_res_mp);
3642 		mutex_exit(&src_nce->nce_lock);
3643 		if (template == NULL) {
3644 			err = ENOMEM;
3645 			goto err_ret;
3646 		}
3647 	} else if (flags & NCE_F_BCAST) {
3648 		/*
3649 		 * broadcast nce.
3650 		 */
3651 		template = copyb(ill->ill_bcast_mp);
3652 		if (template == NULL) {
3653 			err = ENOMEM;
3654 			goto err_ret;
3655 		}
3656 		state = ND_REACHABLE;
3657 	} else if (ill->ill_net_type == IRE_IF_NORESOLVER) {
3658 		/*
3659 		 * NORESOLVER entries are always created in the REACHABLE
3660 		 * state. We create a nce_res_mp with the IP nexthop address
3661 		 * in the destination address in the DLPI hdr if the
3662 		 * physical length is exactly 4 bytes.
3663 		 *
3664 		 * XXX not clear which drivers set ill_phys_addr_length to
3665 		 * IP_ADDR_LEN.
3666 		 */
3667 		if (ill->ill_phys_addr_length == IP_ADDR_LEN) {
3668 			template = ill_dlur_gen((uchar_t *)addr,
3669 			    ill->ill_phys_addr_length,
3670 			    ill->ill_sap, ill->ill_sap_length);
3671 		} else {
3672 			template = copyb(ill->ill_resolver_mp);
3673 		}
3674 		if (template == NULL) {
3675 			err = ENOMEM;
3676 			goto err_ret;
3677 		}
3678 		state = ND_REACHABLE;
3679 	}
3680 	nce->nce_fp_mp = NULL;
3681 	nce->nce_res_mp = template;
3682 	nce->nce_state = state;
3683 	if (state == ND_REACHABLE) {
3684 		nce->nce_last = TICK_TO_MSEC(lbolt64);
3685 		nce->nce_init_time = TICK_TO_MSEC(lbolt64);
3686 	} else {
3687 		nce->nce_last = 0;
3688 		if (state == ND_INITIAL)
3689 			nce->nce_init_time = TICK_TO_MSEC(lbolt64);
3690 	}
3691 
3692 	ASSERT((nce->nce_res_mp == NULL && nce->nce_state == ND_INITIAL) ||
3693 	    (nce->nce_res_mp != NULL && nce->nce_state == ND_REACHABLE));
3694 	/*
3695 	 * Atomically ensure that the ill is not CONDEMNED, before
3696 	 * adding the NCE.
3697 	 */
3698 	mutex_enter(&ill->ill_lock);
3699 	if (ill->ill_state_flags & ILL_CONDEMNED) {
3700 		mutex_exit(&ill->ill_lock);
3701 		err = EINVAL;
3702 		goto err_ret;
3703 	}
3704 	if ((nce->nce_next = *ncep) != NULL)
3705 		nce->nce_next->nce_ptpn = &nce->nce_next;
3706 	*ncep = nce;
3707 	nce->nce_ptpn = ncep;
3708 	*newnce = nce;
3709 	/* This one is for nce being used by an active thread */
3710 	NCE_REFHOLD(*newnce);
3711 
3712 	/* Bump up the number of nce's referencing this ill */
3713 	DTRACE_PROBE3(ill__incr__cnt, (ill_t *), ill,
3714 	    (char *), "nce", (void *), nce);
3715 	ill->ill_nce_cnt++;
3716 	mutex_exit(&ill->ill_lock);
3717 	DTRACE_PROBE1(ndp__add__v4, nce_t *, nce);
3718 	return (0);
3719 err_ret:
3720 	freeb(mp);
3721 	freemsg(template);
3722 	return (err);
3723 }
3724 
3725 /*
3726  * ndp_walk routine to delete all entries that have a given destination or
3727  * gateway address and cached link layer (MAC) address.  This is used when ARP
3728  * informs us that a network-to-link-layer mapping may have changed.
3729  */
3730 void
3731 nce_delete_hw_changed(nce_t *nce, void *arg)
3732 {
3733 	nce_hw_map_t *hwm = arg;
3734 	mblk_t *mp;
3735 	dl_unitdata_req_t *dlu;
3736 	uchar_t *macaddr;
3737 	ill_t *ill;
3738 	int saplen;
3739 	ipaddr_t nce_addr;
3740 
3741 	if (nce->nce_state != ND_REACHABLE)
3742 		return;
3743 
3744 	IN6_V4MAPPED_TO_IPADDR(&nce->nce_addr, nce_addr);
3745 	if (nce_addr != hwm->hwm_addr)
3746 		return;
3747 
3748 	mutex_enter(&nce->nce_lock);
3749 	if ((mp = nce->nce_res_mp) == NULL) {
3750 		mutex_exit(&nce->nce_lock);
3751 		return;
3752 	}
3753 	dlu = (dl_unitdata_req_t *)mp->b_rptr;
3754 	macaddr = (uchar_t *)(dlu + 1);
3755 	ill = nce->nce_ill;
3756 	if ((saplen = ill->ill_sap_length) > 0)
3757 		macaddr += saplen;
3758 	else
3759 		saplen = -saplen;
3760 
3761 	/*
3762 	 * If the hardware address is unchanged, then leave this one alone.
3763 	 * Note that saplen == abs(saplen) now.
3764 	 */
3765 	if (hwm->hwm_hwlen == dlu->dl_dest_addr_length - saplen &&
3766 	    bcmp(hwm->hwm_hwaddr, macaddr, hwm->hwm_hwlen) == 0) {
3767 		mutex_exit(&nce->nce_lock);
3768 		return;
3769 	}
3770 	mutex_exit(&nce->nce_lock);
3771 
3772 	DTRACE_PROBE1(nce__hw__deleted, nce_t *, nce);
3773 	ndp_delete(nce);
3774 }
3775 
3776 /*
3777  * This function verifies whether a given IPv4 address is potentially known to
3778  * the NCE subsystem.  If so, then ARP must not delete the corresponding ace_t,
3779  * so that it can continue to look for hardware changes on that address.
3780  */
3781 boolean_t
3782 ndp_lookup_ipaddr(in_addr_t addr, netstack_t *ns)
3783 {
3784 	nce_t		*nce;
3785 	struct in_addr	nceaddr;
3786 	ip_stack_t	*ipst = ns->netstack_ip;
3787 
3788 	if (addr == INADDR_ANY)
3789 		return (B_FALSE);
3790 
3791 	mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
3792 	nce = *(nce_t **)NCE_HASH_PTR_V4(ipst, addr);
3793 	for (; nce != NULL; nce = nce->nce_next) {
3794 		/* Note that only v4 mapped entries are in the table. */
3795 		IN6_V4MAPPED_TO_INADDR(&nce->nce_addr, &nceaddr);
3796 		if (addr == nceaddr.s_addr &&
3797 		    IN6_ARE_ADDR_EQUAL(&nce->nce_mask, &ipv6_all_ones)) {
3798 			/* Single flag check; no lock needed */
3799 			if (!(nce->nce_flags & NCE_F_CONDEMNED))
3800 				break;
3801 		}
3802 	}
3803 	mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
3804 	return (nce != NULL);
3805 }
3806 
3807 /*
3808  * Wrapper around ipif_lookup_addr_exact_v6() that allows ND to work properly
3809  * with IPMP.  Specifically, since neighbor discovery is always done on
3810  * underlying interfaces (even for addresses owned by an IPMP interface), we
3811  * need to check for `v6addrp' on both `ill' and on the IPMP meta-interface
3812  * associated with `ill' (if it exists).
3813  */
3814 static ipif_t *
3815 ip_ndp_lookup_addr_v6(const in6_addr_t *v6addrp, ill_t *ill)
3816 {
3817 	ipif_t *ipif;
3818 	ip_stack_t *ipst = ill->ill_ipst;
3819 
3820 	ipif = ipif_lookup_addr_exact_v6(v6addrp, ill, ipst);
3821 	if (ipif == NULL && IS_UNDER_IPMP(ill)) {
3822 		if ((ill = ipmp_ill_hold_ipmp_ill(ill)) != NULL) {
3823 			ipif = ipif_lookup_addr_exact_v6(v6addrp, ill, ipst);
3824 			ill_refrele(ill);
3825 		}
3826 	}
3827 	return (ipif);
3828 }
3829