xref: /titanic_50/usr/src/uts/common/inet/ip/ip_ndp.c (revision c77a61a72b5ecdc507d6cf104142edd371a16c84)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 #include <sys/types.h>
29 #include <sys/stream.h>
30 #include <sys/stropts.h>
31 #include <sys/strsun.h>
32 #include <sys/sysmacros.h>
33 #include <sys/errno.h>
34 #include <sys/dlpi.h>
35 #include <sys/socket.h>
36 #include <sys/ddi.h>
37 #include <sys/sunddi.h>
38 #include <sys/cmn_err.h>
39 #include <sys/debug.h>
40 #include <sys/vtrace.h>
41 #include <sys/kmem.h>
42 #include <sys/zone.h>
43 #include <sys/ethernet.h>
44 #include <sys/sdt.h>
45 
46 #include <net/if.h>
47 #include <net/if_types.h>
48 #include <net/if_dl.h>
49 #include <net/route.h>
50 #include <netinet/in.h>
51 #include <netinet/ip6.h>
52 #include <netinet/icmp6.h>
53 
54 #include <inet/common.h>
55 #include <inet/mi.h>
56 #include <inet/mib2.h>
57 #include <inet/nd.h>
58 #include <inet/ip.h>
59 #include <inet/ip_impl.h>
60 #include <inet/ip_if.h>
61 #include <inet/ip_ire.h>
62 #include <inet/ip_rts.h>
63 #include <inet/ip6.h>
64 #include <inet/ip_ndp.h>
65 #include <inet/ipsec_impl.h>
66 #include <inet/ipsec_info.h>
67 #include <inet/sctp_ip.h>
68 
69 /*
70  * Function names with nce_ prefix are static while function
71  * names with ndp_ prefix are used by rest of the IP.
72  *
73  * Lock ordering:
74  *
75  *	ndp_g_lock -> ill_lock -> nce_lock
76  *
77  * The ndp_g_lock protects the NCE hash (nce_hash_tbl, NCE_HASH_PTR) and
78  * nce_next.  Nce_lock protects the contents of the NCE (particularly
79  * nce_refcnt).
80  */
81 
82 static	boolean_t nce_cmp_ll_addr(const nce_t *nce, const uchar_t *new_ll_addr,
83     uint32_t ll_addr_len);
84 static	void	nce_fastpath(nce_t *nce);
85 static	void	nce_ire_delete(nce_t *nce);
86 static	void	nce_ire_delete1(ire_t *ire, char *nce_arg);
87 static	void 	nce_set_ll(nce_t *nce, uchar_t *ll_addr);
88 static	nce_t	*nce_lookup_addr(ill_t *, const in6_addr_t *, nce_t *);
89 static	nce_t	*nce_lookup_mapping(ill_t *ill, const in6_addr_t *addr);
90 static	void	nce_make_mapping(nce_t *nce, uchar_t *addrpos,
91     uchar_t *addr);
92 static	int	nce_set_multicast(ill_t *ill, const in6_addr_t *addr);
93 static	void	nce_queue_mp(nce_t *nce, mblk_t *mp);
94 static	void	nce_report1(nce_t *nce, uchar_t *mp_arg);
95 static	mblk_t	*nce_udreq_alloc(ill_t *ill);
96 static	void	nce_update(nce_t *nce, uint16_t new_state,
97     uchar_t *new_ll_addr);
98 static	uint32_t	nce_solicit(nce_t *nce, mblk_t *mp);
99 static	boolean_t	nce_xmit(ill_t *ill, uint32_t operation,
100     ill_t *hwaddr_ill, boolean_t use_lla_addr, const in6_addr_t *sender,
101     const in6_addr_t *target, int flag);
102 extern void	th_trace_rrecord(th_trace_t *);
103 static	int	ndp_lookup_then_add_v6(ill_t *, uchar_t *,
104     const in6_addr_t *, const in6_addr_t *, const in6_addr_t *,
105     uint32_t, uint16_t, uint16_t, nce_t **, mblk_t *, mblk_t *);
106 static	int	ndp_lookup_then_add_v4(ill_t *, uchar_t *,
107     const in_addr_t *, const in_addr_t *, const in_addr_t *,
108     uint32_t, uint16_t, uint16_t, nce_t **, mblk_t *, mblk_t *);
109 static	int	ndp_add_v6(ill_t *, uchar_t *, const in6_addr_t *,
110     const in6_addr_t *, const in6_addr_t *, uint32_t, uint16_t, uint16_t,
111     nce_t **);
112 static	int	ndp_add_v4(ill_t *, uchar_t *, const in_addr_t *,
113     const in_addr_t *, const in_addr_t *, uint32_t, uint16_t, uint16_t,
114     nce_t **, mblk_t *, mblk_t *);
115 
116 
117 #ifdef NCE_DEBUG
118 void	nce_trace_inactive(nce_t *);
119 #endif
120 
121 ndp_g_t ndp4, ndp6;
122 
123 #define	NCE_HASH_PTR_V4(addr) \
124 	(&(ndp4.nce_hash_tbl[IRE_ADDR_HASH(addr, NCE_TABLE_SIZE)]))
125 
126 #define	NCE_HASH_PTR_V6(addr) \
127 	(&(ndp6.nce_hash_tbl[NCE_ADDR_HASH_V6(addr, NCE_TABLE_SIZE)]))
128 
129 /*
130  * Compute default flags to use for an advertisement of this nce's address.
131  */
132 static int
133 nce_advert_flags(const nce_t *nce)
134 {
135 	int flag = 0;
136 
137 	if (nce->nce_flags & NCE_F_ISROUTER)
138 		flag |= NDP_ISROUTER;
139 	if (!(nce->nce_flags & NCE_F_PROXY))
140 		flag |= NDP_ORIDE;
141 	return (flag);
142 }
143 
144 int
145 ndp_add(ill_t *ill, uchar_t *hw_addr, const void *addr,
146     const void *mask, const void *extract_mask,
147     uint32_t hw_extract_start, uint16_t flags, uint16_t state,
148     nce_t **newnce, mblk_t *fp_mp, mblk_t *res_mp)
149 {
150 	int status;
151 
152 	if (ill->ill_isv6)
153 		status = ndp_add_v6(ill, hw_addr, (in6_addr_t *)addr,
154 		    (in6_addr_t *)mask, (in6_addr_t *)extract_mask,
155 		    hw_extract_start, flags, state, newnce);
156 	else
157 		status = ndp_add_v4(ill, hw_addr, (in_addr_t *)addr,
158 		    (in_addr_t *)mask, (in_addr_t *)extract_mask,
159 		    hw_extract_start, flags, state, newnce, fp_mp, res_mp);
160 	return (status);
161 }
162 
163 /* Non-tunable probe interval, based on link capabilities */
164 #define	ILL_PROBE_INTERVAL(ill)	((ill)->ill_note_link ? 150 : 1500)
165 
166 /*
167  * NDP Cache Entry creation routine.
168  * Mapped entries will never do NUD .
169  * This routine must always be called with ndp6.ndp_g_lock held.
170  * Prior to return, nce_refcnt is incremented.
171  */
172 static int
173 ndp_add_v6(ill_t *ill, uchar_t *hw_addr, const in6_addr_t *addr,
174     const in6_addr_t *mask, const in6_addr_t *extract_mask,
175     uint32_t hw_extract_start, uint16_t flags, uint16_t state,
176     nce_t **newnce)
177 {
178 	static	nce_t		nce_nil;
179 	nce_t		*nce;
180 	mblk_t		*mp;
181 	mblk_t		*template;
182 	nce_t		**ncep;
183 	int		err;
184 	boolean_t	dropped = B_FALSE;
185 
186 	ASSERT(MUTEX_HELD(&ndp6.ndp_g_lock));
187 	ASSERT(ill != NULL && ill->ill_isv6);
188 	if (IN6_IS_ADDR_UNSPECIFIED(addr)) {
189 		ip0dbg(("ndp_add: no addr\n"));
190 		return (EINVAL);
191 	}
192 	if ((flags & ~NCE_EXTERNAL_FLAGS_MASK)) {
193 		ip0dbg(("ndp_add: flags = %x\n", (int)flags));
194 		return (EINVAL);
195 	}
196 	if (IN6_IS_ADDR_UNSPECIFIED(extract_mask) &&
197 	    (flags & NCE_F_MAPPING)) {
198 		ip0dbg(("ndp_add: extract mask zero for mapping"));
199 		return (EINVAL);
200 	}
201 	/*
202 	 * Allocate the mblk to hold the nce.
203 	 *
204 	 * XXX This can come out of a separate cache - nce_cache.
205 	 * We don't need the mp anymore as there are no more
206 	 * "qwriter"s
207 	 */
208 	mp = allocb(sizeof (nce_t), BPRI_MED);
209 	if (mp == NULL)
210 		return (ENOMEM);
211 
212 	nce = (nce_t *)mp->b_rptr;
213 	mp->b_wptr = (uchar_t *)&nce[1];
214 	*nce = nce_nil;
215 
216 	/*
217 	 * This one holds link layer address
218 	 */
219 	if (ill->ill_net_type == IRE_IF_RESOLVER) {
220 		template = nce_udreq_alloc(ill);
221 	} else {
222 		if (ill->ill_resolver_mp == NULL) {
223 			freeb(mp);
224 			return (EINVAL);
225 		}
226 		ASSERT((ill->ill_net_type == IRE_IF_NORESOLVER));
227 		template = copyb(ill->ill_resolver_mp);
228 	}
229 	if (template == NULL) {
230 		freeb(mp);
231 		return (ENOMEM);
232 	}
233 	nce->nce_ill = ill;
234 	nce->nce_ipversion = IPV6_VERSION;
235 	nce->nce_flags = flags;
236 	nce->nce_state = state;
237 	nce->nce_pcnt = ND_MAX_UNICAST_SOLICIT;
238 	nce->nce_rcnt = ill->ill_xmit_count;
239 	nce->nce_addr = *addr;
240 	nce->nce_mask = *mask;
241 	nce->nce_extract_mask = *extract_mask;
242 	nce->nce_ll_extract_start = hw_extract_start;
243 	nce->nce_fp_mp = NULL;
244 	nce->nce_res_mp = template;
245 	if (state == ND_REACHABLE)
246 		nce->nce_last = TICK_TO_MSEC(lbolt64);
247 	else
248 		nce->nce_last = 0;
249 	nce->nce_qd_mp = NULL;
250 	nce->nce_mp = mp;
251 	if (hw_addr != NULL)
252 		nce_set_ll(nce, hw_addr);
253 	/* This one is for nce getting created */
254 	nce->nce_refcnt = 1;
255 	mutex_init(&nce->nce_lock, NULL, MUTEX_DEFAULT, NULL);
256 	if (nce->nce_flags & NCE_F_MAPPING) {
257 		ASSERT(IN6_IS_ADDR_MULTICAST(addr));
258 		ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&nce->nce_mask));
259 		ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&nce->nce_extract_mask));
260 		ncep = &ndp6.nce_mask_entries;
261 	} else {
262 		ncep = ((nce_t **)NCE_HASH_PTR_V6(*addr));
263 	}
264 
265 #ifdef NCE_DEBUG
266 	bzero(nce->nce_trace, sizeof (th_trace_t *) * IP_TR_HASH_MAX);
267 #endif
268 	/*
269 	 * Atomically ensure that the ill is not CONDEMNED, before
270 	 * adding the NCE.
271 	 */
272 	mutex_enter(&ill->ill_lock);
273 	if (ill->ill_state_flags & ILL_CONDEMNED) {
274 		mutex_exit(&ill->ill_lock);
275 		freeb(mp);
276 		freeb(template);
277 		return (EINVAL);
278 	}
279 	if ((nce->nce_next = *ncep) != NULL)
280 		nce->nce_next->nce_ptpn = &nce->nce_next;
281 	*ncep = nce;
282 	nce->nce_ptpn = ncep;
283 	*newnce = nce;
284 	/* This one is for nce being used by an active thread */
285 	NCE_REFHOLD(*newnce);
286 
287 	/* Bump up the number of nce's referencing this ill */
288 	ill->ill_nce_cnt++;
289 	mutex_exit(&ill->ill_lock);
290 
291 	err = 0;
292 	if ((flags & NCE_F_PERMANENT) && state == ND_PROBE) {
293 		mutex_enter(&nce->nce_lock);
294 		mutex_exit(&ndp6.ndp_g_lock);
295 		nce->nce_pcnt = ND_MAX_UNICAST_SOLICIT;
296 		mutex_exit(&nce->nce_lock);
297 		dropped = nce_xmit(ill, ND_NEIGHBOR_SOLICIT, NULL, B_FALSE,
298 		    &ipv6_all_zeros, addr, NDP_PROBE);
299 		if (dropped) {
300 			mutex_enter(&nce->nce_lock);
301 			nce->nce_pcnt++;
302 			mutex_exit(&nce->nce_lock);
303 		}
304 		NDP_RESTART_TIMER(nce, ILL_PROBE_INTERVAL(ill));
305 		mutex_enter(&ndp6.ndp_g_lock);
306 		err = EINPROGRESS;
307 	} else if (flags & NCE_F_UNSOL_ADV) {
308 		/*
309 		 * We account for the transmit below by assigning one
310 		 * less than the ndd variable. Subsequent decrements
311 		 * are done in ndp_timer.
312 		 */
313 		mutex_enter(&nce->nce_lock);
314 		mutex_exit(&ndp6.ndp_g_lock);
315 		nce->nce_unsolicit_count = ip_ndp_unsolicit_count - 1;
316 		mutex_exit(&nce->nce_lock);
317 		dropped = nce_xmit(ill,
318 		    ND_NEIGHBOR_ADVERT,
319 		    ill,	/* ill to be used for extracting ill_nd_lla */
320 		    B_TRUE,	/* use ill_nd_lla */
321 		    addr,	/* Source and target of the advertisement pkt */
322 		    &ipv6_all_hosts_mcast, /* Destination of the packet */
323 		    nce_advert_flags(nce));
324 		mutex_enter(&nce->nce_lock);
325 		if (dropped)
326 			nce->nce_unsolicit_count++;
327 		if (nce->nce_unsolicit_count != 0) {
328 			nce->nce_timeout_id = timeout(ndp_timer, nce,
329 			    MSEC_TO_TICK(ip_ndp_unsolicit_interval));
330 		}
331 		mutex_exit(&nce->nce_lock);
332 		mutex_enter(&ndp6.ndp_g_lock);
333 	}
334 	/*
335 	 * If the hw_addr is NULL, typically for ND_INCOMPLETE nces, then
336 	 * we call nce_fastpath as soon as the nce is resolved in ndp_process.
337 	 * We call nce_fastpath from nce_update if the link layer address of
338 	 * the peer changes from nce_update
339 	 */
340 	if (hw_addr != NULL || ill->ill_net_type == IRE_IF_NORESOLVER)
341 		nce_fastpath(nce);
342 	return (err);
343 }
344 
345 int
346 ndp_lookup_then_add(ill_t *ill, uchar_t *hw_addr, const void *addr,
347     const void *mask, const void *extract_mask,
348     uint32_t hw_extract_start, uint16_t flags, uint16_t state,
349     nce_t **newnce, mblk_t *fp_mp, mblk_t *res_mp)
350 {
351 	int status;
352 
353 	if (ill->ill_isv6) {
354 		status = ndp_lookup_then_add_v6(ill, hw_addr,
355 		    (in6_addr_t *)addr, (in6_addr_t *)mask,
356 		    (in6_addr_t *)extract_mask, hw_extract_start, flags,
357 		    state, newnce, fp_mp, res_mp);
358 	} else  {
359 		status = ndp_lookup_then_add_v4(ill, hw_addr,
360 		    (in_addr_t *)addr, (in_addr_t *)mask,
361 		    (in_addr_t *)extract_mask, hw_extract_start, flags,
362 		    state, newnce, fp_mp, res_mp);
363 	}
364 
365 	return (status);
366 }
367 
368 static int
369 ndp_lookup_then_add_v6(ill_t *ill, uchar_t *hw_addr, const in6_addr_t *addr,
370     const in6_addr_t *mask, const in6_addr_t *extract_mask,
371     uint32_t hw_extract_start, uint16_t flags, uint16_t state,
372     nce_t **newnce, mblk_t *fp_mp, mblk_t *res_mp)
373 {
374 	int	err = 0;
375 	nce_t	*nce;
376 
377 	ASSERT(ill != NULL && ill->ill_isv6);
378 	mutex_enter(&ndp6.ndp_g_lock);
379 	nce = *((nce_t **)NCE_HASH_PTR_V6(*addr)); /* head of v6 hash table */
380 	nce = nce_lookup_addr(ill, addr, nce);
381 	if (nce == NULL) {
382 		err = ndp_add(ill,
383 		    hw_addr,
384 		    addr,
385 		    mask,
386 		    extract_mask,
387 		    hw_extract_start,
388 		    flags,
389 		    state,
390 		    newnce,
391 		    fp_mp,
392 		    res_mp);
393 	} else {
394 		*newnce = nce;
395 		err = EEXIST;
396 	}
397 	mutex_exit(&ndp6.ndp_g_lock);
398 	return (err);
399 }
400 
401 /*
402  * Remove all the CONDEMNED nces from the appropriate hash table.
403  * We create a private list of NCEs, these may have ires pointing
404  * to them, so the list will be passed through to clean up dependent
405  * ires and only then we can do NCE_REFRELE which can make NCE inactive.
406  */
407 static void
408 nce_remove(ndp_g_t *ndp, nce_t *nce, nce_t **free_nce_list)
409 {
410 	nce_t *nce1;
411 	nce_t **ptpn;
412 
413 	ASSERT(MUTEX_HELD(&ndp->ndp_g_lock));
414 	ASSERT(ndp->ndp_g_walker == 0);
415 	for (; nce; nce = nce1) {
416 		nce1 = nce->nce_next;
417 		mutex_enter(&nce->nce_lock);
418 		if (nce->nce_flags & NCE_F_CONDEMNED) {
419 			ptpn = nce->nce_ptpn;
420 			nce1 = nce->nce_next;
421 			if (nce1 != NULL)
422 				nce1->nce_ptpn = ptpn;
423 			*ptpn = nce1;
424 			nce->nce_ptpn = NULL;
425 			nce->nce_next = NULL;
426 			nce->nce_next = *free_nce_list;
427 			*free_nce_list = nce;
428 		}
429 		mutex_exit(&nce->nce_lock);
430 	}
431 }
432 
433 /*
434  * 1. Mark the nce CONDEMNED. This ensures that no new nce_lookup()
435  *    will return this NCE. Also no new IREs will be created that
436  *    point to this NCE (See ire_add_v6).  Also no new timeouts will
437  *    be started (See NDP_RESTART_TIMER).
438  * 2. Cancel any currently running timeouts.
439  * 3. If there is an ndp walker, return. The walker will do the cleanup.
440  *    This ensures that walkers see a consistent list of NCEs while walking.
441  * 4. Otherwise remove the NCE from the list of NCEs
442  * 5. Delete all IREs pointing to this NCE.
443  */
444 void
445 ndp_delete(nce_t *nce)
446 {
447 	nce_t	**ptpn;
448 	nce_t	*nce1;
449 	int	ipversion = nce->nce_ipversion;
450 	ndp_g_t *ndp = (ipversion == IPV4_VERSION ? &ndp4 : &ndp6);
451 
452 	/* Serialize deletes */
453 	mutex_enter(&nce->nce_lock);
454 	if (nce->nce_flags & NCE_F_CONDEMNED) {
455 		/* Some other thread is doing the delete */
456 		mutex_exit(&nce->nce_lock);
457 		return;
458 	}
459 	/*
460 	 * Caller has a refhold. Also 1 ref for being in the list. Thus
461 	 * refcnt has to be >= 2
462 	 */
463 	ASSERT(nce->nce_refcnt >= 2);
464 	nce->nce_flags |= NCE_F_CONDEMNED;
465 	mutex_exit(&nce->nce_lock);
466 
467 	nce_fastpath_list_delete(nce);
468 
469 	/*
470 	 * Cancel any running timer. Timeout can't be restarted
471 	 * since CONDEMNED is set. Can't hold nce_lock across untimeout.
472 	 * Passing invalid timeout id is fine.
473 	 */
474 	if (nce->nce_timeout_id != 0) {
475 		(void) untimeout(nce->nce_timeout_id);
476 		nce->nce_timeout_id = 0;
477 	}
478 
479 	mutex_enter(&ndp->ndp_g_lock);
480 	if (nce->nce_ptpn == NULL) {
481 		/*
482 		 * The last ndp walker has already removed this nce from
483 		 * the list after we marked the nce CONDEMNED and before
484 		 * we grabbed the global lock.
485 		 */
486 		mutex_exit(&ndp->ndp_g_lock);
487 		return;
488 	}
489 	if (ndp->ndp_g_walker > 0) {
490 		/*
491 		 * Can't unlink. The walker will clean up
492 		 */
493 		ndp->ndp_g_walker_cleanup = B_TRUE;
494 		mutex_exit(&ndp->ndp_g_lock);
495 		return;
496 	}
497 
498 	/*
499 	 * Now remove the nce from the list. NDP_RESTART_TIMER won't restart
500 	 * the timer since it is marked CONDEMNED.
501 	 */
502 	ptpn = nce->nce_ptpn;
503 	nce1 = nce->nce_next;
504 	if (nce1 != NULL)
505 		nce1->nce_ptpn = ptpn;
506 	*ptpn = nce1;
507 	nce->nce_ptpn = NULL;
508 	nce->nce_next = NULL;
509 	mutex_exit(&ndp->ndp_g_lock);
510 
511 	nce_ire_delete(nce);
512 }
513 
514 void
515 ndp_inactive(nce_t *nce)
516 {
517 	mblk_t		**mpp;
518 	ill_t		*ill;
519 
520 	ASSERT(nce->nce_refcnt == 0);
521 	ASSERT(MUTEX_HELD(&nce->nce_lock));
522 	ASSERT(nce->nce_fastpath == NULL);
523 
524 	/* Free all nce allocated messages */
525 	mpp = &nce->nce_first_mp_to_free;
526 	do {
527 		while (*mpp != NULL) {
528 			mblk_t  *mp;
529 
530 			mp = *mpp;
531 			*mpp = mp->b_next;
532 
533 			inet_freemsg(mp);
534 		}
535 	} while (mpp++ != &nce->nce_last_mp_to_free);
536 
537 #ifdef NCE_DEBUG
538 	nce_trace_inactive(nce);
539 #endif
540 
541 	ill = nce->nce_ill;
542 	mutex_enter(&ill->ill_lock);
543 	ill->ill_nce_cnt--;
544 	/*
545 	 * If the number of nce's associated with this ill have dropped
546 	 * to zero, check whether we need to restart any operation that
547 	 * is waiting for this to happen.
548 	 */
549 	if (ill->ill_nce_cnt == 0) {
550 		/* ipif_ill_refrele_tail drops the ill_lock */
551 		ipif_ill_refrele_tail(ill);
552 	} else {
553 		mutex_exit(&ill->ill_lock);
554 	}
555 	mutex_destroy(&nce->nce_lock);
556 	if (nce->nce_mp != NULL)
557 		inet_freemsg(nce->nce_mp);
558 }
559 
560 /*
561  * ndp_walk routine.  Delete the nce if it is associated with the ill
562  * that is going away.  Always called as a writer.
563  */
564 void
565 ndp_delete_per_ill(nce_t *nce, uchar_t *arg)
566 {
567 	if ((nce != NULL) && nce->nce_ill == (ill_t *)arg) {
568 		ndp_delete(nce);
569 	}
570 }
571 
572 /*
573  * Walk a list of to be inactive NCEs and blow away all the ires.
574  */
575 static void
576 nce_ire_delete_list(nce_t *nce)
577 {
578 	nce_t *nce_next;
579 
580 	ASSERT(nce != NULL);
581 	while (nce != NULL) {
582 		nce_next = nce->nce_next;
583 		nce->nce_next = NULL;
584 
585 		/*
586 		 * It is possible for the last ndp walker (this thread)
587 		 * to come here after ndp_delete has marked the nce CONDEMNED
588 		 * and before it has removed the nce from the fastpath list
589 		 * or called untimeout. So we need to do it here. It is safe
590 		 * for both ndp_delete and this thread to do it twice or
591 		 * even simultaneously since each of the threads has a
592 		 * reference on the nce.
593 		 */
594 		nce_fastpath_list_delete(nce);
595 		/*
596 		 * Cancel any running timer. Timeout can't be restarted
597 		 * since CONDEMNED is set. Can't hold nce_lock across untimeout.
598 		 * Passing invalid timeout id is fine.
599 		 */
600 		if (nce->nce_timeout_id != 0) {
601 			(void) untimeout(nce->nce_timeout_id);
602 			nce->nce_timeout_id = 0;
603 		}
604 		/*
605 		 * We might hit this func thus in the v4 case:
606 		 * ipif_down->ipif_ndp_down->ndp_walk
607 		 */
608 
609 		if (nce->nce_ipversion == IPV4_VERSION) {
610 			ire_walk_ill_v4(MATCH_IRE_ILL | MATCH_IRE_TYPE,
611 			    IRE_CACHE, nce_ire_delete1,
612 			    (char *)nce, nce->nce_ill);
613 		} else {
614 			ASSERT(nce->nce_ipversion == IPV6_VERSION);
615 			ire_walk_ill_v6(MATCH_IRE_ILL | MATCH_IRE_TYPE,
616 			    IRE_CACHE, nce_ire_delete1,
617 			    (char *)nce, nce->nce_ill);
618 		}
619 		NCE_REFRELE_NOTR(nce);
620 		nce = nce_next;
621 	}
622 }
623 
624 /*
625  * Delete an ire when the nce goes away.
626  */
627 /* ARGSUSED */
628 static void
629 nce_ire_delete(nce_t *nce)
630 {
631 	if (nce->nce_ipversion == IPV6_VERSION) {
632 		ire_walk_ill_v6(MATCH_IRE_ILL | MATCH_IRE_TYPE, IRE_CACHE,
633 		    nce_ire_delete1, (char *)nce, nce->nce_ill);
634 		NCE_REFRELE_NOTR(nce);
635 	} else {
636 		ire_walk_ill_v4(MATCH_IRE_ILL | MATCH_IRE_TYPE, IRE_CACHE,
637 		    nce_ire_delete1, (char *)nce, nce->nce_ill);
638 		NCE_REFRELE_NOTR(nce);
639 	}
640 }
641 
642 /*
643  * ire_walk routine used to delete every IRE that shares this nce
644  */
645 static void
646 nce_ire_delete1(ire_t *ire, char *nce_arg)
647 {
648 	nce_t	*nce = (nce_t *)nce_arg;
649 
650 	ASSERT(ire->ire_type == IRE_CACHE);
651 
652 	if (ire->ire_nce == nce) {
653 		ASSERT(ire->ire_ipversion == nce->nce_ipversion);
654 		ire_delete(ire);
655 	}
656 }
657 
658 /*
659  * Restart DAD on given NCE.  Returns B_TRUE if DAD has been restarted.
660  */
661 boolean_t
662 ndp_restart_dad(nce_t *nce)
663 {
664 	boolean_t started;
665 	boolean_t dropped;
666 
667 	if (nce == NULL)
668 		return (B_FALSE);
669 	mutex_enter(&nce->nce_lock);
670 	if (nce->nce_state == ND_PROBE) {
671 		mutex_exit(&nce->nce_lock);
672 		started = B_TRUE;
673 	} else if (nce->nce_state == ND_REACHABLE) {
674 		nce->nce_state = ND_PROBE;
675 		nce->nce_pcnt = ND_MAX_UNICAST_SOLICIT - 1;
676 		mutex_exit(&nce->nce_lock);
677 		dropped = nce_xmit(nce->nce_ill, ND_NEIGHBOR_SOLICIT, NULL,
678 		    B_FALSE, &ipv6_all_zeros, &nce->nce_addr, NDP_PROBE);
679 		if (dropped) {
680 			mutex_enter(&nce->nce_lock);
681 			nce->nce_pcnt++;
682 			mutex_exit(&nce->nce_lock);
683 		}
684 		NDP_RESTART_TIMER(nce, ILL_PROBE_INTERVAL(nce->nce_ill));
685 		started = B_TRUE;
686 	} else {
687 		mutex_exit(&nce->nce_lock);
688 		started = B_FALSE;
689 	}
690 	return (started);
691 }
692 
693 /*
694  * IPv6 Cache entry lookup.  Try to find an nce matching the parameters passed.
695  * If one is found, the refcnt on the nce will be incremented.
696  */
697 nce_t *
698 ndp_lookup_v6(ill_t *ill, const in6_addr_t *addr, boolean_t caller_holds_lock)
699 {
700 	nce_t	*nce;
701 
702 	ASSERT(ill != NULL && ill->ill_isv6);
703 	if (!caller_holds_lock) {
704 		mutex_enter(&ndp6.ndp_g_lock);
705 	}
706 	nce = *((nce_t **)NCE_HASH_PTR_V6(*addr)); /* head of v6 hash table */
707 	nce = nce_lookup_addr(ill, addr, nce);
708 	if (nce == NULL)
709 		nce = nce_lookup_mapping(ill, addr);
710 	if (!caller_holds_lock)
711 		mutex_exit(&ndp6.ndp_g_lock);
712 	return (nce);
713 }
714 /*
715  * IPv4 Cache entry lookup.  Try to find an nce matching the parameters passed.
716  * If one is found, the refcnt on the nce will be incremented.
717  * Since multicast mappings are handled in arp, there are no nce_mcast_entries
718  * so we skip the nce_lookup_mapping call.
719  * XXX TODO: if the nce is found to be ND_STALE, ndp_delete it and return NULL
720  */
721 nce_t *
722 ndp_lookup_v4(ill_t *ill, const in_addr_t *addr, boolean_t caller_holds_lock)
723 {
724 	nce_t	*nce;
725 	in6_addr_t addr6;
726 
727 	if (!caller_holds_lock) {
728 		mutex_enter(&ndp4.ndp_g_lock);
729 	}
730 	nce = *((nce_t **)NCE_HASH_PTR_V4(*addr)); /* head of v6 hash table */
731 	IN6_IPADDR_TO_V4MAPPED(*addr, &addr6);
732 	nce = nce_lookup_addr(ill, &addr6, nce);
733 	if (!caller_holds_lock)
734 		mutex_exit(&ndp4.ndp_g_lock);
735 	return (nce);
736 }
737 
738 /*
739  * Cache entry lookup.  Try to find an nce matching the parameters passed.
740  * Look only for exact entries (no mappings).  If an nce is found, increment
741  * the hold count on that nce. The caller passes in the start of the
742  * appropriate hash table, and must be holding the appropriate global
743  * lock (ndp_g_lock).
744  */
745 static nce_t *
746 nce_lookup_addr(ill_t *ill, const in6_addr_t *addr, nce_t *nce)
747 {
748 	ndp_g_t *ndp = (ill->ill_isv6 ? &ndp6 : &ndp4);
749 
750 	ASSERT(ill != NULL);
751 	ASSERT(MUTEX_HELD(&ndp->ndp_g_lock));
752 	if (IN6_IS_ADDR_UNSPECIFIED(addr))
753 		return (NULL);
754 	for (; nce != NULL; nce = nce->nce_next) {
755 		if (nce->nce_ill == ill) {
756 			if (IN6_ARE_ADDR_EQUAL(&nce->nce_addr, addr) &&
757 			    IN6_ARE_ADDR_EQUAL(&nce->nce_mask,
758 			    &ipv6_all_ones)) {
759 				mutex_enter(&nce->nce_lock);
760 				if (!(nce->nce_flags & NCE_F_CONDEMNED)) {
761 					NCE_REFHOLD_LOCKED(nce);
762 					mutex_exit(&nce->nce_lock);
763 					break;
764 				}
765 				mutex_exit(&nce->nce_lock);
766 			}
767 		}
768 	}
769 	return (nce);
770 }
771 
772 /*
773  * Cache entry lookup.  Try to find an nce matching the parameters passed.
774  * Look only for mappings.
775  */
776 static nce_t *
777 nce_lookup_mapping(ill_t *ill, const in6_addr_t *addr)
778 {
779 	nce_t	*nce;
780 
781 	ASSERT(ill != NULL && ill->ill_isv6);
782 	ASSERT(MUTEX_HELD(&ndp6.ndp_g_lock));
783 	if (!IN6_IS_ADDR_MULTICAST(addr))
784 		return (NULL);
785 	nce = ndp6.nce_mask_entries;
786 	for (; nce != NULL; nce = nce->nce_next)
787 		if (nce->nce_ill == ill &&
788 		    (V6_MASK_EQ(*addr, nce->nce_mask, nce->nce_addr))) {
789 			mutex_enter(&nce->nce_lock);
790 			if (!(nce->nce_flags & NCE_F_CONDEMNED)) {
791 				NCE_REFHOLD_LOCKED(nce);
792 				mutex_exit(&nce->nce_lock);
793 				break;
794 			}
795 			mutex_exit(&nce->nce_lock);
796 		}
797 	return (nce);
798 }
799 
800 /*
801  * Process passed in parameters either from an incoming packet or via
802  * user ioctl.
803  */
804 void
805 ndp_process(nce_t *nce, uchar_t *hw_addr, uint32_t flag, boolean_t is_adv)
806 {
807 	ill_t	*ill = nce->nce_ill;
808 	uint32_t hw_addr_len = ill->ill_nd_lla_len;
809 	mblk_t	*mp;
810 	boolean_t ll_updated = B_FALSE;
811 	boolean_t ll_changed;
812 
813 	ASSERT(nce->nce_ipversion == IPV6_VERSION);
814 	/*
815 	 * No updates of link layer address or the neighbor state is
816 	 * allowed, when the cache is in NONUD state.  This still
817 	 * allows for responding to reachability solicitation.
818 	 */
819 	mutex_enter(&nce->nce_lock);
820 	if (nce->nce_state == ND_INCOMPLETE) {
821 		if (hw_addr == NULL) {
822 			mutex_exit(&nce->nce_lock);
823 			return;
824 		}
825 		nce_set_ll(nce, hw_addr);
826 		/*
827 		 * Update nce state and send the queued packets
828 		 * back to ip this time ire will be added.
829 		 */
830 		if (flag & ND_NA_FLAG_SOLICITED) {
831 			nce_update(nce, ND_REACHABLE, NULL);
832 		} else {
833 			nce_update(nce, ND_STALE, NULL);
834 		}
835 		mutex_exit(&nce->nce_lock);
836 		nce_fastpath(nce);
837 		mutex_enter(&nce->nce_lock);
838 		mp = nce->nce_qd_mp;
839 		nce->nce_qd_mp = NULL;
840 		mutex_exit(&nce->nce_lock);
841 		while (mp != NULL) {
842 			mblk_t *nxt_mp, *data_mp;
843 
844 			nxt_mp = mp->b_next;
845 			mp->b_next = NULL;
846 
847 			if (mp->b_datap->db_type == M_CTL)
848 				data_mp = mp->b_cont;
849 			else
850 				data_mp = mp;
851 			if (data_mp->b_prev != NULL) {
852 				ill_t   *inbound_ill;
853 				queue_t *fwdq = NULL;
854 				uint_t ifindex;
855 
856 				ifindex = (uint_t)(uintptr_t)data_mp->b_prev;
857 				inbound_ill = ill_lookup_on_ifindex(ifindex,
858 				    B_TRUE, NULL, NULL, NULL, NULL);
859 				if (inbound_ill == NULL) {
860 					data_mp->b_prev = NULL;
861 					freemsg(mp);
862 					return;
863 				} else {
864 					fwdq = inbound_ill->ill_rq;
865 				}
866 				data_mp->b_prev = NULL;
867 				/*
868 				 * Send a forwarded packet back into ip_rput_v6
869 				 * just as in ire_send_v6().
870 				 * Extract the queue from b_prev (set in
871 				 * ip_rput_data_v6).
872 				 */
873 				if (fwdq != NULL) {
874 					/*
875 					 * Forwarded packets hop count will
876 					 * get decremented in ip_rput_data_v6
877 					 */
878 					if (data_mp != mp)
879 						freeb(mp);
880 					put(fwdq, data_mp);
881 				} else {
882 					/*
883 					 * Send locally originated packets back
884 					 * into * ip_wput_v6.
885 					 */
886 					put(ill->ill_wq, mp);
887 				}
888 				ill_refrele(inbound_ill);
889 			} else {
890 				put(ill->ill_wq, mp);
891 			}
892 			mp = nxt_mp;
893 		}
894 		return;
895 	}
896 	ll_changed = nce_cmp_ll_addr(nce, hw_addr, hw_addr_len);
897 	if (!is_adv) {
898 		/* If this is a SOLICITATION request only */
899 		if (ll_changed)
900 			nce_update(nce, ND_STALE, hw_addr);
901 		mutex_exit(&nce->nce_lock);
902 		return;
903 	}
904 	if (!(flag & ND_NA_FLAG_OVERRIDE) && ll_changed) {
905 		/* If in any other state than REACHABLE, ignore */
906 		if (nce->nce_state == ND_REACHABLE) {
907 			nce_update(nce, ND_STALE, NULL);
908 		}
909 		mutex_exit(&nce->nce_lock);
910 		return;
911 	} else {
912 		if (ll_changed) {
913 			nce_update(nce, ND_UNCHANGED, hw_addr);
914 			ll_updated = B_TRUE;
915 		}
916 		if (flag & ND_NA_FLAG_SOLICITED) {
917 			nce_update(nce, ND_REACHABLE, NULL);
918 		} else {
919 			if (ll_updated) {
920 				nce_update(nce, ND_STALE, NULL);
921 			}
922 		}
923 		mutex_exit(&nce->nce_lock);
924 		if (!(flag & ND_NA_FLAG_ROUTER) && (nce->nce_flags &
925 		    NCE_F_ISROUTER)) {
926 			ire_t *ire;
927 
928 			/*
929 			 * Router turned to host.  We need to remove the
930 			 * entry as well as any default route that may be
931 			 * using this as a next hop.  This is required by
932 			 * section 7.2.5 of RFC 2461.
933 			 */
934 			ire = ire_ftable_lookup_v6(&ipv6_all_zeros,
935 			    &ipv6_all_zeros, &nce->nce_addr, IRE_DEFAULT,
936 			    nce->nce_ill->ill_ipif, NULL, ALL_ZONES, 0, NULL,
937 			    MATCH_IRE_ILL | MATCH_IRE_TYPE | MATCH_IRE_GW |
938 			    MATCH_IRE_DEFAULT);
939 			if (ire != NULL) {
940 				ip_rts_rtmsg(RTM_DELETE, ire, 0);
941 				ire_delete(ire);
942 				ire_refrele(ire);
943 			}
944 			ndp_delete(nce);
945 		}
946 	}
947 }
948 
949 /*
950  * Pass arg1 to the pfi supplied, along with each nce in existence.
951  * ndp_walk() places a REFHOLD on the nce and drops the lock when
952  * walking the hash list.
953  */
954 void
955 ndp_walk_common(ndp_g_t *ndp, ill_t *ill, pfi_t pfi, void *arg1,
956     boolean_t trace)
957 {
958 
959 	nce_t	*nce;
960 	nce_t	*nce1;
961 	nce_t	**ncep;
962 	nce_t	*free_nce_list = NULL;
963 
964 	mutex_enter(&ndp->ndp_g_lock);
965 	/* Prevent ndp_delete from unlink and free of NCE */
966 	ndp->ndp_g_walker++;
967 	mutex_exit(&ndp->ndp_g_lock);
968 	for (ncep = ndp->nce_hash_tbl;
969 	    ncep < A_END(ndp->nce_hash_tbl); ncep++) {
970 		for (nce = *ncep; nce != NULL; nce = nce1) {
971 			nce1 = nce->nce_next;
972 			if (ill == NULL || nce->nce_ill == ill) {
973 				if (trace) {
974 					NCE_REFHOLD(nce);
975 					(*pfi)(nce, arg1);
976 					NCE_REFRELE(nce);
977 				} else {
978 					NCE_REFHOLD_NOTR(nce);
979 					(*pfi)(nce, arg1);
980 					NCE_REFRELE_NOTR(nce);
981 				}
982 			}
983 		}
984 	}
985 	for (nce = ndp->nce_mask_entries; nce != NULL; nce = nce1) {
986 		nce1 = nce->nce_next;
987 		if (ill == NULL || nce->nce_ill == ill) {
988 			if (trace) {
989 				NCE_REFHOLD(nce);
990 				(*pfi)(nce, arg1);
991 				NCE_REFRELE(nce);
992 			} else {
993 				NCE_REFHOLD_NOTR(nce);
994 				(*pfi)(nce, arg1);
995 				NCE_REFRELE_NOTR(nce);
996 			}
997 		}
998 	}
999 	mutex_enter(&ndp->ndp_g_lock);
1000 	ndp->ndp_g_walker--;
1001 	/*
1002 	 * While NCE's are removed from global list they are placed
1003 	 * in a private list, to be passed to nce_ire_delete_list().
1004 	 * The reason is, there may be ires pointing to this nce
1005 	 * which needs to cleaned up.
1006 	 */
1007 	if (ndp->ndp_g_walker_cleanup && ndp->ndp_g_walker == 0) {
1008 		/* Time to delete condemned entries */
1009 		for (ncep = ndp->nce_hash_tbl;
1010 		    ncep < A_END(ndp->nce_hash_tbl); ncep++) {
1011 			nce = *ncep;
1012 			if (nce != NULL) {
1013 				nce_remove(ndp, nce, &free_nce_list);
1014 			}
1015 		}
1016 		nce = ndp->nce_mask_entries;
1017 		if (nce != NULL) {
1018 			nce_remove(ndp, nce, &free_nce_list);
1019 		}
1020 		ndp->ndp_g_walker_cleanup = B_FALSE;
1021 	}
1022 	mutex_exit(&ndp->ndp_g_lock);
1023 
1024 	if (free_nce_list != NULL) {
1025 		nce_ire_delete_list(free_nce_list);
1026 	}
1027 }
1028 
1029 void
1030 ndp_walk(ill_t *ill, pfi_t pfi, void *arg1)
1031 {
1032 	ndp_walk_common(&ndp4, ill, pfi, arg1, B_TRUE);
1033 	ndp_walk_common(&ndp6, ill, pfi, arg1, B_TRUE);
1034 }
1035 
1036 /*
1037  * Process resolve requests.  Handles both mapped entries
1038  * as well as cases that needs to be send out on the wire.
1039  * Lookup a NCE for a given IRE.  Regardless of whether one exists
1040  * or one is created, we defer making ire point to nce until the
1041  * ire is actually added at which point the nce_refcnt on the nce is
1042  * incremented.  This is done primarily to have symmetry between ire_add()
1043  * and ire_delete() which decrements the nce_refcnt, when an ire is deleted.
1044  */
1045 int
1046 ndp_resolver(ill_t *ill, const in6_addr_t *dst, mblk_t *mp, zoneid_t zoneid)
1047 {
1048 	nce_t		*nce;
1049 	int		err = 0;
1050 	uint32_t	ms;
1051 	mblk_t		*mp_nce = NULL;
1052 
1053 	ASSERT(ill != NULL);
1054 	ASSERT(ill->ill_isv6);
1055 	if (IN6_IS_ADDR_MULTICAST(dst)) {
1056 		err = nce_set_multicast(ill, dst);
1057 		return (err);
1058 	}
1059 	err = ndp_lookup_then_add(ill,
1060 	    NULL,	/* No hardware address */
1061 	    dst,
1062 	    &ipv6_all_ones,
1063 	    &ipv6_all_zeros,
1064 	    0,
1065 	    (ill->ill_flags & ILLF_NONUD) ? NCE_F_NONUD : 0,
1066 	    ND_INCOMPLETE,
1067 	    &nce,
1068 	    NULL, /* let ndp_add figure out fastpath mp and dlureq_mp for v6 */
1069 	    NULL);
1070 
1071 	switch (err) {
1072 	case 0:
1073 		/*
1074 		 * New cache entry was created. Make sure that the state
1075 		 * is not ND_INCOMPLETE. It can be in some other state
1076 		 * even before we send out the solicitation as we could
1077 		 * get un-solicited advertisements.
1078 		 *
1079 		 * If this is an XRESOLV interface, simply return 0,
1080 		 * since we don't want to solicit just yet.
1081 		 */
1082 		if (ill->ill_flags & ILLF_XRESOLV) {
1083 			NCE_REFRELE(nce);
1084 			return (0);
1085 		}
1086 		rw_enter(&ill_g_lock, RW_READER);
1087 		mutex_enter(&nce->nce_lock);
1088 		if (nce->nce_state != ND_INCOMPLETE) {
1089 			mutex_exit(&nce->nce_lock);
1090 			rw_exit(&ill_g_lock);
1091 			NCE_REFRELE(nce);
1092 			return (0);
1093 		}
1094 		mp_nce = ip_prepend_zoneid(mp, zoneid);
1095 		if (mp_nce == NULL) {
1096 			/* The caller will free mp */
1097 			mutex_exit(&nce->nce_lock);
1098 			rw_exit(&ill_g_lock);
1099 			ndp_delete(nce);
1100 			NCE_REFRELE(nce);
1101 			return (ENOMEM);
1102 		}
1103 		ms = nce_solicit(nce, mp_nce);
1104 		rw_exit(&ill_g_lock);
1105 		if (ms == 0) {
1106 			/* The caller will free mp */
1107 			if (mp_nce != mp)
1108 				freeb(mp_nce);
1109 			mutex_exit(&nce->nce_lock);
1110 			ndp_delete(nce);
1111 			NCE_REFRELE(nce);
1112 			return (EBUSY);
1113 		}
1114 		mutex_exit(&nce->nce_lock);
1115 		NDP_RESTART_TIMER(nce, (clock_t)ms);
1116 		NCE_REFRELE(nce);
1117 		return (EINPROGRESS);
1118 	case EEXIST:
1119 		/* Resolution in progress just queue the packet */
1120 		mutex_enter(&nce->nce_lock);
1121 		if (nce->nce_state == ND_INCOMPLETE) {
1122 			mp_nce = ip_prepend_zoneid(mp, zoneid);
1123 			if (mp_nce == NULL) {
1124 				err = ENOMEM;
1125 			} else {
1126 				nce_queue_mp(nce, mp_nce);
1127 				err = EINPROGRESS;
1128 			}
1129 		} else {
1130 			/*
1131 			 * Any other state implies we have
1132 			 * a nce but IRE needs to be added ...
1133 			 * ire_add_v6() will take care of the
1134 			 * the case when the nce becomes CONDEMNED
1135 			 * before the ire is added to the table.
1136 			 */
1137 			err = 0;
1138 		}
1139 		mutex_exit(&nce->nce_lock);
1140 		NCE_REFRELE(nce);
1141 		break;
1142 	default:
1143 		ip1dbg(("ndp_resolver: Can't create NCE %d\n", err));
1144 		break;
1145 	}
1146 	return (err);
1147 }
1148 
1149 /*
1150  * When there is no resolver, the link layer template is passed in
1151  * the IRE.
1152  * Lookup a NCE for a given IRE.  Regardless of whether one exists
1153  * or one is created, we defer making ire point to nce until the
1154  * ire is actually added at which point the nce_refcnt on the nce is
1155  * incremented.  This is done primarily to have symmetry between ire_add()
1156  * and ire_delete() which decrements the nce_refcnt, when an ire is deleted.
1157  */
1158 int
1159 ndp_noresolver(ill_t *ill, const in6_addr_t *dst)
1160 {
1161 	nce_t		*nce;
1162 	int		err = 0;
1163 
1164 	ASSERT(ill != NULL);
1165 	ASSERT(ill->ill_isv6);
1166 	if (IN6_IS_ADDR_MULTICAST(dst)) {
1167 		err = nce_set_multicast(ill, dst);
1168 		return (err);
1169 	}
1170 
1171 	err = ndp_lookup_then_add(ill,
1172 	    NULL,	/* hardware address */
1173 	    dst,
1174 	    &ipv6_all_ones,
1175 	    &ipv6_all_zeros,
1176 	    0,
1177 	    (ill->ill_flags & ILLF_NONUD) ? NCE_F_NONUD : 0,
1178 	    ND_REACHABLE,
1179 	    &nce,
1180 	    NULL, /* let ndp_add figure out fp_mp/dlureq_mp for v6 */
1181 	    NULL);
1182 
1183 	switch (err) {
1184 	case 0:
1185 		/*
1186 		 * Cache entry with a proper resolver cookie was
1187 		 * created.
1188 		 */
1189 		NCE_REFRELE(nce);
1190 		break;
1191 	case EEXIST:
1192 		err = 0;
1193 		NCE_REFRELE(nce);
1194 		break;
1195 	default:
1196 		ip1dbg(("ndp_noresolver: Can't create NCE %d\n", err));
1197 		break;
1198 	}
1199 	return (err);
1200 }
1201 
1202 /*
1203  * For each interface an entry is added for the unspecified multicast group.
1204  * Here that mapping is used to form the multicast cache entry for a particular
1205  * multicast destination.
1206  */
1207 static int
1208 nce_set_multicast(ill_t *ill, const in6_addr_t *dst)
1209 {
1210 	nce_t		*mnce;	/* Multicast mapping entry */
1211 	nce_t		*nce;
1212 	uchar_t		*hw_addr = NULL;
1213 	int		err = 0;
1214 
1215 	ASSERT(ill != NULL);
1216 	ASSERT(ill->ill_isv6);
1217 	ASSERT(!(IN6_IS_ADDR_UNSPECIFIED(dst)));
1218 
1219 	mutex_enter(&ndp6.ndp_g_lock);
1220 	nce = *((nce_t **)NCE_HASH_PTR_V6(*dst));
1221 	nce = nce_lookup_addr(ill, dst, nce);
1222 	if (nce != NULL) {
1223 		mutex_exit(&ndp6.ndp_g_lock);
1224 		NCE_REFRELE(nce);
1225 		return (0);
1226 	}
1227 	/* No entry, now lookup for a mapping this should never fail */
1228 	mnce = nce_lookup_mapping(ill, dst);
1229 	if (mnce == NULL) {
1230 		/* Something broken for the interface. */
1231 		mutex_exit(&ndp6.ndp_g_lock);
1232 		return (ESRCH);
1233 	}
1234 	ASSERT(mnce->nce_flags & NCE_F_MAPPING);
1235 	if (ill->ill_net_type == IRE_IF_RESOLVER) {
1236 		/*
1237 		 * For IRE_IF_RESOLVER a hardware mapping can be
1238 		 * generated, for IRE_IF_NORESOLVER, resolution cookie
1239 		 * in the ill is copied in ndp_add().
1240 		 */
1241 		hw_addr = kmem_alloc(ill->ill_nd_lla_len, KM_NOSLEEP);
1242 		if (hw_addr == NULL) {
1243 			mutex_exit(&ndp6.ndp_g_lock);
1244 			NCE_REFRELE(mnce);
1245 			return (ENOMEM);
1246 		}
1247 		nce_make_mapping(mnce, hw_addr, (uchar_t *)dst);
1248 	}
1249 	NCE_REFRELE(mnce);
1250 	/*
1251 	 * IRE_IF_NORESOLVER type simply copies the resolution
1252 	 * cookie passed in.  So no hw_addr is needed.
1253 	 */
1254 	err = ndp_add(ill,
1255 	    hw_addr,
1256 	    dst,
1257 	    &ipv6_all_ones,
1258 	    &ipv6_all_zeros,
1259 	    0,
1260 	    NCE_F_NONUD,
1261 	    ND_REACHABLE,
1262 	    &nce,
1263 	    NULL,
1264 	    NULL);
1265 	mutex_exit(&ndp6.ndp_g_lock);
1266 	if (hw_addr != NULL)
1267 		kmem_free(hw_addr, ill->ill_nd_lla_len);
1268 	if (err != 0) {
1269 		ip1dbg(("nce_set_multicast: create failed" "%d\n", err));
1270 		return (err);
1271 	}
1272 	NCE_REFRELE(nce);
1273 	return (0);
1274 }
1275 
1276 /*
1277  * Return the link layer address, and any flags of a nce.
1278  */
1279 int
1280 ndp_query(ill_t *ill, struct lif_nd_req *lnr)
1281 {
1282 	nce_t		*nce;
1283 	in6_addr_t	*addr;
1284 	sin6_t		*sin6;
1285 	dl_unitdata_req_t	*dl;
1286 
1287 	ASSERT(ill != NULL && ill->ill_isv6);
1288 	sin6 = (sin6_t *)&lnr->lnr_addr;
1289 	addr =  &sin6->sin6_addr;
1290 
1291 	nce = ndp_lookup_v6(ill, addr, B_FALSE);
1292 	if (nce == NULL)
1293 		return (ESRCH);
1294 	/* If in INCOMPLETE state, no link layer address is available yet */
1295 	if (nce->nce_state == ND_INCOMPLETE)
1296 		goto done;
1297 	dl = (dl_unitdata_req_t *)nce->nce_res_mp->b_rptr;
1298 	if (ill->ill_flags & ILLF_XRESOLV)
1299 		lnr->lnr_hdw_len = dl->dl_dest_addr_length;
1300 	else
1301 		lnr->lnr_hdw_len = ill->ill_nd_lla_len;
1302 	ASSERT(NCE_LL_ADDR_OFFSET(ill) + lnr->lnr_hdw_len <=
1303 	    sizeof (lnr->lnr_hdw_addr));
1304 	bcopy(nce->nce_res_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill),
1305 	    (uchar_t *)&lnr->lnr_hdw_addr, lnr->lnr_hdw_len);
1306 	if (nce->nce_flags & NCE_F_ISROUTER)
1307 		lnr->lnr_flags = NDF_ISROUTER_ON;
1308 	if (nce->nce_flags & NCE_F_PROXY)
1309 		lnr->lnr_flags |= NDF_PROXY_ON;
1310 	if (nce->nce_flags & NCE_F_ANYCAST)
1311 		lnr->lnr_flags |= NDF_ANYCAST_ON;
1312 done:
1313 	NCE_REFRELE(nce);
1314 	return (0);
1315 }
1316 
1317 /*
1318  * Send Enable/Disable multicast reqs to driver.
1319  */
1320 int
1321 ndp_mcastreq(ill_t *ill, const in6_addr_t *addr, uint32_t hw_addr_len,
1322     uint32_t hw_addr_offset, mblk_t *mp)
1323 {
1324 	nce_t		*nce;
1325 	uchar_t		*hw_addr;
1326 
1327 	ASSERT(ill != NULL && ill->ill_isv6);
1328 	ASSERT(ill->ill_net_type == IRE_IF_RESOLVER);
1329 	hw_addr = mi_offset_paramc(mp, hw_addr_offset, hw_addr_len);
1330 	if (hw_addr == NULL || !IN6_IS_ADDR_MULTICAST(addr)) {
1331 		freemsg(mp);
1332 		return (EINVAL);
1333 	}
1334 	mutex_enter(&ndp6.ndp_g_lock);
1335 	nce = nce_lookup_mapping(ill, addr);
1336 	if (nce == NULL) {
1337 		mutex_exit(&ndp6.ndp_g_lock);
1338 		freemsg(mp);
1339 		return (ESRCH);
1340 	}
1341 	mutex_exit(&ndp6.ndp_g_lock);
1342 	/*
1343 	 * Update dl_addr_length and dl_addr_offset for primitives that
1344 	 * have physical addresses as opposed to full saps
1345 	 */
1346 	switch (((union DL_primitives *)mp->b_rptr)->dl_primitive) {
1347 	case DL_ENABMULTI_REQ:
1348 		/* Track the state if this is the first enabmulti */
1349 		if (ill->ill_dlpi_multicast_state == IDS_UNKNOWN)
1350 			ill->ill_dlpi_multicast_state = IDS_INPROGRESS;
1351 		ip1dbg(("ndp_mcastreq: ENABMULTI\n"));
1352 		break;
1353 	case DL_DISABMULTI_REQ:
1354 		ip1dbg(("ndp_mcastreq: DISABMULTI\n"));
1355 		break;
1356 	default:
1357 		NCE_REFRELE(nce);
1358 		ip1dbg(("ndp_mcastreq: default\n"));
1359 		return (EINVAL);
1360 	}
1361 	nce_make_mapping(nce, hw_addr, (uchar_t *)addr);
1362 	NCE_REFRELE(nce);
1363 	putnext(ill->ill_wq, mp);
1364 	return (0);
1365 }
1366 
1367 /*
1368  * Send a neighbor solicitation.
1369  * Returns number of milliseconds after which we should either rexmit or abort.
1370  * Return of zero means we should abort.
1371  * The caller holds the nce_lock to protect nce_qd_mp and nce_rcnt.
1372  *
1373  * NOTE: This routine drops nce_lock (and later reacquires it) when sending
1374  * the packet.
1375  * NOTE: This routine does not consume mp.
1376  */
1377 uint32_t
1378 nce_solicit(nce_t *nce, mblk_t *mp)
1379 {
1380 	ill_t		*ill;
1381 	ill_t		*src_ill;
1382 	ip6_t		*ip6h;
1383 	in6_addr_t	src;
1384 	in6_addr_t	dst;
1385 	ipif_t		*ipif;
1386 	ip6i_t		*ip6i;
1387 	boolean_t	dropped = B_FALSE;
1388 
1389 	ASSERT(RW_READ_HELD(&ill_g_lock));
1390 	ASSERT(MUTEX_HELD(&nce->nce_lock));
1391 	ill = nce->nce_ill;
1392 	ASSERT(ill != NULL);
1393 
1394 	if (nce->nce_rcnt == 0) {
1395 		return (0);
1396 	}
1397 
1398 	if (mp == NULL) {
1399 		ASSERT(nce->nce_qd_mp != NULL);
1400 		mp = nce->nce_qd_mp;
1401 	} else {
1402 		nce_queue_mp(nce, mp);
1403 	}
1404 
1405 	/* Handle ip_newroute_v6 giving us IPSEC packets */
1406 	if (mp->b_datap->db_type == M_CTL)
1407 		mp = mp->b_cont;
1408 
1409 	ip6h = (ip6_t *)mp->b_rptr;
1410 	if (ip6h->ip6_nxt == IPPROTO_RAW) {
1411 		/*
1412 		 * This message should have been pulled up already in
1413 		 * ip_wput_v6. We can't do pullups here because the message
1414 		 * could be from the nce_qd_mp which could have b_next/b_prev
1415 		 * non-NULL.
1416 		 */
1417 		ip6i = (ip6i_t *)ip6h;
1418 		ASSERT((mp->b_wptr - (uchar_t *)ip6i) >=
1419 			    sizeof (ip6i_t) + IPV6_HDR_LEN);
1420 		ip6h = (ip6_t *)(mp->b_rptr + sizeof (ip6i_t));
1421 	}
1422 	src = ip6h->ip6_src;
1423 	/*
1424 	 * If the src of outgoing packet is one of the assigned interface
1425 	 * addresses use it, otherwise we will pick the source address below.
1426 	 */
1427 	src_ill = ill;
1428 	if (!IN6_IS_ADDR_UNSPECIFIED(&src)) {
1429 		if (ill->ill_group != NULL)
1430 			src_ill = ill->ill_group->illgrp_ill;
1431 		for (; src_ill != NULL; src_ill = src_ill->ill_group_next) {
1432 			for (ipif = src_ill->ill_ipif; ipif != NULL;
1433 			    ipif = ipif->ipif_next) {
1434 				if (IN6_ARE_ADDR_EQUAL(&src,
1435 				    &ipif->ipif_v6lcl_addr)) {
1436 					break;
1437 				}
1438 			}
1439 			if (ipif != NULL)
1440 				break;
1441 		}
1442 		/*
1443 		 * If no relevant ipif can be found, then it's not one of our
1444 		 * addresses.  Reset to :: and let nce_xmit.  If an ipif can be
1445 		 * found, but it's not yet done with DAD verification, then
1446 		 * just postpone this transmission until later.
1447 		 */
1448 		if (src_ill == NULL)
1449 			src = ipv6_all_zeros;
1450 		else if (!ipif->ipif_addr_ready)
1451 			return (ill->ill_reachable_retrans_time);
1452 	}
1453 	dst = nce->nce_addr;
1454 	/*
1455 	 * If source address is unspecified, nce_xmit will choose
1456 	 * one for us and initialize the hardware address also
1457 	 * appropriately.
1458 	 */
1459 	if (IN6_IS_ADDR_UNSPECIFIED(&src))
1460 		src_ill = NULL;
1461 	nce->nce_rcnt--;
1462 	mutex_exit(&nce->nce_lock);
1463 	rw_exit(&ill_g_lock);
1464 	dropped = nce_xmit(ill, ND_NEIGHBOR_SOLICIT, src_ill, B_TRUE, &src,
1465 	    &dst, 0);
1466 	rw_enter(&ill_g_lock, RW_READER);
1467 	mutex_enter(&nce->nce_lock);
1468 	if (dropped)
1469 		nce->nce_rcnt++;
1470 	return (ill->ill_reachable_retrans_time);
1471 }
1472 
1473 /*
1474  * Attempt to recover an address on an interface that's been marked as a
1475  * duplicate.  Because NCEs are destroyed when the interface goes down, there's
1476  * no easy way to just probe the address and have the right thing happen if
1477  * it's no longer in use.  Instead, we just bring it up normally and allow the
1478  * regular interface start-up logic to probe for a remaining duplicate and take
1479  * us back down if necessary.
1480  * Neither DHCP nor temporary addresses arrive here; they're excluded by
1481  * ip_ndp_excl.
1482  */
1483 /* ARGSUSED */
1484 static void
1485 ip_ndp_recover(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg)
1486 {
1487 	ill_t	*ill = rq->q_ptr;
1488 	ipif_t	*ipif;
1489 	in6_addr_t *addr = (in6_addr_t *)mp->b_rptr;
1490 
1491 	for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
1492 		/*
1493 		 * We do not support recovery of proxy ARP'd interfaces,
1494 		 * because the system lacks a complete proxy ARP mechanism.
1495 		 */
1496 		if ((ipif->ipif_flags & IPIF_POINTOPOINT) ||
1497 		    !IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6lcl_addr, addr)) {
1498 			continue;
1499 		}
1500 
1501 		/*
1502 		 * If we have already recovered or if the interface is going
1503 		 * away, then ignore.
1504 		 */
1505 		mutex_enter(&ill->ill_lock);
1506 		if (!(ipif->ipif_flags & IPIF_DUPLICATE) ||
1507 		    (ipif->ipif_flags & (IPIF_MOVING | IPIF_CONDEMNED))) {
1508 			mutex_exit(&ill->ill_lock);
1509 			continue;
1510 		}
1511 
1512 		ipif->ipif_flags &= ~IPIF_DUPLICATE;
1513 		ill->ill_ipif_dup_count--;
1514 		mutex_exit(&ill->ill_lock);
1515 		ipif->ipif_was_dup = B_TRUE;
1516 
1517 		if (ipif_ndp_up(ipif, addr, B_FALSE) != EINPROGRESS)
1518 			(void) ipif_up_done_v6(ipif);
1519 	}
1520 	freeb(mp);
1521 }
1522 
1523 /*
1524  * Attempt to recover an IPv6 interface that's been shut down as a duplicate.
1525  * As long as someone else holds the address, the interface will stay down.
1526  * When that conflict goes away, the interface is brought back up.  This is
1527  * done so that accidental shutdowns of addresses aren't made permanent.  Your
1528  * server will recover from a failure.
1529  *
1530  * For DHCP and temporary addresses, recovery is not done in the kernel.
1531  * Instead, it's handled by user space processes (dhcpagent and in.ndpd).
1532  *
1533  * This function is entered on a timer expiry; the ID is in ipif_recovery_id.
1534  */
1535 static void
1536 ipif6_dup_recovery(void *arg)
1537 {
1538 	ipif_t *ipif = arg;
1539 
1540 	ipif->ipif_recovery_id = 0;
1541 	if (!(ipif->ipif_flags & IPIF_DUPLICATE))
1542 		return;
1543 
1544 	/*
1545 	 * No lock, because this is just an optimization.
1546 	 */
1547 	if (ipif->ipif_state_flags & (IPIF_MOVING | IPIF_CONDEMNED))
1548 		return;
1549 
1550 	/* If the link is down, we'll retry this later */
1551 	if (!(ipif->ipif_ill->ill_phyint->phyint_flags & PHYI_RUNNING))
1552 		return;
1553 
1554 	ndp_do_recovery(ipif);
1555 }
1556 
1557 /*
1558  * Perform interface recovery by forcing the duplicate interfaces up and
1559  * allowing the system to determine which ones should stay up.
1560  *
1561  * Called both by recovery timer expiry and link-up notification.
1562  */
1563 void
1564 ndp_do_recovery(ipif_t *ipif)
1565 {
1566 	ill_t *ill = ipif->ipif_ill;
1567 	mblk_t *mp;
1568 
1569 	mp = allocb(sizeof (ipif->ipif_v6lcl_addr), BPRI_MED);
1570 	if (mp == NULL) {
1571 		mutex_enter(&ill->ill_lock);
1572 		if (ipif->ipif_recovery_id == 0 &&
1573 		    !(ipif->ipif_state_flags & (IPIF_MOVING |
1574 		    IPIF_CONDEMNED))) {
1575 			ipif->ipif_recovery_id = timeout(ipif6_dup_recovery,
1576 			    ipif, MSEC_TO_TICK(ip_dup_recovery));
1577 		}
1578 		mutex_exit(&ill->ill_lock);
1579 	} else {
1580 		bcopy(&ipif->ipif_v6lcl_addr, mp->b_rptr,
1581 		    sizeof (ipif->ipif_v6lcl_addr));
1582 		ill_refhold(ill);
1583 		(void) qwriter_ip(NULL, ill, ill->ill_rq, mp, ip_ndp_recover,
1584 		    CUR_OP, B_FALSE);
1585 	}
1586 }
1587 
1588 /*
1589  * Find the solicitation in the given message, and extract printable details
1590  * (MAC and IP addresses) from it.
1591  */
1592 static nd_neighbor_solicit_t *
1593 ip_ndp_find_solicitation(mblk_t *mp, mblk_t *dl_mp, ill_t *ill, char *hbuf,
1594     size_t hlen, char *sbuf, size_t slen, uchar_t **haddr)
1595 {
1596 	nd_neighbor_solicit_t *ns;
1597 	ip6_t *ip6h;
1598 	uchar_t *addr;
1599 	int alen;
1600 
1601 	alen = 0;
1602 	ip6h = (ip6_t *)mp->b_rptr;
1603 	if (dl_mp == NULL) {
1604 		nd_opt_hdr_t *opt;
1605 		int nslen;
1606 
1607 		/*
1608 		 * If it's from the fast-path, then it can't be a probe
1609 		 * message, and thus must include the source linkaddr option.
1610 		 * Extract that here.
1611 		 */
1612 		ns = (nd_neighbor_solicit_t *)((char *)ip6h + IPV6_HDR_LEN);
1613 		nslen = mp->b_wptr - (uchar_t *)ns;
1614 		if ((nslen -= sizeof (*ns)) > 0) {
1615 			opt = ndp_get_option((nd_opt_hdr_t *)(ns + 1), nslen,
1616 			    ND_OPT_SOURCE_LINKADDR);
1617 			if (opt != NULL &&
1618 			    opt->nd_opt_len * 8 - sizeof (*opt) >=
1619 			    ill->ill_nd_lla_len) {
1620 				addr = (uchar_t *)(opt + 1);
1621 				alen = ill->ill_nd_lla_len;
1622 			}
1623 		}
1624 		/*
1625 		 * We cheat a bit here for the sake of printing usable log
1626 		 * messages in the rare case where the reply we got was unicast
1627 		 * without a source linkaddr option, and the interface is in
1628 		 * fastpath mode.  (Sigh.)
1629 		 */
1630 		if (alen == 0 && ill->ill_type == IFT_ETHER &&
1631 		    MBLKHEAD(mp) >= sizeof (struct ether_header)) {
1632 			struct ether_header *pether;
1633 
1634 			pether = (struct ether_header *)((char *)ip6h -
1635 			    sizeof (*pether));
1636 			addr = pether->ether_shost.ether_addr_octet;
1637 			alen = ETHERADDRL;
1638 		}
1639 	} else {
1640 		dl_unitdata_ind_t *dlu;
1641 
1642 		dlu = (dl_unitdata_ind_t *)dl_mp->b_rptr;
1643 		alen = dlu->dl_src_addr_length;
1644 		if (alen > 0 && dlu->dl_src_addr_offset >= sizeof (*dlu) &&
1645 		    dlu->dl_src_addr_offset + alen <= MBLKL(dl_mp)) {
1646 			addr = dl_mp->b_rptr + dlu->dl_src_addr_offset;
1647 			if (ill->ill_sap_length < 0) {
1648 				alen += ill->ill_sap_length;
1649 			} else {
1650 				addr += ill->ill_sap_length;
1651 				alen -= ill->ill_sap_length;
1652 			}
1653 		}
1654 	}
1655 	if (alen > 0) {
1656 		*haddr = addr;
1657 		(void) mac_colon_addr(addr, alen, hbuf, hlen);
1658 	} else {
1659 		*haddr = NULL;
1660 		(void) strcpy(hbuf, "?");
1661 	}
1662 	ns = (nd_neighbor_solicit_t *)((char *)ip6h + IPV6_HDR_LEN);
1663 	(void) inet_ntop(AF_INET6, &ns->nd_ns_target, sbuf, slen);
1664 	return (ns);
1665 }
1666 
1667 /*
1668  * This is for exclusive changes due to NDP duplicate address detection
1669  * failure.
1670  */
1671 /* ARGSUSED */
1672 static void
1673 ip_ndp_excl(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg)
1674 {
1675 	ill_t	*ill = rq->q_ptr;
1676 	ipif_t	*ipif;
1677 	char ibuf[LIFNAMSIZ + 10];	/* 10 digits for logical i/f number */
1678 	char hbuf[MAC_STR_LEN];
1679 	char sbuf[INET6_ADDRSTRLEN];
1680 	nd_neighbor_solicit_t *ns;
1681 	mblk_t *dl_mp = NULL;
1682 	uchar_t *haddr;
1683 
1684 	if (DB_TYPE(mp) != M_DATA) {
1685 		dl_mp = mp;
1686 		mp = mp->b_cont;
1687 	}
1688 	ns = ip_ndp_find_solicitation(mp, dl_mp, ill, hbuf, sizeof (hbuf), sbuf,
1689 	    sizeof (sbuf), &haddr);
1690 	if (haddr != NULL &&
1691 	    bcmp(haddr, ill->ill_phys_addr, ill->ill_phys_addr_length) == 0) {
1692 		/*
1693 		 * Ignore conflicts generated by misbehaving switches that just
1694 		 * reflect our own messages back to us.
1695 		 */
1696 		goto ignore_conflict;
1697 	}
1698 	(void) strlcpy(ibuf, ill->ill_name, sizeof (ibuf));
1699 	for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
1700 
1701 		if ((ipif->ipif_flags & IPIF_POINTOPOINT) ||
1702 		    !IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6lcl_addr,
1703 		    &ns->nd_ns_target)) {
1704 			continue;
1705 		}
1706 
1707 		/* If it's already marked, then don't do anything. */
1708 		if (ipif->ipif_flags & IPIF_DUPLICATE)
1709 			continue;
1710 
1711 		/*
1712 		 * If this is a failure during duplicate recovery, then don't
1713 		 * complain.  It may take a long time to recover.
1714 		 */
1715 		if (!ipif->ipif_was_dup) {
1716 			if (ipif->ipif_id != 0) {
1717 				(void) snprintf(ibuf + ill->ill_name_length - 1,
1718 				    sizeof (ibuf) - ill->ill_name_length + 1,
1719 				    ":%d", ipif->ipif_id);
1720 			}
1721 			cmn_err(CE_WARN, "%s has duplicate address %s (in "
1722 			    "use by %s); disabled", ibuf, sbuf, hbuf);
1723 		}
1724 		mutex_enter(&ill->ill_lock);
1725 		ASSERT(!(ipif->ipif_flags & IPIF_DUPLICATE));
1726 		ipif->ipif_flags |= IPIF_DUPLICATE;
1727 		ill->ill_ipif_dup_count++;
1728 		mutex_exit(&ill->ill_lock);
1729 		(void) ipif_down(ipif, NULL, NULL);
1730 		ipif_down_tail(ipif);
1731 		mutex_enter(&ill->ill_lock);
1732 		if (!(ipif->ipif_flags & (IPIF_DHCPRUNNING|IPIF_TEMPORARY)) &&
1733 		    ill->ill_net_type == IRE_IF_RESOLVER &&
1734 		    !(ipif->ipif_state_flags & (IPIF_MOVING |
1735 		    IPIF_CONDEMNED)) &&
1736 		    ip_dup_recovery > 0) {
1737 			ipif->ipif_recovery_id = timeout(ipif6_dup_recovery,
1738 			    ipif, MSEC_TO_TICK(ip_dup_recovery));
1739 		}
1740 		mutex_exit(&ill->ill_lock);
1741 	}
1742 ignore_conflict:
1743 	if (dl_mp != NULL)
1744 		freeb(dl_mp);
1745 	freemsg(mp);
1746 }
1747 
1748 /*
1749  * Handle failure by tearing down the ipifs with the specified address.  Note
1750  * that tearing down the ipif also means deleting the nce through ipif_down, so
1751  * it's not possible to do recovery by just restarting the nce timer.  Instead,
1752  * we start a timer on the ipif.
1753  */
1754 static void
1755 ip_ndp_failure(ill_t *ill, mblk_t *mp, mblk_t *dl_mp, nce_t *nce)
1756 {
1757 	if ((mp = copymsg(mp)) != NULL) {
1758 		if (dl_mp == NULL)
1759 			dl_mp = mp;
1760 		else if ((dl_mp = copyb(dl_mp)) != NULL)
1761 			dl_mp->b_cont = mp;
1762 		if (dl_mp == NULL) {
1763 			freemsg(mp);
1764 		} else {
1765 			ill_refhold(ill);
1766 			(void) qwriter_ip(NULL, ill, ill->ill_rq, dl_mp,
1767 			    ip_ndp_excl, CUR_OP, B_FALSE);
1768 		}
1769 	}
1770 	ndp_delete(nce);
1771 }
1772 
1773 /*
1774  * Handle a discovered conflict: some other system is advertising that it owns
1775  * one of our IP addresses.  We need to defend ourselves, or just shut down the
1776  * interface.
1777  */
1778 static void
1779 ip_ndp_conflict(ill_t *ill, mblk_t *mp, mblk_t *dl_mp, nce_t *nce)
1780 {
1781 	ipif_t *ipif;
1782 	uint32_t now;
1783 	uint_t maxdefense;
1784 	uint_t defs;
1785 
1786 	ipif = ipif_lookup_addr_v6(&nce->nce_addr, ill, ALL_ZONES, NULL, NULL,
1787 	    NULL, NULL);
1788 	if (ipif == NULL)
1789 		return;
1790 	/*
1791 	 * First, figure out if this address is disposable.
1792 	 */
1793 	if (ipif->ipif_flags & (IPIF_DHCPRUNNING | IPIF_TEMPORARY))
1794 		maxdefense = ip_max_temp_defend;
1795 	else
1796 		maxdefense = ip_max_defend;
1797 
1798 	/*
1799 	 * Now figure out how many times we've defended ourselves.  Ignore
1800 	 * defenses that happened long in the past.
1801 	 */
1802 	now = gethrestime_sec();
1803 	mutex_enter(&nce->nce_lock);
1804 	if ((defs = nce->nce_defense_count) > 0 &&
1805 	    now - nce->nce_defense_time > ip_defend_interval) {
1806 		nce->nce_defense_count = defs = 0;
1807 	}
1808 	nce->nce_defense_count++;
1809 	nce->nce_defense_time = now;
1810 	mutex_exit(&nce->nce_lock);
1811 	ipif_refrele(ipif);
1812 
1813 	/*
1814 	 * If we've defended ourselves too many times already, then give up and
1815 	 * tear down the interface(s) using this address.  Otherwise, defend by
1816 	 * sending out an unsolicited Neighbor Advertisement.
1817 	 */
1818 	if (defs >= maxdefense) {
1819 		ip_ndp_failure(ill, mp, dl_mp, nce);
1820 	} else {
1821 		char hbuf[MAC_STR_LEN];
1822 		char sbuf[INET6_ADDRSTRLEN];
1823 		uchar_t *haddr;
1824 
1825 		(void) ip_ndp_find_solicitation(mp, dl_mp, ill, hbuf,
1826 		    sizeof (hbuf), sbuf, sizeof (sbuf), &haddr);
1827 		cmn_err(CE_WARN, "node %s is using our IP address %s on %s",
1828 		    hbuf, sbuf, ill->ill_name);
1829 		(void) nce_xmit(ill, ND_NEIGHBOR_ADVERT, ill, B_FALSE,
1830 		    &nce->nce_addr, &ipv6_all_hosts_mcast,
1831 		    nce_advert_flags(nce));
1832 	}
1833 }
1834 
1835 static void
1836 ndp_input_solicit(ill_t *ill, mblk_t *mp, mblk_t *dl_mp)
1837 {
1838 	nd_neighbor_solicit_t *ns;
1839 	uint32_t	hlen = ill->ill_nd_lla_len;
1840 	uchar_t		*haddr = NULL;
1841 	icmp6_t		*icmp_nd;
1842 	ip6_t		*ip6h;
1843 	nce_t		*our_nce = NULL;
1844 	in6_addr_t	target;
1845 	in6_addr_t	src;
1846 	int		len;
1847 	int		flag = 0;
1848 	nd_opt_hdr_t	*opt = NULL;
1849 	boolean_t	bad_solicit = B_FALSE;
1850 	mib2_ipv6IfIcmpEntry_t	*mib = ill->ill_icmp6_mib;
1851 
1852 	ip6h = (ip6_t *)mp->b_rptr;
1853 	icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
1854 	len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN;
1855 	src = ip6h->ip6_src;
1856 	ns = (nd_neighbor_solicit_t *)icmp_nd;
1857 	target = ns->nd_ns_target;
1858 	if (IN6_IS_ADDR_MULTICAST(&target)) {
1859 		if (ip_debug > 2) {
1860 			/* ip1dbg */
1861 			pr_addr_dbg("ndp_input_solicit: Target is"
1862 			    " multicast! %s\n", AF_INET6, &target);
1863 		}
1864 		bad_solicit = B_TRUE;
1865 		goto done;
1866 	}
1867 	if (len > sizeof (nd_neighbor_solicit_t)) {
1868 		/* Options present */
1869 		opt = (nd_opt_hdr_t *)&ns[1];
1870 		len -= sizeof (nd_neighbor_solicit_t);
1871 		if (!ndp_verify_optlen(opt, len)) {
1872 			ip1dbg(("ndp_input_solicit: Bad opt len\n"));
1873 			bad_solicit = B_TRUE;
1874 			goto done;
1875 		}
1876 	}
1877 	if (IN6_IS_ADDR_UNSPECIFIED(&src)) {
1878 		/* Check to see if this is a valid DAD solicitation */
1879 		if (!IN6_IS_ADDR_MC_SOLICITEDNODE(&ip6h->ip6_dst)) {
1880 			if (ip_debug > 2) {
1881 				/* ip1dbg */
1882 				pr_addr_dbg("ndp_input_solicit: IPv6 "
1883 				    "Destination is not solicited node "
1884 				    "multicast %s\n", AF_INET6,
1885 				    &ip6h->ip6_dst);
1886 			}
1887 			bad_solicit = B_TRUE;
1888 			goto done;
1889 		}
1890 	}
1891 
1892 	our_nce = ndp_lookup_v6(ill, &target, B_FALSE);
1893 	/*
1894 	 * If this is a valid Solicitation, a permanent
1895 	 * entry should exist in the cache
1896 	 */
1897 	if (our_nce == NULL ||
1898 	    !(our_nce->nce_flags & NCE_F_PERMANENT)) {
1899 		ip1dbg(("ndp_input_solicit: Wrong target in NS?!"
1900 		    "ifname=%s ", ill->ill_name));
1901 		if (ip_debug > 2) {
1902 			/* ip1dbg */
1903 			pr_addr_dbg(" dst %s\n", AF_INET6, &target);
1904 		}
1905 		bad_solicit = B_TRUE;
1906 		goto done;
1907 	}
1908 
1909 	/* At this point we should have a verified NS per spec */
1910 	if (opt != NULL) {
1911 		opt = ndp_get_option(opt, len, ND_OPT_SOURCE_LINKADDR);
1912 		if (opt != NULL) {
1913 			haddr = (uchar_t *)&opt[1];
1914 			if (hlen > opt->nd_opt_len * 8 - sizeof (*opt) ||
1915 			    hlen == 0) {
1916 				ip1dbg(("ndp_input_advert: bad SLLA\n"));
1917 				bad_solicit = B_TRUE;
1918 				goto done;
1919 			}
1920 		}
1921 	}
1922 
1923 	/* If sending directly to peer, set the unicast flag */
1924 	if (!IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst))
1925 		flag |= NDP_UNICAST;
1926 
1927 	/*
1928 	 * Create/update the entry for the soliciting node.
1929 	 * or respond to outstanding queries, don't if
1930 	 * the source is unspecified address.
1931 	 */
1932 	if (!IN6_IS_ADDR_UNSPECIFIED(&src)) {
1933 		int	err;
1934 		nce_t	*nnce;
1935 
1936 		ASSERT(ill->ill_isv6);
1937 		/*
1938 		 * Regular solicitations *must* include the Source Link-Layer
1939 		 * Address option.  Ignore messages that do not.
1940 		 */
1941 		if (haddr == NULL && IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst)) {
1942 			ip1dbg(("ndp_input_solicit: source link-layer address "
1943 			    "option missing with a specified source.\n"));
1944 			bad_solicit = B_TRUE;
1945 			goto done;
1946 		}
1947 
1948 		/*
1949 		 * This is a regular solicitation.  If we're still in the
1950 		 * process of verifying the address, then don't respond at all
1951 		 * and don't keep track of the sender.
1952 		 */
1953 		if (our_nce->nce_state == ND_PROBE)
1954 			goto done;
1955 
1956 		/*
1957 		 * If the solicitation doesn't have sender hardware address
1958 		 * (legal for unicast solicitation), then process without
1959 		 * installing the return NCE.  Either we already know it, or
1960 		 * we'll be forced to look it up when (and if) we reply to the
1961 		 * packet.
1962 		 */
1963 		if (haddr == NULL)
1964 			goto no_source;
1965 
1966 		err = ndp_lookup_then_add(ill,
1967 		    haddr,
1968 		    &src,	/* Soliciting nodes address */
1969 		    &ipv6_all_ones,
1970 		    &ipv6_all_zeros,
1971 		    0,
1972 		    0,
1973 		    ND_STALE,
1974 		    &nnce,
1975 		    NULL,
1976 		    NULL);
1977 		switch (err) {
1978 		case 0:
1979 			/* done with this entry */
1980 			NCE_REFRELE(nnce);
1981 			break;
1982 		case EEXIST:
1983 			/*
1984 			 * B_FALSE indicates this is not an
1985 			 * an advertisement.
1986 			 */
1987 			ndp_process(nnce, haddr, 0, B_FALSE);
1988 			NCE_REFRELE(nnce);
1989 			break;
1990 		default:
1991 			ip1dbg(("ndp_input_solicit: Can't create NCE %d\n",
1992 			    err));
1993 			goto done;
1994 		}
1995 no_source:
1996 		flag |= NDP_SOLICITED;
1997 	} else {
1998 		/*
1999 		 * No source link layer address option should be present in a
2000 		 * valid DAD request.
2001 		 */
2002 		if (haddr != NULL) {
2003 			ip1dbg(("ndp_input_solicit: source link-layer address "
2004 			    "option present with an unspecified source.\n"));
2005 			bad_solicit = B_TRUE;
2006 			goto done;
2007 		}
2008 		if (our_nce->nce_state == ND_PROBE) {
2009 			/*
2010 			 * Internally looped-back probes won't have DLPI
2011 			 * attached to them.  External ones (which are sent by
2012 			 * multicast) always will.  Just ignore our own
2013 			 * transmissions.
2014 			 */
2015 			if (dl_mp != NULL) {
2016 				/*
2017 				 * If someone else is probing our address, then
2018 				 * we've crossed wires.  Declare failure.
2019 				 */
2020 				ip_ndp_failure(ill, mp, dl_mp, our_nce);
2021 			}
2022 			goto done;
2023 		}
2024 		/*
2025 		 * This is a DAD probe.  Multicast the advertisement to the
2026 		 * all-nodes address.
2027 		 */
2028 		src = ipv6_all_hosts_mcast;
2029 	}
2030 	flag |= nce_advert_flags(our_nce);
2031 	/* Response to a solicitation */
2032 	(void) nce_xmit(ill,
2033 	    ND_NEIGHBOR_ADVERT,
2034 	    ill,	/* ill to be used for extracting ill_nd_lla */
2035 	    B_TRUE,	/* use ill_nd_lla */
2036 	    &target,	/* Source and target of the advertisement pkt */
2037 	    &src,	/* IP Destination (source of original pkt) */
2038 	    flag);
2039 done:
2040 	if (bad_solicit)
2041 		BUMP_MIB(mib, ipv6IfIcmpInBadNeighborSolicitations);
2042 	if (our_nce != NULL)
2043 		NCE_REFRELE(our_nce);
2044 }
2045 
2046 void
2047 ndp_input_advert(ill_t *ill, mblk_t *mp, mblk_t *dl_mp)
2048 {
2049 	nd_neighbor_advert_t *na;
2050 	uint32_t	hlen = ill->ill_nd_lla_len;
2051 	uchar_t		*haddr = NULL;
2052 	icmp6_t		*icmp_nd;
2053 	ip6_t		*ip6h;
2054 	nce_t		*dst_nce = NULL;
2055 	in6_addr_t	target;
2056 	nd_opt_hdr_t	*opt = NULL;
2057 	int		len;
2058 	mib2_ipv6IfIcmpEntry_t	*mib = ill->ill_icmp6_mib;
2059 
2060 	ip6h = (ip6_t *)mp->b_rptr;
2061 	icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
2062 	len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN;
2063 	na = (nd_neighbor_advert_t *)icmp_nd;
2064 	if (IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst) &&
2065 	    (na->nd_na_flags_reserved & ND_NA_FLAG_SOLICITED)) {
2066 		ip1dbg(("ndp_input_advert: Target is multicast but the "
2067 		    "solicited flag is not zero\n"));
2068 		BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements);
2069 		return;
2070 	}
2071 	target = na->nd_na_target;
2072 	if (IN6_IS_ADDR_MULTICAST(&target)) {
2073 		ip1dbg(("ndp_input_advert: Target is multicast!\n"));
2074 		BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements);
2075 		return;
2076 	}
2077 	if (len > sizeof (nd_neighbor_advert_t)) {
2078 		opt = (nd_opt_hdr_t *)&na[1];
2079 		if (!ndp_verify_optlen(opt,
2080 		    len - sizeof (nd_neighbor_advert_t))) {
2081 			ip1dbg(("ndp_input_advert: cannot verify SLLA\n"));
2082 			BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements);
2083 			return;
2084 		}
2085 		/* At this point we have a verified NA per spec */
2086 		len -= sizeof (nd_neighbor_advert_t);
2087 		opt = ndp_get_option(opt, len, ND_OPT_TARGET_LINKADDR);
2088 		if (opt != NULL) {
2089 			haddr = (uchar_t *)&opt[1];
2090 			if (hlen > opt->nd_opt_len * 8 - sizeof (*opt) ||
2091 			    hlen == 0) {
2092 				ip1dbg(("ndp_input_advert: bad SLLA\n"));
2093 				BUMP_MIB(mib,
2094 				    ipv6IfIcmpInBadNeighborAdvertisements);
2095 				return;
2096 			}
2097 		}
2098 	}
2099 
2100 	/*
2101 	 * If this interface is part of the group look at all the
2102 	 * ills in the group.
2103 	 */
2104 	rw_enter(&ill_g_lock, RW_READER);
2105 	if (ill->ill_group != NULL)
2106 		ill = ill->ill_group->illgrp_ill;
2107 
2108 	for (; ill != NULL; ill = ill->ill_group_next) {
2109 		mutex_enter(&ill->ill_lock);
2110 		if (!ILL_CAN_LOOKUP(ill)) {
2111 			mutex_exit(&ill->ill_lock);
2112 			continue;
2113 		}
2114 		ill_refhold_locked(ill);
2115 		mutex_exit(&ill->ill_lock);
2116 		dst_nce = ndp_lookup_v6(ill, &target, B_FALSE);
2117 		/* We have to drop the lock since ndp_process calls put* */
2118 		rw_exit(&ill_g_lock);
2119 		if (dst_nce != NULL) {
2120 			if ((dst_nce->nce_flags & NCE_F_PERMANENT) &&
2121 			    dst_nce->nce_state == ND_PROBE) {
2122 				/*
2123 				 * Someone else sent an advertisement for an
2124 				 * address that we're trying to configure.
2125 				 * Tear it down.  Note that dl_mp might be NULL
2126 				 * if we're getting a unicast reply.  This
2127 				 * isn't typically done (multicast is the norm
2128 				 * in response to a probe), but ip_ndp_failure
2129 				 * will handle the dl_mp == NULL case as well.
2130 				 */
2131 				ip_ndp_failure(ill, mp, dl_mp, dst_nce);
2132 			} else if (dst_nce->nce_flags & NCE_F_PERMANENT) {
2133 				/*
2134 				 * Someone just announced one of our local
2135 				 * addresses.  If it wasn't us, then this is a
2136 				 * conflict.  Defend the address or shut it
2137 				 * down.
2138 				 */
2139 				if (dl_mp != NULL &&
2140 				    (haddr == NULL ||
2141 				    nce_cmp_ll_addr(dst_nce, haddr,
2142 				    ill->ill_nd_lla_len))) {
2143 					ip_ndp_conflict(ill, mp, dl_mp,
2144 					    dst_nce);
2145 				}
2146 			} else {
2147 				if (na->nd_na_flags_reserved &
2148 				    ND_NA_FLAG_ROUTER) {
2149 					dst_nce->nce_flags |= NCE_F_ISROUTER;
2150 				}
2151 				/* B_TRUE indicates this an advertisement */
2152 				ndp_process(dst_nce, haddr,
2153 				    na->nd_na_flags_reserved, B_TRUE);
2154 			}
2155 			NCE_REFRELE(dst_nce);
2156 		}
2157 		rw_enter(&ill_g_lock, RW_READER);
2158 		ill_refrele(ill);
2159 	}
2160 	rw_exit(&ill_g_lock);
2161 }
2162 
2163 /*
2164  * Process NDP neighbor solicitation/advertisement messages.
2165  * The checksum has already checked o.k before reaching here.
2166  */
2167 void
2168 ndp_input(ill_t *ill, mblk_t *mp, mblk_t *dl_mp)
2169 {
2170 	icmp6_t		*icmp_nd;
2171 	ip6_t		*ip6h;
2172 	int		len;
2173 	mib2_ipv6IfIcmpEntry_t	*mib = ill->ill_icmp6_mib;
2174 
2175 
2176 	if (!pullupmsg(mp, -1)) {
2177 		ip1dbg(("ndp_input: pullupmsg failed\n"));
2178 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2179 		goto done;
2180 	}
2181 	ip6h = (ip6_t *)mp->b_rptr;
2182 	if (ip6h->ip6_hops != IPV6_MAX_HOPS) {
2183 		ip1dbg(("ndp_input: hoplimit != IPV6_MAX_HOPS\n"));
2184 		BUMP_MIB(mib, ipv6IfIcmpBadHoplimit);
2185 		goto done;
2186 	}
2187 	/*
2188 	 * NDP does not accept any extension headers between the
2189 	 * IP header and the ICMP header since e.g. a routing
2190 	 * header could be dangerous.
2191 	 * This assumes that any AH or ESP headers are removed
2192 	 * by ip prior to passing the packet to ndp_input.
2193 	 */
2194 	if (ip6h->ip6_nxt != IPPROTO_ICMPV6) {
2195 		ip1dbg(("ndp_input: Wrong next header 0x%x\n",
2196 		    ip6h->ip6_nxt));
2197 		BUMP_MIB(mib, ipv6IfIcmpInErrors);
2198 		goto done;
2199 	}
2200 	icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
2201 	ASSERT(icmp_nd->icmp6_type == ND_NEIGHBOR_SOLICIT ||
2202 	    icmp_nd->icmp6_type == ND_NEIGHBOR_ADVERT);
2203 	if (icmp_nd->icmp6_code != 0) {
2204 		ip1dbg(("ndp_input: icmp6 code != 0 \n"));
2205 		BUMP_MIB(mib, ipv6IfIcmpInErrors);
2206 		goto done;
2207 	}
2208 	len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN;
2209 	/*
2210 	 * Make sure packet length is large enough for either
2211 	 * a NS or a NA icmp packet.
2212 	 */
2213 	if (len <  sizeof (struct icmp6_hdr) + sizeof (struct in6_addr)) {
2214 		ip1dbg(("ndp_input: packet too short\n"));
2215 		BUMP_MIB(mib, ipv6IfIcmpInErrors);
2216 		goto done;
2217 	}
2218 	if (icmp_nd->icmp6_type == ND_NEIGHBOR_SOLICIT) {
2219 		ndp_input_solicit(ill, mp, dl_mp);
2220 	} else {
2221 		ndp_input_advert(ill, mp, dl_mp);
2222 	}
2223 done:
2224 	freemsg(mp);
2225 }
2226 
2227 /*
2228  * nce_xmit is called to form and transmit a ND solicitation or
2229  * advertisement ICMP packet.
2230  *
2231  * If the source address is unspecified and this isn't a probe (used for
2232  * duplicate address detection), an appropriate source address and link layer
2233  * address will be chosen here.  The link layer address option is included if
2234  * the source is specified (i.e., all non-probe packets), and omitted (per the
2235  * specification) otherwise.
2236  *
2237  * It returns B_FALSE only if it does a successful put() to the
2238  * corresponding ill's ill_wq otherwise returns B_TRUE.
2239  */
2240 static boolean_t
2241 nce_xmit(ill_t *ill, uint32_t operation, ill_t *hwaddr_ill,
2242     boolean_t use_nd_lla, const in6_addr_t *sender, const in6_addr_t *target,
2243     int flag)
2244 {
2245 	uint32_t	len;
2246 	icmp6_t 	*icmp6;
2247 	mblk_t		*mp;
2248 	ip6_t		*ip6h;
2249 	nd_opt_hdr_t	*opt;
2250 	uint_t		plen;
2251 	ip6i_t		*ip6i;
2252 	ipif_t		*src_ipif = NULL;
2253 	uint8_t		*hw_addr;
2254 
2255 	/*
2256 	 * If we have a unspecified source(sender) address, select a
2257 	 * proper source address for the solicitation here itself so
2258 	 * that we can initialize the h/w address correctly. This is
2259 	 * needed for interface groups as source address can come from
2260 	 * the whole group and the h/w address initialized from ill will
2261 	 * be wrong if the source address comes from a different ill.
2262 	 *
2263 	 * Note that the NA never comes here with the unspecified source
2264 	 * address. The following asserts that whenever the source
2265 	 * address is specified, the haddr also should be specified.
2266 	 */
2267 	ASSERT(IN6_IS_ADDR_UNSPECIFIED(sender) || (hwaddr_ill != NULL));
2268 
2269 	if (IN6_IS_ADDR_UNSPECIFIED(sender) && !(flag & NDP_PROBE)) {
2270 		ASSERT(operation != ND_NEIGHBOR_ADVERT);
2271 		/*
2272 		 * Pick a source address for this solicitation, but
2273 		 * restrict the selection to addresses assigned to the
2274 		 * output interface (or interface group).  We do this
2275 		 * because the destination will create a neighbor cache
2276 		 * entry for the source address of this packet, so the
2277 		 * source address had better be a valid neighbor.
2278 		 */
2279 		src_ipif = ipif_select_source_v6(ill, target, RESTRICT_TO_ILL,
2280 		    IPV6_PREFER_SRC_DEFAULT, GLOBAL_ZONEID);
2281 		if (src_ipif == NULL) {
2282 			char buf[INET6_ADDRSTRLEN];
2283 
2284 			ip1dbg(("nce_xmit: No source ipif for dst %s\n",
2285 			    inet_ntop(AF_INET6, (char *)target, buf,
2286 			    sizeof (buf))));
2287 			return (B_TRUE);
2288 		}
2289 		sender = &src_ipif->ipif_v6src_addr;
2290 		hwaddr_ill = src_ipif->ipif_ill;
2291 	}
2292 
2293 	/*
2294 	 * Always make sure that the NS/NA packets don't get load
2295 	 * spread. This is needed so that the probe packets sent
2296 	 * by the in.mpathd daemon can really go out on the desired
2297 	 * interface. Probe packets are made to go out on a desired
2298 	 * interface by including a ip6i with ATTACH_IF flag. As these
2299 	 * packets indirectly end up sending/receiving NS/NA packets
2300 	 * (neighbor doing NUD), we have to make sure that NA
2301 	 * also go out on the same interface.
2302 	 */
2303 	plen = (sizeof (nd_opt_hdr_t) + ill->ill_nd_lla_len + 7) / 8;
2304 	len = IPV6_HDR_LEN + sizeof (ip6i_t) + sizeof (nd_neighbor_advert_t) +
2305 	    plen * 8;
2306 	mp = allocb(len,  BPRI_LO);
2307 	if (mp == NULL) {
2308 		if (src_ipif != NULL)
2309 			ipif_refrele(src_ipif);
2310 		return (B_TRUE);
2311 	}
2312 	bzero((char *)mp->b_rptr, len);
2313 	mp->b_wptr = mp->b_rptr + len;
2314 
2315 	ip6i = (ip6i_t *)mp->b_rptr;
2316 	ip6i->ip6i_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
2317 	ip6i->ip6i_nxt = IPPROTO_RAW;
2318 	ip6i->ip6i_flags = IP6I_ATTACH_IF | IP6I_HOPLIMIT;
2319 	if (flag & NDP_PROBE)
2320 		ip6i->ip6i_flags |= IP6I_UNSPEC_SRC;
2321 	ip6i->ip6i_ifindex = ill->ill_phyint->phyint_ifindex;
2322 
2323 	ip6h = (ip6_t *)(mp->b_rptr + sizeof (ip6i_t));
2324 	ip6h->ip6_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
2325 	ip6h->ip6_plen = htons(len - IPV6_HDR_LEN - sizeof (ip6i_t));
2326 	ip6h->ip6_nxt = IPPROTO_ICMPV6;
2327 	ip6h->ip6_hops = IPV6_MAX_HOPS;
2328 	ip6h->ip6_dst = *target;
2329 	icmp6 = (icmp6_t *)&ip6h[1];
2330 
2331 	opt = (nd_opt_hdr_t *)((uint8_t *)ip6h + IPV6_HDR_LEN +
2332 	    sizeof (nd_neighbor_advert_t));
2333 
2334 	if (operation == ND_NEIGHBOR_SOLICIT) {
2335 		nd_neighbor_solicit_t *ns = (nd_neighbor_solicit_t *)icmp6;
2336 
2337 		if (!(flag & NDP_PROBE))
2338 			opt->nd_opt_type = ND_OPT_SOURCE_LINKADDR;
2339 		ip6h->ip6_src = *sender;
2340 		ns->nd_ns_target = *target;
2341 		if (!(flag & NDP_UNICAST)) {
2342 			/* Form multicast address of the target */
2343 			ip6h->ip6_dst = ipv6_solicited_node_mcast;
2344 			ip6h->ip6_dst.s6_addr32[3] |=
2345 			    ns->nd_ns_target.s6_addr32[3];
2346 		}
2347 	} else {
2348 		nd_neighbor_advert_t *na = (nd_neighbor_advert_t *)icmp6;
2349 
2350 		ASSERT(!(flag & NDP_PROBE));
2351 		opt->nd_opt_type = ND_OPT_TARGET_LINKADDR;
2352 		ip6h->ip6_src = *sender;
2353 		na->nd_na_target = *sender;
2354 		if (flag & NDP_ISROUTER)
2355 			na->nd_na_flags_reserved |= ND_NA_FLAG_ROUTER;
2356 		if (flag & NDP_SOLICITED)
2357 			na->nd_na_flags_reserved |= ND_NA_FLAG_SOLICITED;
2358 		if (flag & NDP_ORIDE)
2359 			na->nd_na_flags_reserved |= ND_NA_FLAG_OVERRIDE;
2360 	}
2361 
2362 	hw_addr = NULL;
2363 	if (!(flag & NDP_PROBE)) {
2364 		mutex_enter(&hwaddr_ill->ill_lock);
2365 		hw_addr = use_nd_lla ? hwaddr_ill->ill_nd_lla :
2366 		    hwaddr_ill->ill_phys_addr;
2367 		if (hw_addr != NULL) {
2368 			/* Fill in link layer address and option len */
2369 			opt->nd_opt_len = (uint8_t)plen;
2370 			bcopy(hw_addr, &opt[1], hwaddr_ill->ill_nd_lla_len);
2371 		}
2372 		mutex_exit(&hwaddr_ill->ill_lock);
2373 	}
2374 	if (hw_addr == NULL) {
2375 		/* If there's no link layer address option, then strip it. */
2376 		len -= plen * 8;
2377 		mp->b_wptr = mp->b_rptr + len;
2378 		ip6h->ip6_plen = htons(len - IPV6_HDR_LEN - sizeof (ip6i_t));
2379 	}
2380 
2381 	icmp6->icmp6_type = (uint8_t)operation;
2382 	icmp6->icmp6_code = 0;
2383 	/*
2384 	 * Prepare for checksum by putting icmp length in the icmp
2385 	 * checksum field. The checksum is calculated in ip_wput_v6.
2386 	 */
2387 	icmp6->icmp6_cksum = ip6h->ip6_plen;
2388 
2389 	if (src_ipif != NULL)
2390 		ipif_refrele(src_ipif);
2391 	if (canput(ill->ill_wq)) {
2392 		put(ill->ill_wq, mp);
2393 		return (B_FALSE);
2394 	}
2395 	freemsg(mp);
2396 	return (B_TRUE);
2397 }
2398 
2399 /*
2400  * Make a link layer address (does not include the SAP) from an nce.
2401  * To form the link layer address, use the last four bytes of ipv6
2402  * address passed in and the fixed offset stored in nce.
2403  */
2404 static void
2405 nce_make_mapping(nce_t *nce, uchar_t *addrpos, uchar_t *addr)
2406 {
2407 	uchar_t *mask, *to;
2408 	ill_t	*ill = nce->nce_ill;
2409 	int 	len;
2410 
2411 	if (ill->ill_net_type == IRE_IF_NORESOLVER)
2412 		return;
2413 	ASSERT(nce->nce_res_mp != NULL);
2414 	ASSERT(ill->ill_net_type == IRE_IF_RESOLVER);
2415 	ASSERT(nce->nce_flags & NCE_F_MAPPING);
2416 	ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&nce->nce_extract_mask));
2417 	ASSERT(addr != NULL);
2418 	bcopy(nce->nce_res_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill),
2419 	    addrpos, ill->ill_nd_lla_len);
2420 	len = MIN((int)ill->ill_nd_lla_len - nce->nce_ll_extract_start,
2421 	    IPV6_ADDR_LEN);
2422 	mask = (uchar_t *)&nce->nce_extract_mask;
2423 	mask += (IPV6_ADDR_LEN - len);
2424 	addr += (IPV6_ADDR_LEN - len);
2425 	to = addrpos + nce->nce_ll_extract_start;
2426 	while (len-- > 0)
2427 		*to++ |= *mask++ & *addr++;
2428 }
2429 
2430 /*
2431  * Pass a cache report back out via NDD.
2432  */
2433 /* ARGSUSED */
2434 int
2435 ndp_report(queue_t *q, mblk_t *mp, caddr_t arg, cred_t *ioc_cr)
2436 {
2437 	(void) mi_mpprintf(mp, "ifname      hardware addr    flags"
2438 			"     proto addr/mask");
2439 	ndp_walk(NULL, (pfi_t)nce_report1, (uchar_t *)mp);
2440 	return (0);
2441 }
2442 
2443 /*
2444  * Add a single line to the NDP Cache Entry Report.
2445  */
2446 static void
2447 nce_report1(nce_t *nce, uchar_t *mp_arg)
2448 {
2449 	ill_t		*ill = nce->nce_ill;
2450 	char		local_buf[INET6_ADDRSTRLEN];
2451 	uchar_t		flags_buf[10];
2452 	uint32_t	flags = nce->nce_flags;
2453 	mblk_t		*mp = (mblk_t *)mp_arg;
2454 	uchar_t		*h;
2455 	uchar_t		*m = flags_buf;
2456 	in6_addr_t	v6addr;
2457 
2458 	/*
2459 	 * Lock the nce to protect nce_res_mp from being changed
2460 	 * if an external resolver address resolution completes
2461 	 * while nce_res_mp is being accessed here.
2462 	 *
2463 	 * Deal with all address formats, not just Ethernet-specific
2464 	 * In addition, make sure that the mblk has enough space
2465 	 * before writing to it. If is doesn't, allocate a new one.
2466 	 */
2467 	if (nce->nce_ipversion == IPV4_VERSION)
2468 		/* Don't include v4 nce_ts in NDP cache entry report */
2469 		return;
2470 
2471 	ASSERT(ill != NULL);
2472 	v6addr = nce->nce_mask;
2473 	if (flags & NCE_F_PERMANENT)
2474 		*m++ = 'P';
2475 	if (flags & NCE_F_ISROUTER)
2476 		*m++ = 'R';
2477 	if (flags & NCE_F_MAPPING)
2478 		*m++ = 'M';
2479 	*m = '\0';
2480 
2481 	if (ill->ill_net_type == IRE_IF_RESOLVER) {
2482 		size_t		addrlen;
2483 		char		*addr_buf;
2484 		dl_unitdata_req_t	*dl;
2485 
2486 		mutex_enter(&nce->nce_lock);
2487 		h = nce->nce_res_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill);
2488 		dl = (dl_unitdata_req_t *)nce->nce_res_mp->b_rptr;
2489 		if (ill->ill_flags & ILLF_XRESOLV)
2490 			addrlen = (3 * (dl->dl_dest_addr_length));
2491 		else
2492 			addrlen = (3 * (ill->ill_nd_lla_len));
2493 		if (addrlen <= 0) {
2494 			mutex_exit(&nce->nce_lock);
2495 			(void) mi_mpprintf(mp,
2496 			    "%8s %9s %5s %s/%d",
2497 			    ill->ill_name,
2498 			    "None",
2499 			    (uchar_t *)&flags_buf,
2500 			    inet_ntop(AF_INET6, (char *)&nce->nce_addr,
2501 				(char *)local_buf, sizeof (local_buf)),
2502 				ip_mask_to_plen_v6(&v6addr));
2503 		} else {
2504 			/*
2505 			 * Convert the hardware/lla address to ascii
2506 			 */
2507 			addr_buf = kmem_zalloc(addrlen, KM_NOSLEEP);
2508 			if (addr_buf == NULL) {
2509 				mutex_exit(&nce->nce_lock);
2510 				return;
2511 			}
2512 			(void) mac_colon_addr((uint8_t *)h,
2513 			    (ill->ill_flags & ILLF_XRESOLV) ?
2514 			    dl->dl_dest_addr_length : ill->ill_nd_lla_len,
2515 			    addr_buf, addrlen);
2516 			mutex_exit(&nce->nce_lock);
2517 			(void) mi_mpprintf(mp, "%8s %17s %5s %s/%d",
2518 			    ill->ill_name, addr_buf, (uchar_t *)&flags_buf,
2519 			    inet_ntop(AF_INET6, (char *)&nce->nce_addr,
2520 				(char *)local_buf, sizeof (local_buf)),
2521 				ip_mask_to_plen_v6(&v6addr));
2522 			kmem_free(addr_buf, addrlen);
2523 		}
2524 	} else {
2525 		(void) mi_mpprintf(mp,
2526 		    "%8s %9s %5s %s/%d",
2527 		    ill->ill_name,
2528 		    "None",
2529 		    (uchar_t *)&flags_buf,
2530 		    inet_ntop(AF_INET6, (char *)&nce->nce_addr,
2531 			(char *)local_buf, sizeof (local_buf)),
2532 			ip_mask_to_plen_v6(&v6addr));
2533 	}
2534 }
2535 
2536 mblk_t *
2537 nce_udreq_alloc(ill_t *ill)
2538 {
2539 	mblk_t	*template_mp = NULL;
2540 	dl_unitdata_req_t *dlur;
2541 	int	sap_length;
2542 
2543 	ASSERT(ill->ill_isv6);
2544 
2545 	sap_length = ill->ill_sap_length;
2546 	template_mp = ip_dlpi_alloc(sizeof (dl_unitdata_req_t) +
2547 	    ill->ill_nd_lla_len + ABS(sap_length), DL_UNITDATA_REQ);
2548 	if (template_mp == NULL)
2549 		return (NULL);
2550 
2551 	dlur = (dl_unitdata_req_t *)template_mp->b_rptr;
2552 	dlur->dl_priority.dl_min = 0;
2553 	dlur->dl_priority.dl_max = 0;
2554 	dlur->dl_dest_addr_length = ABS(sap_length) + ill->ill_nd_lla_len;
2555 	dlur->dl_dest_addr_offset = sizeof (dl_unitdata_req_t);
2556 
2557 	/* Copy in the SAP value. */
2558 	NCE_LL_SAP_COPY(ill, template_mp);
2559 
2560 	return (template_mp);
2561 }
2562 
2563 /*
2564  * NDP retransmit timer.
2565  * This timer goes off when:
2566  * a. It is time to retransmit NS for resolver.
2567  * b. It is time to send reachability probes.
2568  */
2569 void
2570 ndp_timer(void *arg)
2571 {
2572 	nce_t		*nce = arg;
2573 	ill_t		*ill = nce->nce_ill;
2574 	uint32_t	ms;
2575 	char		addrbuf[INET6_ADDRSTRLEN];
2576 	mblk_t		*mp;
2577 	boolean_t	dropped = B_FALSE;
2578 
2579 	/*
2580 	 * The timer has to be cancelled by ndp_delete before doing the final
2581 	 * refrele. So the NCE is guaranteed to exist when the timer runs
2582 	 * until it clears the timeout_id. Before clearing the timeout_id
2583 	 * bump up the refcnt so that we can continue to use the nce
2584 	 */
2585 	ASSERT(nce != NULL);
2586 
2587 	/*
2588 	 * Grab the ill_g_lock now itself to avoid lock order problems.
2589 	 * nce_solicit needs ill_g_lock to be able to traverse ills
2590 	 */
2591 	rw_enter(&ill_g_lock, RW_READER);
2592 	mutex_enter(&nce->nce_lock);
2593 	NCE_REFHOLD_LOCKED(nce);
2594 	nce->nce_timeout_id = 0;
2595 
2596 	/*
2597 	 * Check the reachability state first.
2598 	 */
2599 	switch (nce->nce_state) {
2600 	case ND_DELAY:
2601 		rw_exit(&ill_g_lock);
2602 		nce->nce_state = ND_PROBE;
2603 		mutex_exit(&nce->nce_lock);
2604 		(void) nce_xmit(ill, ND_NEIGHBOR_SOLICIT, NULL, B_FALSE,
2605 		    &ipv6_all_zeros, &nce->nce_addr, NDP_UNICAST);
2606 		if (ip_debug > 3) {
2607 			/* ip2dbg */
2608 			pr_addr_dbg("ndp_timer: state for %s changed "
2609 			    "to PROBE\n", AF_INET6, &nce->nce_addr);
2610 		}
2611 		NDP_RESTART_TIMER(nce, ill->ill_reachable_retrans_time);
2612 		NCE_REFRELE(nce);
2613 		return;
2614 	case ND_PROBE:
2615 		/* must be retransmit timer */
2616 		rw_exit(&ill_g_lock);
2617 		nce->nce_pcnt--;
2618 		ASSERT(nce->nce_pcnt < ND_MAX_UNICAST_SOLICIT &&
2619 		    nce->nce_pcnt >= -1);
2620 		if (nce->nce_pcnt > 0) {
2621 			/*
2622 			 * As per RFC2461, the nce gets deleted after
2623 			 * MAX_UNICAST_SOLICIT unsuccessful re-transmissions.
2624 			 * Note that the first unicast solicitation is sent
2625 			 * during the DELAY state.
2626 			 */
2627 			ip2dbg(("ndp_timer: pcount=%x dst %s\n",
2628 			    nce->nce_pcnt, inet_ntop(AF_INET6, &nce->nce_addr,
2629 			    addrbuf, sizeof (addrbuf))));
2630 			mutex_exit(&nce->nce_lock);
2631 			dropped = nce_xmit(ill, ND_NEIGHBOR_SOLICIT, NULL,
2632 			    B_FALSE, &ipv6_all_zeros, &nce->nce_addr,
2633 			    (nce->nce_flags & NCE_F_PERMANENT) ? NDP_PROBE :
2634 			    NDP_UNICAST);
2635 			if (dropped) {
2636 				mutex_enter(&nce->nce_lock);
2637 				nce->nce_pcnt++;
2638 				mutex_exit(&nce->nce_lock);
2639 			}
2640 			NDP_RESTART_TIMER(nce, ILL_PROBE_INTERVAL(ill));
2641 		} else if (nce->nce_pcnt < 0) {
2642 			/* No hope, delete the nce */
2643 			nce->nce_state = ND_UNREACHABLE;
2644 			mutex_exit(&nce->nce_lock);
2645 			if (ip_debug > 2) {
2646 				/* ip1dbg */
2647 				pr_addr_dbg("ndp_timer: Delete IRE for"
2648 				    " dst %s\n", AF_INET6, &nce->nce_addr);
2649 			}
2650 			ndp_delete(nce);
2651 		} else if (!(nce->nce_flags & NCE_F_PERMANENT)) {
2652 			/* Wait RetransTimer, before deleting the entry */
2653 			ip2dbg(("ndp_timer: pcount=%x dst %s\n",
2654 			    nce->nce_pcnt, inet_ntop(AF_INET6,
2655 			    &nce->nce_addr, addrbuf, sizeof (addrbuf))));
2656 			mutex_exit(&nce->nce_lock);
2657 			/* Wait one interval before killing */
2658 			NDP_RESTART_TIMER(nce, ill->ill_reachable_retrans_time);
2659 		} else if (ill->ill_phyint->phyint_flags & PHYI_RUNNING) {
2660 			ipif_t *ipif;
2661 
2662 			/*
2663 			 * We're done probing, and we can now declare this
2664 			 * address to be usable.  Let IP know that it's ok to
2665 			 * use.
2666 			 */
2667 			nce->nce_state = ND_REACHABLE;
2668 			mutex_exit(&nce->nce_lock);
2669 			ipif = ipif_lookup_addr_v6(&nce->nce_addr, ill,
2670 			    ALL_ZONES, NULL, NULL, NULL, NULL);
2671 			if (ipif != NULL) {
2672 				if (ipif->ipif_was_dup) {
2673 					char ibuf[LIFNAMSIZ + 10];
2674 					char sbuf[INET6_ADDRSTRLEN];
2675 
2676 					ipif->ipif_was_dup = B_FALSE;
2677 					(void) strlcpy(ibuf, ill->ill_name,
2678 					    sizeof (ibuf));
2679 					(void) inet_ntop(AF_INET6,
2680 					    &ipif->ipif_v6lcl_addr,
2681 					    sbuf, sizeof (sbuf));
2682 					if (ipif->ipif_id != 0) {
2683 						(void) snprintf(ibuf +
2684 						    ill->ill_name_length - 1,
2685 						    sizeof (ibuf) -
2686 						    ill->ill_name_length + 1,
2687 						    ":%d", ipif->ipif_id);
2688 					}
2689 					cmn_err(CE_NOTE, "recovered address "
2690 					    "%s on %s", sbuf, ibuf);
2691 				}
2692 				if ((ipif->ipif_flags & IPIF_UP) &&
2693 				    !ipif->ipif_addr_ready) {
2694 					ip_rts_ifmsg(ipif);
2695 					ip_rts_newaddrmsg(RTM_ADD, 0, ipif);
2696 					sctp_update_ipif(ipif, SCTP_IPIF_UP);
2697 				}
2698 				ipif->ipif_addr_ready = 1;
2699 				ipif_refrele(ipif);
2700 			}
2701 			/* Begin defending our new address */
2702 			nce->nce_unsolicit_count = 0;
2703 			dropped = nce_xmit(ill, ND_NEIGHBOR_ADVERT, ill,
2704 			    B_FALSE, &nce->nce_addr, &ipv6_all_hosts_mcast,
2705 			    nce_advert_flags(nce));
2706 			if (dropped) {
2707 				nce->nce_unsolicit_count = 1;
2708 				NDP_RESTART_TIMER(nce,
2709 				    ip_ndp_unsolicit_interval);
2710 			} else if (ip_ndp_defense_interval != 0) {
2711 				NDP_RESTART_TIMER(nce, ip_ndp_defense_interval);
2712 			}
2713 		} else {
2714 			/*
2715 			 * This is an address we're probing to be our own, but
2716 			 * the ill is down.  Wait until it comes back before
2717 			 * doing anything, but switch to reachable state so
2718 			 * that the restart will work.
2719 			 */
2720 			nce->nce_state = ND_REACHABLE;
2721 			mutex_exit(&nce->nce_lock);
2722 		}
2723 		NCE_REFRELE(nce);
2724 		return;
2725 	case ND_INCOMPLETE:
2726 		/*
2727 		 * Must be resolvers retransmit timer.
2728 		 */
2729 		for (mp = nce->nce_qd_mp; mp != NULL; mp = mp->b_next) {
2730 			ip6i_t	*ip6i;
2731 			ip6_t	*ip6h;
2732 			mblk_t *data_mp;
2733 
2734 			/*
2735 			 * Walk the list of packets queued, and see if there
2736 			 * are any multipathing probe packets. Such packets
2737 			 * are always queued at the head. Since this is a
2738 			 * retransmit timer firing, mark such packets as
2739 			 * delayed in ND resolution. This info will be used
2740 			 * in ip_wput_v6(). Multipathing probe packets will
2741 			 * always have an ip6i_t. Once we hit a packet without
2742 			 * it, we can break out of this loop.
2743 			 */
2744 			if (mp->b_datap->db_type == M_CTL)
2745 				data_mp = mp->b_cont;
2746 			else
2747 				data_mp = mp;
2748 
2749 			ip6h = (ip6_t *)data_mp->b_rptr;
2750 			if (ip6h->ip6_nxt != IPPROTO_RAW)
2751 				break;
2752 
2753 			/*
2754 			 * This message should have been pulled up already in
2755 			 * ip_wput_v6. We can't do pullups here because the
2756 			 * b_next/b_prev is non-NULL.
2757 			 */
2758 			ip6i = (ip6i_t *)ip6h;
2759 			ASSERT((data_mp->b_wptr - (uchar_t *)ip6i) >=
2760 			    sizeof (ip6i_t) + IPV6_HDR_LEN);
2761 
2762 			/* Mark this packet as delayed due to ND resolution */
2763 			if (ip6i->ip6i_flags & IP6I_DROP_IFDELAYED)
2764 				ip6i->ip6i_flags |= IP6I_ND_DELAYED;
2765 		}
2766 		if (nce->nce_qd_mp != NULL) {
2767 			ms = nce_solicit(nce, NULL);
2768 			rw_exit(&ill_g_lock);
2769 			if (ms == 0) {
2770 				if (nce->nce_state != ND_REACHABLE) {
2771 					mutex_exit(&nce->nce_lock);
2772 					nce_resolv_failed(nce);
2773 					ndp_delete(nce);
2774 				} else {
2775 					mutex_exit(&nce->nce_lock);
2776 				}
2777 			} else {
2778 				mutex_exit(&nce->nce_lock);
2779 				NDP_RESTART_TIMER(nce, (clock_t)ms);
2780 			}
2781 			NCE_REFRELE(nce);
2782 			return;
2783 		}
2784 		mutex_exit(&nce->nce_lock);
2785 		rw_exit(&ill_g_lock);
2786 		NCE_REFRELE(nce);
2787 		break;
2788 	case ND_REACHABLE :
2789 		rw_exit(&ill_g_lock);
2790 		if (((nce->nce_flags & NCE_F_UNSOL_ADV) &&
2791 		    nce->nce_unsolicit_count != 0) ||
2792 		    ((nce->nce_flags & NCE_F_PERMANENT) &&
2793 		    ip_ndp_defense_interval != 0)) {
2794 			if (nce->nce_unsolicit_count > 0)
2795 				nce->nce_unsolicit_count--;
2796 			mutex_exit(&nce->nce_lock);
2797 			dropped = nce_xmit(ill,
2798 			    ND_NEIGHBOR_ADVERT,
2799 			    ill,	/* ill to be used for hw addr */
2800 			    B_FALSE,	/* use ill_phys_addr */
2801 			    &nce->nce_addr,
2802 			    &ipv6_all_hosts_mcast,
2803 			    nce_advert_flags(nce));
2804 			if (dropped) {
2805 				mutex_enter(&nce->nce_lock);
2806 				nce->nce_unsolicit_count++;
2807 				mutex_exit(&nce->nce_lock);
2808 			}
2809 			if (nce->nce_unsolicit_count != 0) {
2810 				NDP_RESTART_TIMER(nce,
2811 				    ip_ndp_unsolicit_interval);
2812 			} else {
2813 				NDP_RESTART_TIMER(nce,
2814 				    ip_ndp_defense_interval);
2815 			}
2816 		} else {
2817 			mutex_exit(&nce->nce_lock);
2818 		}
2819 		NCE_REFRELE(nce);
2820 		break;
2821 	default:
2822 		rw_exit(&ill_g_lock);
2823 		mutex_exit(&nce->nce_lock);
2824 		NCE_REFRELE(nce);
2825 		break;
2826 	}
2827 }
2828 
2829 /*
2830  * Set a link layer address from the ll_addr passed in.
2831  * Copy SAP from ill.
2832  */
2833 static void
2834 nce_set_ll(nce_t *nce, uchar_t *ll_addr)
2835 {
2836 	ill_t	*ill = nce->nce_ill;
2837 	uchar_t	*woffset;
2838 
2839 	ASSERT(ll_addr != NULL);
2840 	/* Always called before fast_path_probe */
2841 	ASSERT(nce->nce_fp_mp == NULL);
2842 	if (ill->ill_sap_length != 0) {
2843 		/*
2844 		 * Copy the SAP type specified in the
2845 		 * request into the xmit template.
2846 		 */
2847 		NCE_LL_SAP_COPY(ill, nce->nce_res_mp);
2848 	}
2849 	if (ill->ill_phys_addr_length > 0) {
2850 		/*
2851 		 * The bcopy() below used to be called for the physical address
2852 		 * length rather than the link layer address length. For
2853 		 * ethernet and many other media, the phys_addr and lla are
2854 		 * identical.
2855 		 * However, with xresolv interfaces being introduced, the
2856 		 * phys_addr and lla are no longer the same, and the physical
2857 		 * address may not have any useful meaning, so we use the lla
2858 		 * for IPv6 address resolution and destination addressing.
2859 		 *
2860 		 * For PPP or other interfaces with a zero length
2861 		 * physical address, don't do anything here.
2862 		 * The bcopy() with a zero phys_addr length was previously
2863 		 * a no-op for interfaces with a zero-length physical address.
2864 		 * Using the lla for them would change the way they operate.
2865 		 * Doing nothing in such cases preserves expected behavior.
2866 		 */
2867 		woffset = nce->nce_res_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill);
2868 		bcopy(ll_addr, woffset, ill->ill_nd_lla_len);
2869 	}
2870 }
2871 
2872 static boolean_t
2873 nce_cmp_ll_addr(const nce_t *nce, const uchar_t *ll_addr, uint32_t ll_addr_len)
2874 {
2875 	ill_t	*ill = nce->nce_ill;
2876 	uchar_t	*ll_offset;
2877 
2878 	ASSERT(nce->nce_res_mp != NULL);
2879 	if (ll_addr == NULL)
2880 		return (B_FALSE);
2881 	ll_offset = nce->nce_res_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill);
2882 	if (bcmp(ll_addr, ll_offset, ll_addr_len) != 0)
2883 		return (B_TRUE);
2884 	return (B_FALSE);
2885 }
2886 
2887 /*
2888  * Updates the link layer address or the reachability state of
2889  * a cache entry.  Reset probe counter if needed.
2890  */
2891 static void
2892 nce_update(nce_t *nce, uint16_t new_state, uchar_t *new_ll_addr)
2893 {
2894 	ill_t	*ill = nce->nce_ill;
2895 	boolean_t need_stop_timer = B_FALSE;
2896 	boolean_t need_fastpath_update = B_FALSE;
2897 
2898 	ASSERT(MUTEX_HELD(&nce->nce_lock));
2899 	ASSERT(nce->nce_ipversion == IPV6_VERSION);
2900 	/*
2901 	 * If this interface does not do NUD, there is no point
2902 	 * in allowing an update to the cache entry.  Although
2903 	 * we will respond to NS.
2904 	 * The only time we accept an update for a resolver when
2905 	 * NUD is turned off is when it has just been created.
2906 	 * Non-Resolvers will always be created as REACHABLE.
2907 	 */
2908 	if (new_state != ND_UNCHANGED) {
2909 		if ((nce->nce_flags & NCE_F_NONUD) &&
2910 		    (nce->nce_state != ND_INCOMPLETE))
2911 			return;
2912 		ASSERT((int16_t)new_state >= ND_STATE_VALID_MIN);
2913 		ASSERT((int16_t)new_state <= ND_STATE_VALID_MAX);
2914 		need_stop_timer = B_TRUE;
2915 		if (new_state == ND_REACHABLE)
2916 			nce->nce_last = TICK_TO_MSEC(lbolt64);
2917 		else {
2918 			/* We force NUD in this case */
2919 			nce->nce_last = 0;
2920 		}
2921 		nce->nce_state = new_state;
2922 		nce->nce_pcnt = ND_MAX_UNICAST_SOLICIT;
2923 	}
2924 	/*
2925 	 * In case of fast path we need to free the the fastpath
2926 	 * M_DATA and do another probe.  Otherwise we can just
2927 	 * overwrite the DL_UNITDATA_REQ data, noting we'll lose
2928 	 * whatever packets that happens to be transmitting at the time.
2929 	 */
2930 	if (new_ll_addr != NULL) {
2931 		ASSERT(nce->nce_res_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill) +
2932 		    ill->ill_nd_lla_len <= nce->nce_res_mp->b_wptr);
2933 		bcopy(new_ll_addr, nce->nce_res_mp->b_rptr +
2934 		    NCE_LL_ADDR_OFFSET(ill), ill->ill_nd_lla_len);
2935 		if (nce->nce_fp_mp != NULL) {
2936 			freemsg(nce->nce_fp_mp);
2937 			nce->nce_fp_mp = NULL;
2938 		}
2939 		need_fastpath_update = B_TRUE;
2940 	}
2941 	mutex_exit(&nce->nce_lock);
2942 	if (need_stop_timer) {
2943 		(void) untimeout(nce->nce_timeout_id);
2944 		nce->nce_timeout_id = 0;
2945 	}
2946 	if (need_fastpath_update)
2947 		nce_fastpath(nce);
2948 	mutex_enter(&nce->nce_lock);
2949 }
2950 
2951 void
2952 nce_queue_mp_common(nce_t *nce, mblk_t *mp, boolean_t head_insert)
2953 {
2954 	uint_t	count = 0;
2955 	mblk_t  **mpp;
2956 
2957 	ASSERT(MUTEX_HELD(&nce->nce_lock));
2958 
2959 	for (mpp = &nce->nce_qd_mp; *mpp != NULL;
2960 	    mpp = &(*mpp)->b_next) {
2961 		if (++count >
2962 		    nce->nce_ill->ill_max_buf) {
2963 			mblk_t *tmp = nce->nce_qd_mp->b_next;
2964 
2965 			nce->nce_qd_mp->b_next = NULL;
2966 			nce->nce_qd_mp->b_prev = NULL;
2967 			freemsg(nce->nce_qd_mp);
2968 			nce->nce_qd_mp = tmp;
2969 		}
2970 	}
2971 	/* put this on the list */
2972 	if (head_insert) {
2973 		mp->b_next = nce->nce_qd_mp;
2974 		nce->nce_qd_mp = mp;
2975 	} else {
2976 		*mpp = mp;
2977 	}
2978 }
2979 
2980 static void
2981 nce_queue_mp(nce_t *nce, mblk_t *mp)
2982 {
2983 	boolean_t head_insert = B_FALSE;
2984 	ip6_t	*ip6h;
2985 	ip6i_t	*ip6i;
2986 	mblk_t *data_mp;
2987 
2988 	ASSERT(MUTEX_HELD(&nce->nce_lock));
2989 
2990 	if (mp->b_datap->db_type == M_CTL)
2991 		data_mp = mp->b_cont;
2992 	else
2993 		data_mp = mp;
2994 	ip6h = (ip6_t *)data_mp->b_rptr;
2995 	if (ip6h->ip6_nxt == IPPROTO_RAW) {
2996 		/*
2997 		 * This message should have been pulled up already in
2998 		 * ip_wput_v6. We can't do pullups here because the message
2999 		 * could be from the nce_qd_mp which could have b_next/b_prev
3000 		 * non-NULL.
3001 		 */
3002 		ip6i = (ip6i_t *)ip6h;
3003 		ASSERT((data_mp->b_wptr - (uchar_t *)ip6i) >=
3004 			    sizeof (ip6i_t) + IPV6_HDR_LEN);
3005 		/*
3006 		 * Multipathing probe packets have IP6I_DROP_IFDELAYED set.
3007 		 * This has 2 aspects mentioned below.
3008 		 * 1. Perform head insertion in the nce_qd_mp for these packets.
3009 		 * This ensures that next retransmit of ND solicitation
3010 		 * will use the interface specified by the probe packet,
3011 		 * for both NS and NA. This corresponds to the src address
3012 		 * in the IPv6 packet. If we insert at tail, we will be
3013 		 * depending on the packet at the head for successful
3014 		 * ND resolution. This is not reliable, because the interface
3015 		 * on which the NA arrives could be different from the interface
3016 		 * on which the NS was sent, and if the receiving interface is
3017 		 * failed, it will appear that the sending interface is also
3018 		 * failed, causing in.mpathd to misdiagnose this as link
3019 		 * failure.
3020 		 * 2. Drop the original packet, if the ND resolution did not
3021 		 * succeed in the first attempt. However we will create the
3022 		 * nce and the ire, as soon as the ND resolution succeeds.
3023 		 * We don't gain anything by queueing multiple probe packets
3024 		 * and sending them back-to-back once resolution succeeds.
3025 		 * It is sufficient to send just 1 packet after ND resolution
3026 		 * succeeds. Since mpathd is sending down probe packets at a
3027 		 * constant rate, we don't need to send the queued packet. We
3028 		 * need to queue it only for NDP resolution. The benefit of
3029 		 * dropping the probe packets that were delayed in ND
3030 		 * resolution, is that in.mpathd will not see inflated
3031 		 * RTT. If the ND resolution does not succeed within
3032 		 * in.mpathd's failure detection time, mpathd may detect
3033 		 * a failure, and it does not matter whether the packet
3034 		 * was queued or dropped.
3035 		 */
3036 		if (ip6i->ip6i_flags & IP6I_DROP_IFDELAYED)
3037 			head_insert = B_TRUE;
3038 	}
3039 
3040 	nce_queue_mp_common(nce, mp, head_insert);
3041 }
3042 
3043 /*
3044  * Called when address resolution failed due to a timeout.
3045  * Send an ICMP unreachable in response to all queued packets.
3046  */
3047 void
3048 nce_resolv_failed(nce_t *nce)
3049 {
3050 	mblk_t	*mp, *nxt_mp, *first_mp;
3051 	char	buf[INET6_ADDRSTRLEN];
3052 	ip6_t *ip6h;
3053 	zoneid_t zoneid = GLOBAL_ZONEID;
3054 
3055 	ip1dbg(("nce_resolv_failed: dst %s\n",
3056 	    inet_ntop(AF_INET6, (char *)&nce->nce_addr, buf, sizeof (buf))));
3057 	mutex_enter(&nce->nce_lock);
3058 	mp = nce->nce_qd_mp;
3059 	nce->nce_qd_mp = NULL;
3060 	mutex_exit(&nce->nce_lock);
3061 	while (mp != NULL) {
3062 		nxt_mp = mp->b_next;
3063 		mp->b_next = NULL;
3064 		mp->b_prev = NULL;
3065 
3066 		first_mp = mp;
3067 		if (mp->b_datap->db_type == M_CTL) {
3068 			ipsec_out_t *io = (ipsec_out_t *)mp->b_rptr;
3069 			ASSERT(io->ipsec_out_type == IPSEC_OUT);
3070 			zoneid = io->ipsec_out_zoneid;
3071 			ASSERT(zoneid != ALL_ZONES);
3072 			mp = mp->b_cont;
3073 		}
3074 
3075 		ip6h = (ip6_t *)mp->b_rptr;
3076 		if (ip6h->ip6_nxt == IPPROTO_RAW) {
3077 			ip6i_t *ip6i;
3078 			/*
3079 			 * This message should have been pulled up already
3080 			 * in ip_wput_v6. ip_hdr_complete_v6 assumes that
3081 			 * the header is pulled up.
3082 			 */
3083 			ip6i = (ip6i_t *)ip6h;
3084 			ASSERT((mp->b_wptr - (uchar_t *)ip6i) >=
3085 			    sizeof (ip6i_t) + IPV6_HDR_LEN);
3086 			mp->b_rptr += sizeof (ip6i_t);
3087 		}
3088 		/*
3089 		 * Ignore failure since icmp_unreachable_v6 will silently
3090 		 * drop packets with an unspecified source address.
3091 		 */
3092 		(void) ip_hdr_complete_v6((ip6_t *)mp->b_rptr, zoneid);
3093 		icmp_unreachable_v6(nce->nce_ill->ill_wq, first_mp,
3094 		    ICMP6_DST_UNREACH_ADDR, B_FALSE, B_FALSE, zoneid);
3095 		mp = nxt_mp;
3096 	}
3097 }
3098 
3099 /*
3100  * Called by SIOCSNDP* ioctl to add/change an nce entry
3101  * and the corresponding attributes.
3102  * Disallow states other than ND_REACHABLE or ND_STALE.
3103  */
3104 int
3105 ndp_sioc_update(ill_t *ill, lif_nd_req_t *lnr)
3106 {
3107 	sin6_t		*sin6;
3108 	in6_addr_t	*addr;
3109 	nce_t		*nce;
3110 	int		err;
3111 	uint16_t	new_flags = 0;
3112 	uint16_t	old_flags = 0;
3113 	int		inflags = lnr->lnr_flags;
3114 
3115 	ASSERT(ill->ill_isv6);
3116 	if ((lnr->lnr_state_create != ND_REACHABLE) &&
3117 	    (lnr->lnr_state_create != ND_STALE))
3118 		return (EINVAL);
3119 
3120 	sin6 = (sin6_t *)&lnr->lnr_addr;
3121 	addr = &sin6->sin6_addr;
3122 
3123 	mutex_enter(&ndp6.ndp_g_lock);
3124 	/* We know it can not be mapping so just look in the hash table */
3125 	nce = *((nce_t **)NCE_HASH_PTR_V6(*addr));
3126 	nce = nce_lookup_addr(ill, addr, nce);
3127 	if (nce != NULL)
3128 		new_flags = nce->nce_flags;
3129 
3130 	switch (inflags & (NDF_ISROUTER_ON|NDF_ISROUTER_OFF)) {
3131 	case NDF_ISROUTER_ON:
3132 		new_flags |= NCE_F_ISROUTER;
3133 		break;
3134 	case NDF_ISROUTER_OFF:
3135 		new_flags &= ~NCE_F_ISROUTER;
3136 		break;
3137 	case (NDF_ISROUTER_OFF|NDF_ISROUTER_ON):
3138 		mutex_exit(&ndp6.ndp_g_lock);
3139 		if (nce != NULL)
3140 			NCE_REFRELE(nce);
3141 		return (EINVAL);
3142 	}
3143 
3144 	switch (inflags & (NDF_ANYCAST_ON|NDF_ANYCAST_OFF)) {
3145 	case NDF_ANYCAST_ON:
3146 		new_flags |= NCE_F_ANYCAST;
3147 		break;
3148 	case NDF_ANYCAST_OFF:
3149 		new_flags &= ~NCE_F_ANYCAST;
3150 		break;
3151 	case (NDF_ANYCAST_OFF|NDF_ANYCAST_ON):
3152 		mutex_exit(&ndp6.ndp_g_lock);
3153 		if (nce != NULL)
3154 			NCE_REFRELE(nce);
3155 		return (EINVAL);
3156 	}
3157 
3158 	switch (inflags & (NDF_PROXY_ON|NDF_PROXY_OFF)) {
3159 	case NDF_PROXY_ON:
3160 		new_flags |= NCE_F_PROXY;
3161 		break;
3162 	case NDF_PROXY_OFF:
3163 		new_flags &= ~NCE_F_PROXY;
3164 		break;
3165 	case (NDF_PROXY_OFF|NDF_PROXY_ON):
3166 		mutex_exit(&ndp6.ndp_g_lock);
3167 		if (nce != NULL)
3168 			NCE_REFRELE(nce);
3169 		return (EINVAL);
3170 	}
3171 
3172 	if (nce == NULL) {
3173 		err = ndp_add(ill,
3174 		    (uchar_t *)lnr->lnr_hdw_addr,
3175 		    addr,
3176 		    &ipv6_all_ones,
3177 		    &ipv6_all_zeros,
3178 		    0,
3179 		    new_flags,
3180 		    lnr->lnr_state_create,
3181 		    &nce,
3182 		    NULL,
3183 		    NULL);
3184 		if (err != 0) {
3185 			mutex_exit(&ndp6.ndp_g_lock);
3186 			ip1dbg(("ndp_sioc_update: Can't create NCE %d\n", err));
3187 			return (err);
3188 		}
3189 	}
3190 	old_flags = nce->nce_flags;
3191 	if (old_flags & NCE_F_ISROUTER && !(new_flags & NCE_F_ISROUTER)) {
3192 		/*
3193 		 * Router turned to host, delete all ires.
3194 		 * XXX Just delete the entry, but we need to add too.
3195 		 */
3196 		nce->nce_flags &= ~NCE_F_ISROUTER;
3197 		mutex_exit(&ndp6.ndp_g_lock);
3198 		ndp_delete(nce);
3199 		NCE_REFRELE(nce);
3200 		return (0);
3201 	}
3202 	mutex_exit(&ndp6.ndp_g_lock);
3203 
3204 	mutex_enter(&nce->nce_lock);
3205 	nce->nce_flags = new_flags;
3206 	mutex_exit(&nce->nce_lock);
3207 	/*
3208 	 * Note that we ignore the state at this point, which
3209 	 * should be either STALE or REACHABLE.  Instead we let
3210 	 * the link layer address passed in to determine the state
3211 	 * much like incoming packets.
3212 	 */
3213 	ndp_process(nce, (uchar_t *)lnr->lnr_hdw_addr, 0, B_FALSE);
3214 	NCE_REFRELE(nce);
3215 	return (0);
3216 }
3217 
3218 /*
3219  * If the device driver supports it, we make nce_fp_mp to have
3220  * an M_DATA prepend.  Otherwise nce_fp_mp will be null.
3221  * The caller insures there is hold on nce for this function.
3222  * Note that since ill_fastpath_probe() copies the mblk there is
3223  * no need for the hold beyond this function.
3224  */
3225 static void
3226 nce_fastpath(nce_t *nce)
3227 {
3228 	ill_t	*ill = nce->nce_ill;
3229 	int res;
3230 
3231 	ASSERT(ill != NULL);
3232 	if (nce->nce_fp_mp != NULL) {
3233 		/* Already contains fastpath info */
3234 		return;
3235 	}
3236 	if (nce->nce_res_mp != NULL) {
3237 		nce_fastpath_list_add(nce);
3238 		res = ill_fastpath_probe(ill, nce->nce_res_mp);
3239 		/*
3240 		 * EAGAIN is an indication of a transient error
3241 		 * i.e. allocation failure etc. leave the nce in the list it
3242 		 * will be updated when another probe happens for another ire
3243 		 * if not it will be taken out of the list when the ire is
3244 		 * deleted.
3245 		 */
3246 
3247 		if (res != 0 && res != EAGAIN)
3248 			nce_fastpath_list_delete(nce);
3249 	}
3250 }
3251 
3252 /*
3253  * Drain the list of nce's waiting for fastpath response.
3254  */
3255 void
3256 nce_fastpath_list_dispatch(ill_t *ill, boolean_t (*func)(nce_t *, void  *),
3257     void *arg)
3258 {
3259 
3260 	nce_t *next_nce;
3261 	nce_t *current_nce;
3262 	nce_t *first_nce;
3263 	nce_t *prev_nce = NULL;
3264 
3265 	ASSERT(ill != NULL && ill->ill_isv6);
3266 
3267 	mutex_enter(&ill->ill_lock);
3268 	first_nce = current_nce = (nce_t *)ill->ill_fastpath_list;
3269 	while (current_nce != (nce_t *)&ill->ill_fastpath_list) {
3270 		next_nce = current_nce->nce_fastpath;
3271 		/*
3272 		 * Take it off the list if we're flushing, or if the callback
3273 		 * routine tells us to do so.  Otherwise, leave the nce in the
3274 		 * fastpath list to handle any pending response from the lower
3275 		 * layer.  We can't drain the list when the callback routine
3276 		 * comparison failed, because the response is asynchronous in
3277 		 * nature, and may not arrive in the same order as the list
3278 		 * insertion.
3279 		 */
3280 		if (func == NULL || func(current_nce, arg)) {
3281 			current_nce->nce_fastpath = NULL;
3282 			if (current_nce == first_nce)
3283 				ill->ill_fastpath_list = first_nce = next_nce;
3284 			else
3285 				prev_nce->nce_fastpath = next_nce;
3286 		} else {
3287 			/* previous element that is still in the list */
3288 			prev_nce = current_nce;
3289 		}
3290 		current_nce = next_nce;
3291 	}
3292 	mutex_exit(&ill->ill_lock);
3293 }
3294 
3295 /*
3296  * Add nce to the nce fastpath list.
3297  */
3298 void
3299 nce_fastpath_list_add(nce_t *nce)
3300 {
3301 	ill_t *ill;
3302 
3303 	ill = nce->nce_ill;
3304 	ASSERT(ill != NULL && ill->ill_isv6);
3305 
3306 	mutex_enter(&ill->ill_lock);
3307 	mutex_enter(&nce->nce_lock);
3308 
3309 	/*
3310 	 * if nce has not been deleted and
3311 	 * is not already in the list add it.
3312 	 */
3313 	if (!(nce->nce_flags & NCE_F_CONDEMNED) &&
3314 	    (nce->nce_fastpath == NULL)) {
3315 		nce->nce_fastpath = (nce_t *)ill->ill_fastpath_list;
3316 		ill->ill_fastpath_list = nce;
3317 	}
3318 
3319 	mutex_exit(&nce->nce_lock);
3320 	mutex_exit(&ill->ill_lock);
3321 }
3322 
3323 /*
3324  * remove nce from the nce fastpath list.
3325  */
3326 void
3327 nce_fastpath_list_delete(nce_t *nce)
3328 {
3329 	nce_t *nce_ptr;
3330 
3331 	ill_t *ill;
3332 
3333 	ill = nce->nce_ill;
3334 	ASSERT(ill != NULL);
3335 	if (!ill->ill_isv6)  {
3336 		/*
3337 		 * v4 nce_t's do not have nce_fastpath set.
3338 		 */
3339 		return;
3340 	}
3341 
3342 	mutex_enter(&ill->ill_lock);
3343 	if (nce->nce_fastpath == NULL)
3344 		goto done;
3345 
3346 	ASSERT(ill->ill_fastpath_list != &ill->ill_fastpath_list);
3347 
3348 	if (ill->ill_fastpath_list == nce) {
3349 		ill->ill_fastpath_list = nce->nce_fastpath;
3350 	} else {
3351 		nce_ptr = ill->ill_fastpath_list;
3352 		while (nce_ptr != (nce_t *)&ill->ill_fastpath_list) {
3353 			if (nce_ptr->nce_fastpath == nce) {
3354 				nce_ptr->nce_fastpath = nce->nce_fastpath;
3355 				break;
3356 			}
3357 			nce_ptr = nce_ptr->nce_fastpath;
3358 		}
3359 	}
3360 
3361 	nce->nce_fastpath = NULL;
3362 done:
3363 	mutex_exit(&ill->ill_lock);
3364 }
3365 
3366 /*
3367  * Update all NCE's that are not in fastpath mode and
3368  * have an nce_fp_mp that matches mp. mp->b_cont contains
3369  * the fastpath header.
3370  *
3371  * Returns TRUE if entry should be dequeued, or FALSE otherwise.
3372  */
3373 boolean_t
3374 ndp_fastpath_update(nce_t *nce, void *arg)
3375 {
3376 	mblk_t 	*mp, *fp_mp;
3377 	uchar_t	*mp_rptr, *ud_mp_rptr;
3378 	mblk_t	*ud_mp = nce->nce_res_mp;
3379 	ptrdiff_t	cmplen;
3380 
3381 	if (nce->nce_flags & NCE_F_MAPPING)
3382 		return (B_TRUE);
3383 	if ((nce->nce_fp_mp != NULL) || (ud_mp == NULL))
3384 		return (B_TRUE);
3385 
3386 	ip2dbg(("ndp_fastpath_update: trying\n"));
3387 	mp = (mblk_t *)arg;
3388 	mp_rptr = mp->b_rptr;
3389 	cmplen = mp->b_wptr - mp_rptr;
3390 	ASSERT(cmplen >= 0);
3391 	ud_mp_rptr = ud_mp->b_rptr;
3392 	/*
3393 	 * The nce is locked here to prevent any other threads
3394 	 * from accessing and changing nce_res_mp when the IPv6 address
3395 	 * becomes resolved to an lla while we're in the middle
3396 	 * of looking at and comparing the hardware address (lla).
3397 	 * It is also locked to prevent multiple threads in nce_fastpath_update
3398 	 * from examining nce_res_mp atthe same time.
3399 	 */
3400 	mutex_enter(&nce->nce_lock);
3401 	if (ud_mp->b_wptr - ud_mp_rptr != cmplen ||
3402 	    bcmp((char *)mp_rptr, (char *)ud_mp_rptr, cmplen) != 0) {
3403 		mutex_exit(&nce->nce_lock);
3404 		/*
3405 		 * Don't take the ire off the fastpath list yet,
3406 		 * since the response may come later.
3407 		 */
3408 		return (B_FALSE);
3409 	}
3410 	/* Matched - install mp as the fastpath mp */
3411 	ip1dbg(("ndp_fastpath_update: match\n"));
3412 	fp_mp = dupb(mp->b_cont);
3413 	if (fp_mp != NULL) {
3414 		nce->nce_fp_mp = fp_mp;
3415 	}
3416 	mutex_exit(&nce->nce_lock);
3417 	return (B_TRUE);
3418 }
3419 
3420 /*
3421  * This function handles the DL_NOTE_FASTPATH_FLUSH notification from
3422  * driver.  Note that it assumes IP is exclusive...
3423  */
3424 /* ARGSUSED */
3425 void
3426 ndp_fastpath_flush(nce_t *nce, char *arg)
3427 {
3428 	if (nce->nce_flags & NCE_F_MAPPING)
3429 		return;
3430 	/* No fastpath info? */
3431 	if (nce->nce_fp_mp == NULL || nce->nce_res_mp == NULL)
3432 		return;
3433 
3434 	/* Just delete the NCE... */
3435 	ndp_delete(nce);
3436 }
3437 
3438 /*
3439  * Return a pointer to a given option in the packet.
3440  * Assumes that option part of the packet have already been validated.
3441  */
3442 nd_opt_hdr_t *
3443 ndp_get_option(nd_opt_hdr_t *opt, int optlen, int opt_type)
3444 {
3445 	while (optlen > 0) {
3446 		if (opt->nd_opt_type == opt_type)
3447 			return (opt);
3448 		optlen -= 8 * opt->nd_opt_len;
3449 		opt = (struct nd_opt_hdr *)((char *)opt + 8 * opt->nd_opt_len);
3450 	}
3451 	return (NULL);
3452 }
3453 
3454 /*
3455  * Verify all option lengths present are > 0, also check to see
3456  * if the option lengths and packet length are consistent.
3457  */
3458 boolean_t
3459 ndp_verify_optlen(nd_opt_hdr_t *opt, int optlen)
3460 {
3461 	ASSERT(opt != NULL);
3462 	while (optlen > 0) {
3463 		if (opt->nd_opt_len == 0)
3464 			return (B_FALSE);
3465 		optlen -= 8 * opt->nd_opt_len;
3466 		if (optlen < 0)
3467 			return (B_FALSE);
3468 		opt = (struct nd_opt_hdr *)((char *)opt + 8 * opt->nd_opt_len);
3469 	}
3470 	return (B_TRUE);
3471 }
3472 
3473 /*
3474  * ndp_walk function.
3475  * Free a fraction of the NCE cache entries.
3476  * A fraction of zero means to not free any in that category.
3477  */
3478 void
3479 ndp_cache_reclaim(nce_t *nce, char *arg)
3480 {
3481 	nce_cache_reclaim_t *ncr = (nce_cache_reclaim_t *)arg;
3482 	uint_t	rand;
3483 
3484 	if (nce->nce_flags & NCE_F_PERMANENT)
3485 		return;
3486 
3487 	rand = (uint_t)lbolt +
3488 	    NCE_ADDR_HASH_V6(nce->nce_addr, NCE_TABLE_SIZE);
3489 	if (ncr->ncr_host != 0 &&
3490 	    (rand/ncr->ncr_host)*ncr->ncr_host == rand) {
3491 		ndp_delete(nce);
3492 		return;
3493 	}
3494 }
3495 
3496 /*
3497  * ndp_walk function.
3498  * Count the number of NCEs that can be deleted.
3499  * These would be hosts but not routers.
3500  */
3501 void
3502 ndp_cache_count(nce_t *nce, char *arg)
3503 {
3504 	ncc_cache_count_t *ncc = (ncc_cache_count_t *)arg;
3505 
3506 	if (nce->nce_flags & NCE_F_PERMANENT)
3507 		return;
3508 
3509 	ncc->ncc_total++;
3510 	if (!(nce->nce_flags & NCE_F_ISROUTER))
3511 		ncc->ncc_host++;
3512 }
3513 
3514 #ifdef NCE_DEBUG
3515 th_trace_t *
3516 th_trace_nce_lookup(nce_t *nce)
3517 {
3518 	int bucket_id;
3519 	th_trace_t *th_trace;
3520 
3521 	ASSERT(MUTEX_HELD(&nce->nce_lock));
3522 
3523 	bucket_id = IP_TR_HASH(curthread);
3524 	ASSERT(bucket_id < IP_TR_HASH_MAX);
3525 
3526 	for (th_trace = nce->nce_trace[bucket_id]; th_trace != NULL;
3527 	    th_trace = th_trace->th_next) {
3528 		if (th_trace->th_id == curthread)
3529 			return (th_trace);
3530 	}
3531 	return (NULL);
3532 }
3533 
3534 void
3535 nce_trace_ref(nce_t *nce)
3536 {
3537 	int bucket_id;
3538 	th_trace_t *th_trace;
3539 
3540 	/*
3541 	 * Attempt to locate the trace buffer for the curthread.
3542 	 * If it does not exist, then allocate a new trace buffer
3543 	 * and link it in list of trace bufs for this ipif, at the head
3544 	 */
3545 	ASSERT(MUTEX_HELD(&nce->nce_lock));
3546 
3547 	if (nce->nce_trace_disable == B_TRUE)
3548 		return;
3549 
3550 	th_trace = th_trace_nce_lookup(nce);
3551 	if (th_trace == NULL) {
3552 		bucket_id = IP_TR_HASH(curthread);
3553 		th_trace = (th_trace_t *)kmem_zalloc(sizeof (th_trace_t),
3554 		    KM_NOSLEEP);
3555 		if (th_trace == NULL) {
3556 			nce->nce_trace_disable = B_TRUE;
3557 			nce_trace_inactive(nce);
3558 			return;
3559 		}
3560 		th_trace->th_id = curthread;
3561 		th_trace->th_next = nce->nce_trace[bucket_id];
3562 		th_trace->th_prev = &nce->nce_trace[bucket_id];
3563 		if (th_trace->th_next != NULL)
3564 			th_trace->th_next->th_prev = &th_trace->th_next;
3565 		nce->nce_trace[bucket_id] = th_trace;
3566 	}
3567 	ASSERT(th_trace->th_refcnt < TR_BUF_MAX - 1);
3568 	th_trace->th_refcnt++;
3569 	th_trace_rrecord(th_trace);
3570 }
3571 
3572 void
3573 nce_untrace_ref(nce_t *nce)
3574 {
3575 	th_trace_t *th_trace;
3576 
3577 	ASSERT(MUTEX_HELD(&nce->nce_lock));
3578 
3579 	if (nce->nce_trace_disable == B_TRUE)
3580 		return;
3581 
3582 	th_trace = th_trace_nce_lookup(nce);
3583 	ASSERT(th_trace != NULL && th_trace->th_refcnt > 0);
3584 
3585 	th_trace_rrecord(th_trace);
3586 	th_trace->th_refcnt--;
3587 }
3588 
3589 void
3590 nce_trace_inactive(nce_t *nce)
3591 {
3592 	th_trace_t *th_trace;
3593 	int i;
3594 
3595 	ASSERT(MUTEX_HELD(&nce->nce_lock));
3596 
3597 	for (i = 0; i < IP_TR_HASH_MAX; i++) {
3598 		while (nce->nce_trace[i] != NULL) {
3599 			th_trace = nce->nce_trace[i];
3600 
3601 			/* unlink th_trace and free it */
3602 			nce->nce_trace[i] = th_trace->th_next;
3603 			if (th_trace->th_next != NULL)
3604 				th_trace->th_next->th_prev =
3605 				    &nce->nce_trace[i];
3606 
3607 			th_trace->th_next = NULL;
3608 			th_trace->th_prev = NULL;
3609 			kmem_free(th_trace, sizeof (th_trace_t));
3610 		}
3611 	}
3612 
3613 }
3614 
3615 /* ARGSUSED */
3616 int
3617 nce_thread_exit(nce_t *nce, caddr_t arg)
3618 {
3619 	th_trace_t	*th_trace;
3620 
3621 	mutex_enter(&nce->nce_lock);
3622 	th_trace = th_trace_nce_lookup(nce);
3623 
3624 	if (th_trace == NULL) {
3625 		mutex_exit(&nce->nce_lock);
3626 		return (0);
3627 	}
3628 
3629 	ASSERT(th_trace->th_refcnt == 0);
3630 
3631 	/* unlink th_trace and free it */
3632 	*th_trace->th_prev = th_trace->th_next;
3633 	if (th_trace->th_next != NULL)
3634 		th_trace->th_next->th_prev = th_trace->th_prev;
3635 	th_trace->th_next = NULL;
3636 	th_trace->th_prev = NULL;
3637 	kmem_free(th_trace, sizeof (th_trace_t));
3638 	mutex_exit(&nce->nce_lock);
3639 	return (0);
3640 }
3641 #endif
3642 
3643 /*
3644  * Called when address resolution fails due to a timeout.
3645  * Send an ICMP unreachable in response to all queued packets.
3646  */
3647 void
3648 arp_resolv_failed(nce_t *nce)
3649 {
3650 	mblk_t	*mp, *nxt_mp, *first_mp;
3651 	char	buf[INET6_ADDRSTRLEN];
3652 	zoneid_t zoneid = GLOBAL_ZONEID;
3653 	struct in_addr ipv4addr;
3654 
3655 	IN6_V4MAPPED_TO_INADDR(&nce->nce_addr, &ipv4addr);
3656 	ip3dbg(("arp_resolv_failed: dst %s\n",
3657 	    inet_ntop(AF_INET, &ipv4addr, buf, sizeof (buf))));
3658 	mutex_enter(&nce->nce_lock);
3659 	mp = nce->nce_qd_mp;
3660 	nce->nce_qd_mp = NULL;
3661 	mutex_exit(&nce->nce_lock);
3662 
3663 	while (mp != NULL) {
3664 		nxt_mp = mp->b_next;
3665 		mp->b_next = NULL;
3666 		mp->b_prev = NULL;
3667 
3668 		first_mp = mp;
3669 		/*
3670 		 * Send icmp unreachable messages
3671 		 * to the hosts.
3672 		 */
3673 		(void) ip_hdr_complete((ipha_t *)mp->b_rptr, zoneid);
3674 		ip3dbg(("arp_resolv_failed: Calling icmp_unreachable\n"));
3675 		icmp_unreachable(nce->nce_ill->ill_wq, first_mp,
3676 		    ICMP_HOST_UNREACHABLE, zoneid);
3677 		mp = nxt_mp;
3678 	}
3679 }
3680 
3681 static int
3682 ndp_lookup_then_add_v4(ill_t *ill, uchar_t *hw_addr, const in_addr_t *addr,
3683     const in_addr_t *mask, const in_addr_t *extract_mask,
3684     uint32_t hw_extract_start, uint16_t flags, uint16_t state,
3685     nce_t **newnce, mblk_t *fp_mp, mblk_t *res_mp)
3686 {
3687 	int	err = 0;
3688 	nce_t	*nce;
3689 	in6_addr_t addr6;
3690 
3691 	mutex_enter(&ndp4.ndp_g_lock);
3692 	nce = *((nce_t **)NCE_HASH_PTR_V4(*addr));
3693 	IN6_IPADDR_TO_V4MAPPED(*addr, &addr6);
3694 	nce = nce_lookup_addr(ill, &addr6, nce);
3695 	if (nce == NULL) {
3696 		err = ndp_add_v4(ill,
3697 		    hw_addr,
3698 		    addr,
3699 		    mask,
3700 		    extract_mask,
3701 		    hw_extract_start,
3702 		    flags,
3703 		    state,
3704 		    newnce,
3705 		    fp_mp,
3706 		    res_mp);
3707 	} else {
3708 		*newnce = nce;
3709 		err = EEXIST;
3710 	}
3711 	mutex_exit(&ndp4.ndp_g_lock);
3712 	return (err);
3713 }
3714 
3715 /*
3716  * NDP Cache Entry creation routine for IPv4.
3717  * Mapped entries are handled in arp.
3718  * This routine must always be called with ndp4.ndp_g_lock held.
3719  * Prior to return, nce_refcnt is incremented.
3720  */
3721 static int
3722 ndp_add_v4(ill_t *ill, uchar_t *hw_addr, const in_addr_t *addr,
3723     const in_addr_t *mask, const in_addr_t *extract_mask,
3724     uint32_t hw_extract_start, uint16_t flags, uint16_t state,
3725     nce_t **newnce, mblk_t *fp_mp, mblk_t *res_mp)
3726 {
3727 	static	nce_t		nce_nil;
3728 	nce_t		*nce;
3729 	mblk_t		*mp;
3730 	mblk_t		*template;
3731 	nce_t		**ncep;
3732 
3733 	ASSERT(MUTEX_HELD(&ndp4.ndp_g_lock));
3734 	ASSERT(ill != NULL);
3735 	if ((flags & ~NCE_EXTERNAL_FLAGS_MASK)) {
3736 		return (EINVAL);
3737 	}
3738 	ASSERT((flags & NCE_F_MAPPING) == 0);
3739 	ASSERT(extract_mask == NULL);
3740 	/*
3741 	 * Allocate the mblk to hold the nce.
3742 	 */
3743 	mp = allocb(sizeof (nce_t), BPRI_MED);
3744 	if (mp == NULL)
3745 		return (ENOMEM);
3746 
3747 	nce = (nce_t *)mp->b_rptr;
3748 	mp->b_wptr = (uchar_t *)&nce[1];
3749 	*nce = nce_nil;
3750 
3751 	/*
3752 	 * This one holds link layer address; if res_mp has been provided
3753 	 * by the caller, accept it without any further checks. Otherwise,
3754 	 * for V4, we fill it up with ill_resolver_mp here, then in
3755 	 * in ire_arpresolve(), we fill it up with the ARP query
3756 	 * once its formulated.
3757 	 */
3758 	if (res_mp != NULL) {
3759 		template = res_mp;
3760 	} else  {
3761 		if (ill->ill_resolver_mp == NULL) {
3762 			freeb(mp);
3763 			return (EINVAL);
3764 		}
3765 		template = copyb(ill->ill_resolver_mp);
3766 	}
3767 	if (template == NULL) {
3768 		freeb(mp);
3769 		return (ENOMEM);
3770 	}
3771 	nce->nce_ill = ill;
3772 	nce->nce_ipversion = IPV4_VERSION;
3773 	nce->nce_flags = flags;
3774 	nce->nce_state = state;
3775 	nce->nce_pcnt = ND_MAX_UNICAST_SOLICIT;
3776 	nce->nce_rcnt = ill->ill_xmit_count;
3777 	IN6_IPADDR_TO_V4MAPPED(*addr, &nce->nce_addr);
3778 	if (*mask == IP_HOST_MASK) {
3779 		nce->nce_mask = ipv6_all_ones;
3780 	} else  {
3781 		IN6_IPADDR_TO_V4MAPPED(*mask, &nce->nce_mask);
3782 	}
3783 	nce->nce_extract_mask = ipv6_all_zeros;
3784 	nce->nce_ll_extract_start = hw_extract_start;
3785 	nce->nce_fp_mp = (fp_mp? fp_mp : NULL);
3786 	nce->nce_res_mp = template;
3787 	if (state == ND_REACHABLE)
3788 		nce->nce_last = TICK_TO_MSEC(lbolt64);
3789 	else
3790 		nce->nce_last = 0;
3791 	nce->nce_qd_mp = NULL;
3792 	nce->nce_mp = mp;
3793 	if (hw_addr != NULL)
3794 		nce_set_ll(nce, hw_addr);
3795 	/* This one is for nce getting created */
3796 	nce->nce_refcnt = 1;
3797 	mutex_init(&nce->nce_lock, NULL, MUTEX_DEFAULT, NULL);
3798 	ncep = ((nce_t **)NCE_HASH_PTR_V4(*addr));
3799 
3800 #ifdef NCE_DEBUG
3801 	bzero(nce->nce_trace, sizeof (th_trace_t *) * IP_TR_HASH_MAX);
3802 #endif
3803 	/*
3804 	 * Atomically ensure that the ill is not CONDEMNED, before
3805 	 * adding the NCE.
3806 	 */
3807 	mutex_enter(&ill->ill_lock);
3808 	if (ill->ill_state_flags & ILL_CONDEMNED) {
3809 		mutex_exit(&ill->ill_lock);
3810 		freeb(mp);
3811 		if (res_mp == NULL) {
3812 			/*
3813 			 * template was locally allocated. need to free it.
3814 			 */
3815 			freeb(template);
3816 		}
3817 		return (EINVAL);
3818 	}
3819 	if ((nce->nce_next = *ncep) != NULL)
3820 		nce->nce_next->nce_ptpn = &nce->nce_next;
3821 	*ncep = nce;
3822 	nce->nce_ptpn = ncep;
3823 	*newnce = nce;
3824 	/* This one is for nce being used by an active thread */
3825 	NCE_REFHOLD(*newnce);
3826 
3827 	/* Bump up the number of nce's referencing this ill */
3828 	ill->ill_nce_cnt++;
3829 	mutex_exit(&ill->ill_lock);
3830 	return (0);
3831 }
3832 
3833 void
3834 ndp_flush_qd_mp(nce_t *nce)
3835 {
3836 	mblk_t *qd_mp, *qd_next;
3837 
3838 	ASSERT(MUTEX_HELD(&nce->nce_lock));
3839 	qd_mp = nce->nce_qd_mp;
3840 	nce->nce_qd_mp = NULL;
3841 	while (qd_mp != NULL) {
3842 		qd_next = qd_mp->b_next;
3843 		qd_mp->b_next = NULL;
3844 		qd_mp->b_prev = NULL;
3845 		freemsg(qd_mp);
3846 		qd_mp = qd_next;
3847 	}
3848 }
3849 
3850 nce_t *
3851 nce_reinit(nce_t *nce)
3852 {
3853 	nce_t *newnce = NULL;
3854 	in_addr_t nce_addr, nce_mask;
3855 
3856 	IN6_V4MAPPED_TO_IPADDR(&nce->nce_addr, nce_addr);
3857 	IN6_V4MAPPED_TO_IPADDR(&nce->nce_mask, nce_mask);
3858 	/*
3859 	 * delete the old one. this will get rid of any ire's pointing
3860 	 * at this nce.
3861 	 */
3862 	ndp_delete(nce);
3863 	/*
3864 	 * create a new nce with the same addr and mask.
3865 	 */
3866 	mutex_enter(&ndp4.ndp_g_lock);
3867 	(void) ndp_add_v4(nce->nce_ill, NULL, &nce_addr, &nce_mask, NULL, 0, 0,
3868 	    ND_INITIAL, &newnce, NULL, NULL);
3869 	mutex_exit(&ndp4.ndp_g_lock);
3870 	/*
3871 	 * refrele the old nce.
3872 	 */
3873 	NCE_REFRELE(nce);
3874 	return (newnce);
3875 }
3876 
3877 /*
3878  * ndp_walk routine to delete all entries that have a given destination or
3879  * gateway address and cached link layer (MAC) address.  This is used when ARP
3880  * informs us that a network-to-link-layer mapping may have changed.
3881  */
3882 void
3883 nce_delete_hw_changed(nce_t *nce, void *arg)
3884 {
3885 	nce_hw_map_t *hwm = arg;
3886 	mblk_t *mp;
3887 	dl_unitdata_req_t *dlu;
3888 	uchar_t *macaddr;
3889 	ill_t *ill;
3890 	int saplen;
3891 	ipaddr_t nce_addr;
3892 
3893 	if (nce->nce_state != ND_REACHABLE)
3894 		return;
3895 
3896 	IN6_V4MAPPED_TO_IPADDR(&nce->nce_addr, nce_addr);
3897 	if (nce_addr != hwm->hwm_addr)
3898 		return;
3899 
3900 	mutex_enter(&nce->nce_lock);
3901 	if ((mp = nce->nce_res_mp) == NULL) {
3902 		mutex_exit(&nce->nce_lock);
3903 		return;
3904 	}
3905 	dlu = (dl_unitdata_req_t *)mp->b_rptr;
3906 	macaddr = (uchar_t *)(dlu + 1);
3907 	ill = nce->nce_ill;
3908 	if ((saplen = ill->ill_sap_length) > 0)
3909 		macaddr += saplen;
3910 	else
3911 		saplen = -saplen;
3912 
3913 	/*
3914 	 * If the hardware address is unchanged, then leave this one alone.
3915 	 * Note that saplen == abs(saplen) now.
3916 	 */
3917 	if (hwm->hwm_hwlen == dlu->dl_dest_addr_length - saplen &&
3918 	    bcmp(hwm->hwm_hwaddr, macaddr, hwm->hwm_hwlen) == 0) {
3919 		mutex_exit(&nce->nce_lock);
3920 		return;
3921 	}
3922 	mutex_exit(&nce->nce_lock);
3923 
3924 	DTRACE_PROBE1(nce__hw__deleted, nce_t *, nce);
3925 	ndp_delete(nce);
3926 }
3927 
3928 /*
3929  * This function verifies whether a given IPv4 address is potentially known to
3930  * the NCE subsystem.  If so, then ARP must not delete the corresponding ace_t,
3931  * so that it can continue to look for hardware changes on that address.
3932  */
3933 boolean_t
3934 ndp_lookup_ipaddr(in_addr_t addr)
3935 {
3936 	nce_t		*nce;
3937 	struct in_addr	nceaddr;
3938 
3939 	if (addr == INADDR_ANY)
3940 		return (B_FALSE);
3941 
3942 	mutex_enter(&ndp4.ndp_g_lock);
3943 	nce = *(nce_t **)NCE_HASH_PTR_V4(addr);
3944 	for (; nce != NULL; nce = nce->nce_next) {
3945 		/* Note that only v4 mapped entries are in the table. */
3946 		IN6_V4MAPPED_TO_INADDR(&nce->nce_addr, &nceaddr);
3947 		if (addr == nceaddr.s_addr &&
3948 		    IN6_ARE_ADDR_EQUAL(&nce->nce_mask, &ipv6_all_ones)) {
3949 			/* Single flag check; no lock needed */
3950 			if (!(nce->nce_flags & NCE_F_CONDEMNED))
3951 				break;
3952 		}
3953 	}
3954 	mutex_exit(&ndp4.ndp_g_lock);
3955 	return (nce != NULL);
3956 }
3957