xref: /titanic_52/usr/src/uts/common/inet/ip/ip_ndp.c (revision ee5416c9d7e449233197d5d20bc6b81e4ff091b2)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 #include <sys/types.h>
29 #include <sys/stream.h>
30 #include <sys/stropts.h>
31 #include <sys/strsun.h>
32 #include <sys/sysmacros.h>
33 #include <sys/errno.h>
34 #include <sys/dlpi.h>
35 #include <sys/socket.h>
36 #include <sys/ddi.h>
37 #include <sys/sunddi.h>
38 #include <sys/cmn_err.h>
39 #include <sys/debug.h>
40 #include <sys/vtrace.h>
41 #include <sys/kmem.h>
42 #include <sys/zone.h>
43 #include <sys/ethernet.h>
44 #include <sys/sdt.h>
45 
46 #include <net/if.h>
47 #include <net/if_types.h>
48 #include <net/if_dl.h>
49 #include <net/route.h>
50 #include <netinet/in.h>
51 #include <netinet/ip6.h>
52 #include <netinet/icmp6.h>
53 
54 #include <inet/common.h>
55 #include <inet/mi.h>
56 #include <inet/mib2.h>
57 #include <inet/nd.h>
58 #include <inet/ip.h>
59 #include <inet/ip_impl.h>
60 #include <inet/ipclassifier.h>
61 #include <inet/ip_if.h>
62 #include <inet/ip_ire.h>
63 #include <inet/ip_rts.h>
64 #include <inet/ip6.h>
65 #include <inet/ip_ndp.h>
66 #include <inet/ipsec_impl.h>
67 #include <inet/ipsec_info.h>
68 #include <inet/sctp_ip.h>
69 
70 /*
71  * Function names with nce_ prefix are static while function
72  * names with ndp_ prefix are used by rest of the IP.
73  *
74  * Lock ordering:
75  *
76  *	ndp_g_lock -> ill_lock -> nce_lock
77  *
78  * The ndp_g_lock protects the NCE hash (nce_hash_tbl, NCE_HASH_PTR) and
79  * nce_next.  Nce_lock protects the contents of the NCE (particularly
80  * nce_refcnt).
81  */
82 
83 static	boolean_t nce_cmp_ll_addr(const nce_t *nce, const uchar_t *new_ll_addr,
84     uint32_t ll_addr_len);
85 static	void	nce_ire_delete(nce_t *nce);
86 static	void	nce_ire_delete1(ire_t *ire, char *nce_arg);
87 static	void 	nce_set_ll(nce_t *nce, uchar_t *ll_addr);
88 static	nce_t	*nce_lookup_addr(ill_t *, const in6_addr_t *, nce_t *);
89 static	nce_t	*nce_lookup_mapping(ill_t *ill, const in6_addr_t *addr);
90 static	void	nce_make_mapping(nce_t *nce, uchar_t *addrpos,
91     uchar_t *addr);
92 static	int	nce_set_multicast(ill_t *ill, const in6_addr_t *addr);
93 static	void	nce_queue_mp(nce_t *nce, mblk_t *mp);
94 static	void	nce_report1(nce_t *nce, uchar_t *mp_arg);
95 static	mblk_t	*nce_udreq_alloc(ill_t *ill);
96 static	void	nce_update(nce_t *nce, uint16_t new_state,
97     uchar_t *new_ll_addr);
98 static	uint32_t	nce_solicit(nce_t *nce, mblk_t *mp);
99 static	boolean_t	nce_xmit(ill_t *ill, uint32_t operation,
100     ill_t *hwaddr_ill, boolean_t use_lla_addr, const in6_addr_t *sender,
101     const in6_addr_t *target, int flag);
102 static int	ndp_add_v4(ill_t *, const in_addr_t *, uint16_t,
103     nce_t **, nce_t *);
104 
105 /*
106  * We track the time of creation of the nce in the  nce_init_time field
107  * of IPv4 nce_t entries. If an nce is stuck in the ND_INITIAL state for
108  * more than NCE_STUCK_TIMEOUT milliseconds, trigger the nce-stuck dtrace
109  * probe to assist in debugging. This probe is fired from from nce_report1()
110  * when 'ndd -get /dev/ip ip_ndp_cache_report' is invoked.
111  */
112 #define	NCE_STUCK_TIMEOUT	120000
113 
114 #ifdef DEBUG
115 static void	nce_trace_cleanup(const nce_t *);
116 #endif
117 
118 #define	NCE_HASH_PTR_V4(ipst, addr)					\
119 	(&((ipst)->ips_ndp4->nce_hash_tbl[IRE_ADDR_HASH(addr, NCE_TABLE_SIZE)]))
120 
121 #define	NCE_HASH_PTR_V6(ipst, addr)				 \
122 	(&((ipst)->ips_ndp6->nce_hash_tbl[NCE_ADDR_HASH_V6(addr, \
123 		NCE_TABLE_SIZE)]))
124 
125 /*
126  * Compute default flags to use for an advertisement of this nce's address.
127  */
128 static int
129 nce_advert_flags(const nce_t *nce)
130 {
131 	int flag = 0;
132 
133 	if (nce->nce_flags & NCE_F_ISROUTER)
134 		flag |= NDP_ISROUTER;
135 	return (flag);
136 }
137 
138 /* Non-tunable probe interval, based on link capabilities */
139 #define	ILL_PROBE_INTERVAL(ill)	((ill)->ill_note_link ? 150 : 1500)
140 
141 /*
142  * NDP Cache Entry creation routine.
143  * Mapped entries will never do NUD .
144  * This routine must always be called with ndp6->ndp_g_lock held.
145  * Prior to return, nce_refcnt is incremented.
146  */
147 int
148 ndp_add_v6(ill_t *ill, uchar_t *hw_addr, const in6_addr_t *addr,
149     const in6_addr_t *mask, const in6_addr_t *extract_mask,
150     uint32_t hw_extract_start, uint16_t flags, uint16_t state,
151     nce_t **newnce)
152 {
153 	static	nce_t		nce_nil;
154 	nce_t		*nce;
155 	mblk_t		*mp;
156 	mblk_t		*template;
157 	nce_t		**ncep;
158 	int		err;
159 	boolean_t	dropped = B_FALSE;
160 	ip_stack_t	*ipst = ill->ill_ipst;
161 
162 	ASSERT(MUTEX_HELD(&ipst->ips_ndp6->ndp_g_lock));
163 	ASSERT(ill != NULL && ill->ill_isv6);
164 	if (IN6_IS_ADDR_UNSPECIFIED(addr)) {
165 		ip0dbg(("ndp_add_v6: no addr\n"));
166 		return (EINVAL);
167 	}
168 	if ((flags & ~NCE_EXTERNAL_FLAGS_MASK)) {
169 		ip0dbg(("ndp_add_v6: flags = %x\n", (int)flags));
170 		return (EINVAL);
171 	}
172 	if (IN6_IS_ADDR_UNSPECIFIED(extract_mask) &&
173 	    (flags & NCE_F_MAPPING)) {
174 		ip0dbg(("ndp_add_v6: extract mask zero for mapping"));
175 		return (EINVAL);
176 	}
177 	/*
178 	 * Allocate the mblk to hold the nce.
179 	 *
180 	 * XXX This can come out of a separate cache - nce_cache.
181 	 * We don't need the mp anymore as there are no more
182 	 * "qwriter"s
183 	 */
184 	mp = allocb(sizeof (nce_t), BPRI_MED);
185 	if (mp == NULL)
186 		return (ENOMEM);
187 
188 	nce = (nce_t *)mp->b_rptr;
189 	mp->b_wptr = (uchar_t *)&nce[1];
190 	*nce = nce_nil;
191 
192 	/*
193 	 * This one holds link layer address
194 	 */
195 	if (ill->ill_net_type == IRE_IF_RESOLVER) {
196 		template = nce_udreq_alloc(ill);
197 	} else {
198 		if (ill->ill_resolver_mp == NULL) {
199 			freeb(mp);
200 			return (EINVAL);
201 		}
202 		ASSERT((ill->ill_net_type == IRE_IF_NORESOLVER));
203 		template = copyb(ill->ill_resolver_mp);
204 	}
205 	if (template == NULL) {
206 		freeb(mp);
207 		return (ENOMEM);
208 	}
209 	nce->nce_ill = ill;
210 	nce->nce_ipversion = IPV6_VERSION;
211 	nce->nce_flags = flags;
212 	nce->nce_state = state;
213 	nce->nce_pcnt = ND_MAX_UNICAST_SOLICIT;
214 	nce->nce_rcnt = ill->ill_xmit_count;
215 	nce->nce_addr = *addr;
216 	nce->nce_mask = *mask;
217 	nce->nce_extract_mask = *extract_mask;
218 	nce->nce_ll_extract_start = hw_extract_start;
219 	nce->nce_fp_mp = NULL;
220 	nce->nce_res_mp = template;
221 	if (state == ND_REACHABLE)
222 		nce->nce_last = TICK_TO_MSEC(lbolt64);
223 	else
224 		nce->nce_last = 0;
225 	nce->nce_qd_mp = NULL;
226 	nce->nce_mp = mp;
227 	if (hw_addr != NULL)
228 		nce_set_ll(nce, hw_addr);
229 	/* This one is for nce getting created */
230 	nce->nce_refcnt = 1;
231 	mutex_init(&nce->nce_lock, NULL, MUTEX_DEFAULT, NULL);
232 	if (nce->nce_flags & NCE_F_MAPPING) {
233 		ASSERT(IN6_IS_ADDR_MULTICAST(addr));
234 		ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&nce->nce_mask));
235 		ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&nce->nce_extract_mask));
236 		ncep = &ipst->ips_ndp6->nce_mask_entries;
237 	} else {
238 		ncep = ((nce_t **)NCE_HASH_PTR_V6(ipst, *addr));
239 	}
240 
241 	nce->nce_trace_disable = B_FALSE;
242 
243 	/*
244 	 * Atomically ensure that the ill is not CONDEMNED, before
245 	 * adding the NCE.
246 	 */
247 	mutex_enter(&ill->ill_lock);
248 	if (ill->ill_state_flags & ILL_CONDEMNED) {
249 		mutex_exit(&ill->ill_lock);
250 		freeb(mp);
251 		freeb(template);
252 		return (EINVAL);
253 	}
254 	if ((nce->nce_next = *ncep) != NULL)
255 		nce->nce_next->nce_ptpn = &nce->nce_next;
256 	*ncep = nce;
257 	nce->nce_ptpn = ncep;
258 	*newnce = nce;
259 	/* This one is for nce being used by an active thread */
260 	NCE_REFHOLD(*newnce);
261 
262 	/* Bump up the number of nce's referencing this ill */
263 	ill->ill_nce_cnt++;
264 	mutex_exit(&ill->ill_lock);
265 
266 	err = 0;
267 	if ((flags & NCE_F_PERMANENT) && state == ND_PROBE) {
268 		mutex_enter(&nce->nce_lock);
269 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
270 		nce->nce_pcnt = ND_MAX_UNICAST_SOLICIT;
271 		mutex_exit(&nce->nce_lock);
272 		dropped = nce_xmit(ill, ND_NEIGHBOR_SOLICIT, NULL, B_FALSE,
273 		    &ipv6_all_zeros, addr, NDP_PROBE);
274 		if (dropped) {
275 			mutex_enter(&nce->nce_lock);
276 			nce->nce_pcnt++;
277 			mutex_exit(&nce->nce_lock);
278 		}
279 		NDP_RESTART_TIMER(nce, ILL_PROBE_INTERVAL(ill));
280 		mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
281 		err = EINPROGRESS;
282 	} else if (flags & NCE_F_UNSOL_ADV) {
283 		/*
284 		 * We account for the transmit below by assigning one
285 		 * less than the ndd variable. Subsequent decrements
286 		 * are done in ndp_timer.
287 		 */
288 		mutex_enter(&nce->nce_lock);
289 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
290 		nce->nce_unsolicit_count = ipst->ips_ip_ndp_unsolicit_count - 1;
291 		mutex_exit(&nce->nce_lock);
292 		dropped = nce_xmit(ill,
293 		    ND_NEIGHBOR_ADVERT,
294 		    ill,	/* ill to be used for extracting ill_nd_lla */
295 		    B_TRUE,	/* use ill_nd_lla */
296 		    addr,	/* Source and target of the advertisement pkt */
297 		    &ipv6_all_hosts_mcast, /* Destination of the packet */
298 		    nce_advert_flags(nce));
299 		mutex_enter(&nce->nce_lock);
300 		if (dropped)
301 			nce->nce_unsolicit_count++;
302 		if (nce->nce_unsolicit_count != 0) {
303 			nce->nce_timeout_id = timeout(ndp_timer, nce,
304 			    MSEC_TO_TICK(ipst->ips_ip_ndp_unsolicit_interval));
305 		}
306 		mutex_exit(&nce->nce_lock);
307 		mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
308 	}
309 	/*
310 	 * If the hw_addr is NULL, typically for ND_INCOMPLETE nces, then
311 	 * we call nce_fastpath as soon as the nce is resolved in ndp_process.
312 	 * We call nce_fastpath from nce_update if the link layer address of
313 	 * the peer changes from nce_update
314 	 */
315 	if (hw_addr != NULL || ill->ill_net_type == IRE_IF_NORESOLVER)
316 		nce_fastpath(nce);
317 	return (err);
318 }
319 
320 int
321 ndp_lookup_then_add_v6(ill_t *ill, uchar_t *hw_addr, const in6_addr_t *addr,
322     const in6_addr_t *mask, const in6_addr_t *extract_mask,
323     uint32_t hw_extract_start, uint16_t flags, uint16_t state,
324     nce_t **newnce)
325 {
326 	int	err = 0;
327 	nce_t	*nce;
328 	ip_stack_t	*ipst = ill->ill_ipst;
329 
330 	ASSERT(ill->ill_isv6);
331 	mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
332 
333 	/* Get head of v6 hash table */
334 	nce = *((nce_t **)NCE_HASH_PTR_V6(ipst, *addr));
335 	nce = nce_lookup_addr(ill, addr, nce);
336 	if (nce == NULL) {
337 		err = ndp_add_v6(ill,
338 		    hw_addr,
339 		    addr,
340 		    mask,
341 		    extract_mask,
342 		    hw_extract_start,
343 		    flags,
344 		    state,
345 		    newnce);
346 	} else {
347 		*newnce = nce;
348 		err = EEXIST;
349 	}
350 	mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
351 	return (err);
352 }
353 
354 /*
355  * Remove all the CONDEMNED nces from the appropriate hash table.
356  * We create a private list of NCEs, these may have ires pointing
357  * to them, so the list will be passed through to clean up dependent
358  * ires and only then we can do NCE_REFRELE which can make NCE inactive.
359  */
360 static void
361 nce_remove(ndp_g_t *ndp, nce_t *nce, nce_t **free_nce_list)
362 {
363 	nce_t *nce1;
364 	nce_t **ptpn;
365 
366 	ASSERT(MUTEX_HELD(&ndp->ndp_g_lock));
367 	ASSERT(ndp->ndp_g_walker == 0);
368 	for (; nce; nce = nce1) {
369 		nce1 = nce->nce_next;
370 		mutex_enter(&nce->nce_lock);
371 		if (nce->nce_flags & NCE_F_CONDEMNED) {
372 			ptpn = nce->nce_ptpn;
373 			nce1 = nce->nce_next;
374 			if (nce1 != NULL)
375 				nce1->nce_ptpn = ptpn;
376 			*ptpn = nce1;
377 			nce->nce_ptpn = NULL;
378 			nce->nce_next = NULL;
379 			nce->nce_next = *free_nce_list;
380 			*free_nce_list = nce;
381 		}
382 		mutex_exit(&nce->nce_lock);
383 	}
384 }
385 
386 /*
387  * 1. Mark the nce CONDEMNED. This ensures that no new nce_lookup()
388  *    will return this NCE. Also no new IREs will be created that
389  *    point to this NCE (See ire_add_v6).  Also no new timeouts will
390  *    be started (See NDP_RESTART_TIMER).
391  * 2. Cancel any currently running timeouts.
392  * 3. If there is an ndp walker, return. The walker will do the cleanup.
393  *    This ensures that walkers see a consistent list of NCEs while walking.
394  * 4. Otherwise remove the NCE from the list of NCEs
395  * 5. Delete all IREs pointing to this NCE.
396  */
397 void
398 ndp_delete(nce_t *nce)
399 {
400 	nce_t	**ptpn;
401 	nce_t	*nce1;
402 	int	ipversion = nce->nce_ipversion;
403 	ndp_g_t *ndp;
404 	ip_stack_t	*ipst = nce->nce_ill->ill_ipst;
405 
406 	if (ipversion == IPV4_VERSION)
407 		ndp = ipst->ips_ndp4;
408 	else
409 		ndp = ipst->ips_ndp6;
410 
411 	/* Serialize deletes */
412 	mutex_enter(&nce->nce_lock);
413 	if (nce->nce_flags & NCE_F_CONDEMNED) {
414 		/* Some other thread is doing the delete */
415 		mutex_exit(&nce->nce_lock);
416 		return;
417 	}
418 	/*
419 	 * Caller has a refhold. Also 1 ref for being in the list. Thus
420 	 * refcnt has to be >= 2
421 	 */
422 	ASSERT(nce->nce_refcnt >= 2);
423 	nce->nce_flags |= NCE_F_CONDEMNED;
424 	mutex_exit(&nce->nce_lock);
425 
426 	nce_fastpath_list_delete(nce);
427 
428 	/*
429 	 * Cancel any running timer. Timeout can't be restarted
430 	 * since CONDEMNED is set. Can't hold nce_lock across untimeout.
431 	 * Passing invalid timeout id is fine.
432 	 */
433 	if (nce->nce_timeout_id != 0) {
434 		(void) untimeout(nce->nce_timeout_id);
435 		nce->nce_timeout_id = 0;
436 	}
437 
438 	mutex_enter(&ndp->ndp_g_lock);
439 	if (nce->nce_ptpn == NULL) {
440 		/*
441 		 * The last ndp walker has already removed this nce from
442 		 * the list after we marked the nce CONDEMNED and before
443 		 * we grabbed the global lock.
444 		 */
445 		mutex_exit(&ndp->ndp_g_lock);
446 		return;
447 	}
448 	if (ndp->ndp_g_walker > 0) {
449 		/*
450 		 * Can't unlink. The walker will clean up
451 		 */
452 		ndp->ndp_g_walker_cleanup = B_TRUE;
453 		mutex_exit(&ndp->ndp_g_lock);
454 		return;
455 	}
456 
457 	/*
458 	 * Now remove the nce from the list. NDP_RESTART_TIMER won't restart
459 	 * the timer since it is marked CONDEMNED.
460 	 */
461 	ptpn = nce->nce_ptpn;
462 	nce1 = nce->nce_next;
463 	if (nce1 != NULL)
464 		nce1->nce_ptpn = ptpn;
465 	*ptpn = nce1;
466 	nce->nce_ptpn = NULL;
467 	nce->nce_next = NULL;
468 	mutex_exit(&ndp->ndp_g_lock);
469 
470 	nce_ire_delete(nce);
471 }
472 
473 void
474 ndp_inactive(nce_t *nce)
475 {
476 	mblk_t		**mpp;
477 	ill_t		*ill;
478 
479 	ASSERT(nce->nce_refcnt == 0);
480 	ASSERT(MUTEX_HELD(&nce->nce_lock));
481 	ASSERT(nce->nce_fastpath == NULL);
482 
483 	/* Free all nce allocated messages */
484 	mpp = &nce->nce_first_mp_to_free;
485 	do {
486 		while (*mpp != NULL) {
487 			mblk_t  *mp;
488 
489 			mp = *mpp;
490 			*mpp = mp->b_next;
491 
492 			inet_freemsg(mp);
493 		}
494 	} while (mpp++ != &nce->nce_last_mp_to_free);
495 
496 #ifdef DEBUG
497 	nce_trace_cleanup(nce);
498 #endif
499 
500 	ill = nce->nce_ill;
501 	mutex_enter(&ill->ill_lock);
502 	ill->ill_nce_cnt--;
503 	/*
504 	 * If the number of nce's associated with this ill have dropped
505 	 * to zero, check whether we need to restart any operation that
506 	 * is waiting for this to happen.
507 	 */
508 	if (ill->ill_nce_cnt == 0) {
509 		/* ipif_ill_refrele_tail drops the ill_lock */
510 		ipif_ill_refrele_tail(ill);
511 	} else {
512 		mutex_exit(&ill->ill_lock);
513 	}
514 	mutex_destroy(&nce->nce_lock);
515 	if (nce->nce_mp != NULL)
516 		inet_freemsg(nce->nce_mp);
517 }
518 
519 /*
520  * ndp_walk routine.  Delete the nce if it is associated with the ill
521  * that is going away.  Always called as a writer.
522  */
523 void
524 ndp_delete_per_ill(nce_t *nce, uchar_t *arg)
525 {
526 	if ((nce != NULL) && nce->nce_ill == (ill_t *)arg) {
527 		ndp_delete(nce);
528 	}
529 }
530 
531 /*
532  * Walk a list of to be inactive NCEs and blow away all the ires.
533  */
534 static void
535 nce_ire_delete_list(nce_t *nce)
536 {
537 	nce_t *nce_next;
538 
539 	ASSERT(nce != NULL);
540 	while (nce != NULL) {
541 		nce_next = nce->nce_next;
542 		nce->nce_next = NULL;
543 
544 		/*
545 		 * It is possible for the last ndp walker (this thread)
546 		 * to come here after ndp_delete has marked the nce CONDEMNED
547 		 * and before it has removed the nce from the fastpath list
548 		 * or called untimeout. So we need to do it here. It is safe
549 		 * for both ndp_delete and this thread to do it twice or
550 		 * even simultaneously since each of the threads has a
551 		 * reference on the nce.
552 		 */
553 		nce_fastpath_list_delete(nce);
554 		/*
555 		 * Cancel any running timer. Timeout can't be restarted
556 		 * since CONDEMNED is set. Can't hold nce_lock across untimeout.
557 		 * Passing invalid timeout id is fine.
558 		 */
559 		if (nce->nce_timeout_id != 0) {
560 			(void) untimeout(nce->nce_timeout_id);
561 			nce->nce_timeout_id = 0;
562 		}
563 		/*
564 		 * We might hit this func thus in the v4 case:
565 		 * ipif_down->ipif_ndp_down->ndp_walk
566 		 */
567 
568 		if (nce->nce_ipversion == IPV4_VERSION) {
569 			ire_walk_ill_v4(MATCH_IRE_ILL | MATCH_IRE_TYPE,
570 			    IRE_CACHE, nce_ire_delete1,
571 			    (char *)nce, nce->nce_ill);
572 		} else {
573 			ASSERT(nce->nce_ipversion == IPV6_VERSION);
574 			ire_walk_ill_v6(MATCH_IRE_ILL | MATCH_IRE_TYPE,
575 			    IRE_CACHE, nce_ire_delete1,
576 			    (char *)nce, nce->nce_ill);
577 		}
578 		NCE_REFRELE_NOTR(nce);
579 		nce = nce_next;
580 	}
581 }
582 
583 /*
584  * Delete an ire when the nce goes away.
585  */
586 /* ARGSUSED */
587 static void
588 nce_ire_delete(nce_t *nce)
589 {
590 	if (nce->nce_ipversion == IPV6_VERSION) {
591 		ire_walk_ill_v6(MATCH_IRE_ILL | MATCH_IRE_TYPE, IRE_CACHE,
592 		    nce_ire_delete1, (char *)nce, nce->nce_ill);
593 		NCE_REFRELE_NOTR(nce);
594 	} else {
595 		ire_walk_ill_v4(MATCH_IRE_ILL | MATCH_IRE_TYPE, IRE_CACHE,
596 		    nce_ire_delete1, (char *)nce, nce->nce_ill);
597 		NCE_REFRELE_NOTR(nce);
598 	}
599 }
600 
601 /*
602  * ire_walk routine used to delete every IRE that shares this nce
603  */
604 static void
605 nce_ire_delete1(ire_t *ire, char *nce_arg)
606 {
607 	nce_t	*nce = (nce_t *)nce_arg;
608 
609 	ASSERT(ire->ire_type == IRE_CACHE);
610 
611 	if (ire->ire_nce == nce) {
612 		ASSERT(ire->ire_ipversion == nce->nce_ipversion);
613 		ire_delete(ire);
614 	}
615 }
616 
617 /*
618  * Restart DAD on given NCE.  Returns B_TRUE if DAD has been restarted.
619  */
620 boolean_t
621 ndp_restart_dad(nce_t *nce)
622 {
623 	boolean_t started;
624 	boolean_t dropped;
625 
626 	if (nce == NULL)
627 		return (B_FALSE);
628 	mutex_enter(&nce->nce_lock);
629 	if (nce->nce_state == ND_PROBE) {
630 		mutex_exit(&nce->nce_lock);
631 		started = B_TRUE;
632 	} else if (nce->nce_state == ND_REACHABLE) {
633 		nce->nce_state = ND_PROBE;
634 		nce->nce_pcnt = ND_MAX_UNICAST_SOLICIT - 1;
635 		mutex_exit(&nce->nce_lock);
636 		dropped = nce_xmit(nce->nce_ill, ND_NEIGHBOR_SOLICIT, NULL,
637 		    B_FALSE, &ipv6_all_zeros, &nce->nce_addr, NDP_PROBE);
638 		if (dropped) {
639 			mutex_enter(&nce->nce_lock);
640 			nce->nce_pcnt++;
641 			mutex_exit(&nce->nce_lock);
642 		}
643 		NDP_RESTART_TIMER(nce, ILL_PROBE_INTERVAL(nce->nce_ill));
644 		started = B_TRUE;
645 	} else {
646 		mutex_exit(&nce->nce_lock);
647 		started = B_FALSE;
648 	}
649 	return (started);
650 }
651 
652 /*
653  * IPv6 Cache entry lookup.  Try to find an nce matching the parameters passed.
654  * If one is found, the refcnt on the nce will be incremented.
655  */
656 nce_t *
657 ndp_lookup_v6(ill_t *ill, const in6_addr_t *addr, boolean_t caller_holds_lock)
658 {
659 	nce_t	*nce;
660 	ip_stack_t	*ipst;
661 
662 	ASSERT(ill != NULL);
663 	ipst = ill->ill_ipst;
664 
665 	ASSERT(ill != NULL && ill->ill_isv6);
666 	if (!caller_holds_lock) {
667 		mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
668 	}
669 
670 	/* Get head of v6 hash table */
671 	nce = *((nce_t **)NCE_HASH_PTR_V6(ipst, *addr));
672 	nce = nce_lookup_addr(ill, addr, nce);
673 	if (nce == NULL)
674 		nce = nce_lookup_mapping(ill, addr);
675 	if (!caller_holds_lock)
676 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
677 	return (nce);
678 }
679 /*
680  * IPv4 Cache entry lookup.  Try to find an nce matching the parameters passed.
681  * If one is found, the refcnt on the nce will be incremented.
682  * Since multicast mappings are handled in arp, there are no nce_mcast_entries
683  * so we skip the nce_lookup_mapping call.
684  * XXX TODO: if the nce is found to be ND_STALE, ndp_delete it and return NULL
685  */
686 nce_t *
687 ndp_lookup_v4(ill_t *ill, const in_addr_t *addr, boolean_t caller_holds_lock)
688 {
689 	nce_t	*nce;
690 	in6_addr_t addr6;
691 	ip_stack_t *ipst = ill->ill_ipst;
692 
693 	if (!caller_holds_lock) {
694 		mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
695 	}
696 
697 	/* Get head of v4 hash table */
698 	nce = *((nce_t **)NCE_HASH_PTR_V4(ipst, *addr));
699 	IN6_IPADDR_TO_V4MAPPED(*addr, &addr6);
700 	nce = nce_lookup_addr(ill, &addr6, nce);
701 	if (!caller_holds_lock)
702 		mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
703 	return (nce);
704 }
705 
706 /*
707  * Cache entry lookup.  Try to find an nce matching the parameters passed.
708  * Look only for exact entries (no mappings).  If an nce is found, increment
709  * the hold count on that nce. The caller passes in the start of the
710  * appropriate hash table, and must be holding the appropriate global
711  * lock (ndp_g_lock).
712  */
713 static nce_t *
714 nce_lookup_addr(ill_t *ill, const in6_addr_t *addr, nce_t *nce)
715 {
716 	ndp_g_t		*ndp;
717 	ip_stack_t	*ipst = ill->ill_ipst;
718 
719 	if (ill->ill_isv6)
720 		ndp = ipst->ips_ndp6;
721 	else
722 		ndp = ipst->ips_ndp4;
723 
724 	ASSERT(ill != NULL);
725 	ASSERT(MUTEX_HELD(&ndp->ndp_g_lock));
726 	if (IN6_IS_ADDR_UNSPECIFIED(addr))
727 		return (NULL);
728 	for (; nce != NULL; nce = nce->nce_next) {
729 		if (nce->nce_ill == ill) {
730 			if (IN6_ARE_ADDR_EQUAL(&nce->nce_addr, addr) &&
731 			    IN6_ARE_ADDR_EQUAL(&nce->nce_mask,
732 			    &ipv6_all_ones)) {
733 				mutex_enter(&nce->nce_lock);
734 				if (!(nce->nce_flags & NCE_F_CONDEMNED)) {
735 					NCE_REFHOLD_LOCKED(nce);
736 					mutex_exit(&nce->nce_lock);
737 					break;
738 				}
739 				mutex_exit(&nce->nce_lock);
740 			}
741 		}
742 	}
743 	return (nce);
744 }
745 
746 /*
747  * Cache entry lookup.  Try to find an nce matching the parameters passed.
748  * Look only for mappings.
749  */
750 static nce_t *
751 nce_lookup_mapping(ill_t *ill, const in6_addr_t *addr)
752 {
753 	nce_t	*nce;
754 	ip_stack_t	*ipst = ill->ill_ipst;
755 
756 	ASSERT(ill != NULL && ill->ill_isv6);
757 	ASSERT(MUTEX_HELD(&ipst->ips_ndp6->ndp_g_lock));
758 	if (!IN6_IS_ADDR_MULTICAST(addr))
759 		return (NULL);
760 	nce = ipst->ips_ndp6->nce_mask_entries;
761 	for (; nce != NULL; nce = nce->nce_next)
762 		if (nce->nce_ill == ill &&
763 		    (V6_MASK_EQ(*addr, nce->nce_mask, nce->nce_addr))) {
764 			mutex_enter(&nce->nce_lock);
765 			if (!(nce->nce_flags & NCE_F_CONDEMNED)) {
766 				NCE_REFHOLD_LOCKED(nce);
767 				mutex_exit(&nce->nce_lock);
768 				break;
769 			}
770 			mutex_exit(&nce->nce_lock);
771 		}
772 	return (nce);
773 }
774 
775 /*
776  * Process passed in parameters either from an incoming packet or via
777  * user ioctl.
778  */
779 void
780 ndp_process(nce_t *nce, uchar_t *hw_addr, uint32_t flag, boolean_t is_adv)
781 {
782 	ill_t	*ill = nce->nce_ill;
783 	uint32_t hw_addr_len = ill->ill_nd_lla_len;
784 	mblk_t	*mp;
785 	boolean_t ll_updated = B_FALSE;
786 	boolean_t ll_changed;
787 	ip_stack_t	*ipst = ill->ill_ipst;
788 
789 	ASSERT(nce->nce_ipversion == IPV6_VERSION);
790 	/*
791 	 * No updates of link layer address or the neighbor state is
792 	 * allowed, when the cache is in NONUD state.  This still
793 	 * allows for responding to reachability solicitation.
794 	 */
795 	mutex_enter(&nce->nce_lock);
796 	if (nce->nce_state == ND_INCOMPLETE) {
797 		if (hw_addr == NULL) {
798 			mutex_exit(&nce->nce_lock);
799 			return;
800 		}
801 		nce_set_ll(nce, hw_addr);
802 		/*
803 		 * Update nce state and send the queued packets
804 		 * back to ip this time ire will be added.
805 		 */
806 		if (flag & ND_NA_FLAG_SOLICITED) {
807 			nce_update(nce, ND_REACHABLE, NULL);
808 		} else {
809 			nce_update(nce, ND_STALE, NULL);
810 		}
811 		mutex_exit(&nce->nce_lock);
812 		nce_fastpath(nce);
813 		mutex_enter(&nce->nce_lock);
814 		mp = nce->nce_qd_mp;
815 		nce->nce_qd_mp = NULL;
816 		mutex_exit(&nce->nce_lock);
817 		while (mp != NULL) {
818 			mblk_t *nxt_mp, *data_mp;
819 
820 			nxt_mp = mp->b_next;
821 			mp->b_next = NULL;
822 
823 			if (mp->b_datap->db_type == M_CTL)
824 				data_mp = mp->b_cont;
825 			else
826 				data_mp = mp;
827 			if (data_mp->b_prev != NULL) {
828 				ill_t   *inbound_ill;
829 				queue_t *fwdq = NULL;
830 				uint_t ifindex;
831 
832 				ifindex = (uint_t)(uintptr_t)data_mp->b_prev;
833 				inbound_ill = ill_lookup_on_ifindex(ifindex,
834 				    B_TRUE, NULL, NULL, NULL, NULL, ipst);
835 				if (inbound_ill == NULL) {
836 					data_mp->b_prev = NULL;
837 					freemsg(mp);
838 					return;
839 				} else {
840 					fwdq = inbound_ill->ill_rq;
841 				}
842 				data_mp->b_prev = NULL;
843 				/*
844 				 * Send a forwarded packet back into ip_rput_v6
845 				 * just as in ire_send_v6().
846 				 * Extract the queue from b_prev (set in
847 				 * ip_rput_data_v6).
848 				 */
849 				if (fwdq != NULL) {
850 					/*
851 					 * Forwarded packets hop count will
852 					 * get decremented in ip_rput_data_v6
853 					 */
854 					if (data_mp != mp)
855 						freeb(mp);
856 					put(fwdq, data_mp);
857 				} else {
858 					/*
859 					 * Send locally originated packets back
860 					 * into * ip_wput_v6.
861 					 */
862 					put(ill->ill_wq, mp);
863 				}
864 				ill_refrele(inbound_ill);
865 			} else {
866 				put(ill->ill_wq, mp);
867 			}
868 			mp = nxt_mp;
869 		}
870 		return;
871 	}
872 	ll_changed = nce_cmp_ll_addr(nce, hw_addr, hw_addr_len);
873 	if (!is_adv) {
874 		/* If this is a SOLICITATION request only */
875 		if (ll_changed)
876 			nce_update(nce, ND_STALE, hw_addr);
877 		mutex_exit(&nce->nce_lock);
878 		return;
879 	}
880 	if (!(flag & ND_NA_FLAG_OVERRIDE) && ll_changed) {
881 		/* If in any other state than REACHABLE, ignore */
882 		if (nce->nce_state == ND_REACHABLE) {
883 			nce_update(nce, ND_STALE, NULL);
884 		}
885 		mutex_exit(&nce->nce_lock);
886 		return;
887 	} else {
888 		if (ll_changed) {
889 			nce_update(nce, ND_UNCHANGED, hw_addr);
890 			ll_updated = B_TRUE;
891 		}
892 		if (flag & ND_NA_FLAG_SOLICITED) {
893 			nce_update(nce, ND_REACHABLE, NULL);
894 		} else {
895 			if (ll_updated) {
896 				nce_update(nce, ND_STALE, NULL);
897 			}
898 		}
899 		mutex_exit(&nce->nce_lock);
900 		if (!(flag & ND_NA_FLAG_ROUTER) && (nce->nce_flags &
901 		    NCE_F_ISROUTER)) {
902 			ire_t *ire;
903 
904 			/*
905 			 * Router turned to host.  We need to remove the
906 			 * entry as well as any default route that may be
907 			 * using this as a next hop.  This is required by
908 			 * section 7.2.5 of RFC 2461.
909 			 */
910 			ire = ire_ftable_lookup_v6(&ipv6_all_zeros,
911 			    &ipv6_all_zeros, &nce->nce_addr, IRE_DEFAULT,
912 			    nce->nce_ill->ill_ipif, NULL, ALL_ZONES, 0, NULL,
913 			    MATCH_IRE_ILL | MATCH_IRE_TYPE | MATCH_IRE_GW |
914 			    MATCH_IRE_DEFAULT, ipst);
915 			if (ire != NULL) {
916 				ip_rts_rtmsg(RTM_DELETE, ire, 0, ipst);
917 				ire_delete(ire);
918 				ire_refrele(ire);
919 			}
920 			ndp_delete(nce);
921 		}
922 	}
923 }
924 
925 /*
926  * Pass arg1 to the pfi supplied, along with each nce in existence.
927  * ndp_walk() places a REFHOLD on the nce and drops the lock when
928  * walking the hash list.
929  */
930 void
931 ndp_walk_common(ndp_g_t *ndp, ill_t *ill, pfi_t pfi, void *arg1,
932     boolean_t trace)
933 {
934 
935 	nce_t	*nce;
936 	nce_t	*nce1;
937 	nce_t	**ncep;
938 	nce_t	*free_nce_list = NULL;
939 
940 	mutex_enter(&ndp->ndp_g_lock);
941 	/* Prevent ndp_delete from unlink and free of NCE */
942 	ndp->ndp_g_walker++;
943 	mutex_exit(&ndp->ndp_g_lock);
944 	for (ncep = ndp->nce_hash_tbl;
945 	    ncep < A_END(ndp->nce_hash_tbl); ncep++) {
946 		for (nce = *ncep; nce != NULL; nce = nce1) {
947 			nce1 = nce->nce_next;
948 			if (ill == NULL || nce->nce_ill == ill) {
949 				if (trace) {
950 					NCE_REFHOLD(nce);
951 					(*pfi)(nce, arg1);
952 					NCE_REFRELE(nce);
953 				} else {
954 					NCE_REFHOLD_NOTR(nce);
955 					(*pfi)(nce, arg1);
956 					NCE_REFRELE_NOTR(nce);
957 				}
958 			}
959 		}
960 	}
961 	for (nce = ndp->nce_mask_entries; nce != NULL; nce = nce1) {
962 		nce1 = nce->nce_next;
963 		if (ill == NULL || nce->nce_ill == ill) {
964 			if (trace) {
965 				NCE_REFHOLD(nce);
966 				(*pfi)(nce, arg1);
967 				NCE_REFRELE(nce);
968 			} else {
969 				NCE_REFHOLD_NOTR(nce);
970 				(*pfi)(nce, arg1);
971 				NCE_REFRELE_NOTR(nce);
972 			}
973 		}
974 	}
975 	mutex_enter(&ndp->ndp_g_lock);
976 	ndp->ndp_g_walker--;
977 	/*
978 	 * While NCE's are removed from global list they are placed
979 	 * in a private list, to be passed to nce_ire_delete_list().
980 	 * The reason is, there may be ires pointing to this nce
981 	 * which needs to cleaned up.
982 	 */
983 	if (ndp->ndp_g_walker_cleanup && ndp->ndp_g_walker == 0) {
984 		/* Time to delete condemned entries */
985 		for (ncep = ndp->nce_hash_tbl;
986 		    ncep < A_END(ndp->nce_hash_tbl); ncep++) {
987 			nce = *ncep;
988 			if (nce != NULL) {
989 				nce_remove(ndp, nce, &free_nce_list);
990 			}
991 		}
992 		nce = ndp->nce_mask_entries;
993 		if (nce != NULL) {
994 			nce_remove(ndp, nce, &free_nce_list);
995 		}
996 		ndp->ndp_g_walker_cleanup = B_FALSE;
997 	}
998 
999 	mutex_exit(&ndp->ndp_g_lock);
1000 
1001 	if (free_nce_list != NULL) {
1002 		nce_ire_delete_list(free_nce_list);
1003 	}
1004 }
1005 
1006 /*
1007  * Walk everything.
1008  * Note that ill can be NULL hence can't derive the ipst from it.
1009  */
1010 void
1011 ndp_walk(ill_t *ill, pfi_t pfi, void *arg1, ip_stack_t *ipst)
1012 {
1013 	ndp_walk_common(ipst->ips_ndp4, ill, pfi, arg1, B_TRUE);
1014 	ndp_walk_common(ipst->ips_ndp6, ill, pfi, arg1, B_TRUE);
1015 }
1016 
1017 /*
1018  * Process resolve requests.  Handles both mapped entries
1019  * as well as cases that needs to be send out on the wire.
1020  * Lookup a NCE for a given IRE.  Regardless of whether one exists
1021  * or one is created, we defer making ire point to nce until the
1022  * ire is actually added at which point the nce_refcnt on the nce is
1023  * incremented.  This is done primarily to have symmetry between ire_add()
1024  * and ire_delete() which decrements the nce_refcnt, when an ire is deleted.
1025  */
1026 int
1027 ndp_resolver(ill_t *ill, const in6_addr_t *dst, mblk_t *mp, zoneid_t zoneid)
1028 {
1029 	nce_t		*nce;
1030 	int		err = 0;
1031 	uint32_t	ms;
1032 	mblk_t		*mp_nce = NULL;
1033 	ip_stack_t	*ipst = ill->ill_ipst;
1034 
1035 	ASSERT(ill->ill_isv6);
1036 	if (IN6_IS_ADDR_MULTICAST(dst)) {
1037 		err = nce_set_multicast(ill, dst);
1038 		return (err);
1039 	}
1040 	err = ndp_lookup_then_add_v6(ill,
1041 	    NULL,	/* No hardware address */
1042 	    dst,
1043 	    &ipv6_all_ones,
1044 	    &ipv6_all_zeros,
1045 	    0,
1046 	    (ill->ill_flags & ILLF_NONUD) ? NCE_F_NONUD : 0,
1047 	    ND_INCOMPLETE,
1048 	    &nce);
1049 
1050 	switch (err) {
1051 	case 0:
1052 		/*
1053 		 * New cache entry was created. Make sure that the state
1054 		 * is not ND_INCOMPLETE. It can be in some other state
1055 		 * even before we send out the solicitation as we could
1056 		 * get un-solicited advertisements.
1057 		 *
1058 		 * If this is an XRESOLV interface, simply return 0,
1059 		 * since we don't want to solicit just yet.
1060 		 */
1061 		if (ill->ill_flags & ILLF_XRESOLV) {
1062 			NCE_REFRELE(nce);
1063 			return (0);
1064 		}
1065 		rw_enter(&ipst->ips_ill_g_lock, RW_READER);
1066 		mutex_enter(&nce->nce_lock);
1067 		if (nce->nce_state != ND_INCOMPLETE) {
1068 			mutex_exit(&nce->nce_lock);
1069 			rw_exit(&ipst->ips_ill_g_lock);
1070 			NCE_REFRELE(nce);
1071 			return (0);
1072 		}
1073 		mp_nce = ip_prepend_zoneid(mp, zoneid, ipst);
1074 		if (mp_nce == NULL) {
1075 			/* The caller will free mp */
1076 			mutex_exit(&nce->nce_lock);
1077 			rw_exit(&ipst->ips_ill_g_lock);
1078 			ndp_delete(nce);
1079 			NCE_REFRELE(nce);
1080 			return (ENOMEM);
1081 		}
1082 		ms = nce_solicit(nce, mp_nce);
1083 		rw_exit(&ipst->ips_ill_g_lock);
1084 		if (ms == 0) {
1085 			/* The caller will free mp */
1086 			if (mp_nce != mp)
1087 				freeb(mp_nce);
1088 			mutex_exit(&nce->nce_lock);
1089 			ndp_delete(nce);
1090 			NCE_REFRELE(nce);
1091 			return (EBUSY);
1092 		}
1093 		mutex_exit(&nce->nce_lock);
1094 		NDP_RESTART_TIMER(nce, (clock_t)ms);
1095 		NCE_REFRELE(nce);
1096 		return (EINPROGRESS);
1097 	case EEXIST:
1098 		/* Resolution in progress just queue the packet */
1099 		mutex_enter(&nce->nce_lock);
1100 		if (nce->nce_state == ND_INCOMPLETE) {
1101 			mp_nce = ip_prepend_zoneid(mp, zoneid, ipst);
1102 			if (mp_nce == NULL) {
1103 				err = ENOMEM;
1104 			} else {
1105 				nce_queue_mp(nce, mp_nce);
1106 				err = EINPROGRESS;
1107 			}
1108 		} else {
1109 			/*
1110 			 * Any other state implies we have
1111 			 * a nce but IRE needs to be added ...
1112 			 * ire_add_v6() will take care of the
1113 			 * the case when the nce becomes CONDEMNED
1114 			 * before the ire is added to the table.
1115 			 */
1116 			err = 0;
1117 		}
1118 		mutex_exit(&nce->nce_lock);
1119 		NCE_REFRELE(nce);
1120 		break;
1121 	default:
1122 		ip1dbg(("ndp_resolver: Can't create NCE %d\n", err));
1123 		break;
1124 	}
1125 	return (err);
1126 }
1127 
1128 /*
1129  * When there is no resolver, the link layer template is passed in
1130  * the IRE.
1131  * Lookup a NCE for a given IRE.  Regardless of whether one exists
1132  * or one is created, we defer making ire point to nce until the
1133  * ire is actually added at which point the nce_refcnt on the nce is
1134  * incremented.  This is done primarily to have symmetry between ire_add()
1135  * and ire_delete() which decrements the nce_refcnt, when an ire is deleted.
1136  */
1137 int
1138 ndp_noresolver(ill_t *ill, const in6_addr_t *dst)
1139 {
1140 	nce_t		*nce;
1141 	int		err = 0;
1142 
1143 	ASSERT(ill != NULL);
1144 	ASSERT(ill->ill_isv6);
1145 	if (IN6_IS_ADDR_MULTICAST(dst)) {
1146 		err = nce_set_multicast(ill, dst);
1147 		return (err);
1148 	}
1149 
1150 	err = ndp_lookup_then_add_v6(ill,
1151 	    NULL,	/* hardware address */
1152 	    dst,
1153 	    &ipv6_all_ones,
1154 	    &ipv6_all_zeros,
1155 	    0,
1156 	    (ill->ill_flags & ILLF_NONUD) ? NCE_F_NONUD : 0,
1157 	    ND_REACHABLE,
1158 	    &nce);
1159 
1160 	switch (err) {
1161 	case 0:
1162 		/*
1163 		 * Cache entry with a proper resolver cookie was
1164 		 * created.
1165 		 */
1166 		NCE_REFRELE(nce);
1167 		break;
1168 	case EEXIST:
1169 		err = 0;
1170 		NCE_REFRELE(nce);
1171 		break;
1172 	default:
1173 		ip1dbg(("ndp_noresolver: Can't create NCE %d\n", err));
1174 		break;
1175 	}
1176 	return (err);
1177 }
1178 
1179 /*
1180  * For each interface an entry is added for the unspecified multicast group.
1181  * Here that mapping is used to form the multicast cache entry for a particular
1182  * multicast destination.
1183  */
1184 static int
1185 nce_set_multicast(ill_t *ill, const in6_addr_t *dst)
1186 {
1187 	nce_t		*mnce;	/* Multicast mapping entry */
1188 	nce_t		*nce;
1189 	uchar_t		*hw_addr = NULL;
1190 	int		err = 0;
1191 	ip_stack_t	*ipst = ill->ill_ipst;
1192 
1193 	ASSERT(ill != NULL);
1194 	ASSERT(ill->ill_isv6);
1195 	ASSERT(!(IN6_IS_ADDR_UNSPECIFIED(dst)));
1196 
1197 	mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
1198 	nce = *((nce_t **)NCE_HASH_PTR_V6(ipst, *dst));
1199 	nce = nce_lookup_addr(ill, dst, nce);
1200 	if (nce != NULL) {
1201 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
1202 		NCE_REFRELE(nce);
1203 		return (0);
1204 	}
1205 	/* No entry, now lookup for a mapping this should never fail */
1206 	mnce = nce_lookup_mapping(ill, dst);
1207 	if (mnce == NULL) {
1208 		/* Something broken for the interface. */
1209 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
1210 		return (ESRCH);
1211 	}
1212 	ASSERT(mnce->nce_flags & NCE_F_MAPPING);
1213 	if (ill->ill_net_type == IRE_IF_RESOLVER) {
1214 		/*
1215 		 * For IRE_IF_RESOLVER a hardware mapping can be
1216 		 * generated, for IRE_IF_NORESOLVER, resolution cookie
1217 		 * in the ill is copied in ndp_add_v6().
1218 		 */
1219 		hw_addr = kmem_alloc(ill->ill_nd_lla_len, KM_NOSLEEP);
1220 		if (hw_addr == NULL) {
1221 			mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
1222 			NCE_REFRELE(mnce);
1223 			return (ENOMEM);
1224 		}
1225 		nce_make_mapping(mnce, hw_addr, (uchar_t *)dst);
1226 	}
1227 	NCE_REFRELE(mnce);
1228 	/*
1229 	 * IRE_IF_NORESOLVER type simply copies the resolution
1230 	 * cookie passed in.  So no hw_addr is needed.
1231 	 */
1232 	err = ndp_add_v6(ill,
1233 	    hw_addr,
1234 	    dst,
1235 	    &ipv6_all_ones,
1236 	    &ipv6_all_zeros,
1237 	    0,
1238 	    NCE_F_NONUD,
1239 	    ND_REACHABLE,
1240 	    &nce);
1241 	mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
1242 	if (hw_addr != NULL)
1243 		kmem_free(hw_addr, ill->ill_nd_lla_len);
1244 	if (err != 0) {
1245 		ip1dbg(("nce_set_multicast: create failed" "%d\n", err));
1246 		return (err);
1247 	}
1248 	NCE_REFRELE(nce);
1249 	return (0);
1250 }
1251 
1252 /*
1253  * Return the link layer address, and any flags of a nce.
1254  */
1255 int
1256 ndp_query(ill_t *ill, struct lif_nd_req *lnr)
1257 {
1258 	nce_t		*nce;
1259 	in6_addr_t	*addr;
1260 	sin6_t		*sin6;
1261 	dl_unitdata_req_t	*dl;
1262 
1263 	ASSERT(ill != NULL && ill->ill_isv6);
1264 	sin6 = (sin6_t *)&lnr->lnr_addr;
1265 	addr =  &sin6->sin6_addr;
1266 
1267 	nce = ndp_lookup_v6(ill, addr, B_FALSE);
1268 	if (nce == NULL)
1269 		return (ESRCH);
1270 	/* If in INCOMPLETE state, no link layer address is available yet */
1271 	if (nce->nce_state == ND_INCOMPLETE)
1272 		goto done;
1273 	dl = (dl_unitdata_req_t *)nce->nce_res_mp->b_rptr;
1274 	if (ill->ill_flags & ILLF_XRESOLV)
1275 		lnr->lnr_hdw_len = dl->dl_dest_addr_length;
1276 	else
1277 		lnr->lnr_hdw_len = ill->ill_nd_lla_len;
1278 	ASSERT(NCE_LL_ADDR_OFFSET(ill) + lnr->lnr_hdw_len <=
1279 	    sizeof (lnr->lnr_hdw_addr));
1280 	bcopy(nce->nce_res_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill),
1281 	    (uchar_t *)&lnr->lnr_hdw_addr, lnr->lnr_hdw_len);
1282 	if (nce->nce_flags & NCE_F_ISROUTER)
1283 		lnr->lnr_flags = NDF_ISROUTER_ON;
1284 	if (nce->nce_flags & NCE_F_ANYCAST)
1285 		lnr->lnr_flags |= NDF_ANYCAST_ON;
1286 done:
1287 	NCE_REFRELE(nce);
1288 	return (0);
1289 }
1290 
1291 /*
1292  * Send Enable/Disable multicast reqs to driver.
1293  */
1294 int
1295 ndp_mcastreq(ill_t *ill, const in6_addr_t *addr, uint32_t hw_addr_len,
1296     uint32_t hw_addr_offset, mblk_t *mp)
1297 {
1298 	nce_t		*nce;
1299 	uchar_t		*hw_addr;
1300 	ip_stack_t	*ipst = ill->ill_ipst;
1301 
1302 	ASSERT(ill != NULL && ill->ill_isv6);
1303 	ASSERT(ill->ill_net_type == IRE_IF_RESOLVER);
1304 	hw_addr = mi_offset_paramc(mp, hw_addr_offset, hw_addr_len);
1305 	if (hw_addr == NULL || !IN6_IS_ADDR_MULTICAST(addr)) {
1306 		freemsg(mp);
1307 		return (EINVAL);
1308 	}
1309 	mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
1310 	nce = nce_lookup_mapping(ill, addr);
1311 	if (nce == NULL) {
1312 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
1313 		freemsg(mp);
1314 		return (ESRCH);
1315 	}
1316 	mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
1317 	/*
1318 	 * Update dl_addr_length and dl_addr_offset for primitives that
1319 	 * have physical addresses as opposed to full saps
1320 	 */
1321 	switch (((union DL_primitives *)mp->b_rptr)->dl_primitive) {
1322 	case DL_ENABMULTI_REQ:
1323 		/* Track the state if this is the first enabmulti */
1324 		if (ill->ill_dlpi_multicast_state == IDS_UNKNOWN)
1325 			ill->ill_dlpi_multicast_state = IDS_INPROGRESS;
1326 		ip1dbg(("ndp_mcastreq: ENABMULTI\n"));
1327 		break;
1328 	case DL_DISABMULTI_REQ:
1329 		ip1dbg(("ndp_mcastreq: DISABMULTI\n"));
1330 		break;
1331 	default:
1332 		NCE_REFRELE(nce);
1333 		ip1dbg(("ndp_mcastreq: default\n"));
1334 		return (EINVAL);
1335 	}
1336 	nce_make_mapping(nce, hw_addr, (uchar_t *)addr);
1337 	NCE_REFRELE(nce);
1338 	ill_dlpi_send(ill, mp);
1339 	return (0);
1340 }
1341 
1342 /*
1343  * Send a neighbor solicitation.
1344  * Returns number of milliseconds after which we should either rexmit or abort.
1345  * Return of zero means we should abort.
1346  * The caller holds the nce_lock to protect nce_qd_mp and nce_rcnt.
1347  *
1348  * NOTE: This routine drops nce_lock (and later reacquires it) when sending
1349  * the packet.
1350  * NOTE: This routine does not consume mp.
1351  */
1352 uint32_t
1353 nce_solicit(nce_t *nce, mblk_t *mp)
1354 {
1355 	ill_t		*ill;
1356 	ill_t		*src_ill;
1357 	ip6_t		*ip6h;
1358 	in6_addr_t	src;
1359 	in6_addr_t	dst;
1360 	ipif_t		*ipif;
1361 	ip6i_t		*ip6i;
1362 	boolean_t	dropped = B_FALSE;
1363 	ip_stack_t	*ipst = nce->nce_ill->ill_ipst;
1364 
1365 	ASSERT(RW_READ_HELD(&ipst->ips_ill_g_lock));
1366 	ASSERT(MUTEX_HELD(&nce->nce_lock));
1367 	ill = nce->nce_ill;
1368 	ASSERT(ill != NULL);
1369 
1370 	if (nce->nce_rcnt == 0) {
1371 		return (0);
1372 	}
1373 
1374 	if (mp == NULL) {
1375 		ASSERT(nce->nce_qd_mp != NULL);
1376 		mp = nce->nce_qd_mp;
1377 	} else {
1378 		nce_queue_mp(nce, mp);
1379 	}
1380 
1381 	/* Handle ip_newroute_v6 giving us IPSEC packets */
1382 	if (mp->b_datap->db_type == M_CTL)
1383 		mp = mp->b_cont;
1384 
1385 	ip6h = (ip6_t *)mp->b_rptr;
1386 	if (ip6h->ip6_nxt == IPPROTO_RAW) {
1387 		/*
1388 		 * This message should have been pulled up already in
1389 		 * ip_wput_v6. We can't do pullups here because the message
1390 		 * could be from the nce_qd_mp which could have b_next/b_prev
1391 		 * non-NULL.
1392 		 */
1393 		ip6i = (ip6i_t *)ip6h;
1394 		ASSERT((mp->b_wptr - (uchar_t *)ip6i) >=
1395 		    sizeof (ip6i_t) + IPV6_HDR_LEN);
1396 		ip6h = (ip6_t *)(mp->b_rptr + sizeof (ip6i_t));
1397 	}
1398 	src = ip6h->ip6_src;
1399 	/*
1400 	 * If the src of outgoing packet is one of the assigned interface
1401 	 * addresses use it, otherwise we will pick the source address below.
1402 	 */
1403 	src_ill = ill;
1404 	if (!IN6_IS_ADDR_UNSPECIFIED(&src)) {
1405 		if (ill->ill_group != NULL)
1406 			src_ill = ill->ill_group->illgrp_ill;
1407 		for (; src_ill != NULL; src_ill = src_ill->ill_group_next) {
1408 			for (ipif = src_ill->ill_ipif; ipif != NULL;
1409 			    ipif = ipif->ipif_next) {
1410 				if (IN6_ARE_ADDR_EQUAL(&src,
1411 				    &ipif->ipif_v6lcl_addr)) {
1412 					break;
1413 				}
1414 			}
1415 			if (ipif != NULL)
1416 				break;
1417 		}
1418 		/*
1419 		 * If no relevant ipif can be found, then it's not one of our
1420 		 * addresses.  Reset to :: and let nce_xmit.  If an ipif can be
1421 		 * found, but it's not yet done with DAD verification, then
1422 		 * just postpone this transmission until later.
1423 		 */
1424 		if (src_ill == NULL)
1425 			src = ipv6_all_zeros;
1426 		else if (!ipif->ipif_addr_ready)
1427 			return (ill->ill_reachable_retrans_time);
1428 	}
1429 	dst = nce->nce_addr;
1430 	/*
1431 	 * If source address is unspecified, nce_xmit will choose
1432 	 * one for us and initialize the hardware address also
1433 	 * appropriately.
1434 	 */
1435 	if (IN6_IS_ADDR_UNSPECIFIED(&src))
1436 		src_ill = NULL;
1437 	nce->nce_rcnt--;
1438 	mutex_exit(&nce->nce_lock);
1439 	rw_exit(&ipst->ips_ill_g_lock);
1440 	dropped = nce_xmit(ill, ND_NEIGHBOR_SOLICIT, src_ill, B_TRUE, &src,
1441 	    &dst, 0);
1442 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
1443 	mutex_enter(&nce->nce_lock);
1444 	if (dropped)
1445 		nce->nce_rcnt++;
1446 	return (ill->ill_reachable_retrans_time);
1447 }
1448 
1449 /*
1450  * Attempt to recover an address on an interface that's been marked as a
1451  * duplicate.  Because NCEs are destroyed when the interface goes down, there's
1452  * no easy way to just probe the address and have the right thing happen if
1453  * it's no longer in use.  Instead, we just bring it up normally and allow the
1454  * regular interface start-up logic to probe for a remaining duplicate and take
1455  * us back down if necessary.
1456  * Neither DHCP nor temporary addresses arrive here; they're excluded by
1457  * ip_ndp_excl.
1458  */
1459 /* ARGSUSED */
1460 static void
1461 ip_ndp_recover(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg)
1462 {
1463 	ill_t	*ill = rq->q_ptr;
1464 	ipif_t	*ipif;
1465 	in6_addr_t *addr = (in6_addr_t *)mp->b_rptr;
1466 
1467 	for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
1468 		/*
1469 		 * We do not support recovery of proxy ARP'd interfaces,
1470 		 * because the system lacks a complete proxy ARP mechanism.
1471 		 */
1472 		if ((ipif->ipif_flags & IPIF_POINTOPOINT) ||
1473 		    !IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6lcl_addr, addr)) {
1474 			continue;
1475 		}
1476 
1477 		/*
1478 		 * If we have already recovered or if the interface is going
1479 		 * away, then ignore.
1480 		 */
1481 		mutex_enter(&ill->ill_lock);
1482 		if (!(ipif->ipif_flags & IPIF_DUPLICATE) ||
1483 		    (ipif->ipif_flags & (IPIF_MOVING | IPIF_CONDEMNED))) {
1484 			mutex_exit(&ill->ill_lock);
1485 			continue;
1486 		}
1487 
1488 		ipif->ipif_flags &= ~IPIF_DUPLICATE;
1489 		ill->ill_ipif_dup_count--;
1490 		mutex_exit(&ill->ill_lock);
1491 		ipif->ipif_was_dup = B_TRUE;
1492 
1493 		if (ipif_ndp_up(ipif) != EINPROGRESS)
1494 			(void) ipif_up_done_v6(ipif);
1495 	}
1496 	freeb(mp);
1497 }
1498 
1499 /*
1500  * Attempt to recover an IPv6 interface that's been shut down as a duplicate.
1501  * As long as someone else holds the address, the interface will stay down.
1502  * When that conflict goes away, the interface is brought back up.  This is
1503  * done so that accidental shutdowns of addresses aren't made permanent.  Your
1504  * server will recover from a failure.
1505  *
1506  * For DHCP and temporary addresses, recovery is not done in the kernel.
1507  * Instead, it's handled by user space processes (dhcpagent and in.ndpd).
1508  *
1509  * This function is entered on a timer expiry; the ID is in ipif_recovery_id.
1510  */
1511 static void
1512 ipif6_dup_recovery(void *arg)
1513 {
1514 	ipif_t *ipif = arg;
1515 
1516 	ipif->ipif_recovery_id = 0;
1517 	if (!(ipif->ipif_flags & IPIF_DUPLICATE))
1518 		return;
1519 
1520 	/*
1521 	 * No lock, because this is just an optimization.
1522 	 */
1523 	if (ipif->ipif_state_flags & (IPIF_MOVING | IPIF_CONDEMNED))
1524 		return;
1525 
1526 	/* If the link is down, we'll retry this later */
1527 	if (!(ipif->ipif_ill->ill_phyint->phyint_flags & PHYI_RUNNING))
1528 		return;
1529 
1530 	ndp_do_recovery(ipif);
1531 }
1532 
1533 /*
1534  * Perform interface recovery by forcing the duplicate interfaces up and
1535  * allowing the system to determine which ones should stay up.
1536  *
1537  * Called both by recovery timer expiry and link-up notification.
1538  */
1539 void
1540 ndp_do_recovery(ipif_t *ipif)
1541 {
1542 	ill_t *ill = ipif->ipif_ill;
1543 	mblk_t *mp;
1544 	ip_stack_t *ipst = ill->ill_ipst;
1545 
1546 	mp = allocb(sizeof (ipif->ipif_v6lcl_addr), BPRI_MED);
1547 	if (mp == NULL) {
1548 		mutex_enter(&ill->ill_lock);
1549 		if (ipif->ipif_recovery_id == 0 &&
1550 		    !(ipif->ipif_state_flags & (IPIF_MOVING |
1551 		    IPIF_CONDEMNED))) {
1552 			ipif->ipif_recovery_id = timeout(ipif6_dup_recovery,
1553 			    ipif, MSEC_TO_TICK(ipst->ips_ip_dup_recovery));
1554 		}
1555 		mutex_exit(&ill->ill_lock);
1556 	} else {
1557 		bcopy(&ipif->ipif_v6lcl_addr, mp->b_rptr,
1558 		    sizeof (ipif->ipif_v6lcl_addr));
1559 		ill_refhold(ill);
1560 		qwriter_ip(ill, ill->ill_rq, mp, ip_ndp_recover, NEW_OP,
1561 		    B_FALSE);
1562 	}
1563 }
1564 
1565 /*
1566  * Find the solicitation in the given message, and extract printable details
1567  * (MAC and IP addresses) from it.
1568  */
1569 static nd_neighbor_solicit_t *
1570 ip_ndp_find_solicitation(mblk_t *mp, mblk_t *dl_mp, ill_t *ill, char *hbuf,
1571     size_t hlen, char *sbuf, size_t slen, uchar_t **haddr)
1572 {
1573 	nd_neighbor_solicit_t *ns;
1574 	ip6_t *ip6h;
1575 	uchar_t *addr;
1576 	int alen;
1577 
1578 	alen = 0;
1579 	ip6h = (ip6_t *)mp->b_rptr;
1580 	if (dl_mp == NULL) {
1581 		nd_opt_hdr_t *opt;
1582 		int nslen;
1583 
1584 		/*
1585 		 * If it's from the fast-path, then it can't be a probe
1586 		 * message, and thus must include the source linkaddr option.
1587 		 * Extract that here.
1588 		 */
1589 		ns = (nd_neighbor_solicit_t *)((char *)ip6h + IPV6_HDR_LEN);
1590 		nslen = mp->b_wptr - (uchar_t *)ns;
1591 		if ((nslen -= sizeof (*ns)) > 0) {
1592 			opt = ndp_get_option((nd_opt_hdr_t *)(ns + 1), nslen,
1593 			    ND_OPT_SOURCE_LINKADDR);
1594 			if (opt != NULL &&
1595 			    opt->nd_opt_len * 8 - sizeof (*opt) >=
1596 			    ill->ill_nd_lla_len) {
1597 				addr = (uchar_t *)(opt + 1);
1598 				alen = ill->ill_nd_lla_len;
1599 			}
1600 		}
1601 		/*
1602 		 * We cheat a bit here for the sake of printing usable log
1603 		 * messages in the rare case where the reply we got was unicast
1604 		 * without a source linkaddr option, and the interface is in
1605 		 * fastpath mode.  (Sigh.)
1606 		 */
1607 		if (alen == 0 && ill->ill_type == IFT_ETHER &&
1608 		    MBLKHEAD(mp) >= sizeof (struct ether_header)) {
1609 			struct ether_header *pether;
1610 
1611 			pether = (struct ether_header *)((char *)ip6h -
1612 			    sizeof (*pether));
1613 			addr = pether->ether_shost.ether_addr_octet;
1614 			alen = ETHERADDRL;
1615 		}
1616 	} else {
1617 		dl_unitdata_ind_t *dlu;
1618 
1619 		dlu = (dl_unitdata_ind_t *)dl_mp->b_rptr;
1620 		alen = dlu->dl_src_addr_length;
1621 		if (alen > 0 && dlu->dl_src_addr_offset >= sizeof (*dlu) &&
1622 		    dlu->dl_src_addr_offset + alen <= MBLKL(dl_mp)) {
1623 			addr = dl_mp->b_rptr + dlu->dl_src_addr_offset;
1624 			if (ill->ill_sap_length < 0) {
1625 				alen += ill->ill_sap_length;
1626 			} else {
1627 				addr += ill->ill_sap_length;
1628 				alen -= ill->ill_sap_length;
1629 			}
1630 		}
1631 	}
1632 	if (alen > 0) {
1633 		*haddr = addr;
1634 		(void) mac_colon_addr(addr, alen, hbuf, hlen);
1635 	} else {
1636 		*haddr = NULL;
1637 		(void) strcpy(hbuf, "?");
1638 	}
1639 	ns = (nd_neighbor_solicit_t *)((char *)ip6h + IPV6_HDR_LEN);
1640 	(void) inet_ntop(AF_INET6, &ns->nd_ns_target, sbuf, slen);
1641 	return (ns);
1642 }
1643 
1644 /*
1645  * This is for exclusive changes due to NDP duplicate address detection
1646  * failure.
1647  */
1648 /* ARGSUSED */
1649 static void
1650 ip_ndp_excl(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg)
1651 {
1652 	ill_t	*ill = rq->q_ptr;
1653 	ipif_t	*ipif;
1654 	char ibuf[LIFNAMSIZ + 10];	/* 10 digits for logical i/f number */
1655 	char hbuf[MAC_STR_LEN];
1656 	char sbuf[INET6_ADDRSTRLEN];
1657 	nd_neighbor_solicit_t *ns;
1658 	mblk_t *dl_mp = NULL;
1659 	uchar_t *haddr;
1660 	ip_stack_t *ipst = ill->ill_ipst;
1661 
1662 	if (DB_TYPE(mp) != M_DATA) {
1663 		dl_mp = mp;
1664 		mp = mp->b_cont;
1665 	}
1666 	ns = ip_ndp_find_solicitation(mp, dl_mp, ill, hbuf, sizeof (hbuf), sbuf,
1667 	    sizeof (sbuf), &haddr);
1668 	if (haddr != NULL &&
1669 	    bcmp(haddr, ill->ill_phys_addr, ill->ill_phys_addr_length) == 0) {
1670 		/*
1671 		 * Ignore conflicts generated by misbehaving switches that just
1672 		 * reflect our own messages back to us.
1673 		 */
1674 		goto ignore_conflict;
1675 	}
1676 
1677 	for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
1678 
1679 		if ((ipif->ipif_flags & IPIF_POINTOPOINT) ||
1680 		    !IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6lcl_addr,
1681 		    &ns->nd_ns_target)) {
1682 			continue;
1683 		}
1684 
1685 		/* If it's already marked, then don't do anything. */
1686 		if (ipif->ipif_flags & IPIF_DUPLICATE)
1687 			continue;
1688 
1689 		/*
1690 		 * If this is a failure during duplicate recovery, then don't
1691 		 * complain.  It may take a long time to recover.
1692 		 */
1693 		if (!ipif->ipif_was_dup) {
1694 			ipif_get_name(ipif, ibuf, sizeof (ibuf));
1695 			cmn_err(CE_WARN, "%s has duplicate address %s (in "
1696 			    "use by %s); disabled", ibuf, sbuf, hbuf);
1697 		}
1698 		mutex_enter(&ill->ill_lock);
1699 		ASSERT(!(ipif->ipif_flags & IPIF_DUPLICATE));
1700 		ipif->ipif_flags |= IPIF_DUPLICATE;
1701 		ill->ill_ipif_dup_count++;
1702 		mutex_exit(&ill->ill_lock);
1703 		(void) ipif_down(ipif, NULL, NULL);
1704 		ipif_down_tail(ipif);
1705 		mutex_enter(&ill->ill_lock);
1706 		if (!(ipif->ipif_flags & (IPIF_DHCPRUNNING|IPIF_TEMPORARY)) &&
1707 		    ill->ill_net_type == IRE_IF_RESOLVER &&
1708 		    !(ipif->ipif_state_flags & (IPIF_MOVING |
1709 		    IPIF_CONDEMNED)) &&
1710 		    ipst->ips_ip_dup_recovery > 0) {
1711 			ipif->ipif_recovery_id = timeout(ipif6_dup_recovery,
1712 			    ipif, MSEC_TO_TICK(ipst->ips_ip_dup_recovery));
1713 		}
1714 		mutex_exit(&ill->ill_lock);
1715 	}
1716 ignore_conflict:
1717 	if (dl_mp != NULL)
1718 		freeb(dl_mp);
1719 	freemsg(mp);
1720 }
1721 
1722 /*
1723  * Handle failure by tearing down the ipifs with the specified address.  Note
1724  * that tearing down the ipif also means deleting the nce through ipif_down, so
1725  * it's not possible to do recovery by just restarting the nce timer.  Instead,
1726  * we start a timer on the ipif.
1727  */
1728 static void
1729 ip_ndp_failure(ill_t *ill, mblk_t *mp, mblk_t *dl_mp, nce_t *nce)
1730 {
1731 	if ((mp = copymsg(mp)) != NULL) {
1732 		if (dl_mp == NULL)
1733 			dl_mp = mp;
1734 		else if ((dl_mp = copyb(dl_mp)) != NULL)
1735 			dl_mp->b_cont = mp;
1736 		if (dl_mp == NULL) {
1737 			freemsg(mp);
1738 		} else {
1739 			ill_refhold(ill);
1740 			qwriter_ip(ill, ill->ill_rq, dl_mp, ip_ndp_excl, NEW_OP,
1741 			    B_FALSE);
1742 		}
1743 	}
1744 	ndp_delete(nce);
1745 }
1746 
1747 /*
1748  * Handle a discovered conflict: some other system is advertising that it owns
1749  * one of our IP addresses.  We need to defend ourselves, or just shut down the
1750  * interface.
1751  */
1752 static void
1753 ip_ndp_conflict(ill_t *ill, mblk_t *mp, mblk_t *dl_mp, nce_t *nce)
1754 {
1755 	ipif_t *ipif;
1756 	uint32_t now;
1757 	uint_t maxdefense;
1758 	uint_t defs;
1759 	ip_stack_t *ipst = ill->ill_ipst;
1760 
1761 	ipif = ipif_lookup_addr_v6(&nce->nce_addr, ill, ALL_ZONES, NULL, NULL,
1762 	    NULL, NULL, ipst);
1763 	if (ipif == NULL)
1764 		return;
1765 	/*
1766 	 * First, figure out if this address is disposable.
1767 	 */
1768 	if (ipif->ipif_flags & (IPIF_DHCPRUNNING | IPIF_TEMPORARY))
1769 		maxdefense = ipst->ips_ip_max_temp_defend;
1770 	else
1771 		maxdefense = ipst->ips_ip_max_defend;
1772 
1773 	/*
1774 	 * Now figure out how many times we've defended ourselves.  Ignore
1775 	 * defenses that happened long in the past.
1776 	 */
1777 	now = gethrestime_sec();
1778 	mutex_enter(&nce->nce_lock);
1779 	if ((defs = nce->nce_defense_count) > 0 &&
1780 	    now - nce->nce_defense_time > ipst->ips_ip_defend_interval) {
1781 		nce->nce_defense_count = defs = 0;
1782 	}
1783 	nce->nce_defense_count++;
1784 	nce->nce_defense_time = now;
1785 	mutex_exit(&nce->nce_lock);
1786 	ipif_refrele(ipif);
1787 
1788 	/*
1789 	 * If we've defended ourselves too many times already, then give up and
1790 	 * tear down the interface(s) using this address.  Otherwise, defend by
1791 	 * sending out an unsolicited Neighbor Advertisement.
1792 	 */
1793 	if (defs >= maxdefense) {
1794 		ip_ndp_failure(ill, mp, dl_mp, nce);
1795 	} else {
1796 		char hbuf[MAC_STR_LEN];
1797 		char sbuf[INET6_ADDRSTRLEN];
1798 		uchar_t *haddr;
1799 
1800 		(void) ip_ndp_find_solicitation(mp, dl_mp, ill, hbuf,
1801 		    sizeof (hbuf), sbuf, sizeof (sbuf), &haddr);
1802 		cmn_err(CE_WARN, "node %s is using our IP address %s on %s",
1803 		    hbuf, sbuf, ill->ill_name);
1804 		(void) nce_xmit(ill, ND_NEIGHBOR_ADVERT, ill, B_FALSE,
1805 		    &nce->nce_addr, &ipv6_all_hosts_mcast,
1806 		    nce_advert_flags(nce));
1807 	}
1808 }
1809 
1810 static void
1811 ndp_input_solicit(ill_t *ill, mblk_t *mp, mblk_t *dl_mp)
1812 {
1813 	nd_neighbor_solicit_t *ns;
1814 	uint32_t	hlen = ill->ill_nd_lla_len;
1815 	uchar_t		*haddr = NULL;
1816 	icmp6_t		*icmp_nd;
1817 	ip6_t		*ip6h;
1818 	nce_t		*our_nce = NULL;
1819 	in6_addr_t	target;
1820 	in6_addr_t	src;
1821 	int		len;
1822 	int		flag = 0;
1823 	nd_opt_hdr_t	*opt = NULL;
1824 	boolean_t	bad_solicit = B_FALSE;
1825 	mib2_ipv6IfIcmpEntry_t	*mib = ill->ill_icmp6_mib;
1826 
1827 	ip6h = (ip6_t *)mp->b_rptr;
1828 	icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
1829 	len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN;
1830 	src = ip6h->ip6_src;
1831 	ns = (nd_neighbor_solicit_t *)icmp_nd;
1832 	target = ns->nd_ns_target;
1833 	if (IN6_IS_ADDR_MULTICAST(&target)) {
1834 		if (ip_debug > 2) {
1835 			/* ip1dbg */
1836 			pr_addr_dbg("ndp_input_solicit: Target is"
1837 			    " multicast! %s\n", AF_INET6, &target);
1838 		}
1839 		bad_solicit = B_TRUE;
1840 		goto done;
1841 	}
1842 	if (len > sizeof (nd_neighbor_solicit_t)) {
1843 		/* Options present */
1844 		opt = (nd_opt_hdr_t *)&ns[1];
1845 		len -= sizeof (nd_neighbor_solicit_t);
1846 		if (!ndp_verify_optlen(opt, len)) {
1847 			ip1dbg(("ndp_input_solicit: Bad opt len\n"));
1848 			bad_solicit = B_TRUE;
1849 			goto done;
1850 		}
1851 	}
1852 	if (IN6_IS_ADDR_UNSPECIFIED(&src)) {
1853 		/* Check to see if this is a valid DAD solicitation */
1854 		if (!IN6_IS_ADDR_MC_SOLICITEDNODE(&ip6h->ip6_dst)) {
1855 			if (ip_debug > 2) {
1856 				/* ip1dbg */
1857 				pr_addr_dbg("ndp_input_solicit: IPv6 "
1858 				    "Destination is not solicited node "
1859 				    "multicast %s\n", AF_INET6,
1860 				    &ip6h->ip6_dst);
1861 			}
1862 			bad_solicit = B_TRUE;
1863 			goto done;
1864 		}
1865 	}
1866 
1867 	our_nce = ndp_lookup_v6(ill, &target, B_FALSE);
1868 	/*
1869 	 * If this is a valid Solicitation, a permanent
1870 	 * entry should exist in the cache
1871 	 */
1872 	if (our_nce == NULL ||
1873 	    !(our_nce->nce_flags & NCE_F_PERMANENT)) {
1874 		ip1dbg(("ndp_input_solicit: Wrong target in NS?!"
1875 		    "ifname=%s ", ill->ill_name));
1876 		if (ip_debug > 2) {
1877 			/* ip1dbg */
1878 			pr_addr_dbg(" dst %s\n", AF_INET6, &target);
1879 		}
1880 		bad_solicit = B_TRUE;
1881 		goto done;
1882 	}
1883 
1884 	/* At this point we should have a verified NS per spec */
1885 	if (opt != NULL) {
1886 		opt = ndp_get_option(opt, len, ND_OPT_SOURCE_LINKADDR);
1887 		if (opt != NULL) {
1888 			haddr = (uchar_t *)&opt[1];
1889 			if (hlen > opt->nd_opt_len * 8 - sizeof (*opt) ||
1890 			    hlen == 0) {
1891 				ip1dbg(("ndp_input_advert: bad SLLA\n"));
1892 				bad_solicit = B_TRUE;
1893 				goto done;
1894 			}
1895 		}
1896 	}
1897 
1898 	/* If sending directly to peer, set the unicast flag */
1899 	if (!IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst))
1900 		flag |= NDP_UNICAST;
1901 
1902 	/*
1903 	 * Create/update the entry for the soliciting node.
1904 	 * or respond to outstanding queries, don't if
1905 	 * the source is unspecified address.
1906 	 */
1907 	if (!IN6_IS_ADDR_UNSPECIFIED(&src)) {
1908 		int	err;
1909 		nce_t	*nnce;
1910 
1911 		ASSERT(ill->ill_isv6);
1912 		/*
1913 		 * Regular solicitations *must* include the Source Link-Layer
1914 		 * Address option.  Ignore messages that do not.
1915 		 */
1916 		if (haddr == NULL && IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst)) {
1917 			ip1dbg(("ndp_input_solicit: source link-layer address "
1918 			    "option missing with a specified source.\n"));
1919 			bad_solicit = B_TRUE;
1920 			goto done;
1921 		}
1922 
1923 		/*
1924 		 * This is a regular solicitation.  If we're still in the
1925 		 * process of verifying the address, then don't respond at all
1926 		 * and don't keep track of the sender.
1927 		 */
1928 		if (our_nce->nce_state == ND_PROBE)
1929 			goto done;
1930 
1931 		/*
1932 		 * If the solicitation doesn't have sender hardware address
1933 		 * (legal for unicast solicitation), then process without
1934 		 * installing the return NCE.  Either we already know it, or
1935 		 * we'll be forced to look it up when (and if) we reply to the
1936 		 * packet.
1937 		 */
1938 		if (haddr == NULL)
1939 			goto no_source;
1940 
1941 		err = ndp_lookup_then_add_v6(ill,
1942 		    haddr,
1943 		    &src,	/* Soliciting nodes address */
1944 		    &ipv6_all_ones,
1945 		    &ipv6_all_zeros,
1946 		    0,
1947 		    0,
1948 		    ND_STALE,
1949 		    &nnce);
1950 		switch (err) {
1951 		case 0:
1952 			/* done with this entry */
1953 			NCE_REFRELE(nnce);
1954 			break;
1955 		case EEXIST:
1956 			/*
1957 			 * B_FALSE indicates this is not an
1958 			 * an advertisement.
1959 			 */
1960 			ndp_process(nnce, haddr, 0, B_FALSE);
1961 			NCE_REFRELE(nnce);
1962 			break;
1963 		default:
1964 			ip1dbg(("ndp_input_solicit: Can't create NCE %d\n",
1965 			    err));
1966 			goto done;
1967 		}
1968 no_source:
1969 		flag |= NDP_SOLICITED;
1970 	} else {
1971 		/*
1972 		 * No source link layer address option should be present in a
1973 		 * valid DAD request.
1974 		 */
1975 		if (haddr != NULL) {
1976 			ip1dbg(("ndp_input_solicit: source link-layer address "
1977 			    "option present with an unspecified source.\n"));
1978 			bad_solicit = B_TRUE;
1979 			goto done;
1980 		}
1981 		if (our_nce->nce_state == ND_PROBE) {
1982 			/*
1983 			 * Internally looped-back probes won't have DLPI
1984 			 * attached to them.  External ones (which are sent by
1985 			 * multicast) always will.  Just ignore our own
1986 			 * transmissions.
1987 			 */
1988 			if (dl_mp != NULL) {
1989 				/*
1990 				 * If someone else is probing our address, then
1991 				 * we've crossed wires.  Declare failure.
1992 				 */
1993 				ip_ndp_failure(ill, mp, dl_mp, our_nce);
1994 			}
1995 			goto done;
1996 		}
1997 		/*
1998 		 * This is a DAD probe.  Multicast the advertisement to the
1999 		 * all-nodes address.
2000 		 */
2001 		src = ipv6_all_hosts_mcast;
2002 	}
2003 	flag |= nce_advert_flags(our_nce);
2004 	/* Response to a solicitation */
2005 	(void) nce_xmit(ill,
2006 	    ND_NEIGHBOR_ADVERT,
2007 	    ill,	/* ill to be used for extracting ill_nd_lla */
2008 	    B_TRUE,	/* use ill_nd_lla */
2009 	    &target,	/* Source and target of the advertisement pkt */
2010 	    &src,	/* IP Destination (source of original pkt) */
2011 	    flag);
2012 done:
2013 	if (bad_solicit)
2014 		BUMP_MIB(mib, ipv6IfIcmpInBadNeighborSolicitations);
2015 	if (our_nce != NULL)
2016 		NCE_REFRELE(our_nce);
2017 }
2018 
2019 void
2020 ndp_input_advert(ill_t *ill, mblk_t *mp, mblk_t *dl_mp)
2021 {
2022 	nd_neighbor_advert_t *na;
2023 	uint32_t	hlen = ill->ill_nd_lla_len;
2024 	uchar_t		*haddr = NULL;
2025 	icmp6_t		*icmp_nd;
2026 	ip6_t		*ip6h;
2027 	nce_t		*dst_nce = NULL;
2028 	in6_addr_t	target;
2029 	nd_opt_hdr_t	*opt = NULL;
2030 	int		len;
2031 	mib2_ipv6IfIcmpEntry_t	*mib = ill->ill_icmp6_mib;
2032 	ip_stack_t	*ipst = ill->ill_ipst;
2033 
2034 	ip6h = (ip6_t *)mp->b_rptr;
2035 	icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
2036 	len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN;
2037 	na = (nd_neighbor_advert_t *)icmp_nd;
2038 	if (IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst) &&
2039 	    (na->nd_na_flags_reserved & ND_NA_FLAG_SOLICITED)) {
2040 		ip1dbg(("ndp_input_advert: Target is multicast but the "
2041 		    "solicited flag is not zero\n"));
2042 		BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements);
2043 		return;
2044 	}
2045 	target = na->nd_na_target;
2046 	if (IN6_IS_ADDR_MULTICAST(&target)) {
2047 		ip1dbg(("ndp_input_advert: Target is multicast!\n"));
2048 		BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements);
2049 		return;
2050 	}
2051 	if (len > sizeof (nd_neighbor_advert_t)) {
2052 		opt = (nd_opt_hdr_t *)&na[1];
2053 		if (!ndp_verify_optlen(opt,
2054 		    len - sizeof (nd_neighbor_advert_t))) {
2055 			ip1dbg(("ndp_input_advert: cannot verify SLLA\n"));
2056 			BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements);
2057 			return;
2058 		}
2059 		/* At this point we have a verified NA per spec */
2060 		len -= sizeof (nd_neighbor_advert_t);
2061 		opt = ndp_get_option(opt, len, ND_OPT_TARGET_LINKADDR);
2062 		if (opt != NULL) {
2063 			haddr = (uchar_t *)&opt[1];
2064 			if (hlen > opt->nd_opt_len * 8 - sizeof (*opt) ||
2065 			    hlen == 0) {
2066 				ip1dbg(("ndp_input_advert: bad SLLA\n"));
2067 				BUMP_MIB(mib,
2068 				    ipv6IfIcmpInBadNeighborAdvertisements);
2069 				return;
2070 			}
2071 		}
2072 	}
2073 
2074 	/*
2075 	 * If this interface is part of the group look at all the
2076 	 * ills in the group.
2077 	 */
2078 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
2079 	if (ill->ill_group != NULL)
2080 		ill = ill->ill_group->illgrp_ill;
2081 
2082 	for (; ill != NULL; ill = ill->ill_group_next) {
2083 		mutex_enter(&ill->ill_lock);
2084 		if (!ILL_CAN_LOOKUP(ill)) {
2085 			mutex_exit(&ill->ill_lock);
2086 			continue;
2087 		}
2088 		ill_refhold_locked(ill);
2089 		mutex_exit(&ill->ill_lock);
2090 		dst_nce = ndp_lookup_v6(ill, &target, B_FALSE);
2091 		/* We have to drop the lock since ndp_process calls put* */
2092 		rw_exit(&ipst->ips_ill_g_lock);
2093 		if (dst_nce != NULL) {
2094 			if ((dst_nce->nce_flags & NCE_F_PERMANENT) &&
2095 			    dst_nce->nce_state == ND_PROBE) {
2096 				/*
2097 				 * Someone else sent an advertisement for an
2098 				 * address that we're trying to configure.
2099 				 * Tear it down.  Note that dl_mp might be NULL
2100 				 * if we're getting a unicast reply.  This
2101 				 * isn't typically done (multicast is the norm
2102 				 * in response to a probe), but ip_ndp_failure
2103 				 * will handle the dl_mp == NULL case as well.
2104 				 */
2105 				ip_ndp_failure(ill, mp, dl_mp, dst_nce);
2106 			} else if (dst_nce->nce_flags & NCE_F_PERMANENT) {
2107 				/*
2108 				 * Someone just announced one of our local
2109 				 * addresses.  If it wasn't us, then this is a
2110 				 * conflict.  Defend the address or shut it
2111 				 * down.
2112 				 */
2113 				if (dl_mp != NULL &&
2114 				    (haddr == NULL ||
2115 				    nce_cmp_ll_addr(dst_nce, haddr,
2116 				    ill->ill_nd_lla_len))) {
2117 					ip_ndp_conflict(ill, mp, dl_mp,
2118 					    dst_nce);
2119 				}
2120 			} else {
2121 				if (na->nd_na_flags_reserved &
2122 				    ND_NA_FLAG_ROUTER) {
2123 					dst_nce->nce_flags |= NCE_F_ISROUTER;
2124 				}
2125 				/* B_TRUE indicates this an advertisement */
2126 				ndp_process(dst_nce, haddr,
2127 				    na->nd_na_flags_reserved, B_TRUE);
2128 			}
2129 			NCE_REFRELE(dst_nce);
2130 		}
2131 		rw_enter(&ipst->ips_ill_g_lock, RW_READER);
2132 		ill_refrele(ill);
2133 	}
2134 	rw_exit(&ipst->ips_ill_g_lock);
2135 }
2136 
2137 /*
2138  * Process NDP neighbor solicitation/advertisement messages.
2139  * The checksum has already checked o.k before reaching here.
2140  */
2141 void
2142 ndp_input(ill_t *ill, mblk_t *mp, mblk_t *dl_mp)
2143 {
2144 	icmp6_t		*icmp_nd;
2145 	ip6_t		*ip6h;
2146 	int		len;
2147 	mib2_ipv6IfIcmpEntry_t	*mib = ill->ill_icmp6_mib;
2148 
2149 
2150 	if (!pullupmsg(mp, -1)) {
2151 		ip1dbg(("ndp_input: pullupmsg failed\n"));
2152 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2153 		goto done;
2154 	}
2155 	ip6h = (ip6_t *)mp->b_rptr;
2156 	if (ip6h->ip6_hops != IPV6_MAX_HOPS) {
2157 		ip1dbg(("ndp_input: hoplimit != IPV6_MAX_HOPS\n"));
2158 		BUMP_MIB(mib, ipv6IfIcmpBadHoplimit);
2159 		goto done;
2160 	}
2161 	/*
2162 	 * NDP does not accept any extension headers between the
2163 	 * IP header and the ICMP header since e.g. a routing
2164 	 * header could be dangerous.
2165 	 * This assumes that any AH or ESP headers are removed
2166 	 * by ip prior to passing the packet to ndp_input.
2167 	 */
2168 	if (ip6h->ip6_nxt != IPPROTO_ICMPV6) {
2169 		ip1dbg(("ndp_input: Wrong next header 0x%x\n",
2170 		    ip6h->ip6_nxt));
2171 		BUMP_MIB(mib, ipv6IfIcmpInErrors);
2172 		goto done;
2173 	}
2174 	icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
2175 	ASSERT(icmp_nd->icmp6_type == ND_NEIGHBOR_SOLICIT ||
2176 	    icmp_nd->icmp6_type == ND_NEIGHBOR_ADVERT);
2177 	if (icmp_nd->icmp6_code != 0) {
2178 		ip1dbg(("ndp_input: icmp6 code != 0 \n"));
2179 		BUMP_MIB(mib, ipv6IfIcmpInErrors);
2180 		goto done;
2181 	}
2182 	len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN;
2183 	/*
2184 	 * Make sure packet length is large enough for either
2185 	 * a NS or a NA icmp packet.
2186 	 */
2187 	if (len <  sizeof (struct icmp6_hdr) + sizeof (struct in6_addr)) {
2188 		ip1dbg(("ndp_input: packet too short\n"));
2189 		BUMP_MIB(mib, ipv6IfIcmpInErrors);
2190 		goto done;
2191 	}
2192 	if (icmp_nd->icmp6_type == ND_NEIGHBOR_SOLICIT) {
2193 		ndp_input_solicit(ill, mp, dl_mp);
2194 	} else {
2195 		ndp_input_advert(ill, mp, dl_mp);
2196 	}
2197 done:
2198 	freemsg(mp);
2199 }
2200 
2201 /*
2202  * nce_xmit is called to form and transmit a ND solicitation or
2203  * advertisement ICMP packet.
2204  *
2205  * If the source address is unspecified and this isn't a probe (used for
2206  * duplicate address detection), an appropriate source address and link layer
2207  * address will be chosen here.  The link layer address option is included if
2208  * the source is specified (i.e., all non-probe packets), and omitted (per the
2209  * specification) otherwise.
2210  *
2211  * It returns B_FALSE only if it does a successful put() to the
2212  * corresponding ill's ill_wq otherwise returns B_TRUE.
2213  */
2214 static boolean_t
2215 nce_xmit(ill_t *ill, uint32_t operation, ill_t *hwaddr_ill,
2216     boolean_t use_nd_lla, const in6_addr_t *sender, const in6_addr_t *target,
2217     int flag)
2218 {
2219 	uint32_t	len;
2220 	icmp6_t 	*icmp6;
2221 	mblk_t		*mp;
2222 	ip6_t		*ip6h;
2223 	nd_opt_hdr_t	*opt;
2224 	uint_t		plen;
2225 	ip6i_t		*ip6i;
2226 	ipif_t		*src_ipif = NULL;
2227 	uint8_t		*hw_addr;
2228 	zoneid_t	zoneid = GLOBAL_ZONEID;
2229 
2230 	/*
2231 	 * If we have a unspecified source(sender) address, select a
2232 	 * proper source address for the solicitation here itself so
2233 	 * that we can initialize the h/w address correctly. This is
2234 	 * needed for interface groups as source address can come from
2235 	 * the whole group and the h/w address initialized from ill will
2236 	 * be wrong if the source address comes from a different ill.
2237 	 *
2238 	 * If the sender is specified then we use this address in order
2239 	 * to lookup the zoneid before calling ip_output_v6(). This is to
2240 	 * enable unicast ND_NEIGHBOR_ADVERT packets to be routed correctly
2241 	 * by IP (we cannot guarantee that the global zone has an interface
2242 	 * route to the destination).
2243 	 *
2244 	 * Note that the NA never comes here with the unspecified source
2245 	 * address. The following asserts that whenever the source
2246 	 * address is specified, the haddr also should be specified.
2247 	 */
2248 	ASSERT(IN6_IS_ADDR_UNSPECIFIED(sender) || (hwaddr_ill != NULL));
2249 
2250 	if (IN6_IS_ADDR_UNSPECIFIED(sender) && !(flag & NDP_PROBE)) {
2251 		ASSERT(operation != ND_NEIGHBOR_ADVERT);
2252 		/*
2253 		 * Pick a source address for this solicitation, but
2254 		 * restrict the selection to addresses assigned to the
2255 		 * output interface (or interface group).  We do this
2256 		 * because the destination will create a neighbor cache
2257 		 * entry for the source address of this packet, so the
2258 		 * source address had better be a valid neighbor.
2259 		 */
2260 		src_ipif = ipif_select_source_v6(ill, target, RESTRICT_TO_ILL,
2261 		    IPV6_PREFER_SRC_DEFAULT, ALL_ZONES);
2262 		if (src_ipif == NULL) {
2263 			char buf[INET6_ADDRSTRLEN];
2264 
2265 			ip1dbg(("nce_xmit: No source ipif for dst %s\n",
2266 			    inet_ntop(AF_INET6, (char *)target, buf,
2267 			    sizeof (buf))));
2268 			return (B_TRUE);
2269 		}
2270 		sender = &src_ipif->ipif_v6src_addr;
2271 		hwaddr_ill = src_ipif->ipif_ill;
2272 	} else if (!(IN6_IS_ADDR_UNSPECIFIED(sender))) {
2273 		zoneid = ipif_lookup_addr_zoneid_v6(sender, ill, ill->ill_ipst);
2274 		/*
2275 		 * It's possible for ipif_lookup_addr_zoneid_v6() to return
2276 		 * ALL_ZONES if it cannot find a matching ipif for the address
2277 		 * we are trying to use. In this case we err on the side of
2278 		 * trying to send the packet by defaulting to the GLOBAL_ZONEID.
2279 		 */
2280 		if (zoneid == ALL_ZONES)
2281 			zoneid = GLOBAL_ZONEID;
2282 	}
2283 
2284 	/*
2285 	 * Always make sure that the NS/NA packets don't get load
2286 	 * spread. This is needed so that the probe packets sent
2287 	 * by the in.mpathd daemon can really go out on the desired
2288 	 * interface. Probe packets are made to go out on a desired
2289 	 * interface by including a ip6i with ATTACH_IF flag. As these
2290 	 * packets indirectly end up sending/receiving NS/NA packets
2291 	 * (neighbor doing NUD), we have to make sure that NA
2292 	 * also go out on the same interface.
2293 	 */
2294 	plen = (sizeof (nd_opt_hdr_t) + ill->ill_nd_lla_len + 7) / 8;
2295 	len = IPV6_HDR_LEN + sizeof (ip6i_t) + sizeof (nd_neighbor_advert_t) +
2296 	    plen * 8;
2297 	mp = allocb(len,  BPRI_LO);
2298 	if (mp == NULL) {
2299 		if (src_ipif != NULL)
2300 			ipif_refrele(src_ipif);
2301 		return (B_TRUE);
2302 	}
2303 	bzero((char *)mp->b_rptr, len);
2304 	mp->b_wptr = mp->b_rptr + len;
2305 
2306 	ip6i = (ip6i_t *)mp->b_rptr;
2307 	ip6i->ip6i_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
2308 	ip6i->ip6i_nxt = IPPROTO_RAW;
2309 	ip6i->ip6i_flags = IP6I_ATTACH_IF | IP6I_HOPLIMIT;
2310 	if (flag & NDP_PROBE)
2311 		ip6i->ip6i_flags |= IP6I_UNSPEC_SRC;
2312 	ip6i->ip6i_ifindex = ill->ill_phyint->phyint_ifindex;
2313 
2314 	ip6h = (ip6_t *)(mp->b_rptr + sizeof (ip6i_t));
2315 	ip6h->ip6_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
2316 	ip6h->ip6_plen = htons(len - IPV6_HDR_LEN - sizeof (ip6i_t));
2317 	ip6h->ip6_nxt = IPPROTO_ICMPV6;
2318 	ip6h->ip6_hops = IPV6_MAX_HOPS;
2319 	ip6h->ip6_dst = *target;
2320 	icmp6 = (icmp6_t *)&ip6h[1];
2321 
2322 	opt = (nd_opt_hdr_t *)((uint8_t *)ip6h + IPV6_HDR_LEN +
2323 	    sizeof (nd_neighbor_advert_t));
2324 
2325 	if (operation == ND_NEIGHBOR_SOLICIT) {
2326 		nd_neighbor_solicit_t *ns = (nd_neighbor_solicit_t *)icmp6;
2327 
2328 		if (!(flag & NDP_PROBE))
2329 			opt->nd_opt_type = ND_OPT_SOURCE_LINKADDR;
2330 		ip6h->ip6_src = *sender;
2331 		ns->nd_ns_target = *target;
2332 		if (!(flag & NDP_UNICAST)) {
2333 			/* Form multicast address of the target */
2334 			ip6h->ip6_dst = ipv6_solicited_node_mcast;
2335 			ip6h->ip6_dst.s6_addr32[3] |=
2336 			    ns->nd_ns_target.s6_addr32[3];
2337 		}
2338 	} else {
2339 		nd_neighbor_advert_t *na = (nd_neighbor_advert_t *)icmp6;
2340 
2341 		ASSERT(!(flag & NDP_PROBE));
2342 		opt->nd_opt_type = ND_OPT_TARGET_LINKADDR;
2343 		ip6h->ip6_src = *sender;
2344 		na->nd_na_target = *sender;
2345 		if (flag & NDP_ISROUTER)
2346 			na->nd_na_flags_reserved |= ND_NA_FLAG_ROUTER;
2347 		if (flag & NDP_SOLICITED)
2348 			na->nd_na_flags_reserved |= ND_NA_FLAG_SOLICITED;
2349 		if (flag & NDP_ORIDE)
2350 			na->nd_na_flags_reserved |= ND_NA_FLAG_OVERRIDE;
2351 	}
2352 
2353 	hw_addr = NULL;
2354 	if (!(flag & NDP_PROBE)) {
2355 		hw_addr = use_nd_lla ? hwaddr_ill->ill_nd_lla :
2356 		    hwaddr_ill->ill_phys_addr;
2357 		if (hw_addr != NULL) {
2358 			/* Fill in link layer address and option len */
2359 			opt->nd_opt_len = (uint8_t)plen;
2360 			bcopy(hw_addr, &opt[1], hwaddr_ill->ill_nd_lla_len);
2361 		}
2362 	}
2363 	if (hw_addr == NULL) {
2364 		/* If there's no link layer address option, then strip it. */
2365 		len -= plen * 8;
2366 		mp->b_wptr = mp->b_rptr + len;
2367 		ip6h->ip6_plen = htons(len - IPV6_HDR_LEN - sizeof (ip6i_t));
2368 	}
2369 
2370 	icmp6->icmp6_type = (uint8_t)operation;
2371 	icmp6->icmp6_code = 0;
2372 	/*
2373 	 * Prepare for checksum by putting icmp length in the icmp
2374 	 * checksum field. The checksum is calculated in ip_wput_v6.
2375 	 */
2376 	icmp6->icmp6_cksum = ip6h->ip6_plen;
2377 
2378 	if (src_ipif != NULL)
2379 		ipif_refrele(src_ipif);
2380 
2381 	ip_output_v6((void *)(uintptr_t)zoneid, mp, ill->ill_wq, IP_WPUT);
2382 	return (B_FALSE);
2383 }
2384 
2385 /*
2386  * Make a link layer address (does not include the SAP) from an nce.
2387  * To form the link layer address, use the last four bytes of ipv6
2388  * address passed in and the fixed offset stored in nce.
2389  */
2390 static void
2391 nce_make_mapping(nce_t *nce, uchar_t *addrpos, uchar_t *addr)
2392 {
2393 	uchar_t *mask, *to;
2394 	ill_t	*ill = nce->nce_ill;
2395 	int 	len;
2396 
2397 	if (ill->ill_net_type == IRE_IF_NORESOLVER)
2398 		return;
2399 	ASSERT(nce->nce_res_mp != NULL);
2400 	ASSERT(ill->ill_net_type == IRE_IF_RESOLVER);
2401 	ASSERT(nce->nce_flags & NCE_F_MAPPING);
2402 	ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&nce->nce_extract_mask));
2403 	ASSERT(addr != NULL);
2404 	bcopy(nce->nce_res_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill),
2405 	    addrpos, ill->ill_nd_lla_len);
2406 	len = MIN((int)ill->ill_nd_lla_len - nce->nce_ll_extract_start,
2407 	    IPV6_ADDR_LEN);
2408 	mask = (uchar_t *)&nce->nce_extract_mask;
2409 	mask += (IPV6_ADDR_LEN - len);
2410 	addr += (IPV6_ADDR_LEN - len);
2411 	to = addrpos + nce->nce_ll_extract_start;
2412 	while (len-- > 0)
2413 		*to++ |= *mask++ & *addr++;
2414 }
2415 
2416 /*
2417  * Pass a cache report back out via NDD.
2418  */
2419 /* ARGSUSED */
2420 int
2421 ndp_report(queue_t *q, mblk_t *mp, caddr_t arg, cred_t *ioc_cr)
2422 {
2423 	ip_stack_t	*ipst;
2424 
2425 	if (CONN_Q(q))
2426 		ipst = CONNQ_TO_IPST(q);
2427 	else
2428 		ipst = ILLQ_TO_IPST(q);
2429 
2430 	(void) mi_mpprintf(mp, "ifname      hardware addr    flags"
2431 	    "     proto addr/mask");
2432 	ndp_walk(NULL, (pfi_t)nce_report1, (uchar_t *)mp, ipst);
2433 	return (0);
2434 }
2435 
2436 /*
2437  * Add a single line to the NDP Cache Entry Report.
2438  */
2439 static void
2440 nce_report1(nce_t *nce, uchar_t *mp_arg)
2441 {
2442 	ill_t		*ill = nce->nce_ill;
2443 	char		local_buf[INET6_ADDRSTRLEN];
2444 	uchar_t		flags_buf[10];
2445 	uint32_t	flags = nce->nce_flags;
2446 	mblk_t		*mp = (mblk_t *)mp_arg;
2447 	uchar_t		*h;
2448 	uchar_t		*m = flags_buf;
2449 	in6_addr_t	v6addr;
2450 	uint64_t	now;
2451 
2452 	/*
2453 	 * Lock the nce to protect nce_res_mp from being changed
2454 	 * if an external resolver address resolution completes
2455 	 * while nce_res_mp is being accessed here.
2456 	 *
2457 	 * Deal with all address formats, not just Ethernet-specific
2458 	 * In addition, make sure that the mblk has enough space
2459 	 * before writing to it. If is doesn't, allocate a new one.
2460 	 */
2461 	if (nce->nce_ipversion == IPV4_VERSION) {
2462 		/*
2463 		 * Don't include v4 NCEs in NDP cache entry report.
2464 		 * But sanity check for lingering ND_INITIAL entries
2465 		 * when we do 'ndd -get /dev/ip ip_ndp_cache_report'
2466 		 */
2467 		if (nce->nce_state == ND_INITIAL) {
2468 
2469 			now = TICK_TO_MSEC(lbolt64);
2470 			if (now - nce->nce_init_time > NCE_STUCK_TIMEOUT) {
2471 				DTRACE_PROBE1(nce__stuck, nce_t *, nce);
2472 			}
2473 		}
2474 		return;
2475 	}
2476 
2477 	ASSERT(ill != NULL);
2478 	v6addr = nce->nce_mask;
2479 	if (flags & NCE_F_PERMANENT)
2480 		*m++ = 'P';
2481 	if (flags & NCE_F_ISROUTER)
2482 		*m++ = 'R';
2483 	if (flags & NCE_F_MAPPING)
2484 		*m++ = 'M';
2485 	*m = '\0';
2486 
2487 	if (ill->ill_net_type == IRE_IF_RESOLVER) {
2488 		size_t		addrlen;
2489 		char		*addr_buf;
2490 		dl_unitdata_req_t	*dl;
2491 
2492 		mutex_enter(&nce->nce_lock);
2493 		h = nce->nce_res_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill);
2494 		dl = (dl_unitdata_req_t *)nce->nce_res_mp->b_rptr;
2495 		if (ill->ill_flags & ILLF_XRESOLV)
2496 			addrlen = (3 * (dl->dl_dest_addr_length));
2497 		else
2498 			addrlen = (3 * (ill->ill_nd_lla_len));
2499 		if (addrlen <= 0) {
2500 			mutex_exit(&nce->nce_lock);
2501 			(void) mi_mpprintf(mp,
2502 			    "%8s %9s %5s %s/%d",
2503 			    ill->ill_name,
2504 			    "None",
2505 			    (uchar_t *)&flags_buf,
2506 			    inet_ntop(AF_INET6, (char *)&nce->nce_addr,
2507 			    (char *)local_buf, sizeof (local_buf)),
2508 			    ip_mask_to_plen_v6(&v6addr));
2509 		} else {
2510 			/*
2511 			 * Convert the hardware/lla address to ascii
2512 			 */
2513 			addr_buf = kmem_zalloc(addrlen, KM_NOSLEEP);
2514 			if (addr_buf == NULL) {
2515 				mutex_exit(&nce->nce_lock);
2516 				return;
2517 			}
2518 			(void) mac_colon_addr((uint8_t *)h,
2519 			    (ill->ill_flags & ILLF_XRESOLV) ?
2520 			    dl->dl_dest_addr_length : ill->ill_nd_lla_len,
2521 			    addr_buf, addrlen);
2522 			mutex_exit(&nce->nce_lock);
2523 			(void) mi_mpprintf(mp, "%8s %17s %5s %s/%d",
2524 			    ill->ill_name, addr_buf, (uchar_t *)&flags_buf,
2525 			    inet_ntop(AF_INET6, (char *)&nce->nce_addr,
2526 			    (char *)local_buf, sizeof (local_buf)),
2527 			    ip_mask_to_plen_v6(&v6addr));
2528 			kmem_free(addr_buf, addrlen);
2529 		}
2530 	} else {
2531 		(void) mi_mpprintf(mp,
2532 		    "%8s %9s %5s %s/%d",
2533 		    ill->ill_name,
2534 		    "None",
2535 		    (uchar_t *)&flags_buf,
2536 		    inet_ntop(AF_INET6, (char *)&nce->nce_addr,
2537 		    (char *)local_buf, sizeof (local_buf)),
2538 		    ip_mask_to_plen_v6(&v6addr));
2539 	}
2540 }
2541 
2542 mblk_t *
2543 nce_udreq_alloc(ill_t *ill)
2544 {
2545 	mblk_t	*template_mp = NULL;
2546 	dl_unitdata_req_t *dlur;
2547 	int	sap_length;
2548 
2549 	ASSERT(ill->ill_isv6);
2550 
2551 	sap_length = ill->ill_sap_length;
2552 	template_mp = ip_dlpi_alloc(sizeof (dl_unitdata_req_t) +
2553 	    ill->ill_nd_lla_len + ABS(sap_length), DL_UNITDATA_REQ);
2554 	if (template_mp == NULL)
2555 		return (NULL);
2556 
2557 	dlur = (dl_unitdata_req_t *)template_mp->b_rptr;
2558 	dlur->dl_priority.dl_min = 0;
2559 	dlur->dl_priority.dl_max = 0;
2560 	dlur->dl_dest_addr_length = ABS(sap_length) + ill->ill_nd_lla_len;
2561 	dlur->dl_dest_addr_offset = sizeof (dl_unitdata_req_t);
2562 
2563 	/* Copy in the SAP value. */
2564 	NCE_LL_SAP_COPY(ill, template_mp);
2565 
2566 	return (template_mp);
2567 }
2568 
2569 /*
2570  * NDP retransmit timer.
2571  * This timer goes off when:
2572  * a. It is time to retransmit NS for resolver.
2573  * b. It is time to send reachability probes.
2574  */
2575 void
2576 ndp_timer(void *arg)
2577 {
2578 	nce_t		*nce = arg;
2579 	ill_t		*ill = nce->nce_ill;
2580 	uint32_t	ms;
2581 	char		addrbuf[INET6_ADDRSTRLEN];
2582 	mblk_t		*mp;
2583 	boolean_t	dropped = B_FALSE;
2584 	ip_stack_t	*ipst = ill->ill_ipst;
2585 
2586 	/*
2587 	 * The timer has to be cancelled by ndp_delete before doing the final
2588 	 * refrele. So the NCE is guaranteed to exist when the timer runs
2589 	 * until it clears the timeout_id. Before clearing the timeout_id
2590 	 * bump up the refcnt so that we can continue to use the nce
2591 	 */
2592 	ASSERT(nce != NULL);
2593 
2594 	/*
2595 	 * Grab the ill_g_lock now itself to avoid lock order problems.
2596 	 * nce_solicit needs ill_g_lock to be able to traverse ills
2597 	 */
2598 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
2599 	mutex_enter(&nce->nce_lock);
2600 	NCE_REFHOLD_LOCKED(nce);
2601 	nce->nce_timeout_id = 0;
2602 
2603 	/*
2604 	 * Check the reachability state first.
2605 	 */
2606 	switch (nce->nce_state) {
2607 	case ND_DELAY:
2608 		rw_exit(&ipst->ips_ill_g_lock);
2609 		nce->nce_state = ND_PROBE;
2610 		mutex_exit(&nce->nce_lock);
2611 		(void) nce_xmit(ill, ND_NEIGHBOR_SOLICIT, NULL, B_FALSE,
2612 		    &ipv6_all_zeros, &nce->nce_addr, NDP_UNICAST);
2613 		if (ip_debug > 3) {
2614 			/* ip2dbg */
2615 			pr_addr_dbg("ndp_timer: state for %s changed "
2616 			    "to PROBE\n", AF_INET6, &nce->nce_addr);
2617 		}
2618 		NDP_RESTART_TIMER(nce, ill->ill_reachable_retrans_time);
2619 		NCE_REFRELE(nce);
2620 		return;
2621 	case ND_PROBE:
2622 		/* must be retransmit timer */
2623 		rw_exit(&ipst->ips_ill_g_lock);
2624 		nce->nce_pcnt--;
2625 		ASSERT(nce->nce_pcnt < ND_MAX_UNICAST_SOLICIT &&
2626 		    nce->nce_pcnt >= -1);
2627 		if (nce->nce_pcnt > 0) {
2628 			/*
2629 			 * As per RFC2461, the nce gets deleted after
2630 			 * MAX_UNICAST_SOLICIT unsuccessful re-transmissions.
2631 			 * Note that the first unicast solicitation is sent
2632 			 * during the DELAY state.
2633 			 */
2634 			ip2dbg(("ndp_timer: pcount=%x dst %s\n",
2635 			    nce->nce_pcnt, inet_ntop(AF_INET6, &nce->nce_addr,
2636 			    addrbuf, sizeof (addrbuf))));
2637 			mutex_exit(&nce->nce_lock);
2638 			dropped = nce_xmit(ill, ND_NEIGHBOR_SOLICIT, NULL,
2639 			    B_FALSE, &ipv6_all_zeros, &nce->nce_addr,
2640 			    (nce->nce_flags & NCE_F_PERMANENT) ? NDP_PROBE :
2641 			    NDP_UNICAST);
2642 			if (dropped) {
2643 				mutex_enter(&nce->nce_lock);
2644 				nce->nce_pcnt++;
2645 				mutex_exit(&nce->nce_lock);
2646 			}
2647 			NDP_RESTART_TIMER(nce, ILL_PROBE_INTERVAL(ill));
2648 		} else if (nce->nce_pcnt < 0) {
2649 			/* No hope, delete the nce */
2650 			nce->nce_state = ND_UNREACHABLE;
2651 			mutex_exit(&nce->nce_lock);
2652 			if (ip_debug > 2) {
2653 				/* ip1dbg */
2654 				pr_addr_dbg("ndp_timer: Delete IRE for"
2655 				    " dst %s\n", AF_INET6, &nce->nce_addr);
2656 			}
2657 			ndp_delete(nce);
2658 		} else if (!(nce->nce_flags & NCE_F_PERMANENT)) {
2659 			/* Wait RetransTimer, before deleting the entry */
2660 			ip2dbg(("ndp_timer: pcount=%x dst %s\n",
2661 			    nce->nce_pcnt, inet_ntop(AF_INET6,
2662 			    &nce->nce_addr, addrbuf, sizeof (addrbuf))));
2663 			mutex_exit(&nce->nce_lock);
2664 			/* Wait one interval before killing */
2665 			NDP_RESTART_TIMER(nce, ill->ill_reachable_retrans_time);
2666 		} else if (ill->ill_phyint->phyint_flags & PHYI_RUNNING) {
2667 			ipif_t *ipif;
2668 
2669 			/*
2670 			 * We're done probing, and we can now declare this
2671 			 * address to be usable.  Let IP know that it's ok to
2672 			 * use.
2673 			 */
2674 			nce->nce_state = ND_REACHABLE;
2675 			mutex_exit(&nce->nce_lock);
2676 			ipif = ipif_lookup_addr_v6(&nce->nce_addr, ill,
2677 			    ALL_ZONES, NULL, NULL, NULL, NULL, ipst);
2678 			if (ipif != NULL) {
2679 				if (ipif->ipif_was_dup) {
2680 					char ibuf[LIFNAMSIZ + 10];
2681 					char sbuf[INET6_ADDRSTRLEN];
2682 
2683 					ipif->ipif_was_dup = B_FALSE;
2684 					(void) inet_ntop(AF_INET6,
2685 					    &ipif->ipif_v6lcl_addr,
2686 					    sbuf, sizeof (sbuf));
2687 					ipif_get_name(ipif, ibuf,
2688 					    sizeof (ibuf));
2689 					cmn_err(CE_NOTE, "recovered address "
2690 					    "%s on %s", sbuf, ibuf);
2691 				}
2692 				if ((ipif->ipif_flags & IPIF_UP) &&
2693 				    !ipif->ipif_addr_ready) {
2694 					ip_rts_ifmsg(ipif);
2695 					ip_rts_newaddrmsg(RTM_ADD, 0, ipif);
2696 					sctp_update_ipif(ipif, SCTP_IPIF_UP);
2697 				}
2698 				ipif->ipif_addr_ready = 1;
2699 				ipif_refrele(ipif);
2700 			}
2701 			/* Begin defending our new address */
2702 			nce->nce_unsolicit_count = 0;
2703 			dropped = nce_xmit(ill, ND_NEIGHBOR_ADVERT, ill,
2704 			    B_FALSE, &nce->nce_addr, &ipv6_all_hosts_mcast,
2705 			    nce_advert_flags(nce));
2706 			if (dropped) {
2707 				nce->nce_unsolicit_count = 1;
2708 				NDP_RESTART_TIMER(nce,
2709 				    ipst->ips_ip_ndp_unsolicit_interval);
2710 			} else if (ipst->ips_ip_ndp_defense_interval != 0) {
2711 				NDP_RESTART_TIMER(nce,
2712 				    ipst->ips_ip_ndp_defense_interval);
2713 			}
2714 		} else {
2715 			/*
2716 			 * This is an address we're probing to be our own, but
2717 			 * the ill is down.  Wait until it comes back before
2718 			 * doing anything, but switch to reachable state so
2719 			 * that the restart will work.
2720 			 */
2721 			nce->nce_state = ND_REACHABLE;
2722 			mutex_exit(&nce->nce_lock);
2723 		}
2724 		NCE_REFRELE(nce);
2725 		return;
2726 	case ND_INCOMPLETE:
2727 		/*
2728 		 * Must be resolvers retransmit timer.
2729 		 */
2730 		for (mp = nce->nce_qd_mp; mp != NULL; mp = mp->b_next) {
2731 			ip6i_t	*ip6i;
2732 			ip6_t	*ip6h;
2733 			mblk_t *data_mp;
2734 
2735 			/*
2736 			 * Walk the list of packets queued, and see if there
2737 			 * are any multipathing probe packets. Such packets
2738 			 * are always queued at the head. Since this is a
2739 			 * retransmit timer firing, mark such packets as
2740 			 * delayed in ND resolution. This info will be used
2741 			 * in ip_wput_v6(). Multipathing probe packets will
2742 			 * always have an ip6i_t. Once we hit a packet without
2743 			 * it, we can break out of this loop.
2744 			 */
2745 			if (mp->b_datap->db_type == M_CTL)
2746 				data_mp = mp->b_cont;
2747 			else
2748 				data_mp = mp;
2749 
2750 			ip6h = (ip6_t *)data_mp->b_rptr;
2751 			if (ip6h->ip6_nxt != IPPROTO_RAW)
2752 				break;
2753 
2754 			/*
2755 			 * This message should have been pulled up already in
2756 			 * ip_wput_v6. We can't do pullups here because the
2757 			 * b_next/b_prev is non-NULL.
2758 			 */
2759 			ip6i = (ip6i_t *)ip6h;
2760 			ASSERT((data_mp->b_wptr - (uchar_t *)ip6i) >=
2761 			    sizeof (ip6i_t) + IPV6_HDR_LEN);
2762 
2763 			/* Mark this packet as delayed due to ND resolution */
2764 			if (ip6i->ip6i_flags & IP6I_DROP_IFDELAYED)
2765 				ip6i->ip6i_flags |= IP6I_ND_DELAYED;
2766 		}
2767 		if (nce->nce_qd_mp != NULL) {
2768 			ms = nce_solicit(nce, NULL);
2769 			rw_exit(&ipst->ips_ill_g_lock);
2770 			if (ms == 0) {
2771 				if (nce->nce_state != ND_REACHABLE) {
2772 					mutex_exit(&nce->nce_lock);
2773 					nce_resolv_failed(nce);
2774 					ndp_delete(nce);
2775 				} else {
2776 					mutex_exit(&nce->nce_lock);
2777 				}
2778 			} else {
2779 				mutex_exit(&nce->nce_lock);
2780 				NDP_RESTART_TIMER(nce, (clock_t)ms);
2781 			}
2782 			NCE_REFRELE(nce);
2783 			return;
2784 		}
2785 		mutex_exit(&nce->nce_lock);
2786 		rw_exit(&ipst->ips_ill_g_lock);
2787 		NCE_REFRELE(nce);
2788 		break;
2789 	case ND_REACHABLE :
2790 		rw_exit(&ipst->ips_ill_g_lock);
2791 		if (((nce->nce_flags & NCE_F_UNSOL_ADV) &&
2792 		    nce->nce_unsolicit_count != 0) ||
2793 		    ((nce->nce_flags & NCE_F_PERMANENT) &&
2794 		    ipst->ips_ip_ndp_defense_interval != 0)) {
2795 			if (nce->nce_unsolicit_count > 0)
2796 				nce->nce_unsolicit_count--;
2797 			mutex_exit(&nce->nce_lock);
2798 			dropped = nce_xmit(ill,
2799 			    ND_NEIGHBOR_ADVERT,
2800 			    ill,	/* ill to be used for hw addr */
2801 			    B_FALSE,	/* use ill_phys_addr */
2802 			    &nce->nce_addr,
2803 			    &ipv6_all_hosts_mcast,
2804 			    nce_advert_flags(nce));
2805 			if (dropped) {
2806 				mutex_enter(&nce->nce_lock);
2807 				nce->nce_unsolicit_count++;
2808 				mutex_exit(&nce->nce_lock);
2809 			}
2810 			if (nce->nce_unsolicit_count != 0) {
2811 				NDP_RESTART_TIMER(nce,
2812 				    ipst->ips_ip_ndp_unsolicit_interval);
2813 			} else {
2814 				NDP_RESTART_TIMER(nce,
2815 				    ipst->ips_ip_ndp_defense_interval);
2816 			}
2817 		} else {
2818 			mutex_exit(&nce->nce_lock);
2819 		}
2820 		NCE_REFRELE(nce);
2821 		break;
2822 	default:
2823 		rw_exit(&ipst->ips_ill_g_lock);
2824 		mutex_exit(&nce->nce_lock);
2825 		NCE_REFRELE(nce);
2826 		break;
2827 	}
2828 }
2829 
2830 /*
2831  * Set a link layer address from the ll_addr passed in.
2832  * Copy SAP from ill.
2833  */
2834 static void
2835 nce_set_ll(nce_t *nce, uchar_t *ll_addr)
2836 {
2837 	ill_t	*ill = nce->nce_ill;
2838 	uchar_t	*woffset;
2839 
2840 	ASSERT(ll_addr != NULL);
2841 	/* Always called before fast_path_probe */
2842 	ASSERT(nce->nce_fp_mp == NULL);
2843 	if (ill->ill_sap_length != 0) {
2844 		/*
2845 		 * Copy the SAP type specified in the
2846 		 * request into the xmit template.
2847 		 */
2848 		NCE_LL_SAP_COPY(ill, nce->nce_res_mp);
2849 	}
2850 	if (ill->ill_phys_addr_length > 0) {
2851 		/*
2852 		 * The bcopy() below used to be called for the physical address
2853 		 * length rather than the link layer address length. For
2854 		 * ethernet and many other media, the phys_addr and lla are
2855 		 * identical.
2856 		 * However, with xresolv interfaces being introduced, the
2857 		 * phys_addr and lla are no longer the same, and the physical
2858 		 * address may not have any useful meaning, so we use the lla
2859 		 * for IPv6 address resolution and destination addressing.
2860 		 *
2861 		 * For PPP or other interfaces with a zero length
2862 		 * physical address, don't do anything here.
2863 		 * The bcopy() with a zero phys_addr length was previously
2864 		 * a no-op for interfaces with a zero-length physical address.
2865 		 * Using the lla for them would change the way they operate.
2866 		 * Doing nothing in such cases preserves expected behavior.
2867 		 */
2868 		woffset = nce->nce_res_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill);
2869 		bcopy(ll_addr, woffset, ill->ill_nd_lla_len);
2870 	}
2871 }
2872 
2873 static boolean_t
2874 nce_cmp_ll_addr(const nce_t *nce, const uchar_t *ll_addr, uint32_t ll_addr_len)
2875 {
2876 	ill_t	*ill = nce->nce_ill;
2877 	uchar_t	*ll_offset;
2878 
2879 	ASSERT(nce->nce_res_mp != NULL);
2880 	if (ll_addr == NULL)
2881 		return (B_FALSE);
2882 	ll_offset = nce->nce_res_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill);
2883 	if (bcmp(ll_addr, ll_offset, ll_addr_len) != 0)
2884 		return (B_TRUE);
2885 	return (B_FALSE);
2886 }
2887 
2888 /*
2889  * Updates the link layer address or the reachability state of
2890  * a cache entry.  Reset probe counter if needed.
2891  */
2892 static void
2893 nce_update(nce_t *nce, uint16_t new_state, uchar_t *new_ll_addr)
2894 {
2895 	ill_t	*ill = nce->nce_ill;
2896 	boolean_t need_stop_timer = B_FALSE;
2897 	boolean_t need_fastpath_update = B_FALSE;
2898 
2899 	ASSERT(MUTEX_HELD(&nce->nce_lock));
2900 	ASSERT(nce->nce_ipversion == IPV6_VERSION);
2901 	/*
2902 	 * If this interface does not do NUD, there is no point
2903 	 * in allowing an update to the cache entry.  Although
2904 	 * we will respond to NS.
2905 	 * The only time we accept an update for a resolver when
2906 	 * NUD is turned off is when it has just been created.
2907 	 * Non-Resolvers will always be created as REACHABLE.
2908 	 */
2909 	if (new_state != ND_UNCHANGED) {
2910 		if ((nce->nce_flags & NCE_F_NONUD) &&
2911 		    (nce->nce_state != ND_INCOMPLETE))
2912 			return;
2913 		ASSERT((int16_t)new_state >= ND_STATE_VALID_MIN);
2914 		ASSERT((int16_t)new_state <= ND_STATE_VALID_MAX);
2915 		need_stop_timer = B_TRUE;
2916 		if (new_state == ND_REACHABLE)
2917 			nce->nce_last = TICK_TO_MSEC(lbolt64);
2918 		else {
2919 			/* We force NUD in this case */
2920 			nce->nce_last = 0;
2921 		}
2922 		nce->nce_state = new_state;
2923 		nce->nce_pcnt = ND_MAX_UNICAST_SOLICIT;
2924 	}
2925 	/*
2926 	 * In case of fast path we need to free the the fastpath
2927 	 * M_DATA and do another probe.  Otherwise we can just
2928 	 * overwrite the DL_UNITDATA_REQ data, noting we'll lose
2929 	 * whatever packets that happens to be transmitting at the time.
2930 	 */
2931 	if (new_ll_addr != NULL) {
2932 		ASSERT(nce->nce_res_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill) +
2933 		    ill->ill_nd_lla_len <= nce->nce_res_mp->b_wptr);
2934 		bcopy(new_ll_addr, nce->nce_res_mp->b_rptr +
2935 		    NCE_LL_ADDR_OFFSET(ill), ill->ill_nd_lla_len);
2936 		if (nce->nce_fp_mp != NULL) {
2937 			freemsg(nce->nce_fp_mp);
2938 			nce->nce_fp_mp = NULL;
2939 		}
2940 		need_fastpath_update = B_TRUE;
2941 	}
2942 	mutex_exit(&nce->nce_lock);
2943 	if (need_stop_timer) {
2944 		(void) untimeout(nce->nce_timeout_id);
2945 		nce->nce_timeout_id = 0;
2946 	}
2947 	if (need_fastpath_update)
2948 		nce_fastpath(nce);
2949 	mutex_enter(&nce->nce_lock);
2950 }
2951 
2952 void
2953 nce_queue_mp_common(nce_t *nce, mblk_t *mp, boolean_t head_insert)
2954 {
2955 	uint_t	count = 0;
2956 	mblk_t  **mpp;
2957 
2958 	ASSERT(MUTEX_HELD(&nce->nce_lock));
2959 
2960 	for (mpp = &nce->nce_qd_mp; *mpp != NULL;
2961 	    mpp = &(*mpp)->b_next) {
2962 		if (++count >
2963 		    nce->nce_ill->ill_max_buf) {
2964 			mblk_t *tmp = nce->nce_qd_mp->b_next;
2965 
2966 			nce->nce_qd_mp->b_next = NULL;
2967 			nce->nce_qd_mp->b_prev = NULL;
2968 			freemsg(nce->nce_qd_mp);
2969 			nce->nce_qd_mp = tmp;
2970 		}
2971 	}
2972 	/* put this on the list */
2973 	if (head_insert) {
2974 		mp->b_next = nce->nce_qd_mp;
2975 		nce->nce_qd_mp = mp;
2976 	} else {
2977 		*mpp = mp;
2978 	}
2979 }
2980 
2981 static void
2982 nce_queue_mp(nce_t *nce, mblk_t *mp)
2983 {
2984 	boolean_t head_insert = B_FALSE;
2985 	ip6_t	*ip6h;
2986 	ip6i_t	*ip6i;
2987 	mblk_t *data_mp;
2988 
2989 	ASSERT(MUTEX_HELD(&nce->nce_lock));
2990 
2991 	if (mp->b_datap->db_type == M_CTL)
2992 		data_mp = mp->b_cont;
2993 	else
2994 		data_mp = mp;
2995 	ip6h = (ip6_t *)data_mp->b_rptr;
2996 	if (ip6h->ip6_nxt == IPPROTO_RAW) {
2997 		/*
2998 		 * This message should have been pulled up already in
2999 		 * ip_wput_v6. We can't do pullups here because the message
3000 		 * could be from the nce_qd_mp which could have b_next/b_prev
3001 		 * non-NULL.
3002 		 */
3003 		ip6i = (ip6i_t *)ip6h;
3004 		ASSERT((data_mp->b_wptr - (uchar_t *)ip6i) >=
3005 		    sizeof (ip6i_t) + IPV6_HDR_LEN);
3006 		/*
3007 		 * Multipathing probe packets have IP6I_DROP_IFDELAYED set.
3008 		 * This has 2 aspects mentioned below.
3009 		 * 1. Perform head insertion in the nce_qd_mp for these packets.
3010 		 * This ensures that next retransmit of ND solicitation
3011 		 * will use the interface specified by the probe packet,
3012 		 * for both NS and NA. This corresponds to the src address
3013 		 * in the IPv6 packet. If we insert at tail, we will be
3014 		 * depending on the packet at the head for successful
3015 		 * ND resolution. This is not reliable, because the interface
3016 		 * on which the NA arrives could be different from the interface
3017 		 * on which the NS was sent, and if the receiving interface is
3018 		 * failed, it will appear that the sending interface is also
3019 		 * failed, causing in.mpathd to misdiagnose this as link
3020 		 * failure.
3021 		 * 2. Drop the original packet, if the ND resolution did not
3022 		 * succeed in the first attempt. However we will create the
3023 		 * nce and the ire, as soon as the ND resolution succeeds.
3024 		 * We don't gain anything by queueing multiple probe packets
3025 		 * and sending them back-to-back once resolution succeeds.
3026 		 * It is sufficient to send just 1 packet after ND resolution
3027 		 * succeeds. Since mpathd is sending down probe packets at a
3028 		 * constant rate, we don't need to send the queued packet. We
3029 		 * need to queue it only for NDP resolution. The benefit of
3030 		 * dropping the probe packets that were delayed in ND
3031 		 * resolution, is that in.mpathd will not see inflated
3032 		 * RTT. If the ND resolution does not succeed within
3033 		 * in.mpathd's failure detection time, mpathd may detect
3034 		 * a failure, and it does not matter whether the packet
3035 		 * was queued or dropped.
3036 		 */
3037 		if (ip6i->ip6i_flags & IP6I_DROP_IFDELAYED)
3038 			head_insert = B_TRUE;
3039 	}
3040 
3041 	nce_queue_mp_common(nce, mp, head_insert);
3042 }
3043 
3044 /*
3045  * Called when address resolution failed due to a timeout.
3046  * Send an ICMP unreachable in response to all queued packets.
3047  */
3048 void
3049 nce_resolv_failed(nce_t *nce)
3050 {
3051 	mblk_t	*mp, *nxt_mp, *first_mp;
3052 	char	buf[INET6_ADDRSTRLEN];
3053 	ip6_t *ip6h;
3054 	zoneid_t zoneid = GLOBAL_ZONEID;
3055 	ip_stack_t	*ipst = nce->nce_ill->ill_ipst;
3056 
3057 	ip1dbg(("nce_resolv_failed: dst %s\n",
3058 	    inet_ntop(AF_INET6, (char *)&nce->nce_addr, buf, sizeof (buf))));
3059 	mutex_enter(&nce->nce_lock);
3060 	mp = nce->nce_qd_mp;
3061 	nce->nce_qd_mp = NULL;
3062 	mutex_exit(&nce->nce_lock);
3063 	while (mp != NULL) {
3064 		nxt_mp = mp->b_next;
3065 		mp->b_next = NULL;
3066 		mp->b_prev = NULL;
3067 
3068 		first_mp = mp;
3069 		if (mp->b_datap->db_type == M_CTL) {
3070 			ipsec_out_t *io = (ipsec_out_t *)mp->b_rptr;
3071 			ASSERT(io->ipsec_out_type == IPSEC_OUT);
3072 			zoneid = io->ipsec_out_zoneid;
3073 			ASSERT(zoneid != ALL_ZONES);
3074 			mp = mp->b_cont;
3075 		}
3076 
3077 		ip6h = (ip6_t *)mp->b_rptr;
3078 		if (ip6h->ip6_nxt == IPPROTO_RAW) {
3079 			ip6i_t *ip6i;
3080 			/*
3081 			 * This message should have been pulled up already
3082 			 * in ip_wput_v6. ip_hdr_complete_v6 assumes that
3083 			 * the header is pulled up.
3084 			 */
3085 			ip6i = (ip6i_t *)ip6h;
3086 			ASSERT((mp->b_wptr - (uchar_t *)ip6i) >=
3087 			    sizeof (ip6i_t) + IPV6_HDR_LEN);
3088 			mp->b_rptr += sizeof (ip6i_t);
3089 		}
3090 		/*
3091 		 * Ignore failure since icmp_unreachable_v6 will silently
3092 		 * drop packets with an unspecified source address.
3093 		 */
3094 		(void) ip_hdr_complete_v6((ip6_t *)mp->b_rptr, zoneid, ipst);
3095 		icmp_unreachable_v6(nce->nce_ill->ill_wq, first_mp,
3096 		    ICMP6_DST_UNREACH_ADDR, B_FALSE, B_FALSE, zoneid, ipst);
3097 		mp = nxt_mp;
3098 	}
3099 }
3100 
3101 /*
3102  * Called by SIOCSNDP* ioctl to add/change an nce entry
3103  * and the corresponding attributes.
3104  * Disallow states other than ND_REACHABLE or ND_STALE.
3105  */
3106 int
3107 ndp_sioc_update(ill_t *ill, lif_nd_req_t *lnr)
3108 {
3109 	sin6_t		*sin6;
3110 	in6_addr_t	*addr;
3111 	nce_t		*nce;
3112 	int		err;
3113 	uint16_t	new_flags = 0;
3114 	uint16_t	old_flags = 0;
3115 	int		inflags = lnr->lnr_flags;
3116 	ip_stack_t	*ipst = ill->ill_ipst;
3117 
3118 	ASSERT(ill->ill_isv6);
3119 	if ((lnr->lnr_state_create != ND_REACHABLE) &&
3120 	    (lnr->lnr_state_create != ND_STALE))
3121 		return (EINVAL);
3122 
3123 	sin6 = (sin6_t *)&lnr->lnr_addr;
3124 	addr = &sin6->sin6_addr;
3125 
3126 	mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
3127 	/* We know it can not be mapping so just look in the hash table */
3128 	nce = *((nce_t **)NCE_HASH_PTR_V6(ipst, *addr));
3129 	nce = nce_lookup_addr(ill, addr, nce);
3130 	if (nce != NULL)
3131 		new_flags = nce->nce_flags;
3132 
3133 	switch (inflags & (NDF_ISROUTER_ON|NDF_ISROUTER_OFF)) {
3134 	case NDF_ISROUTER_ON:
3135 		new_flags |= NCE_F_ISROUTER;
3136 		break;
3137 	case NDF_ISROUTER_OFF:
3138 		new_flags &= ~NCE_F_ISROUTER;
3139 		break;
3140 	case (NDF_ISROUTER_OFF|NDF_ISROUTER_ON):
3141 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
3142 		if (nce != NULL)
3143 			NCE_REFRELE(nce);
3144 		return (EINVAL);
3145 	}
3146 
3147 	switch (inflags & (NDF_ANYCAST_ON|NDF_ANYCAST_OFF)) {
3148 	case NDF_ANYCAST_ON:
3149 		new_flags |= NCE_F_ANYCAST;
3150 		break;
3151 	case NDF_ANYCAST_OFF:
3152 		new_flags &= ~NCE_F_ANYCAST;
3153 		break;
3154 	case (NDF_ANYCAST_OFF|NDF_ANYCAST_ON):
3155 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
3156 		if (nce != NULL)
3157 			NCE_REFRELE(nce);
3158 		return (EINVAL);
3159 	}
3160 
3161 	if (nce == NULL) {
3162 		err = ndp_add_v6(ill,
3163 		    (uchar_t *)lnr->lnr_hdw_addr,
3164 		    addr,
3165 		    &ipv6_all_ones,
3166 		    &ipv6_all_zeros,
3167 		    0,
3168 		    new_flags,
3169 		    lnr->lnr_state_create,
3170 		    &nce);
3171 		if (err != 0) {
3172 			mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
3173 			ip1dbg(("ndp_sioc_update: Can't create NCE %d\n", err));
3174 			return (err);
3175 		}
3176 	}
3177 	old_flags = nce->nce_flags;
3178 	if (old_flags & NCE_F_ISROUTER && !(new_flags & NCE_F_ISROUTER)) {
3179 		/*
3180 		 * Router turned to host, delete all ires.
3181 		 * XXX Just delete the entry, but we need to add too.
3182 		 */
3183 		nce->nce_flags &= ~NCE_F_ISROUTER;
3184 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
3185 		ndp_delete(nce);
3186 		NCE_REFRELE(nce);
3187 		return (0);
3188 	}
3189 	mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
3190 
3191 	mutex_enter(&nce->nce_lock);
3192 	nce->nce_flags = new_flags;
3193 	mutex_exit(&nce->nce_lock);
3194 	/*
3195 	 * Note that we ignore the state at this point, which
3196 	 * should be either STALE or REACHABLE.  Instead we let
3197 	 * the link layer address passed in to determine the state
3198 	 * much like incoming packets.
3199 	 */
3200 	ndp_process(nce, (uchar_t *)lnr->lnr_hdw_addr, 0, B_FALSE);
3201 	NCE_REFRELE(nce);
3202 	return (0);
3203 }
3204 
3205 /*
3206  * If the device driver supports it, we make nce_fp_mp to have
3207  * an M_DATA prepend.  Otherwise nce_fp_mp will be null.
3208  * The caller ensures there is hold on nce for this function.
3209  * Note that since ill_fastpath_probe() copies the mblk there is
3210  * no need for the hold beyond this function.
3211  */
3212 void
3213 nce_fastpath(nce_t *nce)
3214 {
3215 	ill_t	*ill = nce->nce_ill;
3216 	int res;
3217 
3218 	ASSERT(ill != NULL);
3219 	ASSERT(nce->nce_state != ND_INITIAL && nce->nce_state != ND_INCOMPLETE);
3220 
3221 	if (nce->nce_fp_mp != NULL) {
3222 		/* Already contains fastpath info */
3223 		return;
3224 	}
3225 	if (nce->nce_res_mp != NULL) {
3226 		nce_fastpath_list_add(nce);
3227 		res = ill_fastpath_probe(ill, nce->nce_res_mp);
3228 		/*
3229 		 * EAGAIN is an indication of a transient error
3230 		 * i.e. allocation failure etc. leave the nce in the list it
3231 		 * will be updated when another probe happens for another ire
3232 		 * if not it will be taken out of the list when the ire is
3233 		 * deleted.
3234 		 */
3235 
3236 		if (res != 0 && res != EAGAIN)
3237 			nce_fastpath_list_delete(nce);
3238 	}
3239 }
3240 
3241 /*
3242  * Drain the list of nce's waiting for fastpath response.
3243  */
3244 void
3245 nce_fastpath_list_dispatch(ill_t *ill, boolean_t (*func)(nce_t *, void  *),
3246     void *arg)
3247 {
3248 
3249 	nce_t *next_nce;
3250 	nce_t *current_nce;
3251 	nce_t *first_nce;
3252 	nce_t *prev_nce = NULL;
3253 
3254 	mutex_enter(&ill->ill_lock);
3255 	first_nce = current_nce = (nce_t *)ill->ill_fastpath_list;
3256 	while (current_nce != (nce_t *)&ill->ill_fastpath_list) {
3257 		next_nce = current_nce->nce_fastpath;
3258 		/*
3259 		 * Take it off the list if we're flushing, or if the callback
3260 		 * routine tells us to do so.  Otherwise, leave the nce in the
3261 		 * fastpath list to handle any pending response from the lower
3262 		 * layer.  We can't drain the list when the callback routine
3263 		 * comparison failed, because the response is asynchronous in
3264 		 * nature, and may not arrive in the same order as the list
3265 		 * insertion.
3266 		 */
3267 		if (func == NULL || func(current_nce, arg)) {
3268 			current_nce->nce_fastpath = NULL;
3269 			if (current_nce == first_nce)
3270 				ill->ill_fastpath_list = first_nce = next_nce;
3271 			else
3272 				prev_nce->nce_fastpath = next_nce;
3273 		} else {
3274 			/* previous element that is still in the list */
3275 			prev_nce = current_nce;
3276 		}
3277 		current_nce = next_nce;
3278 	}
3279 	mutex_exit(&ill->ill_lock);
3280 }
3281 
3282 /*
3283  * Add nce to the nce fastpath list.
3284  */
3285 void
3286 nce_fastpath_list_add(nce_t *nce)
3287 {
3288 	ill_t *ill;
3289 
3290 	ill = nce->nce_ill;
3291 
3292 	mutex_enter(&ill->ill_lock);
3293 	mutex_enter(&nce->nce_lock);
3294 
3295 	/*
3296 	 * if nce has not been deleted and
3297 	 * is not already in the list add it.
3298 	 */
3299 	if (!(nce->nce_flags & NCE_F_CONDEMNED) &&
3300 	    (nce->nce_fastpath == NULL)) {
3301 		nce->nce_fastpath = (nce_t *)ill->ill_fastpath_list;
3302 		ill->ill_fastpath_list = nce;
3303 	}
3304 
3305 	mutex_exit(&nce->nce_lock);
3306 	mutex_exit(&ill->ill_lock);
3307 }
3308 
3309 /*
3310  * remove nce from the nce fastpath list.
3311  */
3312 void
3313 nce_fastpath_list_delete(nce_t *nce)
3314 {
3315 	nce_t *nce_ptr;
3316 
3317 	ill_t *ill;
3318 
3319 	ill = nce->nce_ill;
3320 	ASSERT(ill != NULL);
3321 
3322 	mutex_enter(&ill->ill_lock);
3323 	if (nce->nce_fastpath == NULL)
3324 		goto done;
3325 
3326 	ASSERT(ill->ill_fastpath_list != &ill->ill_fastpath_list);
3327 
3328 	if (ill->ill_fastpath_list == nce) {
3329 		ill->ill_fastpath_list = nce->nce_fastpath;
3330 	} else {
3331 		nce_ptr = ill->ill_fastpath_list;
3332 		while (nce_ptr != (nce_t *)&ill->ill_fastpath_list) {
3333 			if (nce_ptr->nce_fastpath == nce) {
3334 				nce_ptr->nce_fastpath = nce->nce_fastpath;
3335 				break;
3336 			}
3337 			nce_ptr = nce_ptr->nce_fastpath;
3338 		}
3339 	}
3340 
3341 	nce->nce_fastpath = NULL;
3342 done:
3343 	mutex_exit(&ill->ill_lock);
3344 }
3345 
3346 /*
3347  * Update all NCE's that are not in fastpath mode and
3348  * have an nce_fp_mp that matches mp. mp->b_cont contains
3349  * the fastpath header.
3350  *
3351  * Returns TRUE if entry should be dequeued, or FALSE otherwise.
3352  */
3353 boolean_t
3354 ndp_fastpath_update(nce_t *nce, void *arg)
3355 {
3356 	mblk_t 	*mp, *fp_mp;
3357 	uchar_t	*mp_rptr, *ud_mp_rptr;
3358 	mblk_t	*ud_mp = nce->nce_res_mp;
3359 	ptrdiff_t	cmplen;
3360 
3361 	if (nce->nce_flags & NCE_F_MAPPING)
3362 		return (B_TRUE);
3363 	if ((nce->nce_fp_mp != NULL) || (ud_mp == NULL))
3364 		return (B_TRUE);
3365 
3366 	ip2dbg(("ndp_fastpath_update: trying\n"));
3367 	mp = (mblk_t *)arg;
3368 	mp_rptr = mp->b_rptr;
3369 	cmplen = mp->b_wptr - mp_rptr;
3370 	ASSERT(cmplen >= 0);
3371 	ud_mp_rptr = ud_mp->b_rptr;
3372 	/*
3373 	 * The nce is locked here to prevent any other threads
3374 	 * from accessing and changing nce_res_mp when the IPv6 address
3375 	 * becomes resolved to an lla while we're in the middle
3376 	 * of looking at and comparing the hardware address (lla).
3377 	 * It is also locked to prevent multiple threads in nce_fastpath_update
3378 	 * from examining nce_res_mp atthe same time.
3379 	 */
3380 	mutex_enter(&nce->nce_lock);
3381 	if (ud_mp->b_wptr - ud_mp_rptr != cmplen ||
3382 	    bcmp((char *)mp_rptr, (char *)ud_mp_rptr, cmplen) != 0) {
3383 		mutex_exit(&nce->nce_lock);
3384 		/*
3385 		 * Don't take the ire off the fastpath list yet,
3386 		 * since the response may come later.
3387 		 */
3388 		return (B_FALSE);
3389 	}
3390 	/* Matched - install mp as the fastpath mp */
3391 	ip1dbg(("ndp_fastpath_update: match\n"));
3392 	fp_mp = dupb(mp->b_cont);
3393 	if (fp_mp != NULL) {
3394 		nce->nce_fp_mp = fp_mp;
3395 	}
3396 	mutex_exit(&nce->nce_lock);
3397 	return (B_TRUE);
3398 }
3399 
3400 /*
3401  * This function handles the DL_NOTE_FASTPATH_FLUSH notification from
3402  * driver.  Note that it assumes IP is exclusive...
3403  */
3404 /* ARGSUSED */
3405 void
3406 ndp_fastpath_flush(nce_t *nce, char *arg)
3407 {
3408 	if (nce->nce_flags & NCE_F_MAPPING)
3409 		return;
3410 	/* No fastpath info? */
3411 	if (nce->nce_fp_mp == NULL || nce->nce_res_mp == NULL)
3412 		return;
3413 
3414 	if (nce->nce_ipversion == IPV4_VERSION &&
3415 	    nce->nce_flags & NCE_F_BCAST) {
3416 		/*
3417 		 * IPv4 BROADCAST entries:
3418 		 * We can't delete the nce since it is difficult to
3419 		 * recreate these without going through the
3420 		 * ipif down/up dance.
3421 		 *
3422 		 * All access to nce->nce_fp_mp in the case of these
3423 		 * is protected by nce_lock.
3424 		 */
3425 		mutex_enter(&nce->nce_lock);
3426 		if (nce->nce_fp_mp != NULL) {
3427 			freeb(nce->nce_fp_mp);
3428 			nce->nce_fp_mp = NULL;
3429 			mutex_exit(&nce->nce_lock);
3430 			nce_fastpath(nce);
3431 		} else {
3432 			mutex_exit(&nce->nce_lock);
3433 		}
3434 	} else {
3435 		/* Just delete the NCE... */
3436 		ndp_delete(nce);
3437 	}
3438 }
3439 
3440 /*
3441  * Return a pointer to a given option in the packet.
3442  * Assumes that option part of the packet have already been validated.
3443  */
3444 nd_opt_hdr_t *
3445 ndp_get_option(nd_opt_hdr_t *opt, int optlen, int opt_type)
3446 {
3447 	while (optlen > 0) {
3448 		if (opt->nd_opt_type == opt_type)
3449 			return (opt);
3450 		optlen -= 8 * opt->nd_opt_len;
3451 		opt = (struct nd_opt_hdr *)((char *)opt + 8 * opt->nd_opt_len);
3452 	}
3453 	return (NULL);
3454 }
3455 
3456 /*
3457  * Verify all option lengths present are > 0, also check to see
3458  * if the option lengths and packet length are consistent.
3459  */
3460 boolean_t
3461 ndp_verify_optlen(nd_opt_hdr_t *opt, int optlen)
3462 {
3463 	ASSERT(opt != NULL);
3464 	while (optlen > 0) {
3465 		if (opt->nd_opt_len == 0)
3466 			return (B_FALSE);
3467 		optlen -= 8 * opt->nd_opt_len;
3468 		if (optlen < 0)
3469 			return (B_FALSE);
3470 		opt = (struct nd_opt_hdr *)((char *)opt + 8 * opt->nd_opt_len);
3471 	}
3472 	return (B_TRUE);
3473 }
3474 
3475 /*
3476  * ndp_walk function.
3477  * Free a fraction of the NCE cache entries.
3478  * A fraction of zero means to not free any in that category.
3479  */
3480 void
3481 ndp_cache_reclaim(nce_t *nce, char *arg)
3482 {
3483 	nce_cache_reclaim_t *ncr = (nce_cache_reclaim_t *)arg;
3484 	uint_t	rand;
3485 
3486 	if (nce->nce_flags & NCE_F_PERMANENT)
3487 		return;
3488 
3489 	rand = (uint_t)lbolt +
3490 	    NCE_ADDR_HASH_V6(nce->nce_addr, NCE_TABLE_SIZE);
3491 	if (ncr->ncr_host != 0 &&
3492 	    (rand/ncr->ncr_host)*ncr->ncr_host == rand) {
3493 		ndp_delete(nce);
3494 		return;
3495 	}
3496 }
3497 
3498 /*
3499  * ndp_walk function.
3500  * Count the number of NCEs that can be deleted.
3501  * These would be hosts but not routers.
3502  */
3503 void
3504 ndp_cache_count(nce_t *nce, char *arg)
3505 {
3506 	ncc_cache_count_t *ncc = (ncc_cache_count_t *)arg;
3507 
3508 	if (nce->nce_flags & NCE_F_PERMANENT)
3509 		return;
3510 
3511 	ncc->ncc_total++;
3512 	if (!(nce->nce_flags & NCE_F_ISROUTER))
3513 		ncc->ncc_host++;
3514 }
3515 
3516 #ifdef DEBUG
3517 void
3518 nce_trace_ref(nce_t *nce)
3519 {
3520 	ASSERT(MUTEX_HELD(&nce->nce_lock));
3521 
3522 	if (nce->nce_trace_disable)
3523 		return;
3524 
3525 	if (!th_trace_ref(nce, nce->nce_ill->ill_ipst)) {
3526 		nce->nce_trace_disable = B_TRUE;
3527 		nce_trace_cleanup(nce);
3528 	}
3529 }
3530 
3531 void
3532 nce_untrace_ref(nce_t *nce)
3533 {
3534 	ASSERT(MUTEX_HELD(&nce->nce_lock));
3535 
3536 	if (!nce->nce_trace_disable)
3537 		th_trace_unref(nce);
3538 }
3539 
3540 static void
3541 nce_trace_cleanup(const nce_t *nce)
3542 {
3543 	th_trace_cleanup(nce, nce->nce_trace_disable);
3544 }
3545 #endif
3546 
3547 /*
3548  * Called when address resolution fails due to a timeout.
3549  * Send an ICMP unreachable in response to all queued packets.
3550  */
3551 void
3552 arp_resolv_failed(nce_t *nce)
3553 {
3554 	mblk_t	*mp, *nxt_mp, *first_mp;
3555 	char	buf[INET6_ADDRSTRLEN];
3556 	zoneid_t zoneid = GLOBAL_ZONEID;
3557 	struct in_addr ipv4addr;
3558 	ip_stack_t *ipst = nce->nce_ill->ill_ipst;
3559 
3560 	IN6_V4MAPPED_TO_INADDR(&nce->nce_addr, &ipv4addr);
3561 	ip3dbg(("arp_resolv_failed: dst %s\n",
3562 	    inet_ntop(AF_INET, &ipv4addr, buf, sizeof (buf))));
3563 	mutex_enter(&nce->nce_lock);
3564 	mp = nce->nce_qd_mp;
3565 	nce->nce_qd_mp = NULL;
3566 	mutex_exit(&nce->nce_lock);
3567 
3568 	while (mp != NULL) {
3569 		nxt_mp = mp->b_next;
3570 		mp->b_next = NULL;
3571 		mp->b_prev = NULL;
3572 
3573 		first_mp = mp;
3574 		/*
3575 		 * Send icmp unreachable messages
3576 		 * to the hosts.
3577 		 */
3578 		(void) ip_hdr_complete((ipha_t *)mp->b_rptr, zoneid, ipst);
3579 		ip3dbg(("arp_resolv_failed: Calling icmp_unreachable\n"));
3580 		icmp_unreachable(nce->nce_ill->ill_wq, first_mp,
3581 		    ICMP_HOST_UNREACHABLE, zoneid, ipst);
3582 		mp = nxt_mp;
3583 	}
3584 }
3585 
3586 int
3587 ndp_lookup_then_add_v4(ill_t *ill, const in_addr_t *addr, uint16_t flags,
3588     nce_t **newnce, nce_t *src_nce)
3589 {
3590 	int	err;
3591 	nce_t	*nce;
3592 	in6_addr_t addr6;
3593 	ip_stack_t *ipst = ill->ill_ipst;
3594 
3595 	mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
3596 	nce = *((nce_t **)NCE_HASH_PTR_V4(ipst, *addr));
3597 	IN6_IPADDR_TO_V4MAPPED(*addr, &addr6);
3598 	nce = nce_lookup_addr(ill, &addr6, nce);
3599 	if (nce == NULL) {
3600 		err = ndp_add_v4(ill, addr, flags, newnce, src_nce);
3601 	} else {
3602 		*newnce = nce;
3603 		err = EEXIST;
3604 	}
3605 	mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
3606 	return (err);
3607 }
3608 
3609 /*
3610  * NDP Cache Entry creation routine for IPv4.
3611  * Mapped entries are handled in arp.
3612  * This routine must always be called with ndp4->ndp_g_lock held.
3613  * Prior to return, nce_refcnt is incremented.
3614  */
3615 static int
3616 ndp_add_v4(ill_t *ill, const in_addr_t *addr, uint16_t flags,
3617     nce_t **newnce, nce_t *src_nce)
3618 {
3619 	static	nce_t		nce_nil;
3620 	nce_t		*nce;
3621 	mblk_t		*mp;
3622 	mblk_t		*template = NULL;
3623 	nce_t		**ncep;
3624 	ip_stack_t	*ipst = ill->ill_ipst;
3625 	uint16_t	state = ND_INITIAL;
3626 	int		err;
3627 
3628 	ASSERT(MUTEX_HELD(&ipst->ips_ndp4->ndp_g_lock));
3629 	ASSERT(!ill->ill_isv6);
3630 	ASSERT((flags & NCE_F_MAPPING) == 0);
3631 
3632 	if (ill->ill_resolver_mp == NULL)
3633 		return (EINVAL);
3634 	/*
3635 	 * Allocate the mblk to hold the nce.
3636 	 */
3637 	mp = allocb(sizeof (nce_t), BPRI_MED);
3638 	if (mp == NULL)
3639 		return (ENOMEM);
3640 
3641 	nce = (nce_t *)mp->b_rptr;
3642 	mp->b_wptr = (uchar_t *)&nce[1];
3643 	*nce = nce_nil;
3644 	nce->nce_ill = ill;
3645 	nce->nce_ipversion = IPV4_VERSION;
3646 	nce->nce_flags = flags;
3647 	nce->nce_pcnt = ND_MAX_UNICAST_SOLICIT;
3648 	nce->nce_rcnt = ill->ill_xmit_count;
3649 	IN6_IPADDR_TO_V4MAPPED(*addr, &nce->nce_addr);
3650 	nce->nce_mask = ipv6_all_ones;
3651 	nce->nce_extract_mask = ipv6_all_zeros;
3652 	nce->nce_ll_extract_start = 0;
3653 	nce->nce_qd_mp = NULL;
3654 	nce->nce_mp = mp;
3655 	/* This one is for nce getting created */
3656 	nce->nce_refcnt = 1;
3657 	mutex_init(&nce->nce_lock, NULL, MUTEX_DEFAULT, NULL);
3658 	ncep = ((nce_t **)NCE_HASH_PTR_V4(ipst, *addr));
3659 
3660 	nce->nce_trace_disable = B_FALSE;
3661 
3662 	if (src_nce != NULL) {
3663 		/*
3664 		 * src_nce has been provided by the caller. The only
3665 		 * caller who provides a non-null, non-broadcast
3666 		 * src_nce is from ip_newroute() which must pass in
3667 		 * a ND_REACHABLE src_nce (this condition is verified
3668 		 * via an ASSERT for the save_ire->ire_nce in ip_newroute())
3669 		 */
3670 		mutex_enter(&src_nce->nce_lock);
3671 		state = src_nce->nce_state;
3672 		if ((src_nce->nce_flags & NCE_F_CONDEMNED) ||
3673 		    (ipst->ips_ndp4->ndp_g_hw_change > 0)) {
3674 			/*
3675 			 * src_nce has been deleted, or
3676 			 * ip_arp_news is in the middle of
3677 			 * flushing entries in the the nce.
3678 			 * Fail the add, since we don't know
3679 			 * if it is safe to copy the contents of
3680 			 * src_nce
3681 			 */
3682 			DTRACE_PROBE2(nce__bad__src__nce,
3683 			    nce_t *, src_nce, ill_t *, ill);
3684 			mutex_exit(&src_nce->nce_lock);
3685 			err = EINVAL;
3686 			goto err_ret;
3687 		}
3688 		template = copyb(src_nce->nce_res_mp);
3689 		mutex_exit(&src_nce->nce_lock);
3690 		if (template == NULL) {
3691 			err = ENOMEM;
3692 			goto err_ret;
3693 		}
3694 	} else if (flags & NCE_F_BCAST) {
3695 		/*
3696 		 * broadcast nce.
3697 		 */
3698 		template = copyb(ill->ill_bcast_mp);
3699 		if (template == NULL) {
3700 			err = ENOMEM;
3701 			goto err_ret;
3702 		}
3703 		state = ND_REACHABLE;
3704 	} else if (ill->ill_net_type == IRE_IF_NORESOLVER) {
3705 		/*
3706 		 * NORESOLVER entries are always created in the REACHABLE
3707 		 * state. We create a nce_res_mp with the IP nexthop address
3708 		 * in the destination address in the DLPI hdr if the
3709 		 * physical length is exactly 4 bytes.
3710 		 *
3711 		 * XXX not clear which drivers set ill_phys_addr_length to
3712 		 * IP_ADDR_LEN.
3713 		 */
3714 		if (ill->ill_phys_addr_length == IP_ADDR_LEN) {
3715 			template = ill_dlur_gen((uchar_t *)addr,
3716 			    ill->ill_phys_addr_length,
3717 			    ill->ill_sap, ill->ill_sap_length);
3718 		} else {
3719 			template = copyb(ill->ill_resolver_mp);
3720 		}
3721 		if (template == NULL) {
3722 			err = ENOMEM;
3723 			goto err_ret;
3724 		}
3725 		state = ND_REACHABLE;
3726 	}
3727 	nce->nce_fp_mp = NULL;
3728 	nce->nce_res_mp = template;
3729 	nce->nce_state = state;
3730 	if (state == ND_REACHABLE) {
3731 		nce->nce_last = TICK_TO_MSEC(lbolt64);
3732 		nce->nce_init_time = TICK_TO_MSEC(lbolt64);
3733 	} else {
3734 		nce->nce_last = 0;
3735 		if (state == ND_INITIAL)
3736 			nce->nce_init_time = TICK_TO_MSEC(lbolt64);
3737 	}
3738 
3739 	ASSERT((nce->nce_res_mp == NULL && nce->nce_state == ND_INITIAL) ||
3740 	    (nce->nce_res_mp != NULL && nce->nce_state == ND_REACHABLE));
3741 	/*
3742 	 * Atomically ensure that the ill is not CONDEMNED, before
3743 	 * adding the NCE.
3744 	 */
3745 	mutex_enter(&ill->ill_lock);
3746 	if (ill->ill_state_flags & ILL_CONDEMNED) {
3747 		mutex_exit(&ill->ill_lock);
3748 		err = EINVAL;
3749 		goto err_ret;
3750 	}
3751 	if ((nce->nce_next = *ncep) != NULL)
3752 		nce->nce_next->nce_ptpn = &nce->nce_next;
3753 	*ncep = nce;
3754 	nce->nce_ptpn = ncep;
3755 	*newnce = nce;
3756 	/* This one is for nce being used by an active thread */
3757 	NCE_REFHOLD(*newnce);
3758 
3759 	/* Bump up the number of nce's referencing this ill */
3760 	ill->ill_nce_cnt++;
3761 	mutex_exit(&ill->ill_lock);
3762 	DTRACE_PROBE1(ndp__add__v4, nce_t *, nce);
3763 	return (0);
3764 err_ret:
3765 	freeb(mp);
3766 	freemsg(template);
3767 	return (err);
3768 }
3769 
3770 void
3771 ndp_flush_qd_mp(nce_t *nce)
3772 {
3773 	mblk_t *qd_mp, *qd_next;
3774 
3775 	ASSERT(MUTEX_HELD(&nce->nce_lock));
3776 	qd_mp = nce->nce_qd_mp;
3777 	nce->nce_qd_mp = NULL;
3778 	while (qd_mp != NULL) {
3779 		qd_next = qd_mp->b_next;
3780 		qd_mp->b_next = NULL;
3781 		qd_mp->b_prev = NULL;
3782 		freemsg(qd_mp);
3783 		qd_mp = qd_next;
3784 	}
3785 }
3786 
3787 
3788 /*
3789  * ndp_walk routine to delete all entries that have a given destination or
3790  * gateway address and cached link layer (MAC) address.  This is used when ARP
3791  * informs us that a network-to-link-layer mapping may have changed.
3792  */
3793 void
3794 nce_delete_hw_changed(nce_t *nce, void *arg)
3795 {
3796 	nce_hw_map_t *hwm = arg;
3797 	mblk_t *mp;
3798 	dl_unitdata_req_t *dlu;
3799 	uchar_t *macaddr;
3800 	ill_t *ill;
3801 	int saplen;
3802 	ipaddr_t nce_addr;
3803 
3804 	if (nce->nce_state != ND_REACHABLE)
3805 		return;
3806 
3807 	IN6_V4MAPPED_TO_IPADDR(&nce->nce_addr, nce_addr);
3808 	if (nce_addr != hwm->hwm_addr)
3809 		return;
3810 
3811 	mutex_enter(&nce->nce_lock);
3812 	if ((mp = nce->nce_res_mp) == NULL) {
3813 		mutex_exit(&nce->nce_lock);
3814 		return;
3815 	}
3816 	dlu = (dl_unitdata_req_t *)mp->b_rptr;
3817 	macaddr = (uchar_t *)(dlu + 1);
3818 	ill = nce->nce_ill;
3819 	if ((saplen = ill->ill_sap_length) > 0)
3820 		macaddr += saplen;
3821 	else
3822 		saplen = -saplen;
3823 
3824 	/*
3825 	 * If the hardware address is unchanged, then leave this one alone.
3826 	 * Note that saplen == abs(saplen) now.
3827 	 */
3828 	if (hwm->hwm_hwlen == dlu->dl_dest_addr_length - saplen &&
3829 	    bcmp(hwm->hwm_hwaddr, macaddr, hwm->hwm_hwlen) == 0) {
3830 		mutex_exit(&nce->nce_lock);
3831 		return;
3832 	}
3833 	mutex_exit(&nce->nce_lock);
3834 
3835 	DTRACE_PROBE1(nce__hw__deleted, nce_t *, nce);
3836 	ndp_delete(nce);
3837 }
3838 
3839 /*
3840  * This function verifies whether a given IPv4 address is potentially known to
3841  * the NCE subsystem.  If so, then ARP must not delete the corresponding ace_t,
3842  * so that it can continue to look for hardware changes on that address.
3843  */
3844 boolean_t
3845 ndp_lookup_ipaddr(in_addr_t addr, netstack_t *ns)
3846 {
3847 	nce_t		*nce;
3848 	struct in_addr	nceaddr;
3849 	ip_stack_t	*ipst = ns->netstack_ip;
3850 
3851 	if (addr == INADDR_ANY)
3852 		return (B_FALSE);
3853 
3854 	mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
3855 	nce = *(nce_t **)NCE_HASH_PTR_V4(ipst, addr);
3856 	for (; nce != NULL; nce = nce->nce_next) {
3857 		/* Note that only v4 mapped entries are in the table. */
3858 		IN6_V4MAPPED_TO_INADDR(&nce->nce_addr, &nceaddr);
3859 		if (addr == nceaddr.s_addr &&
3860 		    IN6_ARE_ADDR_EQUAL(&nce->nce_mask, &ipv6_all_ones)) {
3861 			/* Single flag check; no lock needed */
3862 			if (!(nce->nce_flags & NCE_F_CONDEMNED))
3863 				break;
3864 		}
3865 	}
3866 	mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
3867 	return (nce != NULL);
3868 }
3869