xref: /titanic_41/usr/src/uts/common/inet/ip/ip_ndp.c (revision 60471b7bbfab236de7d8776aed871d919c5f81c3)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #include <sys/types.h>
27 #include <sys/stream.h>
28 #include <sys/stropts.h>
29 #include <sys/strsun.h>
30 #include <sys/sysmacros.h>
31 #include <sys/errno.h>
32 #include <sys/dlpi.h>
33 #include <sys/socket.h>
34 #include <sys/ddi.h>
35 #include <sys/sunddi.h>
36 #include <sys/cmn_err.h>
37 #include <sys/debug.h>
38 #include <sys/vtrace.h>
39 #include <sys/kmem.h>
40 #include <sys/zone.h>
41 #include <sys/ethernet.h>
42 #include <sys/sdt.h>
43 
44 #include <net/if.h>
45 #include <net/if_types.h>
46 #include <net/if_dl.h>
47 #include <net/route.h>
48 #include <netinet/in.h>
49 #include <netinet/ip6.h>
50 #include <netinet/icmp6.h>
51 
52 #include <inet/common.h>
53 #include <inet/mi.h>
54 #include <inet/mib2.h>
55 #include <inet/nd.h>
56 #include <inet/ip.h>
57 #include <inet/ip_impl.h>
58 #include <inet/ipclassifier.h>
59 #include <inet/ip_if.h>
60 #include <inet/ip_ire.h>
61 #include <inet/ip_rts.h>
62 #include <inet/ip6.h>
63 #include <inet/ip_ndp.h>
64 #include <inet/ipsec_impl.h>
65 #include <inet/ipsec_info.h>
66 #include <inet/sctp_ip.h>
67 #include <inet/ip2mac_impl.h>
68 
69 /*
70  * Function names with nce_ prefix are static while function
71  * names with ndp_ prefix are used by rest of the IP.
72  *
73  * Lock ordering:
74  *
75  *	ndp_g_lock -> ill_lock -> nce_lock
76  *
77  * The ndp_g_lock protects the NCE hash (nce_hash_tbl, NCE_HASH_PTR) and
78  * nce_next.  Nce_lock protects the contents of the NCE (particularly
79  * nce_refcnt).
80  */
81 
82 static	boolean_t nce_cmp_ll_addr(const nce_t *nce, const uchar_t *new_ll_addr,
83     uint32_t ll_addr_len);
84 static	void	nce_ire_delete(nce_t *nce);
85 static	void	nce_ire_delete1(ire_t *ire, char *nce_arg);
86 static	void 	nce_set_ll(nce_t *nce, uchar_t *ll_addr);
87 static	nce_t	*nce_lookup_addr(ill_t *, boolean_t, const in6_addr_t *,
88     nce_t *);
89 static	nce_t	*nce_lookup_mapping(ill_t *, const in6_addr_t *);
90 static	void	nce_make_mapping(nce_t *nce, uchar_t *addrpos,
91     uchar_t *addr);
92 static	int	nce_set_multicast(ill_t *ill, const in6_addr_t *addr);
93 static	void	nce_queue_mp(nce_t *nce, mblk_t *mp);
94 static	mblk_t	*nce_udreq_alloc(ill_t *ill);
95 static	void	nce_update(nce_t *nce, uint16_t new_state,
96     uchar_t *new_ll_addr);
97 static	uint32_t	nce_solicit(nce_t *nce, in6_addr_t src);
98 static	boolean_t	nce_xmit(ill_t *ill, uint8_t type,
99     boolean_t use_lla_addr, const in6_addr_t *sender,
100     const in6_addr_t *target, int flag);
101 static boolean_t	nce_xmit_advert(nce_t *nce, boolean_t use_nd_lla,
102     const in6_addr_t *target, uint_t flags);
103 static boolean_t	nce_xmit_solicit(nce_t *nce, boolean_t use_nd_lla,
104     const in6_addr_t *src, uint_t flags);
105 static int	ndp_add_v4(ill_t *, const in_addr_t *, uint16_t,
106     nce_t **, nce_t *);
107 static ipif_t	*ip_ndp_lookup_addr_v6(const in6_addr_t *v6addrp, ill_t *ill);
108 
109 #ifdef DEBUG
110 static void	nce_trace_cleanup(const nce_t *);
111 #endif
112 
113 #define	NCE_HASH_PTR_V4(ipst, addr)					\
114 	(&((ipst)->ips_ndp4->nce_hash_tbl[IRE_ADDR_HASH(addr, NCE_TABLE_SIZE)]))
115 
116 #define	NCE_HASH_PTR_V6(ipst, addr)				 \
117 	(&((ipst)->ips_ndp6->nce_hash_tbl[NCE_ADDR_HASH_V6(addr, \
118 		NCE_TABLE_SIZE)]))
119 
120 /* Non-tunable probe interval, based on link capabilities */
121 #define	ILL_PROBE_INTERVAL(ill)	((ill)->ill_note_link ? 150 : 1500)
122 
123 /*
124  * NDP Cache Entry creation routine.
125  * Mapped entries will never do NUD .
126  * This routine must always be called with ndp6->ndp_g_lock held.
127  * Prior to return, nce_refcnt is incremented.
128  */
129 int
130 ndp_add_v6(ill_t *ill, uchar_t *hw_addr, const in6_addr_t *addr,
131     const in6_addr_t *mask, const in6_addr_t *extract_mask,
132     uint32_t hw_extract_start, uint16_t flags, uint16_t state,
133     nce_t **newnce)
134 {
135 	static	nce_t		nce_nil;
136 	nce_t		*nce;
137 	mblk_t		*mp;
138 	mblk_t		*template;
139 	nce_t		**ncep;
140 	int		err;
141 	boolean_t	dropped = B_FALSE;
142 	ip_stack_t	*ipst = ill->ill_ipst;
143 
144 	ASSERT(MUTEX_HELD(&ipst->ips_ndp6->ndp_g_lock));
145 	ASSERT(ill != NULL && ill->ill_isv6);
146 	if (IN6_IS_ADDR_UNSPECIFIED(addr)) {
147 		ip0dbg(("ndp_add_v6: no addr\n"));
148 		return (EINVAL);
149 	}
150 	if ((flags & ~NCE_EXTERNAL_FLAGS_MASK)) {
151 		ip0dbg(("ndp_add_v6: flags = %x\n", (int)flags));
152 		return (EINVAL);
153 	}
154 	if (IN6_IS_ADDR_UNSPECIFIED(extract_mask) &&
155 	    (flags & NCE_F_MAPPING)) {
156 		ip0dbg(("ndp_add_v6: extract mask zero for mapping"));
157 		return (EINVAL);
158 	}
159 	/*
160 	 * Allocate the mblk to hold the nce.
161 	 *
162 	 * XXX This can come out of a separate cache - nce_cache.
163 	 * We don't need the mp anymore as there are no more
164 	 * "qwriter"s
165 	 */
166 	mp = allocb(sizeof (nce_t), BPRI_MED);
167 	if (mp == NULL)
168 		return (ENOMEM);
169 
170 	nce = (nce_t *)mp->b_rptr;
171 	mp->b_wptr = (uchar_t *)&nce[1];
172 	*nce = nce_nil;
173 
174 	/*
175 	 * This one holds link layer address
176 	 */
177 	if (ill->ill_net_type == IRE_IF_RESOLVER) {
178 		template = nce_udreq_alloc(ill);
179 	} else {
180 		if (ill->ill_phys_addr_length == IPV6_ADDR_LEN &&
181 		    ill->ill_mactype != DL_IPV6) {
182 			/*
183 			 * We create a nce_res_mp with the IP nexthop address
184 			 * as the destination address if the physical length
185 			 * is exactly 16 bytes for point-to-multipoint links
186 			 * that do their own resolution from IP to link-layer
187 			 * address.
188 			 */
189 			template = ill_dlur_gen((uchar_t *)addr,
190 			    ill->ill_phys_addr_length, ill->ill_sap,
191 			    ill->ill_sap_length);
192 		} else {
193 			if (ill->ill_resolver_mp == NULL) {
194 				freeb(mp);
195 				return (EINVAL);
196 			}
197 			ASSERT((ill->ill_net_type == IRE_IF_NORESOLVER));
198 			template = copyb(ill->ill_resolver_mp);
199 		}
200 	}
201 	if (template == NULL) {
202 		freeb(mp);
203 		return (ENOMEM);
204 	}
205 	nce->nce_ill = ill;
206 	nce->nce_ipversion = IPV6_VERSION;
207 	nce->nce_flags = flags;
208 	nce->nce_state = state;
209 	nce->nce_pcnt = ND_MAX_UNICAST_SOLICIT;
210 	nce->nce_rcnt = ill->ill_xmit_count;
211 	nce->nce_addr = *addr;
212 	nce->nce_mask = *mask;
213 	nce->nce_extract_mask = *extract_mask;
214 	nce->nce_ll_extract_start = hw_extract_start;
215 	nce->nce_fp_mp = NULL;
216 	nce->nce_res_mp = template;
217 	if (state == ND_REACHABLE)
218 		nce->nce_last = TICK_TO_MSEC(lbolt64);
219 	else
220 		nce->nce_last = 0;
221 	nce->nce_qd_mp = NULL;
222 	nce->nce_mp = mp;
223 	if (hw_addr != NULL)
224 		nce_set_ll(nce, hw_addr);
225 	/* This one is for nce getting created */
226 	nce->nce_refcnt = 1;
227 	mutex_init(&nce->nce_lock, NULL, MUTEX_DEFAULT, NULL);
228 	if (nce->nce_flags & NCE_F_MAPPING) {
229 		ASSERT(IN6_IS_ADDR_MULTICAST(addr));
230 		ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&nce->nce_mask));
231 		ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&nce->nce_extract_mask));
232 		ncep = &ipst->ips_ndp6->nce_mask_entries;
233 	} else {
234 		ncep = ((nce_t **)NCE_HASH_PTR_V6(ipst, *addr));
235 	}
236 
237 	nce->nce_trace_disable = B_FALSE;
238 
239 	list_create(&nce->nce_cb, sizeof (nce_cb_t),
240 	    offsetof(nce_cb_t, nce_cb_node));
241 	/*
242 	 * Atomically ensure that the ill is not CONDEMNED, before
243 	 * adding the NCE.
244 	 */
245 	mutex_enter(&ill->ill_lock);
246 	if (ill->ill_state_flags & ILL_CONDEMNED) {
247 		mutex_exit(&ill->ill_lock);
248 		freeb(mp);
249 		freeb(template);
250 		return (EINVAL);
251 	}
252 	if ((nce->nce_next = *ncep) != NULL)
253 		nce->nce_next->nce_ptpn = &nce->nce_next;
254 	*ncep = nce;
255 	nce->nce_ptpn = ncep;
256 	*newnce = nce;
257 	/* This one is for nce being used by an active thread */
258 	NCE_REFHOLD(*newnce);
259 
260 	/* Bump up the number of nce's referencing this ill */
261 	DTRACE_PROBE3(ill__incr__cnt, (ill_t *), ill,
262 	    (char *), "nce", (void *), nce);
263 	ill->ill_nce_cnt++;
264 	mutex_exit(&ill->ill_lock);
265 
266 	err = 0;
267 	if ((flags & NCE_F_PERMANENT) && state == ND_PROBE) {
268 		mutex_enter(&nce->nce_lock);
269 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
270 		nce->nce_pcnt = ND_MAX_UNICAST_SOLICIT;
271 		mutex_exit(&nce->nce_lock);
272 		dropped = nce_xmit_solicit(nce, B_FALSE, NULL, NDP_PROBE);
273 		if (dropped) {
274 			mutex_enter(&nce->nce_lock);
275 			nce->nce_pcnt++;
276 			mutex_exit(&nce->nce_lock);
277 		}
278 		NDP_RESTART_TIMER(nce, ILL_PROBE_INTERVAL(ill));
279 		mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
280 		err = EINPROGRESS;
281 	} else if (flags & NCE_F_UNSOL_ADV) {
282 		/*
283 		 * We account for the transmit below by assigning one
284 		 * less than the ndd variable. Subsequent decrements
285 		 * are done in ndp_timer.
286 		 */
287 		mutex_enter(&nce->nce_lock);
288 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
289 		nce->nce_unsolicit_count = ipst->ips_ip_ndp_unsolicit_count - 1;
290 		mutex_exit(&nce->nce_lock);
291 		dropped = nce_xmit_advert(nce, B_TRUE, &ipv6_all_hosts_mcast,
292 		    0);
293 		mutex_enter(&nce->nce_lock);
294 		if (dropped)
295 			nce->nce_unsolicit_count++;
296 		if (nce->nce_unsolicit_count != 0) {
297 			ASSERT(nce->nce_timeout_id == 0);
298 			nce->nce_timeout_id = timeout(ndp_timer, nce,
299 			    MSEC_TO_TICK(ipst->ips_ip_ndp_unsolicit_interval));
300 		}
301 		mutex_exit(&nce->nce_lock);
302 		mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
303 	}
304 
305 	/*
306 	 * If the hw_addr is NULL, typically for ND_INCOMPLETE nces, then
307 	 * we call nce_fastpath as soon as the nce is resolved in ndp_process.
308 	 * We call nce_fastpath from nce_update if the link layer address of
309 	 * the peer changes from nce_update
310 	 */
311 	if (hw_addr != NULL || ill->ill_net_type == IRE_IF_NORESOLVER)
312 		nce_fastpath(nce);
313 	return (err);
314 }
315 
316 int
317 ndp_lookup_then_add_v6(ill_t *ill, boolean_t match_illgrp, uchar_t *hw_addr,
318     const in6_addr_t *addr, const in6_addr_t *mask,
319     const in6_addr_t *extract_mask, uint32_t hw_extract_start, uint16_t flags,
320     uint16_t state, nce_t **newnce)
321 {
322 	int	err = 0;
323 	nce_t	*nce;
324 	ip_stack_t	*ipst = ill->ill_ipst;
325 
326 	ASSERT(ill->ill_isv6);
327 	mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
328 
329 	/* Get head of v6 hash table */
330 	nce = *((nce_t **)NCE_HASH_PTR_V6(ipst, *addr));
331 	nce = nce_lookup_addr(ill, match_illgrp, addr, nce);
332 	if (nce == NULL) {
333 		err = ndp_add_v6(ill,
334 		    hw_addr,
335 		    addr,
336 		    mask,
337 		    extract_mask,
338 		    hw_extract_start,
339 		    flags,
340 		    state,
341 		    newnce);
342 	} else {
343 		*newnce = nce;
344 		err = EEXIST;
345 	}
346 	mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
347 	return (err);
348 }
349 
350 /*
351  * Remove all the CONDEMNED nces from the appropriate hash table.
352  * We create a private list of NCEs, these may have ires pointing
353  * to them, so the list will be passed through to clean up dependent
354  * ires and only then we can do NCE_REFRELE which can make NCE inactive.
355  */
356 static void
357 nce_remove(ndp_g_t *ndp, nce_t *nce, nce_t **free_nce_list)
358 {
359 	nce_t *nce1;
360 	nce_t **ptpn;
361 
362 	ASSERT(MUTEX_HELD(&ndp->ndp_g_lock));
363 	ASSERT(ndp->ndp_g_walker == 0);
364 	for (; nce; nce = nce1) {
365 		nce1 = nce->nce_next;
366 		mutex_enter(&nce->nce_lock);
367 		if (nce->nce_flags & NCE_F_CONDEMNED) {
368 			ptpn = nce->nce_ptpn;
369 			nce1 = nce->nce_next;
370 			if (nce1 != NULL)
371 				nce1->nce_ptpn = ptpn;
372 			*ptpn = nce1;
373 			nce->nce_ptpn = NULL;
374 			nce->nce_next = NULL;
375 			nce->nce_next = *free_nce_list;
376 			*free_nce_list = nce;
377 		}
378 		mutex_exit(&nce->nce_lock);
379 	}
380 }
381 
382 /*
383  * 1. Mark the nce CONDEMNED. This ensures that no new nce_lookup()
384  *    will return this NCE. Also no new IREs will be created that
385  *    point to this NCE (See ire_add_v6).  Also no new timeouts will
386  *    be started (See NDP_RESTART_TIMER).
387  * 2. Cancel any currently running timeouts.
388  * 3. If there is an ndp walker, return. The walker will do the cleanup.
389  *    This ensures that walkers see a consistent list of NCEs while walking.
390  * 4. Otherwise remove the NCE from the list of NCEs
391  * 5. Delete all IREs pointing to this NCE.
392  */
393 void
394 ndp_delete(nce_t *nce)
395 {
396 	nce_t	**ptpn;
397 	nce_t	*nce1;
398 	int	ipversion = nce->nce_ipversion;
399 	ndp_g_t *ndp;
400 	ip_stack_t	*ipst = nce->nce_ill->ill_ipst;
401 
402 	if (ipversion == IPV4_VERSION)
403 		ndp = ipst->ips_ndp4;
404 	else
405 		ndp = ipst->ips_ndp6;
406 
407 	/* Serialize deletes */
408 	mutex_enter(&nce->nce_lock);
409 	if (nce->nce_flags & NCE_F_CONDEMNED) {
410 		/* Some other thread is doing the delete */
411 		mutex_exit(&nce->nce_lock);
412 		return;
413 	}
414 	/*
415 	 * Caller has a refhold. Also 1 ref for being in the list. Thus
416 	 * refcnt has to be >= 2
417 	 */
418 	ASSERT(nce->nce_refcnt >= 2);
419 	nce->nce_flags |= NCE_F_CONDEMNED;
420 	mutex_exit(&nce->nce_lock);
421 
422 	nce_fastpath_list_delete(nce);
423 
424 	/* Complete any waiting callbacks */
425 	nce_cb_dispatch(nce);
426 
427 	/*
428 	 * Cancel any running timer. Timeout can't be restarted
429 	 * since CONDEMNED is set. Can't hold nce_lock across untimeout.
430 	 * Passing invalid timeout id is fine.
431 	 */
432 	if (nce->nce_timeout_id != 0) {
433 		(void) untimeout(nce->nce_timeout_id);
434 		nce->nce_timeout_id = 0;
435 	}
436 
437 	mutex_enter(&ndp->ndp_g_lock);
438 	if (nce->nce_ptpn == NULL) {
439 		/*
440 		 * The last ndp walker has already removed this nce from
441 		 * the list after we marked the nce CONDEMNED and before
442 		 * we grabbed the global lock.
443 		 */
444 		mutex_exit(&ndp->ndp_g_lock);
445 		return;
446 	}
447 	if (ndp->ndp_g_walker > 0) {
448 		/*
449 		 * Can't unlink. The walker will clean up
450 		 */
451 		ndp->ndp_g_walker_cleanup = B_TRUE;
452 		mutex_exit(&ndp->ndp_g_lock);
453 		return;
454 	}
455 
456 	/*
457 	 * Now remove the nce from the list. NDP_RESTART_TIMER won't restart
458 	 * the timer since it is marked CONDEMNED.
459 	 */
460 	ptpn = nce->nce_ptpn;
461 	nce1 = nce->nce_next;
462 	if (nce1 != NULL)
463 		nce1->nce_ptpn = ptpn;
464 	*ptpn = nce1;
465 	nce->nce_ptpn = NULL;
466 	nce->nce_next = NULL;
467 	mutex_exit(&ndp->ndp_g_lock);
468 
469 	nce_ire_delete(nce);
470 }
471 
472 void
473 ndp_inactive(nce_t *nce)
474 {
475 	mblk_t		**mpp;
476 	ill_t		*ill;
477 
478 	ASSERT(nce->nce_refcnt == 0);
479 	ASSERT(MUTEX_HELD(&nce->nce_lock));
480 	ASSERT(nce->nce_fastpath == NULL);
481 
482 	/* Free all nce allocated messages */
483 	mpp = &nce->nce_first_mp_to_free;
484 	do {
485 		while (*mpp != NULL) {
486 			mblk_t  *mp;
487 
488 			mp = *mpp;
489 			*mpp = mp->b_next;
490 
491 			inet_freemsg(mp);
492 		}
493 	} while (mpp++ != &nce->nce_last_mp_to_free);
494 
495 	if (nce->nce_ipversion == IPV6_VERSION) {
496 		/*
497 		 * must have been cleaned up in nce_delete
498 		 */
499 		ASSERT(list_is_empty(&nce->nce_cb));
500 		list_destroy(&nce->nce_cb);
501 	}
502 #ifdef DEBUG
503 	nce_trace_cleanup(nce);
504 #endif
505 
506 	ill = nce->nce_ill;
507 	mutex_enter(&ill->ill_lock);
508 	DTRACE_PROBE3(ill__decr__cnt, (ill_t *), ill,
509 	    (char *), "nce", (void *), nce);
510 	ill->ill_nce_cnt--;
511 	/*
512 	 * If the number of nce's associated with this ill have dropped
513 	 * to zero, check whether we need to restart any operation that
514 	 * is waiting for this to happen.
515 	 */
516 	if (ILL_DOWN_OK(ill)) {
517 		/* ipif_ill_refrele_tail drops the ill_lock */
518 		ipif_ill_refrele_tail(ill);
519 	} else {
520 		mutex_exit(&ill->ill_lock);
521 	}
522 	mutex_destroy(&nce->nce_lock);
523 	if (nce->nce_mp != NULL)
524 		inet_freemsg(nce->nce_mp);
525 }
526 
527 /*
528  * ndp_walk routine.  Delete the nce if it is associated with the ill
529  * that is going away.  Always called as a writer.
530  */
531 void
532 ndp_delete_per_ill(nce_t *nce, uchar_t *arg)
533 {
534 	if ((nce != NULL) && nce->nce_ill == (ill_t *)arg) {
535 		ndp_delete(nce);
536 	}
537 }
538 
539 /*
540  * Walk a list of to be inactive NCEs and blow away all the ires.
541  */
542 static void
543 nce_ire_delete_list(nce_t *nce)
544 {
545 	nce_t *nce_next;
546 
547 	ASSERT(nce != NULL);
548 	while (nce != NULL) {
549 		nce_next = nce->nce_next;
550 		nce->nce_next = NULL;
551 
552 		/*
553 		 * It is possible for the last ndp walker (this thread)
554 		 * to come here after ndp_delete has marked the nce CONDEMNED
555 		 * and before it has removed the nce from the fastpath list
556 		 * or called untimeout. So we need to do it here. It is safe
557 		 * for both ndp_delete and this thread to do it twice or
558 		 * even simultaneously since each of the threads has a
559 		 * reference on the nce.
560 		 */
561 		nce_fastpath_list_delete(nce);
562 		/*
563 		 * Cancel any running timer. Timeout can't be restarted
564 		 * since CONDEMNED is set. Can't hold nce_lock across untimeout.
565 		 * Passing invalid timeout id is fine.
566 		 */
567 		if (nce->nce_timeout_id != 0) {
568 			(void) untimeout(nce->nce_timeout_id);
569 			nce->nce_timeout_id = 0;
570 		}
571 		/*
572 		 * We might hit this func thus in the v4 case:
573 		 * ipif_down->ipif_ndp_down->ndp_walk
574 		 */
575 
576 		if (nce->nce_ipversion == IPV4_VERSION) {
577 			ire_walk_ill_v4(MATCH_IRE_ILL | MATCH_IRE_TYPE,
578 			    IRE_CACHE, nce_ire_delete1, nce, nce->nce_ill);
579 		} else {
580 			ASSERT(nce->nce_ipversion == IPV6_VERSION);
581 			ire_walk_ill_v6(MATCH_IRE_ILL | MATCH_IRE_TYPE,
582 			    IRE_CACHE, nce_ire_delete1, nce, nce->nce_ill);
583 		}
584 		NCE_REFRELE_NOTR(nce);
585 		nce = nce_next;
586 	}
587 }
588 
589 /*
590  * Delete an ire when the nce goes away.
591  */
592 /* ARGSUSED */
593 static void
594 nce_ire_delete(nce_t *nce)
595 {
596 	if (nce->nce_ipversion == IPV6_VERSION) {
597 		ire_walk_ill_v6(MATCH_IRE_ILL | MATCH_IRE_TYPE, IRE_CACHE,
598 		    nce_ire_delete1, (char *)nce, nce->nce_ill);
599 		NCE_REFRELE_NOTR(nce);
600 	} else {
601 		ire_walk_ill_v4(MATCH_IRE_ILL | MATCH_IRE_TYPE, IRE_CACHE,
602 		    nce_ire_delete1, (char *)nce, nce->nce_ill);
603 		NCE_REFRELE_NOTR(nce);
604 	}
605 }
606 
607 /*
608  * ire_walk routine used to delete every IRE that shares this nce
609  */
610 static void
611 nce_ire_delete1(ire_t *ire, char *nce_arg)
612 {
613 	nce_t	*nce = (nce_t *)nce_arg;
614 
615 	ASSERT(ire->ire_type == IRE_CACHE);
616 
617 	if (ire->ire_nce == nce) {
618 		ASSERT(ire->ire_ipversion == nce->nce_ipversion);
619 		ire_delete(ire);
620 	}
621 }
622 
623 /*
624  * Restart DAD on given NCE.  Returns B_TRUE if DAD has been restarted.
625  */
626 boolean_t
627 ndp_restart_dad(nce_t *nce)
628 {
629 	boolean_t started;
630 	boolean_t dropped;
631 
632 	if (nce == NULL)
633 		return (B_FALSE);
634 	mutex_enter(&nce->nce_lock);
635 	if (nce->nce_state == ND_PROBE) {
636 		mutex_exit(&nce->nce_lock);
637 		started = B_TRUE;
638 	} else if (nce->nce_state == ND_REACHABLE) {
639 		nce->nce_state = ND_PROBE;
640 		nce->nce_pcnt = ND_MAX_UNICAST_SOLICIT - 1;
641 		mutex_exit(&nce->nce_lock);
642 		dropped = nce_xmit_solicit(nce, B_FALSE, NULL, NDP_PROBE);
643 		if (dropped) {
644 			mutex_enter(&nce->nce_lock);
645 			nce->nce_pcnt++;
646 			mutex_exit(&nce->nce_lock);
647 		}
648 		NDP_RESTART_TIMER(nce, ILL_PROBE_INTERVAL(nce->nce_ill));
649 		started = B_TRUE;
650 	} else {
651 		mutex_exit(&nce->nce_lock);
652 		started = B_FALSE;
653 	}
654 	return (started);
655 }
656 
657 /*
658  * IPv6 Cache entry lookup.  Try to find an nce matching the parameters passed.
659  * If one is found, the refcnt on the nce will be incremented.
660  */
661 nce_t *
662 ndp_lookup_v6(ill_t *ill, boolean_t match_illgrp, const in6_addr_t *addr,
663     boolean_t caller_holds_lock)
664 {
665 	nce_t	*nce;
666 	ip_stack_t *ipst = ill->ill_ipst;
667 
668 	ASSERT(ill->ill_isv6);
669 	if (!caller_holds_lock)
670 		mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
671 
672 	/* Get head of v6 hash table */
673 	nce = *((nce_t **)NCE_HASH_PTR_V6(ipst, *addr));
674 	nce = nce_lookup_addr(ill, match_illgrp, addr, nce);
675 	if (nce == NULL)
676 		nce = nce_lookup_mapping(ill, addr);
677 	if (!caller_holds_lock)
678 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
679 	return (nce);
680 }
681 /*
682  * IPv4 Cache entry lookup.  Try to find an nce matching the parameters passed.
683  * If one is found, the refcnt on the nce will be incremented.
684  * Since multicast mappings are handled in arp, there are no nce_mcast_entries
685  * so we skip the nce_lookup_mapping call.
686  * XXX TODO: if the nce is found to be ND_STALE, ndp_delete it and return NULL
687  */
688 nce_t *
689 ndp_lookup_v4(ill_t *ill, const in_addr_t *addr, boolean_t caller_holds_lock)
690 {
691 	nce_t	*nce;
692 	in6_addr_t addr6;
693 	ip_stack_t *ipst = ill->ill_ipst;
694 
695 	if (!caller_holds_lock)
696 		mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
697 
698 	/* Get head of v4 hash table */
699 	nce = *((nce_t **)NCE_HASH_PTR_V4(ipst, *addr));
700 	IN6_IPADDR_TO_V4MAPPED(*addr, &addr6);
701 	/*
702 	 * NOTE: IPv4 never matches across the illgrp since the NCE's we're
703 	 * looking up have fastpath headers that are inherently per-ill.
704 	 */
705 	nce = nce_lookup_addr(ill, B_FALSE, &addr6, nce);
706 	if (!caller_holds_lock)
707 		mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
708 	return (nce);
709 }
710 
711 /*
712  * Cache entry lookup.  Try to find an nce matching the parameters passed.
713  * Look only for exact entries (no mappings).  If an nce is found, increment
714  * the hold count on that nce. The caller passes in the start of the
715  * appropriate hash table, and must be holding the appropriate global
716  * lock (ndp_g_lock).
717  */
718 static nce_t *
719 nce_lookup_addr(ill_t *ill, boolean_t match_illgrp, const in6_addr_t *addr,
720     nce_t *nce)
721 {
722 	ndp_g_t		*ndp;
723 	ip_stack_t	*ipst = ill->ill_ipst;
724 
725 	if (ill->ill_isv6)
726 		ndp = ipst->ips_ndp6;
727 	else
728 		ndp = ipst->ips_ndp4;
729 
730 	ASSERT(MUTEX_HELD(&ndp->ndp_g_lock));
731 	if (IN6_IS_ADDR_UNSPECIFIED(addr))
732 		return (NULL);
733 	for (; nce != NULL; nce = nce->nce_next) {
734 		if (nce->nce_ill == ill ||
735 		    match_illgrp && IS_IN_SAME_ILLGRP(ill, nce->nce_ill)) {
736 			if (IN6_ARE_ADDR_EQUAL(&nce->nce_addr, addr) &&
737 			    IN6_ARE_ADDR_EQUAL(&nce->nce_mask,
738 			    &ipv6_all_ones)) {
739 				mutex_enter(&nce->nce_lock);
740 				if (!(nce->nce_flags & NCE_F_CONDEMNED)) {
741 					NCE_REFHOLD_LOCKED(nce);
742 					mutex_exit(&nce->nce_lock);
743 					break;
744 				}
745 				mutex_exit(&nce->nce_lock);
746 			}
747 		}
748 	}
749 	return (nce);
750 }
751 
752 /*
753  * Cache entry lookup.  Try to find an nce matching the parameters passed.
754  * Look only for mappings.
755  */
756 static nce_t *
757 nce_lookup_mapping(ill_t *ill, const in6_addr_t *addr)
758 {
759 	nce_t	*nce;
760 	ip_stack_t	*ipst = ill->ill_ipst;
761 
762 	ASSERT(ill != NULL && ill->ill_isv6);
763 	ASSERT(MUTEX_HELD(&ipst->ips_ndp6->ndp_g_lock));
764 	if (!IN6_IS_ADDR_MULTICAST(addr))
765 		return (NULL);
766 	nce = ipst->ips_ndp6->nce_mask_entries;
767 	for (; nce != NULL; nce = nce->nce_next)
768 		if (nce->nce_ill == ill &&
769 		    (V6_MASK_EQ(*addr, nce->nce_mask, nce->nce_addr))) {
770 			mutex_enter(&nce->nce_lock);
771 			if (!(nce->nce_flags & NCE_F_CONDEMNED)) {
772 				NCE_REFHOLD_LOCKED(nce);
773 				mutex_exit(&nce->nce_lock);
774 				break;
775 			}
776 			mutex_exit(&nce->nce_lock);
777 		}
778 	return (nce);
779 }
780 
781 /*
782  * Process passed in parameters either from an incoming packet or via
783  * user ioctl.
784  */
785 static void
786 nce_process(nce_t *nce, uchar_t *hw_addr, uint32_t flag, boolean_t is_adv)
787 {
788 	ill_t	*ill = nce->nce_ill;
789 	uint32_t hw_addr_len = ill->ill_nd_lla_len;
790 	mblk_t	*mp;
791 	boolean_t ll_updated = B_FALSE;
792 	boolean_t ll_changed;
793 	ip_stack_t	*ipst = ill->ill_ipst;
794 
795 	ASSERT(nce->nce_ipversion == IPV6_VERSION);
796 	/*
797 	 * No updates of link layer address or the neighbor state is
798 	 * allowed, when the cache is in NONUD state.  This still
799 	 * allows for responding to reachability solicitation.
800 	 */
801 	mutex_enter(&nce->nce_lock);
802 	if (nce->nce_state == ND_INCOMPLETE) {
803 		if (hw_addr == NULL) {
804 			mutex_exit(&nce->nce_lock);
805 			return;
806 		}
807 		nce_set_ll(nce, hw_addr);
808 		/*
809 		 * Update nce state and send the queued packets
810 		 * back to ip this time ire will be added.
811 		 */
812 		if (flag & ND_NA_FLAG_SOLICITED) {
813 			nce_update(nce, ND_REACHABLE, NULL);
814 		} else {
815 			nce_update(nce, ND_STALE, NULL);
816 		}
817 		mutex_exit(&nce->nce_lock);
818 		nce_fastpath(nce);
819 		nce_cb_dispatch(nce); /* complete callbacks */
820 		mutex_enter(&nce->nce_lock);
821 		mp = nce->nce_qd_mp;
822 		nce->nce_qd_mp = NULL;
823 		mutex_exit(&nce->nce_lock);
824 		while (mp != NULL) {
825 			mblk_t *nxt_mp, *data_mp;
826 
827 			nxt_mp = mp->b_next;
828 			mp->b_next = NULL;
829 
830 			if (mp->b_datap->db_type == M_CTL)
831 				data_mp = mp->b_cont;
832 			else
833 				data_mp = mp;
834 			if (data_mp->b_prev != NULL) {
835 				ill_t   *inbound_ill;
836 				queue_t *fwdq = NULL;
837 				uint_t ifindex;
838 
839 				ifindex = (uint_t)(uintptr_t)data_mp->b_prev;
840 				inbound_ill = ill_lookup_on_ifindex(ifindex,
841 				    B_TRUE, NULL, NULL, NULL, NULL, ipst);
842 				if (inbound_ill == NULL) {
843 					data_mp->b_prev = NULL;
844 					freemsg(mp);
845 					return;
846 				} else {
847 					fwdq = inbound_ill->ill_rq;
848 				}
849 				data_mp->b_prev = NULL;
850 				/*
851 				 * Send a forwarded packet back into ip_rput_v6
852 				 * just as in ire_send_v6().
853 				 * Extract the queue from b_prev (set in
854 				 * ip_rput_data_v6).
855 				 */
856 				if (fwdq != NULL) {
857 					/*
858 					 * Forwarded packets hop count will
859 					 * get decremented in ip_rput_data_v6
860 					 */
861 					if (data_mp != mp)
862 						freeb(mp);
863 					put(fwdq, data_mp);
864 				} else {
865 					/*
866 					 * Send locally originated packets back
867 					 * into ip_wput_v6.
868 					 */
869 					put(ill->ill_wq, mp);
870 				}
871 				ill_refrele(inbound_ill);
872 			} else {
873 				put(ill->ill_wq, mp);
874 			}
875 			mp = nxt_mp;
876 		}
877 		return;
878 	}
879 	ll_changed = nce_cmp_ll_addr(nce, hw_addr, hw_addr_len);
880 	if (!is_adv) {
881 		/* If this is a SOLICITATION request only */
882 		if (ll_changed)
883 			nce_update(nce, ND_STALE, hw_addr);
884 		mutex_exit(&nce->nce_lock);
885 		nce_cb_dispatch(nce);
886 		return;
887 	}
888 	if (!(flag & ND_NA_FLAG_OVERRIDE) && ll_changed) {
889 		/* If in any other state than REACHABLE, ignore */
890 		if (nce->nce_state == ND_REACHABLE) {
891 			nce_update(nce, ND_STALE, NULL);
892 		}
893 		mutex_exit(&nce->nce_lock);
894 		nce_cb_dispatch(nce);
895 		return;
896 	} else {
897 		if (ll_changed) {
898 			nce_update(nce, ND_UNCHANGED, hw_addr);
899 			ll_updated = B_TRUE;
900 		}
901 		if (flag & ND_NA_FLAG_SOLICITED) {
902 			nce_update(nce, ND_REACHABLE, NULL);
903 		} else {
904 			if (ll_updated) {
905 				nce_update(nce, ND_STALE, NULL);
906 			}
907 		}
908 		mutex_exit(&nce->nce_lock);
909 		if (!(flag & ND_NA_FLAG_ROUTER) && (nce->nce_flags &
910 		    NCE_F_ISROUTER)) {
911 			ire_t *ire;
912 
913 			/*
914 			 * Router turned to host.  We need to remove the
915 			 * entry as well as any default route that may be
916 			 * using this as a next hop.  This is required by
917 			 * section 7.2.5 of RFC 2461.
918 			 */
919 			ire = ire_ftable_lookup_v6(&ipv6_all_zeros,
920 			    &ipv6_all_zeros, &nce->nce_addr, IRE_DEFAULT,
921 			    nce->nce_ill->ill_ipif, NULL, ALL_ZONES, 0, NULL,
922 			    MATCH_IRE_ILL | MATCH_IRE_TYPE | MATCH_IRE_GW |
923 			    MATCH_IRE_DEFAULT, ipst);
924 			if (ire != NULL) {
925 				ip_rts_rtmsg(RTM_DELETE, ire, 0, ipst);
926 				ire_delete(ire);
927 				ire_refrele(ire);
928 			}
929 			ndp_delete(nce); /* will do nce_cb_dispatch */
930 		} else {
931 			nce_cb_dispatch(nce);
932 		}
933 	}
934 }
935 
936 /*
937  * Walker state structure used by ndp_process() / ndp_process_entry().
938  */
939 typedef struct ndp_process_data {
940 	ill_t		*np_ill; 	/* ill/illgrp to match against */
941 	const in6_addr_t *np_addr; 	/* IPv6 address to match */
942 	uchar_t		*np_hw_addr; 	/* passed to nce_process() */
943 	uint32_t	np_flag;	/* passed to nce_process() */
944 	boolean_t	np_is_adv;	/* passed to nce_process() */
945 } ndp_process_data_t;
946 
947 /*
948  * Walker callback used by ndp_process() for IPMP groups: calls nce_process()
949  * for each NCE with a matching address that's in the same IPMP group.
950  */
951 static void
952 ndp_process_entry(nce_t *nce, void *arg)
953 {
954 	ndp_process_data_t *npp = arg;
955 
956 	if (IS_IN_SAME_ILLGRP(nce->nce_ill, npp->np_ill) &&
957 	    IN6_ARE_ADDR_EQUAL(&nce->nce_addr, npp->np_addr) &&
958 	    IN6_ARE_ADDR_EQUAL(&nce->nce_mask, &ipv6_all_ones)) {
959 		nce_process(nce, npp->np_hw_addr, npp->np_flag, npp->np_is_adv);
960 	}
961 }
962 
963 /*
964  * Wrapper around nce_process() that handles IPMP.  In particular, for IPMP,
965  * NCEs are per-underlying-ill (because of nce_fp_mp) and thus we may have
966  * more than one NCE for a given IPv6 address to tend to.  In that case, we
967  * need to walk all NCEs and callback nce_process() for each one.  Since this
968  * is expensive, in the non-IPMP case we just directly call nce_process().
969  * Ultimately, nce_fp_mp needs to be moved out of the nce_t so that all IP
970  * interfaces in an IPMP group share the same NCEs -- at which point this
971  * function can be removed entirely.
972  */
973 void
974 ndp_process(nce_t *nce, uchar_t *hw_addr, uint32_t flag, boolean_t is_adv)
975 {
976 	ill_t *ill = nce->nce_ill;
977 	struct ndp_g_s *ndp = ill->ill_ipst->ips_ndp6;
978 	ndp_process_data_t np;
979 
980 	if (ill->ill_grp == NULL) {
981 		nce_process(nce, hw_addr, flag, is_adv);
982 		return;
983 	}
984 
985 	/* IPMP case: walk all NCEs */
986 	np.np_ill = ill;
987 	np.np_addr = &nce->nce_addr;
988 	np.np_flag = flag;
989 	np.np_is_adv = is_adv;
990 	np.np_hw_addr = hw_addr;
991 
992 	ndp_walk_common(ndp, NULL, (pfi_t)ndp_process_entry, &np, ALL_ZONES);
993 }
994 
995 /*
996  * Pass arg1 to the pfi supplied, along with each nce in existence.
997  * ndp_walk() places a REFHOLD on the nce and drops the lock when
998  * walking the hash list.
999  */
1000 void
1001 ndp_walk_common(ndp_g_t *ndp, ill_t *ill, pfi_t pfi, void *arg1,
1002     boolean_t trace)
1003 {
1004 	nce_t	*nce;
1005 	nce_t	*nce1;
1006 	nce_t	**ncep;
1007 	nce_t	*free_nce_list = NULL;
1008 
1009 	mutex_enter(&ndp->ndp_g_lock);
1010 	/* Prevent ndp_delete from unlink and free of NCE */
1011 	ndp->ndp_g_walker++;
1012 	mutex_exit(&ndp->ndp_g_lock);
1013 	for (ncep = ndp->nce_hash_tbl;
1014 	    ncep < A_END(ndp->nce_hash_tbl); ncep++) {
1015 		for (nce = *ncep; nce != NULL; nce = nce1) {
1016 			nce1 = nce->nce_next;
1017 			if (ill == NULL || nce->nce_ill == ill) {
1018 				if (trace) {
1019 					NCE_REFHOLD(nce);
1020 					(*pfi)(nce, arg1);
1021 					NCE_REFRELE(nce);
1022 				} else {
1023 					NCE_REFHOLD_NOTR(nce);
1024 					(*pfi)(nce, arg1);
1025 					NCE_REFRELE_NOTR(nce);
1026 				}
1027 			}
1028 		}
1029 	}
1030 	for (nce = ndp->nce_mask_entries; nce != NULL; nce = nce1) {
1031 		nce1 = nce->nce_next;
1032 		if (ill == NULL || nce->nce_ill == ill) {
1033 			if (trace) {
1034 				NCE_REFHOLD(nce);
1035 				(*pfi)(nce, arg1);
1036 				NCE_REFRELE(nce);
1037 			} else {
1038 				NCE_REFHOLD_NOTR(nce);
1039 				(*pfi)(nce, arg1);
1040 				NCE_REFRELE_NOTR(nce);
1041 			}
1042 		}
1043 	}
1044 	mutex_enter(&ndp->ndp_g_lock);
1045 	ndp->ndp_g_walker--;
1046 	/*
1047 	 * While NCE's are removed from global list they are placed
1048 	 * in a private list, to be passed to nce_ire_delete_list().
1049 	 * The reason is, there may be ires pointing to this nce
1050 	 * which needs to cleaned up.
1051 	 */
1052 	if (ndp->ndp_g_walker_cleanup && ndp->ndp_g_walker == 0) {
1053 		/* Time to delete condemned entries */
1054 		for (ncep = ndp->nce_hash_tbl;
1055 		    ncep < A_END(ndp->nce_hash_tbl); ncep++) {
1056 			nce = *ncep;
1057 			if (nce != NULL) {
1058 				nce_remove(ndp, nce, &free_nce_list);
1059 			}
1060 		}
1061 		nce = ndp->nce_mask_entries;
1062 		if (nce != NULL) {
1063 			nce_remove(ndp, nce, &free_nce_list);
1064 		}
1065 		ndp->ndp_g_walker_cleanup = B_FALSE;
1066 	}
1067 
1068 	mutex_exit(&ndp->ndp_g_lock);
1069 
1070 	if (free_nce_list != NULL) {
1071 		nce_ire_delete_list(free_nce_list);
1072 	}
1073 }
1074 
1075 /*
1076  * Walk everything.
1077  * Note that ill can be NULL hence can't derive the ipst from it.
1078  */
1079 void
1080 ndp_walk(ill_t *ill, pfi_t pfi, void *arg1, ip_stack_t *ipst)
1081 {
1082 	ndp_walk_common(ipst->ips_ndp4, ill, pfi, arg1, B_TRUE);
1083 	ndp_walk_common(ipst->ips_ndp6, ill, pfi, arg1, B_TRUE);
1084 }
1085 
1086 /*
1087  * Process resolve requests.  Handles both mapped entries
1088  * as well as cases that needs to be send out on the wire.
1089  * Lookup a NCE for a given IRE.  Regardless of whether one exists
1090  * or one is created, we defer making ire point to nce until the
1091  * ire is actually added at which point the nce_refcnt on the nce is
1092  * incremented.  This is done primarily to have symmetry between ire_add()
1093  * and ire_delete() which decrements the nce_refcnt, when an ire is deleted.
1094  */
1095 int
1096 ndp_resolver(ill_t *ill, const in6_addr_t *dst, mblk_t *mp, zoneid_t zoneid)
1097 {
1098 	nce_t		*nce, *hw_nce = NULL;
1099 	int		err;
1100 	ill_t		*ipmp_ill;
1101 	uint16_t	nce_flags;
1102 	mblk_t		*mp_nce = NULL;
1103 	ip_stack_t	*ipst = ill->ill_ipst;
1104 	uchar_t		*hwaddr = NULL;
1105 
1106 	ASSERT(ill->ill_isv6);
1107 
1108 	if (IN6_IS_ADDR_MULTICAST(dst))
1109 		return (nce_set_multicast(ill, dst));
1110 
1111 	nce_flags = (ill->ill_flags & ILLF_NONUD) ? NCE_F_NONUD : 0;
1112 
1113 	/*
1114 	 * If `ill' is under IPMP, then first check to see if there's an NCE
1115 	 * for `dst' on the IPMP meta-interface (e.g., because an application
1116 	 * explicitly did an SIOCLIFSETND to tie a hardware address to `dst').
1117 	 * If so, we use that hardware address when creating the NCE below.
1118 	 * Note that we don't yet have a mechanism to remove these NCEs if the
1119 	 * NCE for `dst' on the IPMP meta-interface is subsequently removed --
1120 	 * but rather than build such a beast, we should fix NCEs so that they
1121 	 * can be properly shared across an IPMP group.
1122 	 */
1123 	if (IS_UNDER_IPMP(ill)) {
1124 		if ((ipmp_ill = ipmp_ill_hold_ipmp_ill(ill)) != NULL) {
1125 			hw_nce = ndp_lookup_v6(ipmp_ill, B_FALSE, dst, B_FALSE);
1126 			if (hw_nce != NULL && hw_nce->nce_res_mp != NULL) {
1127 				hwaddr = hw_nce->nce_res_mp->b_rptr +
1128 				    NCE_LL_ADDR_OFFSET(ipmp_ill);
1129 				nce_flags |= hw_nce->nce_flags;
1130 			}
1131 			ill_refrele(ipmp_ill);
1132 		}
1133 	}
1134 
1135 	err = ndp_lookup_then_add_v6(ill,
1136 	    B_FALSE,	/* NCE fastpath is per ill; don't match across group */
1137 	    hwaddr,
1138 	    dst,
1139 	    &ipv6_all_ones,
1140 	    &ipv6_all_zeros,
1141 	    0,
1142 	    nce_flags,
1143 	    hwaddr != NULL ? ND_REACHABLE : ND_INCOMPLETE,
1144 	    &nce);
1145 
1146 	if (hw_nce != NULL)
1147 		NCE_REFRELE(hw_nce);
1148 
1149 	switch (err) {
1150 	case 0:
1151 		/*
1152 		 * New cache entry was created. Make sure that the state
1153 		 * is not ND_INCOMPLETE. It can be in some other state
1154 		 * even before we send out the solicitation as we could
1155 		 * get un-solicited advertisements.
1156 		 *
1157 		 * If this is an XRESOLV interface, simply return 0,
1158 		 * since we don't want to solicit just yet.
1159 		 */
1160 		if (ill->ill_flags & ILLF_XRESOLV) {
1161 			NCE_REFRELE(nce);
1162 			return (0);
1163 		}
1164 
1165 		mutex_enter(&nce->nce_lock);
1166 		if (nce->nce_state != ND_INCOMPLETE) {
1167 			mutex_exit(&nce->nce_lock);
1168 			NCE_REFRELE(nce);
1169 			return (0);
1170 		}
1171 		if (nce->nce_rcnt == 0) {
1172 			/* The caller will free mp */
1173 			mutex_exit(&nce->nce_lock);
1174 			ndp_delete(nce);
1175 			NCE_REFRELE(nce);
1176 			return (ESRCH);
1177 		}
1178 		mp_nce = ip_prepend_zoneid(mp, zoneid, ipst);
1179 		if (mp_nce == NULL) {
1180 			/* The caller will free mp */
1181 			mutex_exit(&nce->nce_lock);
1182 			ndp_delete(nce);
1183 			NCE_REFRELE(nce);
1184 			return (ENOMEM);
1185 		}
1186 		nce_queue_mp(nce, mp_nce);
1187 		ip_ndp_resolve(nce);
1188 		mutex_exit(&nce->nce_lock);
1189 		NCE_REFRELE(nce);
1190 		return (EINPROGRESS);
1191 	case EEXIST:
1192 		/* Resolution in progress just queue the packet */
1193 		mutex_enter(&nce->nce_lock);
1194 		if (nce->nce_state == ND_INCOMPLETE) {
1195 			mp_nce = ip_prepend_zoneid(mp, zoneid, ipst);
1196 			if (mp_nce == NULL) {
1197 				err = ENOMEM;
1198 			} else {
1199 				nce_queue_mp(nce, mp_nce);
1200 				err = EINPROGRESS;
1201 			}
1202 		} else {
1203 			/*
1204 			 * Any other state implies we have
1205 			 * a nce but IRE needs to be added ...
1206 			 * ire_add_v6() will take care of the
1207 			 * the case when the nce becomes CONDEMNED
1208 			 * before the ire is added to the table.
1209 			 */
1210 			err = 0;
1211 		}
1212 		mutex_exit(&nce->nce_lock);
1213 		NCE_REFRELE(nce);
1214 		break;
1215 	default:
1216 		ip1dbg(("ndp_resolver: Can't create NCE %d\n", err));
1217 		break;
1218 	}
1219 	return (err);
1220 }
1221 
1222 /*
1223  * When there is no resolver, the link layer template is passed in
1224  * the IRE.
1225  * Lookup a NCE for a given IRE.  Regardless of whether one exists
1226  * or one is created, we defer making ire point to nce until the
1227  * ire is actually added at which point the nce_refcnt on the nce is
1228  * incremented.  This is done primarily to have symmetry between ire_add()
1229  * and ire_delete() which decrements the nce_refcnt, when an ire is deleted.
1230  */
1231 int
1232 ndp_noresolver(ill_t *ill, const in6_addr_t *dst)
1233 {
1234 	nce_t		*nce;
1235 	int		err = 0;
1236 
1237 	ASSERT(ill != NULL);
1238 	ASSERT(ill->ill_isv6);
1239 	if (IN6_IS_ADDR_MULTICAST(dst)) {
1240 		err = nce_set_multicast(ill, dst);
1241 		return (err);
1242 	}
1243 
1244 	err = ndp_lookup_then_add_v6(ill,
1245 	    B_FALSE,	/* NCE fastpath is per ill; don't match across group */
1246 	    ill->ill_dest_addr,	/* hardware address is NULL in most cases */
1247 	    dst,
1248 	    &ipv6_all_ones,
1249 	    &ipv6_all_zeros,
1250 	    0,
1251 	    (ill->ill_flags & ILLF_NONUD) ? NCE_F_NONUD : 0,
1252 	    ND_REACHABLE,
1253 	    &nce);
1254 
1255 	switch (err) {
1256 	case 0:
1257 		/*
1258 		 * Cache entry with a proper resolver cookie was
1259 		 * created.
1260 		 */
1261 		NCE_REFRELE(nce);
1262 		break;
1263 	case EEXIST:
1264 		err = 0;
1265 		NCE_REFRELE(nce);
1266 		break;
1267 	default:
1268 		ip1dbg(("ndp_noresolver: Can't create NCE %d\n", err));
1269 		break;
1270 	}
1271 	return (err);
1272 }
1273 
1274 /*
1275  * For each interface an entry is added for the unspecified multicast group.
1276  * Here that mapping is used to form the multicast cache entry for a particular
1277  * multicast destination.
1278  */
1279 static int
1280 nce_set_multicast(ill_t *ill, const in6_addr_t *dst)
1281 {
1282 	nce_t		*mnce;	/* Multicast mapping entry */
1283 	nce_t		*nce;
1284 	uchar_t		*hw_addr = NULL;
1285 	int		err = 0;
1286 	ip_stack_t	*ipst = ill->ill_ipst;
1287 
1288 	ASSERT(ill != NULL);
1289 	ASSERT(ill->ill_isv6);
1290 	ASSERT(!(IN6_IS_ADDR_UNSPECIFIED(dst)));
1291 
1292 	mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
1293 	nce = *((nce_t **)NCE_HASH_PTR_V6(ipst, *dst));
1294 	nce = nce_lookup_addr(ill, B_FALSE, dst, nce);
1295 	if (nce != NULL) {
1296 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
1297 		NCE_REFRELE(nce);
1298 		return (0);
1299 	}
1300 	/* No entry, now lookup for a mapping this should never fail */
1301 	mnce = nce_lookup_mapping(ill, dst);
1302 	if (mnce == NULL) {
1303 		/* Something broken for the interface. */
1304 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
1305 		return (ESRCH);
1306 	}
1307 	ASSERT(mnce->nce_flags & NCE_F_MAPPING);
1308 	if (ill->ill_net_type == IRE_IF_RESOLVER) {
1309 		/*
1310 		 * For IRE_IF_RESOLVER a hardware mapping can be
1311 		 * generated, for IRE_IF_NORESOLVER, resolution cookie
1312 		 * in the ill is copied in ndp_add_v6().
1313 		 */
1314 		hw_addr = kmem_alloc(ill->ill_nd_lla_len, KM_NOSLEEP);
1315 		if (hw_addr == NULL) {
1316 			mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
1317 			NCE_REFRELE(mnce);
1318 			return (ENOMEM);
1319 		}
1320 		nce_make_mapping(mnce, hw_addr, (uchar_t *)dst);
1321 	}
1322 	NCE_REFRELE(mnce);
1323 	/*
1324 	 * IRE_IF_NORESOLVER type simply copies the resolution
1325 	 * cookie passed in.  So no hw_addr is needed.
1326 	 */
1327 	err = ndp_add_v6(ill,
1328 	    hw_addr,
1329 	    dst,
1330 	    &ipv6_all_ones,
1331 	    &ipv6_all_zeros,
1332 	    0,
1333 	    NCE_F_NONUD,
1334 	    ND_REACHABLE,
1335 	    &nce);
1336 	mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
1337 	if (hw_addr != NULL)
1338 		kmem_free(hw_addr, ill->ill_nd_lla_len);
1339 	if (err != 0) {
1340 		ip1dbg(("nce_set_multicast: create failed" "%d\n", err));
1341 		return (err);
1342 	}
1343 	NCE_REFRELE(nce);
1344 	return (0);
1345 }
1346 
1347 /*
1348  * Return the link layer address, and any flags of a nce.
1349  */
1350 int
1351 ndp_query(ill_t *ill, struct lif_nd_req *lnr)
1352 {
1353 	nce_t		*nce;
1354 	in6_addr_t	*addr;
1355 	sin6_t		*sin6;
1356 	dl_unitdata_req_t	*dl;
1357 
1358 	ASSERT(ill != NULL && ill->ill_isv6);
1359 	sin6 = (sin6_t *)&lnr->lnr_addr;
1360 	addr =  &sin6->sin6_addr;
1361 
1362 	/*
1363 	 * NOTE: if the ill is an IPMP interface, then match against the whole
1364 	 * illgrp.  This e.g. allows in.ndpd to retrieve the link layer
1365 	 * addresses for the data addresses on an IPMP interface even though
1366 	 * ipif_ndp_up() created them with an nce_ill of ipif_bound_ill.
1367 	 */
1368 	nce = ndp_lookup_v6(ill, IS_IPMP(ill), addr, B_FALSE);
1369 	if (nce == NULL)
1370 		return (ESRCH);
1371 	/* If in INCOMPLETE state, no link layer address is available yet */
1372 	if (!NCE_ISREACHABLE(nce)) {
1373 		NCE_REFRELE(nce);
1374 		return (ESRCH);
1375 	}
1376 	dl = (dl_unitdata_req_t *)nce->nce_res_mp->b_rptr;
1377 	if (ill->ill_flags & ILLF_XRESOLV)
1378 		lnr->lnr_hdw_len = dl->dl_dest_addr_length;
1379 	else
1380 		lnr->lnr_hdw_len = ill->ill_nd_lla_len;
1381 	ASSERT(NCE_LL_ADDR_OFFSET(ill) + lnr->lnr_hdw_len <=
1382 	    sizeof (lnr->lnr_hdw_addr));
1383 	bcopy(nce->nce_res_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill),
1384 	    (uchar_t *)&lnr->lnr_hdw_addr, lnr->lnr_hdw_len);
1385 	if (nce->nce_flags & NCE_F_ISROUTER)
1386 		lnr->lnr_flags = NDF_ISROUTER_ON;
1387 	if (nce->nce_flags & NCE_F_ANYCAST)
1388 		lnr->lnr_flags |= NDF_ANYCAST_ON;
1389 	NCE_REFRELE(nce);
1390 	return (0);
1391 }
1392 
1393 /*
1394  * Send Enable/Disable multicast reqs to driver.
1395  */
1396 int
1397 ndp_mcastreq(ill_t *ill, const in6_addr_t *addr, uint32_t hw_addr_len,
1398     uint32_t hw_addr_offset, mblk_t *mp)
1399 {
1400 	nce_t		*nce;
1401 	uchar_t		*hw_addr;
1402 	ip_stack_t	*ipst = ill->ill_ipst;
1403 
1404 	ASSERT(ill != NULL && ill->ill_isv6);
1405 	ASSERT(ill->ill_net_type == IRE_IF_RESOLVER);
1406 	hw_addr = mi_offset_paramc(mp, hw_addr_offset, hw_addr_len);
1407 	if (hw_addr == NULL || !IN6_IS_ADDR_MULTICAST(addr)) {
1408 		freemsg(mp);
1409 		return (EINVAL);
1410 	}
1411 	mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
1412 	nce = nce_lookup_mapping(ill, addr);
1413 	if (nce == NULL) {
1414 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
1415 		freemsg(mp);
1416 		return (ESRCH);
1417 	}
1418 	mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
1419 	/*
1420 	 * Update dl_addr_length and dl_addr_offset for primitives that
1421 	 * have physical addresses as opposed to full saps
1422 	 */
1423 	switch (((union DL_primitives *)mp->b_rptr)->dl_primitive) {
1424 	case DL_ENABMULTI_REQ:
1425 		/* Track the state if this is the first enabmulti */
1426 		if (ill->ill_dlpi_multicast_state == IDS_UNKNOWN)
1427 			ill->ill_dlpi_multicast_state = IDS_INPROGRESS;
1428 		ip1dbg(("ndp_mcastreq: ENABMULTI\n"));
1429 		break;
1430 	case DL_DISABMULTI_REQ:
1431 		ip1dbg(("ndp_mcastreq: DISABMULTI\n"));
1432 		break;
1433 	default:
1434 		NCE_REFRELE(nce);
1435 		ip1dbg(("ndp_mcastreq: default\n"));
1436 		return (EINVAL);
1437 	}
1438 	nce_make_mapping(nce, hw_addr, (uchar_t *)addr);
1439 	NCE_REFRELE(nce);
1440 	ill_dlpi_send(ill, mp);
1441 	return (0);
1442 }
1443 
1444 
1445 /*
1446  * Send out a NS for resolving the ip address in nce.
1447  */
1448 void
1449 ip_ndp_resolve(nce_t *nce)
1450 {
1451 	in6_addr_t	sender6 = ipv6_all_zeros;
1452 	uint32_t	ms;
1453 	mblk_t		*mp;
1454 	ip6_t		*ip6h;
1455 
1456 	ASSERT(MUTEX_HELD(&nce->nce_lock));
1457 	/*
1458 	 * Pick the src from outgoing packet, if one is available.
1459 	 * Otherwise let nce_xmit figure out the src.
1460 	 */
1461 	if ((mp = nce->nce_qd_mp) != NULL) {
1462 		/* Handle ip_newroute_v6 giving us IPSEC packets */
1463 		if (mp->b_datap->db_type == M_CTL)
1464 			mp = mp->b_cont;
1465 		ip6h = (ip6_t *)mp->b_rptr;
1466 		if (ip6h->ip6_nxt == IPPROTO_RAW) {
1467 			/*
1468 			 * This message should have been pulled up already in
1469 			 * ip_wput_v6. We can't do pullups here because
1470 			 * the message could be from the nce_qd_mp which could
1471 			 * have b_next/b_prev non-NULL.
1472 			 */
1473 			ASSERT(MBLKL(mp) >= sizeof (ip6i_t) + IPV6_HDR_LEN);
1474 			ip6h = (ip6_t *)(mp->b_rptr + sizeof (ip6i_t));
1475 		}
1476 		sender6 = ip6h->ip6_src;
1477 	}
1478 	ms = nce_solicit(nce, sender6);
1479 	mutex_exit(&nce->nce_lock);
1480 	if (ms == 0) {
1481 		if (nce->nce_state != ND_REACHABLE) {
1482 			nce_resolv_failed(nce);
1483 			ndp_delete(nce);
1484 		}
1485 	} else {
1486 		NDP_RESTART_TIMER(nce, (clock_t)ms);
1487 	}
1488 	mutex_enter(&nce->nce_lock);
1489 }
1490 
1491 /*
1492  * Send a neighbor solicitation.
1493  * Returns number of milliseconds after which we should either rexmit or abort.
1494  * Return of zero means we should abort.
1495  * The caller holds the nce_lock to protect nce_qd_mp and nce_rcnt.
1496  *
1497  * NOTE: This routine drops nce_lock (and later reacquires it) when sending
1498  * the packet.
1499  */
1500 uint32_t
1501 nce_solicit(nce_t *nce, in6_addr_t sender)
1502 {
1503 	boolean_t	dropped;
1504 
1505 	ASSERT(nce->nce_ipversion == IPV6_VERSION);
1506 	ASSERT(MUTEX_HELD(&nce->nce_lock));
1507 
1508 	if (nce->nce_rcnt == 0)
1509 		return (0);
1510 
1511 	nce->nce_rcnt--;
1512 	mutex_exit(&nce->nce_lock);
1513 	dropped = nce_xmit_solicit(nce, B_TRUE, &sender, 0);
1514 	mutex_enter(&nce->nce_lock);
1515 	if (dropped)
1516 		nce->nce_rcnt++;
1517 	return (nce->nce_ill->ill_reachable_retrans_time);
1518 }
1519 
1520 /*
1521  * Attempt to recover an address on an interface that's been marked as a
1522  * duplicate.  Because NCEs are destroyed when the interface goes down, there's
1523  * no easy way to just probe the address and have the right thing happen if
1524  * it's no longer in use.  Instead, we just bring it up normally and allow the
1525  * regular interface start-up logic to probe for a remaining duplicate and take
1526  * us back down if necessary.
1527  * Neither DHCP nor temporary addresses arrive here; they're excluded by
1528  * ip_ndp_excl.
1529  */
1530 /* ARGSUSED */
1531 static void
1532 ip_ndp_recover(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg)
1533 {
1534 	ill_t	*ill = rq->q_ptr;
1535 	ipif_t	*ipif;
1536 	in6_addr_t *addr = (in6_addr_t *)mp->b_rptr;
1537 
1538 	for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
1539 		/*
1540 		 * We do not support recovery of proxy ARP'd interfaces,
1541 		 * because the system lacks a complete proxy ARP mechanism.
1542 		 */
1543 		if ((ipif->ipif_flags & IPIF_POINTOPOINT) ||
1544 		    !IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6lcl_addr, addr)) {
1545 			continue;
1546 		}
1547 
1548 		/*
1549 		 * If we have already recovered or if the interface is going
1550 		 * away, then ignore.
1551 		 */
1552 		mutex_enter(&ill->ill_lock);
1553 		if (!(ipif->ipif_flags & IPIF_DUPLICATE) ||
1554 		    (ipif->ipif_state_flags & IPIF_CONDEMNED)) {
1555 			mutex_exit(&ill->ill_lock);
1556 			continue;
1557 		}
1558 
1559 		ipif->ipif_flags &= ~IPIF_DUPLICATE;
1560 		ill->ill_ipif_dup_count--;
1561 		mutex_exit(&ill->ill_lock);
1562 		ipif->ipif_was_dup = B_TRUE;
1563 
1564 		VERIFY(ipif_ndp_up(ipif, B_TRUE) != EINPROGRESS);
1565 		(void) ipif_up_done_v6(ipif);
1566 	}
1567 	freeb(mp);
1568 }
1569 
1570 /*
1571  * Attempt to recover an IPv6 interface that's been shut down as a duplicate.
1572  * As long as someone else holds the address, the interface will stay down.
1573  * When that conflict goes away, the interface is brought back up.  This is
1574  * done so that accidental shutdowns of addresses aren't made permanent.  Your
1575  * server will recover from a failure.
1576  *
1577  * For DHCP and temporary addresses, recovery is not done in the kernel.
1578  * Instead, it's handled by user space processes (dhcpagent and in.ndpd).
1579  *
1580  * This function is entered on a timer expiry; the ID is in ipif_recovery_id.
1581  */
1582 static void
1583 ipif6_dup_recovery(void *arg)
1584 {
1585 	ipif_t *ipif = arg;
1586 
1587 	ipif->ipif_recovery_id = 0;
1588 	if (!(ipif->ipif_flags & IPIF_DUPLICATE))
1589 		return;
1590 
1591 	/*
1592 	 * No lock, because this is just an optimization.
1593 	 */
1594 	if (ipif->ipif_state_flags & IPIF_CONDEMNED)
1595 		return;
1596 
1597 	/* If the link is down, we'll retry this later */
1598 	if (!(ipif->ipif_ill->ill_phyint->phyint_flags & PHYI_RUNNING))
1599 		return;
1600 
1601 	ndp_do_recovery(ipif);
1602 }
1603 
1604 /*
1605  * Perform interface recovery by forcing the duplicate interfaces up and
1606  * allowing the system to determine which ones should stay up.
1607  *
1608  * Called both by recovery timer expiry and link-up notification.
1609  */
1610 void
1611 ndp_do_recovery(ipif_t *ipif)
1612 {
1613 	ill_t *ill = ipif->ipif_ill;
1614 	mblk_t *mp;
1615 	ip_stack_t *ipst = ill->ill_ipst;
1616 
1617 	mp = allocb(sizeof (ipif->ipif_v6lcl_addr), BPRI_MED);
1618 	if (mp == NULL) {
1619 		mutex_enter(&ill->ill_lock);
1620 		if (ipif->ipif_recovery_id == 0 &&
1621 		    !(ipif->ipif_state_flags & IPIF_CONDEMNED)) {
1622 			ipif->ipif_recovery_id = timeout(ipif6_dup_recovery,
1623 			    ipif, MSEC_TO_TICK(ipst->ips_ip_dup_recovery));
1624 		}
1625 		mutex_exit(&ill->ill_lock);
1626 	} else {
1627 		/*
1628 		 * A recovery timer may still be running if we got here from
1629 		 * ill_restart_dad(); cancel that timer.
1630 		 */
1631 		if (ipif->ipif_recovery_id != 0)
1632 			(void) untimeout(ipif->ipif_recovery_id);
1633 		ipif->ipif_recovery_id = 0;
1634 
1635 		bcopy(&ipif->ipif_v6lcl_addr, mp->b_rptr,
1636 		    sizeof (ipif->ipif_v6lcl_addr));
1637 		ill_refhold(ill);
1638 		qwriter_ip(ill, ill->ill_rq, mp, ip_ndp_recover, NEW_OP,
1639 		    B_FALSE);
1640 	}
1641 }
1642 
1643 /*
1644  * Find the MAC and IP addresses in an NA/NS message.
1645  */
1646 static void
1647 ip_ndp_find_addresses(mblk_t *mp, mblk_t *dl_mp, ill_t *ill, in6_addr_t *targp,
1648     uchar_t **haddr, uint_t *haddrlenp)
1649 {
1650 	ip6_t *ip6h = (ip6_t *)mp->b_rptr;
1651 	icmp6_t *icmp6 = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
1652 	nd_neighbor_advert_t *na = (nd_neighbor_advert_t *)icmp6;
1653 	nd_neighbor_solicit_t *ns = (nd_neighbor_solicit_t *)icmp6;
1654 	uchar_t *addr;
1655 	int alen = 0;
1656 
1657 	if (dl_mp == NULL) {
1658 		nd_opt_hdr_t *opt = NULL;
1659 		int len;
1660 
1661 		/*
1662 		 * If it's from the fast-path, then it can't be a probe
1663 		 * message, and thus must include a linkaddr option.
1664 		 * Extract that here.
1665 		 */
1666 		switch (icmp6->icmp6_type) {
1667 		case ND_NEIGHBOR_SOLICIT:
1668 			len = mp->b_wptr - (uchar_t *)ns;
1669 			if ((len -= sizeof (*ns)) > 0) {
1670 				opt = ndp_get_option((nd_opt_hdr_t *)(ns + 1),
1671 				    len, ND_OPT_SOURCE_LINKADDR);
1672 			}
1673 			break;
1674 		case ND_NEIGHBOR_ADVERT:
1675 			len = mp->b_wptr - (uchar_t *)na;
1676 			if ((len -= sizeof (*na)) > 0) {
1677 				opt = ndp_get_option((nd_opt_hdr_t *)(na + 1),
1678 				    len, ND_OPT_TARGET_LINKADDR);
1679 			}
1680 			break;
1681 		}
1682 
1683 		if (opt != NULL && opt->nd_opt_len * 8 - sizeof (*opt) >=
1684 		    ill->ill_nd_lla_len) {
1685 			addr = (uchar_t *)(opt + 1);
1686 			alen = ill->ill_nd_lla_len;
1687 		}
1688 
1689 		/*
1690 		 * We cheat a bit here for the sake of printing usable log
1691 		 * messages in the rare case where the reply we got was unicast
1692 		 * without a source linkaddr option, and the interface is in
1693 		 * fastpath mode.  (Sigh.)
1694 		 */
1695 		if (alen == 0 && ill->ill_type == IFT_ETHER &&
1696 		    MBLKHEAD(mp) >= sizeof (struct ether_header)) {
1697 			struct ether_header *pether;
1698 
1699 			pether = (struct ether_header *)((char *)ip6h -
1700 			    sizeof (*pether));
1701 			addr = pether->ether_shost.ether_addr_octet;
1702 			alen = ETHERADDRL;
1703 		}
1704 	} else {
1705 		dl_unitdata_ind_t *dlu;
1706 
1707 		dlu = (dl_unitdata_ind_t *)dl_mp->b_rptr;
1708 		alen = dlu->dl_src_addr_length;
1709 		if (alen > 0 && dlu->dl_src_addr_offset >= sizeof (*dlu) &&
1710 		    dlu->dl_src_addr_offset + alen <= MBLKL(dl_mp)) {
1711 			addr = dl_mp->b_rptr + dlu->dl_src_addr_offset;
1712 			if (ill->ill_sap_length < 0) {
1713 				alen += ill->ill_sap_length;
1714 			} else {
1715 				addr += ill->ill_sap_length;
1716 				alen -= ill->ill_sap_length;
1717 			}
1718 		}
1719 	}
1720 
1721 	if (alen > 0) {
1722 		*haddr = addr;
1723 		*haddrlenp = alen;
1724 	} else {
1725 		*haddr = NULL;
1726 		*haddrlenp = 0;
1727 	}
1728 
1729 	/* nd_ns_target and nd_na_target are at the same offset, so we cheat */
1730 	*targp = ns->nd_ns_target;
1731 }
1732 
1733 /*
1734  * This is for exclusive changes due to NDP duplicate address detection
1735  * failure.
1736  */
1737 /* ARGSUSED */
1738 static void
1739 ip_ndp_excl(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg)
1740 {
1741 	ill_t	*ill = rq->q_ptr;
1742 	ipif_t	*ipif;
1743 	mblk_t	*dl_mp = NULL;
1744 	uchar_t	*haddr;
1745 	uint_t	haddrlen;
1746 	ip_stack_t *ipst = ill->ill_ipst;
1747 	in6_addr_t targ;
1748 
1749 	if (DB_TYPE(mp) != M_DATA) {
1750 		dl_mp = mp;
1751 		mp = mp->b_cont;
1752 	}
1753 
1754 	ip_ndp_find_addresses(mp, dl_mp, ill, &targ, &haddr, &haddrlen);
1755 	if (haddr != NULL && haddrlen == ill->ill_phys_addr_length) {
1756 		/*
1757 		 * Ignore conflicts generated by misbehaving switches that
1758 		 * just reflect our own messages back to us.  For IPMP, we may
1759 		 * see reflections across any ill in the illgrp.
1760 		 */
1761 		if (bcmp(haddr, ill->ill_phys_addr, haddrlen) == 0 ||
1762 		    IS_UNDER_IPMP(ill) &&
1763 		    ipmp_illgrp_find_ill(ill->ill_grp, haddr, haddrlen) != NULL)
1764 			goto ignore_conflict;
1765 	}
1766 
1767 	/*
1768 	 * Look up the appropriate ipif.
1769 	 */
1770 	ipif = ipif_lookup_addr_v6(&targ, ill, ALL_ZONES, NULL, NULL, NULL,
1771 	    NULL, ipst);
1772 	if (ipif == NULL)
1773 		goto ignore_conflict;
1774 
1775 	/* Reload the ill to match the ipif */
1776 	ill = ipif->ipif_ill;
1777 
1778 	/* If it's already duplicate or ineligible, then don't do anything. */
1779 	if (ipif->ipif_flags & (IPIF_POINTOPOINT|IPIF_DUPLICATE)) {
1780 		ipif_refrele(ipif);
1781 		goto ignore_conflict;
1782 	}
1783 
1784 	/*
1785 	 * If this is a failure during duplicate recovery, then don't
1786 	 * complain.  It may take a long time to recover.
1787 	 */
1788 	if (!ipif->ipif_was_dup) {
1789 		char ibuf[LIFNAMSIZ];
1790 		char hbuf[MAC_STR_LEN];
1791 		char sbuf[INET6_ADDRSTRLEN];
1792 
1793 		ipif_get_name(ipif, ibuf, sizeof (ibuf));
1794 		cmn_err(CE_WARN, "%s has duplicate address %s (in use by %s);"
1795 		    " disabled", ibuf,
1796 		    inet_ntop(AF_INET6, &targ, sbuf, sizeof (sbuf)),
1797 		    mac_colon_addr(haddr, haddrlen, hbuf, sizeof (hbuf)));
1798 	}
1799 	mutex_enter(&ill->ill_lock);
1800 	ASSERT(!(ipif->ipif_flags & IPIF_DUPLICATE));
1801 	ipif->ipif_flags |= IPIF_DUPLICATE;
1802 	ill->ill_ipif_dup_count++;
1803 	mutex_exit(&ill->ill_lock);
1804 	(void) ipif_down(ipif, NULL, NULL);
1805 	ipif_down_tail(ipif);
1806 	mutex_enter(&ill->ill_lock);
1807 	if (!(ipif->ipif_flags & (IPIF_DHCPRUNNING|IPIF_TEMPORARY)) &&
1808 	    ill->ill_net_type == IRE_IF_RESOLVER &&
1809 	    !(ipif->ipif_state_flags & IPIF_CONDEMNED) &&
1810 	    ipst->ips_ip_dup_recovery > 0) {
1811 		ASSERT(ipif->ipif_recovery_id == 0);
1812 		ipif->ipif_recovery_id = timeout(ipif6_dup_recovery,
1813 		    ipif, MSEC_TO_TICK(ipst->ips_ip_dup_recovery));
1814 	}
1815 	mutex_exit(&ill->ill_lock);
1816 	ipif_refrele(ipif);
1817 ignore_conflict:
1818 	if (dl_mp != NULL)
1819 		freeb(dl_mp);
1820 	freemsg(mp);
1821 }
1822 
1823 /*
1824  * Handle failure by tearing down the ipifs with the specified address.  Note
1825  * that tearing down the ipif also means deleting the nce through ipif_down, so
1826  * it's not possible to do recovery by just restarting the nce timer.  Instead,
1827  * we start a timer on the ipif.
1828  */
1829 static void
1830 ip_ndp_failure(ill_t *ill, mblk_t *mp, mblk_t *dl_mp)
1831 {
1832 	if ((mp = copymsg(mp)) != NULL) {
1833 		if (dl_mp == NULL)
1834 			dl_mp = mp;
1835 		else if ((dl_mp = copyb(dl_mp)) != NULL)
1836 			dl_mp->b_cont = mp;
1837 		if (dl_mp == NULL) {
1838 			freemsg(mp);
1839 		} else {
1840 			ill_refhold(ill);
1841 			qwriter_ip(ill, ill->ill_rq, dl_mp, ip_ndp_excl, NEW_OP,
1842 			    B_FALSE);
1843 		}
1844 	}
1845 }
1846 
1847 /*
1848  * Handle a discovered conflict: some other system is advertising that it owns
1849  * one of our IP addresses.  We need to defend ourselves, or just shut down the
1850  * interface.
1851  */
1852 static void
1853 ip_ndp_conflict(ill_t *ill, mblk_t *mp, mblk_t *dl_mp, nce_t *nce)
1854 {
1855 	ipif_t *ipif;
1856 	uint32_t now;
1857 	uint_t maxdefense;
1858 	uint_t defs;
1859 	ip_stack_t *ipst = ill->ill_ipst;
1860 
1861 	ipif = ipif_lookup_addr_v6(&nce->nce_addr, ill, ALL_ZONES, NULL, NULL,
1862 	    NULL, NULL, ipst);
1863 	if (ipif == NULL)
1864 		return;
1865 
1866 	/*
1867 	 * First, figure out if this address is disposable.
1868 	 */
1869 	if (ipif->ipif_flags & (IPIF_DHCPRUNNING | IPIF_TEMPORARY))
1870 		maxdefense = ipst->ips_ip_max_temp_defend;
1871 	else
1872 		maxdefense = ipst->ips_ip_max_defend;
1873 
1874 	/*
1875 	 * Now figure out how many times we've defended ourselves.  Ignore
1876 	 * defenses that happened long in the past.
1877 	 */
1878 	now = gethrestime_sec();
1879 	mutex_enter(&nce->nce_lock);
1880 	if ((defs = nce->nce_defense_count) > 0 &&
1881 	    now - nce->nce_defense_time > ipst->ips_ip_defend_interval) {
1882 		nce->nce_defense_count = defs = 0;
1883 	}
1884 	nce->nce_defense_count++;
1885 	nce->nce_defense_time = now;
1886 	mutex_exit(&nce->nce_lock);
1887 	ipif_refrele(ipif);
1888 
1889 	/*
1890 	 * If we've defended ourselves too many times already, then give up and
1891 	 * tear down the interface(s) using this address.  Otherwise, defend by
1892 	 * sending out an unsolicited Neighbor Advertisement.
1893 	 */
1894 	if (defs >= maxdefense) {
1895 		ip_ndp_failure(ill, mp, dl_mp);
1896 	} else {
1897 		char hbuf[MAC_STR_LEN];
1898 		char sbuf[INET6_ADDRSTRLEN];
1899 		uchar_t *haddr;
1900 		uint_t haddrlen;
1901 		in6_addr_t targ;
1902 
1903 		ip_ndp_find_addresses(mp, dl_mp, ill, &targ, &haddr, &haddrlen);
1904 		cmn_err(CE_WARN, "node %s is using our IP address %s on %s",
1905 		    mac_colon_addr(haddr, haddrlen, hbuf, sizeof (hbuf)),
1906 		    inet_ntop(AF_INET6, &targ, sbuf, sizeof (sbuf)),
1907 		    ill->ill_name);
1908 
1909 		(void) nce_xmit_advert(nce, B_FALSE, &ipv6_all_hosts_mcast, 0);
1910 	}
1911 }
1912 
1913 static void
1914 ndp_input_solicit(ill_t *ill, mblk_t *mp, mblk_t *dl_mp)
1915 {
1916 	nd_neighbor_solicit_t *ns;
1917 	uint32_t	hlen = ill->ill_nd_lla_len;
1918 	uchar_t		*haddr = NULL;
1919 	icmp6_t		*icmp_nd;
1920 	ip6_t		*ip6h;
1921 	nce_t		*our_nce = NULL;
1922 	in6_addr_t	target;
1923 	in6_addr_t	src;
1924 	int		len;
1925 	int		flag = 0;
1926 	nd_opt_hdr_t	*opt = NULL;
1927 	boolean_t	bad_solicit = B_FALSE;
1928 	mib2_ipv6IfIcmpEntry_t	*mib = ill->ill_icmp6_mib;
1929 
1930 	ip6h = (ip6_t *)mp->b_rptr;
1931 	icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
1932 	len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN;
1933 	src = ip6h->ip6_src;
1934 	ns = (nd_neighbor_solicit_t *)icmp_nd;
1935 	target = ns->nd_ns_target;
1936 	if (IN6_IS_ADDR_MULTICAST(&target)) {
1937 		if (ip_debug > 2) {
1938 			/* ip1dbg */
1939 			pr_addr_dbg("ndp_input_solicit: Target is"
1940 			    " multicast! %s\n", AF_INET6, &target);
1941 		}
1942 		bad_solicit = B_TRUE;
1943 		goto done;
1944 	}
1945 	if (len > sizeof (nd_neighbor_solicit_t)) {
1946 		/* Options present */
1947 		opt = (nd_opt_hdr_t *)&ns[1];
1948 		len -= sizeof (nd_neighbor_solicit_t);
1949 		if (!ndp_verify_optlen(opt, len)) {
1950 			ip1dbg(("ndp_input_solicit: Bad opt len\n"));
1951 			bad_solicit = B_TRUE;
1952 			goto done;
1953 		}
1954 
1955 	}
1956 	if (IN6_IS_ADDR_UNSPECIFIED(&src)) {
1957 		/* Check to see if this is a valid DAD solicitation */
1958 		if (!IN6_IS_ADDR_MC_SOLICITEDNODE(&ip6h->ip6_dst)) {
1959 			if (ip_debug > 2) {
1960 				/* ip1dbg */
1961 				pr_addr_dbg("ndp_input_solicit: IPv6 "
1962 				    "Destination is not solicited node "
1963 				    "multicast %s\n", AF_INET6,
1964 				    &ip6h->ip6_dst);
1965 			}
1966 			bad_solicit = B_TRUE;
1967 			goto done;
1968 		}
1969 	}
1970 
1971 	/*
1972 	 * NOTE: with IPMP, it's possible the nominated multicast ill (which
1973 	 * received this packet if it's multicast) is not the ill tied to
1974 	 * e.g. the IPMP ill's data link-local.  So we match across the illgrp
1975 	 * to ensure we find the associated NCE.
1976 	 */
1977 	our_nce = ndp_lookup_v6(ill, B_TRUE, &target, B_FALSE);
1978 	/*
1979 	 * If this is a valid Solicitation, a permanent
1980 	 * entry should exist in the cache
1981 	 */
1982 	if (our_nce == NULL ||
1983 	    !(our_nce->nce_flags & NCE_F_PERMANENT)) {
1984 		ip1dbg(("ndp_input_solicit: Wrong target in NS?!"
1985 		    "ifname=%s ", ill->ill_name));
1986 		if (ip_debug > 2) {
1987 			/* ip1dbg */
1988 			pr_addr_dbg(" dst %s\n", AF_INET6, &target);
1989 		}
1990 		bad_solicit = B_TRUE;
1991 		goto done;
1992 	}
1993 
1994 	/* At this point we should have a verified NS per spec */
1995 	if (opt != NULL) {
1996 		opt = ndp_get_option(opt, len, ND_OPT_SOURCE_LINKADDR);
1997 		if (opt != NULL) {
1998 			haddr = (uchar_t *)&opt[1];
1999 			if (hlen > opt->nd_opt_len * 8 - sizeof (*opt) ||
2000 			    hlen == 0) {
2001 				ip1dbg(("ndp_input_solicit: bad SLLA\n"));
2002 				bad_solicit = B_TRUE;
2003 				goto done;
2004 			}
2005 		}
2006 	}
2007 
2008 	/* If sending directly to peer, set the unicast flag */
2009 	if (!IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst))
2010 		flag |= NDP_UNICAST;
2011 
2012 	/*
2013 	 * Create/update the entry for the soliciting node.
2014 	 * or respond to outstanding queries, don't if
2015 	 * the source is unspecified address.
2016 	 */
2017 	if (!IN6_IS_ADDR_UNSPECIFIED(&src)) {
2018 		int	err;
2019 		nce_t	*nnce;
2020 
2021 		ASSERT(ill->ill_isv6);
2022 		/*
2023 		 * Regular solicitations *must* include the Source Link-Layer
2024 		 * Address option.  Ignore messages that do not.
2025 		 */
2026 		if (haddr == NULL && IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst)) {
2027 			ip1dbg(("ndp_input_solicit: source link-layer address "
2028 			    "option missing with a specified source.\n"));
2029 			bad_solicit = B_TRUE;
2030 			goto done;
2031 		}
2032 
2033 		/*
2034 		 * This is a regular solicitation.  If we're still in the
2035 		 * process of verifying the address, then don't respond at all
2036 		 * and don't keep track of the sender.
2037 		 */
2038 		if (our_nce->nce_state == ND_PROBE)
2039 			goto done;
2040 
2041 		/*
2042 		 * If the solicitation doesn't have sender hardware address
2043 		 * (legal for unicast solicitation), then process without
2044 		 * installing the return NCE.  Either we already know it, or
2045 		 * we'll be forced to look it up when (and if) we reply to the
2046 		 * packet.
2047 		 */
2048 		if (haddr == NULL)
2049 			goto no_source;
2050 
2051 		err = ndp_lookup_then_add_v6(ill,
2052 		    B_FALSE,
2053 		    haddr,
2054 		    &src,	/* Soliciting nodes address */
2055 		    &ipv6_all_ones,
2056 		    &ipv6_all_zeros,
2057 		    0,
2058 		    0,
2059 		    ND_STALE,
2060 		    &nnce);
2061 		switch (err) {
2062 		case 0:
2063 			/* done with this entry */
2064 			NCE_REFRELE(nnce);
2065 			break;
2066 		case EEXIST:
2067 			/*
2068 			 * B_FALSE indicates this is not an an advertisement.
2069 			 */
2070 			ndp_process(nnce, haddr, 0, B_FALSE);
2071 			NCE_REFRELE(nnce);
2072 			break;
2073 		default:
2074 			ip1dbg(("ndp_input_solicit: Can't create NCE %d\n",
2075 			    err));
2076 			goto done;
2077 		}
2078 no_source:
2079 		flag |= NDP_SOLICITED;
2080 	} else {
2081 		/*
2082 		 * No source link layer address option should be present in a
2083 		 * valid DAD request.
2084 		 */
2085 		if (haddr != NULL) {
2086 			ip1dbg(("ndp_input_solicit: source link-layer address "
2087 			    "option present with an unspecified source.\n"));
2088 			bad_solicit = B_TRUE;
2089 			goto done;
2090 		}
2091 		if (our_nce->nce_state == ND_PROBE) {
2092 			/*
2093 			 * Internally looped-back probes won't have DLPI
2094 			 * attached to them.  External ones (which are sent by
2095 			 * multicast) always will.  Just ignore our own
2096 			 * transmissions.
2097 			 */
2098 			if (dl_mp != NULL) {
2099 				/*
2100 				 * If someone else is probing our address, then
2101 				 * we've crossed wires.  Declare failure.
2102 				 */
2103 				ip_ndp_failure(ill, mp, dl_mp);
2104 			}
2105 			goto done;
2106 		}
2107 		/*
2108 		 * This is a DAD probe.  Multicast the advertisement to the
2109 		 * all-nodes address.
2110 		 */
2111 		src = ipv6_all_hosts_mcast;
2112 	}
2113 	/* Response to a solicitation */
2114 	(void) nce_xmit_advert(our_nce, B_TRUE, &src, flag);
2115 done:
2116 	if (bad_solicit)
2117 		BUMP_MIB(mib, ipv6IfIcmpInBadNeighborSolicitations);
2118 	if (our_nce != NULL)
2119 		NCE_REFRELE(our_nce);
2120 }
2121 
2122 void
2123 ndp_input_advert(ill_t *ill, mblk_t *mp, mblk_t *dl_mp)
2124 {
2125 	nd_neighbor_advert_t *na;
2126 	uint32_t	hlen = ill->ill_nd_lla_len;
2127 	uchar_t		*haddr = NULL;
2128 	icmp6_t		*icmp_nd;
2129 	ip6_t		*ip6h;
2130 	nce_t		*dst_nce = NULL;
2131 	in6_addr_t	target;
2132 	nd_opt_hdr_t	*opt = NULL;
2133 	int		len;
2134 	ip_stack_t	*ipst = ill->ill_ipst;
2135 	mib2_ipv6IfIcmpEntry_t	*mib = ill->ill_icmp6_mib;
2136 
2137 	ip6h = (ip6_t *)mp->b_rptr;
2138 	icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
2139 	len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN;
2140 	na = (nd_neighbor_advert_t *)icmp_nd;
2141 	if (IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst) &&
2142 	    (na->nd_na_flags_reserved & ND_NA_FLAG_SOLICITED)) {
2143 		ip1dbg(("ndp_input_advert: Target is multicast but the "
2144 		    "solicited flag is not zero\n"));
2145 		BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements);
2146 		return;
2147 	}
2148 	target = na->nd_na_target;
2149 	if (IN6_IS_ADDR_MULTICAST(&target)) {
2150 		ip1dbg(("ndp_input_advert: Target is multicast!\n"));
2151 		BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements);
2152 		return;
2153 	}
2154 	if (len > sizeof (nd_neighbor_advert_t)) {
2155 		opt = (nd_opt_hdr_t *)&na[1];
2156 		if (!ndp_verify_optlen(opt,
2157 		    len - sizeof (nd_neighbor_advert_t))) {
2158 			ip1dbg(("ndp_input_advert: cannot verify SLLA\n"));
2159 			BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements);
2160 			return;
2161 		}
2162 		/* At this point we have a verified NA per spec */
2163 		len -= sizeof (nd_neighbor_advert_t);
2164 		opt = ndp_get_option(opt, len, ND_OPT_TARGET_LINKADDR);
2165 		if (opt != NULL) {
2166 			haddr = (uchar_t *)&opt[1];
2167 			if (hlen > opt->nd_opt_len * 8 - sizeof (*opt) ||
2168 			    hlen == 0) {
2169 				ip1dbg(("ndp_input_advert: bad SLLA\n"));
2170 				BUMP_MIB(mib,
2171 				    ipv6IfIcmpInBadNeighborAdvertisements);
2172 				return;
2173 			}
2174 		}
2175 	}
2176 
2177 	/*
2178 	 * NOTE: we match across the illgrp since we need to do DAD for all of
2179 	 * our local addresses, and those are spread across all the active
2180 	 * ills in the group.
2181 	 */
2182 	if ((dst_nce = ndp_lookup_v6(ill, B_TRUE, &target, B_FALSE)) == NULL)
2183 		return;
2184 
2185 	if (dst_nce->nce_flags & NCE_F_PERMANENT) {
2186 		/*
2187 		 * Someone just advertised one of our local addresses.	First,
2188 		 * check it it was us -- if so, we can safely ignore it.
2189 		 */
2190 		if (haddr != NULL) {
2191 			if (!nce_cmp_ll_addr(dst_nce, haddr, hlen))
2192 				goto out;	/* from us -- no conflict */
2193 
2194 			/*
2195 			 * If we're in an IPMP group, check if this is an echo
2196 			 * from another ill in the group.  Use the double-
2197 			 * checked locking pattern to avoid grabbing
2198 			 * ill_g_lock in the non-IPMP case.
2199 			 */
2200 			if (IS_UNDER_IPMP(ill)) {
2201 				rw_enter(&ipst->ips_ill_g_lock, RW_READER);
2202 				if (IS_UNDER_IPMP(ill) && ipmp_illgrp_find_ill(
2203 				    ill->ill_grp, haddr, hlen) != NULL) {
2204 					rw_exit(&ipst->ips_ill_g_lock);
2205 					goto out;
2206 				}
2207 				rw_exit(&ipst->ips_ill_g_lock);
2208 			}
2209 		}
2210 
2211 		/*
2212 		 * Our own (looped-back) unsolicited neighbor advertisements
2213 		 * will get here with dl_mp == NULL.  (These will usually be
2214 		 * filtered by the `haddr' checks above, but point-to-point
2215 		 * links have no hardware address and thus make it here.)
2216 		 */
2217 		if (dl_mp == NULL && dst_nce->nce_state != ND_PROBE)
2218 			goto out;
2219 
2220 		/*
2221 		 * This appears to be a real conflict.  If we're trying to
2222 		 * configure this NCE (ND_PROBE), then shut it down.
2223 		 * Otherwise, handle the discovered conflict.
2224 		 *
2225 		 * In the ND_PROBE case, dl_mp might be NULL if we're getting
2226 		 * a unicast reply.  This isn't typically done (multicast is
2227 		 * the norm in response to a probe), but we can handle it.
2228 		 */
2229 		if (dst_nce->nce_state == ND_PROBE)
2230 			ip_ndp_failure(ill, mp, dl_mp);
2231 		else
2232 			ip_ndp_conflict(ill, mp, dl_mp, dst_nce);
2233 	} else {
2234 		if (na->nd_na_flags_reserved & ND_NA_FLAG_ROUTER)
2235 			dst_nce->nce_flags |= NCE_F_ISROUTER;
2236 
2237 		/* B_TRUE indicates this an advertisement */
2238 		ndp_process(dst_nce, haddr, na->nd_na_flags_reserved, B_TRUE);
2239 	}
2240 out:
2241 	NCE_REFRELE(dst_nce);
2242 }
2243 
2244 /*
2245  * Process NDP neighbor solicitation/advertisement messages.
2246  * The checksum has already checked o.k before reaching here.
2247  */
2248 void
2249 ndp_input(ill_t *ill, mblk_t *mp, mblk_t *dl_mp)
2250 {
2251 	icmp6_t		*icmp_nd;
2252 	ip6_t		*ip6h;
2253 	int		len;
2254 	mib2_ipv6IfIcmpEntry_t	*mib = ill->ill_icmp6_mib;
2255 
2256 
2257 	if (!pullupmsg(mp, -1)) {
2258 		ip1dbg(("ndp_input: pullupmsg failed\n"));
2259 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2260 		goto done;
2261 	}
2262 	ip6h = (ip6_t *)mp->b_rptr;
2263 	if (ip6h->ip6_hops != IPV6_MAX_HOPS) {
2264 		ip1dbg(("ndp_input: hoplimit != IPV6_MAX_HOPS\n"));
2265 		BUMP_MIB(mib, ipv6IfIcmpBadHoplimit);
2266 		goto done;
2267 	}
2268 	/*
2269 	 * NDP does not accept any extension headers between the
2270 	 * IP header and the ICMP header since e.g. a routing
2271 	 * header could be dangerous.
2272 	 * This assumes that any AH or ESP headers are removed
2273 	 * by ip prior to passing the packet to ndp_input.
2274 	 */
2275 	if (ip6h->ip6_nxt != IPPROTO_ICMPV6) {
2276 		ip1dbg(("ndp_input: Wrong next header 0x%x\n",
2277 		    ip6h->ip6_nxt));
2278 		BUMP_MIB(mib, ipv6IfIcmpInErrors);
2279 		goto done;
2280 	}
2281 	icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
2282 	ASSERT(icmp_nd->icmp6_type == ND_NEIGHBOR_SOLICIT ||
2283 	    icmp_nd->icmp6_type == ND_NEIGHBOR_ADVERT);
2284 	if (icmp_nd->icmp6_code != 0) {
2285 		ip1dbg(("ndp_input: icmp6 code != 0 \n"));
2286 		BUMP_MIB(mib, ipv6IfIcmpInErrors);
2287 		goto done;
2288 	}
2289 	len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN;
2290 	/*
2291 	 * Make sure packet length is large enough for either
2292 	 * a NS or a NA icmp packet.
2293 	 */
2294 	if (len <  sizeof (struct icmp6_hdr) + sizeof (struct in6_addr)) {
2295 		ip1dbg(("ndp_input: packet too short\n"));
2296 		BUMP_MIB(mib, ipv6IfIcmpInErrors);
2297 		goto done;
2298 	}
2299 	if (icmp_nd->icmp6_type == ND_NEIGHBOR_SOLICIT) {
2300 		ndp_input_solicit(ill, mp, dl_mp);
2301 	} else {
2302 		ndp_input_advert(ill, mp, dl_mp);
2303 	}
2304 done:
2305 	freemsg(mp);
2306 }
2307 
2308 /*
2309  * Utility routine to send an advertisement.  Assumes that the NCE cannot
2310  * go away (e.g., because it's refheld).
2311  */
2312 static boolean_t
2313 nce_xmit_advert(nce_t *nce, boolean_t use_nd_lla, const in6_addr_t *target,
2314     uint_t flags)
2315 {
2316 	ASSERT((flags & NDP_PROBE) == 0);
2317 
2318 	if (nce->nce_flags & NCE_F_ISROUTER)
2319 		flags |= NDP_ISROUTER;
2320 	if (!(nce->nce_flags & NCE_F_ANYCAST))
2321 		flags |= NDP_ORIDE;
2322 
2323 	return (nce_xmit(nce->nce_ill, ND_NEIGHBOR_ADVERT, use_nd_lla,
2324 	    &nce->nce_addr, target, flags));
2325 }
2326 
2327 /*
2328  * Utility routine to send a solicitation.  Assumes that the NCE cannot
2329  * go away (e.g., because it's refheld).
2330  */
2331 static boolean_t
2332 nce_xmit_solicit(nce_t *nce, boolean_t use_nd_lla, const in6_addr_t *sender,
2333     uint_t flags)
2334 {
2335 	if (flags & NDP_PROBE)
2336 		sender = &ipv6_all_zeros;
2337 
2338 	return (nce_xmit(nce->nce_ill, ND_NEIGHBOR_SOLICIT, use_nd_lla,
2339 	    sender, &nce->nce_addr, flags));
2340 }
2341 
2342 /*
2343  * nce_xmit is called to form and transmit a ND solicitation or
2344  * advertisement ICMP packet.
2345  *
2346  * If the source address is unspecified and this isn't a probe (used for
2347  * duplicate address detection), an appropriate source address and link layer
2348  * address will be chosen here.  The link layer address option is included if
2349  * the source is specified (i.e., all non-probe packets), and omitted (per the
2350  * specification) otherwise.
2351  *
2352  * It returns B_FALSE only if it does a successful put() to the
2353  * corresponding ill's ill_wq otherwise returns B_TRUE.
2354  */
2355 static boolean_t
2356 nce_xmit(ill_t *ill, uint8_t type, boolean_t use_nd_lla,
2357     const in6_addr_t *sender, const in6_addr_t *target, int flag)
2358 {
2359 	ill_t		*hwaddr_ill;
2360 	uint32_t	len;
2361 	icmp6_t 	*icmp6;
2362 	mblk_t		*mp;
2363 	ip6_t		*ip6h;
2364 	nd_opt_hdr_t	*opt;
2365 	uint_t		plen, maxplen;
2366 	ip6i_t		*ip6i;
2367 	ipif_t		*src_ipif = NULL;
2368 	uint8_t		*hw_addr;
2369 	zoneid_t	zoneid = GLOBAL_ZONEID;
2370 	char		buf[INET6_ADDRSTRLEN];
2371 
2372 	ASSERT(!IS_IPMP(ill));
2373 
2374 	/*
2375 	 * Check that the sender is actually a usable address on `ill', and if
2376 	 * so, track that as the src_ipif.  If not, for solicitations, set the
2377 	 * sender to :: so that a new one will be picked below; for adverts,
2378 	 * drop the packet since we expect nce_xmit_advert() to always provide
2379 	 * a valid sender.
2380 	 */
2381 	if (!IN6_IS_ADDR_UNSPECIFIED(sender)) {
2382 		if ((src_ipif = ip_ndp_lookup_addr_v6(sender, ill)) == NULL ||
2383 		    !src_ipif->ipif_addr_ready) {
2384 			if (src_ipif != NULL) {
2385 				ipif_refrele(src_ipif);
2386 				src_ipif = NULL;
2387 			}
2388 			if (type == ND_NEIGHBOR_ADVERT) {
2389 				ip1dbg(("nce_xmit: No source ipif for src %s\n",
2390 				    inet_ntop(AF_INET6, sender, buf,
2391 				    sizeof (buf))));
2392 				return (B_TRUE);
2393 			}
2394 			sender = &ipv6_all_zeros;
2395 		}
2396 	}
2397 
2398 	/*
2399 	 * If we still have an unspecified source (sender) address and this
2400 	 * isn't a probe, select a source address from `ill'.
2401 	 */
2402 	if (IN6_IS_ADDR_UNSPECIFIED(sender) && !(flag & NDP_PROBE)) {
2403 		ASSERT(type != ND_NEIGHBOR_ADVERT);
2404 		/*
2405 		 * Pick a source address for this solicitation, but restrict
2406 		 * the selection to addresses assigned to the output
2407 		 * interface.  We do this because the destination will create
2408 		 * a neighbor cache entry for the source address of this
2409 		 * packet, so the source address needs to be a valid neighbor.
2410 		 */
2411 		src_ipif = ipif_select_source_v6(ill, target, B_TRUE,
2412 		    IPV6_PREFER_SRC_DEFAULT, ALL_ZONES);
2413 		if (src_ipif == NULL) {
2414 			ip1dbg(("nce_xmit: No source ipif for dst %s\n",
2415 			    inet_ntop(AF_INET6, target, buf, sizeof (buf))));
2416 			return (B_TRUE);
2417 		}
2418 		sender = &src_ipif->ipif_v6src_addr;
2419 	}
2420 
2421 	/*
2422 	 * We're either sending a probe or we have a source address.
2423 	 */
2424 	ASSERT((flag & NDP_PROBE) || src_ipif != NULL);
2425 
2426 	maxplen = roundup(sizeof (nd_opt_hdr_t) + ND_MAX_HDW_LEN, 8);
2427 	len = IPV6_HDR_LEN + sizeof (ip6i_t) + sizeof (nd_neighbor_advert_t) +
2428 	    maxplen;
2429 	mp = allocb(len,  BPRI_LO);
2430 	if (mp == NULL) {
2431 		if (src_ipif != NULL)
2432 			ipif_refrele(src_ipif);
2433 		return (B_TRUE);
2434 	}
2435 	bzero((char *)mp->b_rptr, len);
2436 	mp->b_wptr = mp->b_rptr + len;
2437 
2438 	ip6i = (ip6i_t *)mp->b_rptr;
2439 	ip6i->ip6i_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
2440 	ip6i->ip6i_nxt = IPPROTO_RAW;
2441 	ip6i->ip6i_flags = IP6I_HOPLIMIT;
2442 	if (flag & NDP_PROBE)
2443 		ip6i->ip6i_flags |= IP6I_UNSPEC_SRC;
2444 
2445 	ip6h = (ip6_t *)(mp->b_rptr + sizeof (ip6i_t));
2446 	ip6h->ip6_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
2447 	ip6h->ip6_plen = htons(len - IPV6_HDR_LEN - sizeof (ip6i_t));
2448 	ip6h->ip6_nxt = IPPROTO_ICMPV6;
2449 	ip6h->ip6_hops = IPV6_MAX_HOPS;
2450 	ip6h->ip6_src = *sender;
2451 	ip6h->ip6_dst = *target;
2452 	icmp6 = (icmp6_t *)&ip6h[1];
2453 
2454 	opt = (nd_opt_hdr_t *)((uint8_t *)ip6h + IPV6_HDR_LEN +
2455 	    sizeof (nd_neighbor_advert_t));
2456 
2457 	if (type == ND_NEIGHBOR_SOLICIT) {
2458 		nd_neighbor_solicit_t *ns = (nd_neighbor_solicit_t *)icmp6;
2459 
2460 		if (!(flag & NDP_PROBE))
2461 			opt->nd_opt_type = ND_OPT_SOURCE_LINKADDR;
2462 		ns->nd_ns_target = *target;
2463 		if (!(flag & NDP_UNICAST)) {
2464 			/* Form multicast address of the target */
2465 			ip6h->ip6_dst = ipv6_solicited_node_mcast;
2466 			ip6h->ip6_dst.s6_addr32[3] |=
2467 			    ns->nd_ns_target.s6_addr32[3];
2468 		}
2469 	} else {
2470 		nd_neighbor_advert_t *na = (nd_neighbor_advert_t *)icmp6;
2471 
2472 		ASSERT(!(flag & NDP_PROBE));
2473 		opt->nd_opt_type = ND_OPT_TARGET_LINKADDR;
2474 		na->nd_na_target = *sender;
2475 		if (flag & NDP_ISROUTER)
2476 			na->nd_na_flags_reserved |= ND_NA_FLAG_ROUTER;
2477 		if (flag & NDP_SOLICITED)
2478 			na->nd_na_flags_reserved |= ND_NA_FLAG_SOLICITED;
2479 		if (flag & NDP_ORIDE)
2480 			na->nd_na_flags_reserved |= ND_NA_FLAG_OVERRIDE;
2481 	}
2482 
2483 	hw_addr = NULL;
2484 	if (!(flag & NDP_PROBE)) {
2485 		/*
2486 		 * Use our source address to find the hardware address to put
2487 		 * in the packet, so that the hardware address and IP address
2488 		 * will match up -- even if that hardware address doesn't
2489 		 * match the ill we actually transmit the packet through.
2490 		 */
2491 		if (IS_IPMP(src_ipif->ipif_ill)) {
2492 			hwaddr_ill = ipmp_ipif_hold_bound_ill(src_ipif);
2493 			if (hwaddr_ill == NULL) {
2494 				ip1dbg(("nce_xmit: no bound ill!\n"));
2495 				ipif_refrele(src_ipif);
2496 				freemsg(mp);
2497 				return (B_TRUE);
2498 			}
2499 		} else {
2500 			hwaddr_ill = src_ipif->ipif_ill;
2501 			ill_refhold(hwaddr_ill);	/* for symmetry */
2502 		}
2503 
2504 		plen = roundup(sizeof (nd_opt_hdr_t) +
2505 		    hwaddr_ill->ill_nd_lla_len, 8);
2506 
2507 		hw_addr = use_nd_lla ? hwaddr_ill->ill_nd_lla :
2508 		    hwaddr_ill->ill_phys_addr;
2509 		if (hw_addr != NULL) {
2510 			/* Fill in link layer address and option len */
2511 			opt->nd_opt_len = (uint8_t)(plen / 8);
2512 			bcopy(hw_addr, &opt[1], hwaddr_ill->ill_nd_lla_len);
2513 		}
2514 
2515 		ill_refrele(hwaddr_ill);
2516 	}
2517 
2518 	if (hw_addr == NULL)
2519 		plen = 0;
2520 
2521 	/* Fix up the length of the packet now that plen is known */
2522 	len -= (maxplen - plen);
2523 	mp->b_wptr = mp->b_rptr + len;
2524 	ip6h->ip6_plen = htons(len - IPV6_HDR_LEN - sizeof (ip6i_t));
2525 
2526 	icmp6->icmp6_type = type;
2527 	icmp6->icmp6_code = 0;
2528 	/*
2529 	 * Prepare for checksum by putting icmp length in the icmp
2530 	 * checksum field. The checksum is calculated in ip_wput_v6.
2531 	 */
2532 	icmp6->icmp6_cksum = ip6h->ip6_plen;
2533 
2534 	/*
2535 	 * Before we toss the src_ipif, look up the zoneid to pass to
2536 	 * ip_output_v6().  This is to ensure unicast ND_NEIGHBOR_ADVERT
2537 	 * packets to be routed correctly by IP (we cannot guarantee that the
2538 	 * global zone has an interface route to the destination).
2539 	 */
2540 	if (src_ipif != NULL) {
2541 		if ((zoneid = src_ipif->ipif_zoneid) == ALL_ZONES)
2542 			zoneid = GLOBAL_ZONEID;
2543 		ipif_refrele(src_ipif);
2544 	}
2545 
2546 	ip_output_v6((void *)(uintptr_t)zoneid, mp, ill->ill_wq, IP_WPUT);
2547 	return (B_FALSE);
2548 }
2549 
2550 /*
2551  * Make a link layer address (does not include the SAP) from an nce.
2552  * To form the link layer address, use the last four bytes of ipv6
2553  * address passed in and the fixed offset stored in nce.
2554  */
2555 static void
2556 nce_make_mapping(nce_t *nce, uchar_t *addrpos, uchar_t *addr)
2557 {
2558 	uchar_t *mask, *to;
2559 	ill_t	*ill = nce->nce_ill;
2560 	int 	len;
2561 
2562 	if (ill->ill_net_type == IRE_IF_NORESOLVER)
2563 		return;
2564 	ASSERT(nce->nce_res_mp != NULL);
2565 	ASSERT(ill->ill_net_type == IRE_IF_RESOLVER);
2566 	ASSERT(nce->nce_flags & NCE_F_MAPPING);
2567 	ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&nce->nce_extract_mask));
2568 	ASSERT(addr != NULL);
2569 	bcopy(nce->nce_res_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill),
2570 	    addrpos, ill->ill_nd_lla_len);
2571 	len = MIN((int)ill->ill_nd_lla_len - nce->nce_ll_extract_start,
2572 	    IPV6_ADDR_LEN);
2573 	mask = (uchar_t *)&nce->nce_extract_mask;
2574 	mask += (IPV6_ADDR_LEN - len);
2575 	addr += (IPV6_ADDR_LEN - len);
2576 	to = addrpos + nce->nce_ll_extract_start;
2577 	while (len-- > 0)
2578 		*to++ |= *mask++ & *addr++;
2579 }
2580 
2581 mblk_t *
2582 nce_udreq_alloc(ill_t *ill)
2583 {
2584 	mblk_t	*template_mp = NULL;
2585 	dl_unitdata_req_t *dlur;
2586 	int	sap_length;
2587 
2588 	ASSERT(ill->ill_isv6);
2589 
2590 	sap_length = ill->ill_sap_length;
2591 	template_mp = ip_dlpi_alloc(sizeof (dl_unitdata_req_t) +
2592 	    ill->ill_nd_lla_len + ABS(sap_length), DL_UNITDATA_REQ);
2593 	if (template_mp == NULL)
2594 		return (NULL);
2595 
2596 	dlur = (dl_unitdata_req_t *)template_mp->b_rptr;
2597 	dlur->dl_priority.dl_min = 0;
2598 	dlur->dl_priority.dl_max = 0;
2599 	dlur->dl_dest_addr_length = ABS(sap_length) + ill->ill_nd_lla_len;
2600 	dlur->dl_dest_addr_offset = sizeof (dl_unitdata_req_t);
2601 
2602 	/* Copy in the SAP value. */
2603 	NCE_LL_SAP_COPY(ill, template_mp);
2604 
2605 	return (template_mp);
2606 }
2607 
2608 /*
2609  * NDP retransmit timer.
2610  * This timer goes off when:
2611  * a. It is time to retransmit NS for resolver.
2612  * b. It is time to send reachability probes.
2613  */
2614 void
2615 ndp_timer(void *arg)
2616 {
2617 	nce_t		*nce = arg;
2618 	ill_t		*ill = nce->nce_ill;
2619 	char		addrbuf[INET6_ADDRSTRLEN];
2620 	boolean_t	dropped = B_FALSE;
2621 	ip_stack_t	*ipst = ill->ill_ipst;
2622 
2623 	/*
2624 	 * The timer has to be cancelled by ndp_delete before doing the final
2625 	 * refrele. So the NCE is guaranteed to exist when the timer runs
2626 	 * until it clears the timeout_id. Before clearing the timeout_id
2627 	 * bump up the refcnt so that we can continue to use the nce
2628 	 */
2629 	ASSERT(nce != NULL);
2630 
2631 	mutex_enter(&nce->nce_lock);
2632 	NCE_REFHOLD_LOCKED(nce);
2633 	nce->nce_timeout_id = 0;
2634 
2635 	/*
2636 	 * Check the reachability state first.
2637 	 */
2638 	switch (nce->nce_state) {
2639 	case ND_DELAY:
2640 		nce->nce_state = ND_PROBE;
2641 		mutex_exit(&nce->nce_lock);
2642 		(void) nce_xmit_solicit(nce, B_FALSE, &ipv6_all_zeros,
2643 		    NDP_UNICAST);
2644 		if (ip_debug > 3) {
2645 			/* ip2dbg */
2646 			pr_addr_dbg("ndp_timer: state for %s changed "
2647 			    "to PROBE\n", AF_INET6, &nce->nce_addr);
2648 		}
2649 		NDP_RESTART_TIMER(nce, ill->ill_reachable_retrans_time);
2650 		NCE_REFRELE(nce);
2651 		return;
2652 	case ND_PROBE:
2653 		/* must be retransmit timer */
2654 		nce->nce_pcnt--;
2655 		ASSERT(nce->nce_pcnt < ND_MAX_UNICAST_SOLICIT &&
2656 		    nce->nce_pcnt >= -1);
2657 		if (nce->nce_pcnt > 0) {
2658 			/*
2659 			 * As per RFC2461, the nce gets deleted after
2660 			 * MAX_UNICAST_SOLICIT unsuccessful re-transmissions.
2661 			 * Note that the first unicast solicitation is sent
2662 			 * during the DELAY state.
2663 			 */
2664 			ip2dbg(("ndp_timer: pcount=%x dst %s\n",
2665 			    nce->nce_pcnt, inet_ntop(AF_INET6, &nce->nce_addr,
2666 			    addrbuf, sizeof (addrbuf))));
2667 			mutex_exit(&nce->nce_lock);
2668 			dropped = nce_xmit_solicit(nce, B_FALSE,
2669 			    &ipv6_all_zeros,
2670 			    (nce->nce_flags & NCE_F_PERMANENT) ? NDP_PROBE :
2671 			    NDP_UNICAST);
2672 			if (dropped) {
2673 				mutex_enter(&nce->nce_lock);
2674 				nce->nce_pcnt++;
2675 				mutex_exit(&nce->nce_lock);
2676 			}
2677 			NDP_RESTART_TIMER(nce, ILL_PROBE_INTERVAL(ill));
2678 		} else if (nce->nce_pcnt < 0) {
2679 			/* No hope, delete the nce */
2680 			nce->nce_state = ND_UNREACHABLE;
2681 			mutex_exit(&nce->nce_lock);
2682 			if (ip_debug > 2) {
2683 				/* ip1dbg */
2684 				pr_addr_dbg("ndp_timer: Delete IRE for"
2685 				    " dst %s\n", AF_INET6, &nce->nce_addr);
2686 			}
2687 			ndp_delete(nce);
2688 		} else if (!(nce->nce_flags & NCE_F_PERMANENT)) {
2689 			/* Wait RetransTimer, before deleting the entry */
2690 			ip2dbg(("ndp_timer: pcount=%x dst %s\n",
2691 			    nce->nce_pcnt, inet_ntop(AF_INET6,
2692 			    &nce->nce_addr, addrbuf, sizeof (addrbuf))));
2693 			mutex_exit(&nce->nce_lock);
2694 			/* Wait one interval before killing */
2695 			NDP_RESTART_TIMER(nce, ill->ill_reachable_retrans_time);
2696 		} else if (ill->ill_phyint->phyint_flags & PHYI_RUNNING) {
2697 			ipif_t *ipif;
2698 
2699 			/*
2700 			 * We're done probing, and we can now declare this
2701 			 * address to be usable.  Let IP know that it's ok to
2702 			 * use.
2703 			 */
2704 			nce->nce_state = ND_REACHABLE;
2705 			mutex_exit(&nce->nce_lock);
2706 			ipif = ip_ndp_lookup_addr_v6(&nce->nce_addr,
2707 			    nce->nce_ill);
2708 			if (ipif != NULL) {
2709 				if (ipif->ipif_was_dup) {
2710 					char ibuf[LIFNAMSIZ + 10];
2711 					char sbuf[INET6_ADDRSTRLEN];
2712 
2713 					ipif->ipif_was_dup = B_FALSE;
2714 					(void) inet_ntop(AF_INET6,
2715 					    &ipif->ipif_v6lcl_addr,
2716 					    sbuf, sizeof (sbuf));
2717 					ipif_get_name(ipif, ibuf,
2718 					    sizeof (ibuf));
2719 					cmn_err(CE_NOTE, "recovered address "
2720 					    "%s on %s", sbuf, ibuf);
2721 				}
2722 				if ((ipif->ipif_flags & IPIF_UP) &&
2723 				    !ipif->ipif_addr_ready)
2724 					ipif_up_notify(ipif);
2725 				ipif->ipif_addr_ready = 1;
2726 				ipif_refrele(ipif);
2727 			}
2728 			/* Begin defending our new address */
2729 			nce->nce_unsolicit_count = 0;
2730 			dropped = nce_xmit_advert(nce, B_FALSE,
2731 			    &ipv6_all_hosts_mcast, 0);
2732 			if (dropped) {
2733 				nce->nce_unsolicit_count = 1;
2734 				NDP_RESTART_TIMER(nce,
2735 				    ipst->ips_ip_ndp_unsolicit_interval);
2736 			} else if (ipst->ips_ip_ndp_defense_interval != 0) {
2737 				NDP_RESTART_TIMER(nce,
2738 				    ipst->ips_ip_ndp_defense_interval);
2739 			}
2740 		} else {
2741 			/*
2742 			 * This is an address we're probing to be our own, but
2743 			 * the ill is down.  Wait until it comes back before
2744 			 * doing anything, but switch to reachable state so
2745 			 * that the restart will work.
2746 			 */
2747 			nce->nce_state = ND_REACHABLE;
2748 			mutex_exit(&nce->nce_lock);
2749 		}
2750 		NCE_REFRELE(nce);
2751 		return;
2752 	case ND_INCOMPLETE: {
2753 		ip6_t	*ip6h;
2754 		ip6i_t	*ip6i;
2755 		mblk_t	*mp, *datamp, *nextmp, **prevmpp;
2756 
2757 		/*
2758 		 * Per case (2) in the nce_queue_mp() comments, scan nce_qd_mp
2759 		 * for any IPMP probe packets, and toss 'em.  IPMP probe
2760 		 * packets will always be at the head of nce_qd_mp and always
2761 		 * have an ip6i_t header, so we can stop at the first queued
2762 		 * ND packet without an ip6i_t.
2763 		 */
2764 		prevmpp = &nce->nce_qd_mp;
2765 		for (mp = nce->nce_qd_mp; mp != NULL; mp = nextmp) {
2766 			nextmp = mp->b_next;
2767 			datamp = (DB_TYPE(mp) == M_CTL) ? mp->b_cont : mp;
2768 			ip6h = (ip6_t *)datamp->b_rptr;
2769 			if (ip6h->ip6_nxt != IPPROTO_RAW)
2770 				break;
2771 
2772 			ip6i = (ip6i_t *)ip6h;
2773 			if (ip6i->ip6i_flags & IP6I_IPMP_PROBE) {
2774 				inet_freemsg(mp);
2775 				*prevmpp = nextmp;
2776 			} else {
2777 				prevmpp = &mp->b_next;
2778 			}
2779 		}
2780 		ip_ndp_resolve(nce);
2781 		mutex_exit(&nce->nce_lock);
2782 		NCE_REFRELE(nce);
2783 		break;
2784 	}
2785 	case ND_REACHABLE:
2786 		if (((nce->nce_flags & NCE_F_UNSOL_ADV) &&
2787 		    nce->nce_unsolicit_count != 0) ||
2788 		    ((nce->nce_flags & NCE_F_PERMANENT) &&
2789 		    ipst->ips_ip_ndp_defense_interval != 0)) {
2790 			if (nce->nce_unsolicit_count > 0)
2791 				nce->nce_unsolicit_count--;
2792 			mutex_exit(&nce->nce_lock);
2793 			dropped = nce_xmit_advert(nce, B_FALSE,
2794 			    &ipv6_all_hosts_mcast, 0);
2795 			if (dropped) {
2796 				mutex_enter(&nce->nce_lock);
2797 				nce->nce_unsolicit_count++;
2798 				mutex_exit(&nce->nce_lock);
2799 			}
2800 			if (nce->nce_unsolicit_count != 0) {
2801 				NDP_RESTART_TIMER(nce,
2802 				    ipst->ips_ip_ndp_unsolicit_interval);
2803 			} else {
2804 				NDP_RESTART_TIMER(nce,
2805 				    ipst->ips_ip_ndp_defense_interval);
2806 			}
2807 		} else {
2808 			mutex_exit(&nce->nce_lock);
2809 		}
2810 		NCE_REFRELE(nce);
2811 		break;
2812 	default:
2813 		mutex_exit(&nce->nce_lock);
2814 		NCE_REFRELE(nce);
2815 		break;
2816 	}
2817 }
2818 
2819 /*
2820  * Set a link layer address from the ll_addr passed in.
2821  * Copy SAP from ill.
2822  */
2823 static void
2824 nce_set_ll(nce_t *nce, uchar_t *ll_addr)
2825 {
2826 	ill_t	*ill = nce->nce_ill;
2827 	uchar_t	*woffset;
2828 
2829 	ASSERT(ll_addr != NULL);
2830 	/* Always called before fast_path_probe */
2831 	ASSERT(nce->nce_fp_mp == NULL);
2832 	if (ill->ill_sap_length != 0) {
2833 		/*
2834 		 * Copy the SAP type specified in the
2835 		 * request into the xmit template.
2836 		 */
2837 		NCE_LL_SAP_COPY(ill, nce->nce_res_mp);
2838 	}
2839 	if (ill->ill_phys_addr_length > 0) {
2840 		/*
2841 		 * The bcopy() below used to be called for the physical address
2842 		 * length rather than the link layer address length. For
2843 		 * ethernet and many other media, the phys_addr and lla are
2844 		 * identical.
2845 		 * However, with xresolv interfaces being introduced, the
2846 		 * phys_addr and lla are no longer the same, and the physical
2847 		 * address may not have any useful meaning, so we use the lla
2848 		 * for IPv6 address resolution and destination addressing.
2849 		 *
2850 		 * For PPP or other interfaces with a zero length
2851 		 * physical address, don't do anything here.
2852 		 * The bcopy() with a zero phys_addr length was previously
2853 		 * a no-op for interfaces with a zero-length physical address.
2854 		 * Using the lla for them would change the way they operate.
2855 		 * Doing nothing in such cases preserves expected behavior.
2856 		 */
2857 		woffset = nce->nce_res_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill);
2858 		bcopy(ll_addr, woffset, ill->ill_nd_lla_len);
2859 	}
2860 }
2861 
2862 static boolean_t
2863 nce_cmp_ll_addr(const nce_t *nce, const uchar_t *ll_addr, uint32_t ll_addr_len)
2864 {
2865 	ill_t	*ill = nce->nce_ill;
2866 	uchar_t	*ll_offset;
2867 
2868 	ASSERT(nce->nce_res_mp != NULL);
2869 	if (ll_addr == NULL)
2870 		return (B_FALSE);
2871 	ll_offset = nce->nce_res_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill);
2872 	if (bcmp(ll_addr, ll_offset, ll_addr_len) != 0)
2873 		return (B_TRUE);
2874 	return (B_FALSE);
2875 }
2876 
2877 /*
2878  * Updates the link layer address or the reachability state of
2879  * a cache entry.  Reset probe counter if needed.
2880  */
2881 static void
2882 nce_update(nce_t *nce, uint16_t new_state, uchar_t *new_ll_addr)
2883 {
2884 	ill_t	*ill = nce->nce_ill;
2885 	boolean_t need_stop_timer = B_FALSE;
2886 	boolean_t need_fastpath_update = B_FALSE;
2887 
2888 	ASSERT(MUTEX_HELD(&nce->nce_lock));
2889 	ASSERT(nce->nce_ipversion == IPV6_VERSION);
2890 	/*
2891 	 * If this interface does not do NUD, there is no point
2892 	 * in allowing an update to the cache entry.  Although
2893 	 * we will respond to NS.
2894 	 * The only time we accept an update for a resolver when
2895 	 * NUD is turned off is when it has just been created.
2896 	 * Non-Resolvers will always be created as REACHABLE.
2897 	 */
2898 	if (new_state != ND_UNCHANGED) {
2899 		if ((nce->nce_flags & NCE_F_NONUD) &&
2900 		    (nce->nce_state != ND_INCOMPLETE))
2901 			return;
2902 		ASSERT((int16_t)new_state >= ND_STATE_VALID_MIN);
2903 		ASSERT((int16_t)new_state <= ND_STATE_VALID_MAX);
2904 		need_stop_timer = B_TRUE;
2905 		if (new_state == ND_REACHABLE)
2906 			nce->nce_last = TICK_TO_MSEC(lbolt64);
2907 		else {
2908 			/* We force NUD in this case */
2909 			nce->nce_last = 0;
2910 		}
2911 		nce->nce_state = new_state;
2912 		nce->nce_pcnt = ND_MAX_UNICAST_SOLICIT;
2913 	}
2914 	/*
2915 	 * In case of fast path we need to free the the fastpath
2916 	 * M_DATA and do another probe.  Otherwise we can just
2917 	 * overwrite the DL_UNITDATA_REQ data, noting we'll lose
2918 	 * whatever packets that happens to be transmitting at the time.
2919 	 */
2920 	if (new_ll_addr != NULL) {
2921 		ASSERT(nce->nce_res_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill) +
2922 		    ill->ill_nd_lla_len <= nce->nce_res_mp->b_wptr);
2923 		bcopy(new_ll_addr, nce->nce_res_mp->b_rptr +
2924 		    NCE_LL_ADDR_OFFSET(ill), ill->ill_nd_lla_len);
2925 		if (nce->nce_fp_mp != NULL) {
2926 			freemsg(nce->nce_fp_mp);
2927 			nce->nce_fp_mp = NULL;
2928 		}
2929 		need_fastpath_update = B_TRUE;
2930 	}
2931 	mutex_exit(&nce->nce_lock);
2932 	if (need_stop_timer) {
2933 		(void) untimeout(nce->nce_timeout_id);
2934 		nce->nce_timeout_id = 0;
2935 	}
2936 	if (need_fastpath_update)
2937 		nce_fastpath(nce);
2938 	mutex_enter(&nce->nce_lock);
2939 }
2940 
2941 void
2942 nce_queue_mp_common(nce_t *nce, mblk_t *mp, boolean_t head_insert)
2943 {
2944 	uint_t	count = 0;
2945 	mblk_t  **mpp, *tmp;
2946 
2947 	ASSERT(MUTEX_HELD(&nce->nce_lock));
2948 
2949 	for (mpp = &nce->nce_qd_mp; *mpp != NULL; mpp = &(*mpp)->b_next) {
2950 		if (++count > nce->nce_ill->ill_max_buf) {
2951 			tmp = nce->nce_qd_mp->b_next;
2952 			nce->nce_qd_mp->b_next = NULL;
2953 			nce->nce_qd_mp->b_prev = NULL;
2954 			freemsg(nce->nce_qd_mp);
2955 			nce->nce_qd_mp = tmp;
2956 		}
2957 	}
2958 
2959 	if (head_insert) {
2960 		mp->b_next = nce->nce_qd_mp;
2961 		nce->nce_qd_mp = mp;
2962 	} else {
2963 		*mpp = mp;
2964 	}
2965 }
2966 
2967 static void
2968 nce_queue_mp(nce_t *nce, mblk_t *mp)
2969 {
2970 	boolean_t head_insert = B_FALSE;
2971 	ip6_t	*ip6h;
2972 	ip6i_t  *ip6i;
2973 	mblk_t	*data_mp;
2974 
2975 	ASSERT(MUTEX_HELD(&nce->nce_lock));
2976 
2977 	if (mp->b_datap->db_type == M_CTL)
2978 		data_mp = mp->b_cont;
2979 	else
2980 		data_mp = mp;
2981 	ip6h = (ip6_t *)data_mp->b_rptr;
2982 	if (ip6h->ip6_nxt == IPPROTO_RAW) {
2983 		/*
2984 		 * This message should have been pulled up already in
2985 		 * ip_wput_v6. We can't do pullups here because the message
2986 		 * could be from the nce_qd_mp which could have b_next/b_prev
2987 		 * non-NULL.
2988 		 */
2989 		ip6i = (ip6i_t *)ip6h;
2990 		ASSERT(MBLKL(data_mp) >= sizeof (ip6i_t) + IPV6_HDR_LEN);
2991 
2992 		/*
2993 		 * If this packet is marked IP6I_IPMP_PROBE, then we need to:
2994 		 *
2995 		 *   1. Insert it at the head of the nce_qd_mp list.  Consider
2996 		 *	the normal (non-probe) load-speading case where the
2997 		 *	source address of the ND packet is not tied to nce_ill.
2998 		 *	If the ill bound to the source address cannot receive,
2999 		 *	the response to the ND packet will not be received.
3000 		 *	However, if ND packets for nce_ill's probes are queued
3001 		 *	behind that ND packet, those probes will also fail to
3002 		 *	be sent, and thus in.mpathd will erroneously conclude
3003 		 *	that nce_ill has also failed.
3004 		 *
3005 		 *   2. Drop the probe packet in ndp_timer() if the ND did
3006 		 *	not succeed on the first attempt.  This ensures that
3007 		 *	ND problems do not manifest as probe RTT spikes.
3008 		 */
3009 		if (ip6i->ip6i_flags & IP6I_IPMP_PROBE)
3010 			head_insert = B_TRUE;
3011 	}
3012 	nce_queue_mp_common(nce, mp, head_insert);
3013 }
3014 
3015 /*
3016  * Called when address resolution failed due to a timeout.
3017  * Send an ICMP unreachable in response to all queued packets.
3018  */
3019 void
3020 nce_resolv_failed(nce_t *nce)
3021 {
3022 	mblk_t	*mp, *nxt_mp, *first_mp;
3023 	char	buf[INET6_ADDRSTRLEN];
3024 	ip6_t *ip6h;
3025 	zoneid_t zoneid = GLOBAL_ZONEID;
3026 	ip_stack_t	*ipst = nce->nce_ill->ill_ipst;
3027 
3028 	ip1dbg(("nce_resolv_failed: dst %s\n",
3029 	    inet_ntop(AF_INET6, (char *)&nce->nce_addr, buf, sizeof (buf))));
3030 	mutex_enter(&nce->nce_lock);
3031 	mp = nce->nce_qd_mp;
3032 	nce->nce_qd_mp = NULL;
3033 	mutex_exit(&nce->nce_lock);
3034 	while (mp != NULL) {
3035 		nxt_mp = mp->b_next;
3036 		mp->b_next = NULL;
3037 		mp->b_prev = NULL;
3038 
3039 		first_mp = mp;
3040 		if (mp->b_datap->db_type == M_CTL) {
3041 			ipsec_out_t *io = (ipsec_out_t *)mp->b_rptr;
3042 			ASSERT(io->ipsec_out_type == IPSEC_OUT);
3043 			zoneid = io->ipsec_out_zoneid;
3044 			ASSERT(zoneid != ALL_ZONES);
3045 			mp = mp->b_cont;
3046 			mp->b_next = NULL;
3047 			mp->b_prev = NULL;
3048 		}
3049 
3050 		ip6h = (ip6_t *)mp->b_rptr;
3051 		if (ip6h->ip6_nxt == IPPROTO_RAW) {
3052 			ip6i_t *ip6i;
3053 			/*
3054 			 * This message should have been pulled up already
3055 			 * in ip_wput_v6. ip_hdr_complete_v6 assumes that
3056 			 * the header is pulled up.
3057 			 */
3058 			ip6i = (ip6i_t *)ip6h;
3059 			ASSERT((mp->b_wptr - (uchar_t *)ip6i) >=
3060 			    sizeof (ip6i_t) + IPV6_HDR_LEN);
3061 			mp->b_rptr += sizeof (ip6i_t);
3062 		}
3063 		/*
3064 		 * Ignore failure since icmp_unreachable_v6 will silently
3065 		 * drop packets with an unspecified source address.
3066 		 */
3067 		(void) ip_hdr_complete_v6((ip6_t *)mp->b_rptr, zoneid, ipst);
3068 		icmp_unreachable_v6(nce->nce_ill->ill_wq, first_mp,
3069 		    ICMP6_DST_UNREACH_ADDR, B_FALSE, B_FALSE, zoneid, ipst);
3070 		mp = nxt_mp;
3071 	}
3072 	nce_cb_dispatch(nce);
3073 }
3074 
3075 /*
3076  * Called by SIOCSNDP* ioctl to add/change an nce entry
3077  * and the corresponding attributes.
3078  * Disallow states other than ND_REACHABLE or ND_STALE.
3079  */
3080 int
3081 ndp_sioc_update(ill_t *ill, lif_nd_req_t *lnr)
3082 {
3083 	sin6_t		*sin6;
3084 	in6_addr_t	*addr;
3085 	nce_t		*nce;
3086 	int		err;
3087 	uint16_t	new_flags = 0;
3088 	uint16_t	old_flags = 0;
3089 	int		inflags = lnr->lnr_flags;
3090 	ip_stack_t	*ipst = ill->ill_ipst;
3091 
3092 	ASSERT(ill->ill_isv6);
3093 	if ((lnr->lnr_state_create != ND_REACHABLE) &&
3094 	    (lnr->lnr_state_create != ND_STALE))
3095 		return (EINVAL);
3096 
3097 	if (lnr->lnr_hdw_len > ND_MAX_HDW_LEN)
3098 		return (EINVAL);
3099 
3100 	sin6 = (sin6_t *)&lnr->lnr_addr;
3101 	addr = &sin6->sin6_addr;
3102 
3103 	mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
3104 	/* We know it can not be mapping so just look in the hash table */
3105 	nce = *((nce_t **)NCE_HASH_PTR_V6(ipst, *addr));
3106 	/* See comment in ndp_query() regarding IS_IPMP(ill) usage */
3107 	nce = nce_lookup_addr(ill, IS_IPMP(ill), addr, nce);
3108 	if (nce != NULL)
3109 		new_flags = nce->nce_flags;
3110 
3111 	switch (inflags & (NDF_ISROUTER_ON|NDF_ISROUTER_OFF)) {
3112 	case NDF_ISROUTER_ON:
3113 		new_flags |= NCE_F_ISROUTER;
3114 		break;
3115 	case NDF_ISROUTER_OFF:
3116 		new_flags &= ~NCE_F_ISROUTER;
3117 		break;
3118 	case (NDF_ISROUTER_OFF|NDF_ISROUTER_ON):
3119 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
3120 		if (nce != NULL)
3121 			NCE_REFRELE(nce);
3122 		return (EINVAL);
3123 	}
3124 
3125 	switch (inflags & (NDF_ANYCAST_ON|NDF_ANYCAST_OFF)) {
3126 	case NDF_ANYCAST_ON:
3127 		new_flags |= NCE_F_ANYCAST;
3128 		break;
3129 	case NDF_ANYCAST_OFF:
3130 		new_flags &= ~NCE_F_ANYCAST;
3131 		break;
3132 	case (NDF_ANYCAST_OFF|NDF_ANYCAST_ON):
3133 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
3134 		if (nce != NULL)
3135 			NCE_REFRELE(nce);
3136 		return (EINVAL);
3137 	}
3138 
3139 	if (nce == NULL) {
3140 		err = ndp_add_v6(ill,
3141 		    (uchar_t *)lnr->lnr_hdw_addr,
3142 		    addr,
3143 		    &ipv6_all_ones,
3144 		    &ipv6_all_zeros,
3145 		    0,
3146 		    new_flags,
3147 		    lnr->lnr_state_create,
3148 		    &nce);
3149 		if (err != 0) {
3150 			mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
3151 			ip1dbg(("ndp_sioc_update: Can't create NCE %d\n", err));
3152 			return (err);
3153 		}
3154 	}
3155 	old_flags = nce->nce_flags;
3156 	if (old_flags & NCE_F_ISROUTER && !(new_flags & NCE_F_ISROUTER)) {
3157 		/*
3158 		 * Router turned to host, delete all ires.
3159 		 * XXX Just delete the entry, but we need to add too.
3160 		 */
3161 		nce->nce_flags &= ~NCE_F_ISROUTER;
3162 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
3163 		ndp_delete(nce);
3164 		NCE_REFRELE(nce);
3165 		return (0);
3166 	}
3167 	mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
3168 
3169 	mutex_enter(&nce->nce_lock);
3170 	nce->nce_flags = new_flags;
3171 	mutex_exit(&nce->nce_lock);
3172 	/*
3173 	 * Note that we ignore the state at this point, which
3174 	 * should be either STALE or REACHABLE.  Instead we let
3175 	 * the link layer address passed in to determine the state
3176 	 * much like incoming packets.
3177 	 */
3178 	nce_process(nce, (uchar_t *)lnr->lnr_hdw_addr, 0, B_FALSE);
3179 	NCE_REFRELE(nce);
3180 	return (0);
3181 }
3182 
3183 /*
3184  * If the device driver supports it, we make nce_fp_mp to have
3185  * an M_DATA prepend.  Otherwise nce_fp_mp will be null.
3186  * The caller ensures there is hold on nce for this function.
3187  * Note that since ill_fastpath_probe() copies the mblk there is
3188  * no need for the hold beyond this function.
3189  */
3190 void
3191 nce_fastpath(nce_t *nce)
3192 {
3193 	ill_t	*ill = nce->nce_ill;
3194 	int res;
3195 
3196 	ASSERT(ill != NULL);
3197 	ASSERT(nce->nce_state != ND_INITIAL && nce->nce_state != ND_INCOMPLETE);
3198 
3199 	if (nce->nce_fp_mp != NULL) {
3200 		/* Already contains fastpath info */
3201 		return;
3202 	}
3203 	if (nce->nce_res_mp != NULL) {
3204 		nce_fastpath_list_add(nce);
3205 		res = ill_fastpath_probe(ill, nce->nce_res_mp);
3206 		/*
3207 		 * EAGAIN is an indication of a transient error
3208 		 * i.e. allocation failure etc. leave the nce in the list it
3209 		 * will be updated when another probe happens for another ire
3210 		 * if not it will be taken out of the list when the ire is
3211 		 * deleted.
3212 		 */
3213 
3214 		if (res != 0 && res != EAGAIN)
3215 			nce_fastpath_list_delete(nce);
3216 	}
3217 }
3218 
3219 /*
3220  * Drain the list of nce's waiting for fastpath response.
3221  */
3222 void
3223 nce_fastpath_list_dispatch(ill_t *ill, boolean_t (*func)(nce_t *, void  *),
3224     void *arg)
3225 {
3226 
3227 	nce_t *next_nce;
3228 	nce_t *current_nce;
3229 	nce_t *first_nce;
3230 	nce_t *prev_nce = NULL;
3231 
3232 	mutex_enter(&ill->ill_lock);
3233 	first_nce = current_nce = (nce_t *)ill->ill_fastpath_list;
3234 	while (current_nce != (nce_t *)&ill->ill_fastpath_list) {
3235 		next_nce = current_nce->nce_fastpath;
3236 		/*
3237 		 * Take it off the list if we're flushing, or if the callback
3238 		 * routine tells us to do so.  Otherwise, leave the nce in the
3239 		 * fastpath list to handle any pending response from the lower
3240 		 * layer.  We can't drain the list when the callback routine
3241 		 * comparison failed, because the response is asynchronous in
3242 		 * nature, and may not arrive in the same order as the list
3243 		 * insertion.
3244 		 */
3245 		if (func == NULL || func(current_nce, arg)) {
3246 			current_nce->nce_fastpath = NULL;
3247 			if (current_nce == first_nce)
3248 				ill->ill_fastpath_list = first_nce = next_nce;
3249 			else
3250 				prev_nce->nce_fastpath = next_nce;
3251 		} else {
3252 			/* previous element that is still in the list */
3253 			prev_nce = current_nce;
3254 		}
3255 		current_nce = next_nce;
3256 	}
3257 	mutex_exit(&ill->ill_lock);
3258 }
3259 
3260 /*
3261  * Add nce to the nce fastpath list.
3262  */
3263 void
3264 nce_fastpath_list_add(nce_t *nce)
3265 {
3266 	ill_t *ill;
3267 
3268 	ill = nce->nce_ill;
3269 
3270 	mutex_enter(&ill->ill_lock);
3271 	mutex_enter(&nce->nce_lock);
3272 
3273 	/*
3274 	 * if nce has not been deleted and
3275 	 * is not already in the list add it.
3276 	 */
3277 	if (!(nce->nce_flags & NCE_F_CONDEMNED) &&
3278 	    (nce->nce_fastpath == NULL)) {
3279 		nce->nce_fastpath = (nce_t *)ill->ill_fastpath_list;
3280 		ill->ill_fastpath_list = nce;
3281 	}
3282 
3283 	mutex_exit(&nce->nce_lock);
3284 	mutex_exit(&ill->ill_lock);
3285 }
3286 
3287 /*
3288  * remove nce from the nce fastpath list.
3289  */
3290 void
3291 nce_fastpath_list_delete(nce_t *nce)
3292 {
3293 	nce_t *nce_ptr;
3294 
3295 	ill_t *ill;
3296 
3297 	ill = nce->nce_ill;
3298 	ASSERT(ill != NULL);
3299 
3300 	mutex_enter(&ill->ill_lock);
3301 	if (nce->nce_fastpath == NULL)
3302 		goto done;
3303 
3304 	ASSERT(ill->ill_fastpath_list != &ill->ill_fastpath_list);
3305 
3306 	if (ill->ill_fastpath_list == nce) {
3307 		ill->ill_fastpath_list = nce->nce_fastpath;
3308 	} else {
3309 		nce_ptr = ill->ill_fastpath_list;
3310 		while (nce_ptr != (nce_t *)&ill->ill_fastpath_list) {
3311 			if (nce_ptr->nce_fastpath == nce) {
3312 				nce_ptr->nce_fastpath = nce->nce_fastpath;
3313 				break;
3314 			}
3315 			nce_ptr = nce_ptr->nce_fastpath;
3316 		}
3317 	}
3318 
3319 	nce->nce_fastpath = NULL;
3320 done:
3321 	mutex_exit(&ill->ill_lock);
3322 }
3323 
3324 /*
3325  * Update all NCE's that are not in fastpath mode and
3326  * have an nce_fp_mp that matches mp. mp->b_cont contains
3327  * the fastpath header.
3328  *
3329  * Returns TRUE if entry should be dequeued, or FALSE otherwise.
3330  */
3331 boolean_t
3332 ndp_fastpath_update(nce_t *nce, void *arg)
3333 {
3334 	mblk_t 	*mp, *fp_mp;
3335 	uchar_t	*mp_rptr, *ud_mp_rptr;
3336 	mblk_t	*ud_mp = nce->nce_res_mp;
3337 	ptrdiff_t	cmplen;
3338 
3339 	if (nce->nce_flags & NCE_F_MAPPING)
3340 		return (B_TRUE);
3341 	if ((nce->nce_fp_mp != NULL) || (ud_mp == NULL))
3342 		return (B_TRUE);
3343 
3344 	ip2dbg(("ndp_fastpath_update: trying\n"));
3345 	mp = (mblk_t *)arg;
3346 	mp_rptr = mp->b_rptr;
3347 	cmplen = mp->b_wptr - mp_rptr;
3348 	ASSERT(cmplen >= 0);
3349 	ud_mp_rptr = ud_mp->b_rptr;
3350 	/*
3351 	 * The nce is locked here to prevent any other threads
3352 	 * from accessing and changing nce_res_mp when the IPv6 address
3353 	 * becomes resolved to an lla while we're in the middle
3354 	 * of looking at and comparing the hardware address (lla).
3355 	 * It is also locked to prevent multiple threads in nce_fastpath_update
3356 	 * from examining nce_res_mp atthe same time.
3357 	 */
3358 	mutex_enter(&nce->nce_lock);
3359 	if (ud_mp->b_wptr - ud_mp_rptr != cmplen ||
3360 	    bcmp((char *)mp_rptr, (char *)ud_mp_rptr, cmplen) != 0) {
3361 		mutex_exit(&nce->nce_lock);
3362 		/*
3363 		 * Don't take the ire off the fastpath list yet,
3364 		 * since the response may come later.
3365 		 */
3366 		return (B_FALSE);
3367 	}
3368 	/* Matched - install mp as the fastpath mp */
3369 	ip1dbg(("ndp_fastpath_update: match\n"));
3370 	fp_mp = dupb(mp->b_cont);
3371 	if (fp_mp != NULL) {
3372 		nce->nce_fp_mp = fp_mp;
3373 	}
3374 	mutex_exit(&nce->nce_lock);
3375 	return (B_TRUE);
3376 }
3377 
3378 /*
3379  * This function handles the DL_NOTE_FASTPATH_FLUSH notification from
3380  * driver.  Note that it assumes IP is exclusive...
3381  */
3382 /* ARGSUSED */
3383 void
3384 ndp_fastpath_flush(nce_t *nce, char *arg)
3385 {
3386 	if (nce->nce_flags & NCE_F_MAPPING)
3387 		return;
3388 	/* No fastpath info? */
3389 	if (nce->nce_fp_mp == NULL || nce->nce_res_mp == NULL)
3390 		return;
3391 
3392 	if (nce->nce_ipversion == IPV4_VERSION &&
3393 	    nce->nce_flags & NCE_F_BCAST) {
3394 		/*
3395 		 * IPv4 BROADCAST entries:
3396 		 * We can't delete the nce since it is difficult to
3397 		 * recreate these without going through the
3398 		 * ipif down/up dance.
3399 		 *
3400 		 * All access to nce->nce_fp_mp in the case of these
3401 		 * is protected by nce_lock.
3402 		 */
3403 		mutex_enter(&nce->nce_lock);
3404 		if (nce->nce_fp_mp != NULL) {
3405 			freeb(nce->nce_fp_mp);
3406 			nce->nce_fp_mp = NULL;
3407 			mutex_exit(&nce->nce_lock);
3408 			nce_fastpath(nce);
3409 		} else {
3410 			mutex_exit(&nce->nce_lock);
3411 		}
3412 	} else {
3413 		/* Just delete the NCE... */
3414 		ndp_delete(nce);
3415 	}
3416 }
3417 
3418 /*
3419  * Return a pointer to a given option in the packet.
3420  * Assumes that option part of the packet have already been validated.
3421  */
3422 nd_opt_hdr_t *
3423 ndp_get_option(nd_opt_hdr_t *opt, int optlen, int opt_type)
3424 {
3425 	while (optlen > 0) {
3426 		if (opt->nd_opt_type == opt_type)
3427 			return (opt);
3428 		optlen -= 8 * opt->nd_opt_len;
3429 		opt = (struct nd_opt_hdr *)((char *)opt + 8 * opt->nd_opt_len);
3430 	}
3431 	return (NULL);
3432 }
3433 
3434 /*
3435  * Verify all option lengths present are > 0, also check to see
3436  * if the option lengths and packet length are consistent.
3437  */
3438 boolean_t
3439 ndp_verify_optlen(nd_opt_hdr_t *opt, int optlen)
3440 {
3441 	ASSERT(opt != NULL);
3442 	while (optlen > 0) {
3443 		if (opt->nd_opt_len == 0)
3444 			return (B_FALSE);
3445 		optlen -= 8 * opt->nd_opt_len;
3446 		if (optlen < 0)
3447 			return (B_FALSE);
3448 		opt = (struct nd_opt_hdr *)((char *)opt + 8 * opt->nd_opt_len);
3449 	}
3450 	return (B_TRUE);
3451 }
3452 
3453 /*
3454  * ndp_walk function.
3455  * Free a fraction of the NCE cache entries.
3456  * A fraction of zero means to not free any in that category.
3457  */
3458 void
3459 ndp_cache_reclaim(nce_t *nce, char *arg)
3460 {
3461 	nce_cache_reclaim_t *ncr = (nce_cache_reclaim_t *)arg;
3462 	uint_t	rand;
3463 
3464 	if (nce->nce_flags & NCE_F_PERMANENT)
3465 		return;
3466 
3467 	rand = (uint_t)lbolt +
3468 	    NCE_ADDR_HASH_V6(nce->nce_addr, NCE_TABLE_SIZE);
3469 	if (ncr->ncr_host != 0 &&
3470 	    (rand/ncr->ncr_host)*ncr->ncr_host == rand) {
3471 		ndp_delete(nce);
3472 		return;
3473 	}
3474 }
3475 
3476 /*
3477  * ndp_walk function.
3478  * Count the number of NCEs that can be deleted.
3479  * These would be hosts but not routers.
3480  */
3481 void
3482 ndp_cache_count(nce_t *nce, char *arg)
3483 {
3484 	ncc_cache_count_t *ncc = (ncc_cache_count_t *)arg;
3485 
3486 	if (nce->nce_flags & NCE_F_PERMANENT)
3487 		return;
3488 
3489 	ncc->ncc_total++;
3490 	if (!(nce->nce_flags & NCE_F_ISROUTER))
3491 		ncc->ncc_host++;
3492 }
3493 
3494 #ifdef DEBUG
3495 void
3496 nce_trace_ref(nce_t *nce)
3497 {
3498 	ASSERT(MUTEX_HELD(&nce->nce_lock));
3499 
3500 	if (nce->nce_trace_disable)
3501 		return;
3502 
3503 	if (!th_trace_ref(nce, nce->nce_ill->ill_ipst)) {
3504 		nce->nce_trace_disable = B_TRUE;
3505 		nce_trace_cleanup(nce);
3506 	}
3507 }
3508 
3509 void
3510 nce_untrace_ref(nce_t *nce)
3511 {
3512 	ASSERT(MUTEX_HELD(&nce->nce_lock));
3513 
3514 	if (!nce->nce_trace_disable)
3515 		th_trace_unref(nce);
3516 }
3517 
3518 static void
3519 nce_trace_cleanup(const nce_t *nce)
3520 {
3521 	th_trace_cleanup(nce, nce->nce_trace_disable);
3522 }
3523 #endif
3524 
3525 /*
3526  * Called when address resolution fails due to a timeout.
3527  * Send an ICMP unreachable in response to all queued packets.
3528  */
3529 void
3530 arp_resolv_failed(nce_t *nce)
3531 {
3532 	mblk_t	*mp, *nxt_mp, *first_mp;
3533 	char	buf[INET6_ADDRSTRLEN];
3534 	zoneid_t zoneid = GLOBAL_ZONEID;
3535 	struct in_addr ipv4addr;
3536 	ip_stack_t *ipst = nce->nce_ill->ill_ipst;
3537 
3538 	IN6_V4MAPPED_TO_INADDR(&nce->nce_addr, &ipv4addr);
3539 	ip3dbg(("arp_resolv_failed: dst %s\n",
3540 	    inet_ntop(AF_INET, &ipv4addr, buf, sizeof (buf))));
3541 	mutex_enter(&nce->nce_lock);
3542 	mp = nce->nce_qd_mp;
3543 	nce->nce_qd_mp = NULL;
3544 	mutex_exit(&nce->nce_lock);
3545 
3546 	while (mp != NULL) {
3547 		nxt_mp = mp->b_next;
3548 		mp->b_next = NULL;
3549 		mp->b_prev = NULL;
3550 
3551 		first_mp = mp;
3552 		/*
3553 		 * Send icmp unreachable messages
3554 		 * to the hosts.
3555 		 */
3556 		(void) ip_hdr_complete((ipha_t *)mp->b_rptr, zoneid, ipst);
3557 		ip3dbg(("arp_resolv_failed: Calling icmp_unreachable\n"));
3558 		icmp_unreachable(nce->nce_ill->ill_wq, first_mp,
3559 		    ICMP_HOST_UNREACHABLE, zoneid, ipst);
3560 		mp = nxt_mp;
3561 	}
3562 }
3563 
3564 int
3565 ndp_lookup_then_add_v4(ill_t *ill, const in_addr_t *addr, uint16_t flags,
3566     nce_t **newnce, nce_t *src_nce)
3567 {
3568 	int	err;
3569 	nce_t	*nce;
3570 	in6_addr_t addr6;
3571 	ip_stack_t *ipst = ill->ill_ipst;
3572 
3573 	mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
3574 	nce = *((nce_t **)NCE_HASH_PTR_V4(ipst, *addr));
3575 	IN6_IPADDR_TO_V4MAPPED(*addr, &addr6);
3576 	/*
3577 	 * NOTE: IPv4 never matches across the illgrp since the NCE's we're
3578 	 * looking up have fastpath headers that are inherently per-ill.
3579 	 */
3580 	nce = nce_lookup_addr(ill, B_FALSE, &addr6, nce);
3581 	if (nce == NULL) {
3582 		err = ndp_add_v4(ill, addr, flags, newnce, src_nce);
3583 	} else {
3584 		*newnce = nce;
3585 		err = EEXIST;
3586 	}
3587 	mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
3588 	return (err);
3589 }
3590 
3591 /*
3592  * NDP Cache Entry creation routine for IPv4.
3593  * Mapped entries are handled in arp.
3594  * This routine must always be called with ndp4->ndp_g_lock held.
3595  * Prior to return, nce_refcnt is incremented.
3596  */
3597 static int
3598 ndp_add_v4(ill_t *ill, const in_addr_t *addr, uint16_t flags,
3599     nce_t **newnce, nce_t *src_nce)
3600 {
3601 	static	nce_t		nce_nil;
3602 	nce_t		*nce;
3603 	mblk_t		*mp;
3604 	mblk_t		*template = NULL;
3605 	nce_t		**ncep;
3606 	ip_stack_t	*ipst = ill->ill_ipst;
3607 	uint16_t	state = ND_INITIAL;
3608 	int		err;
3609 
3610 	ASSERT(MUTEX_HELD(&ipst->ips_ndp4->ndp_g_lock));
3611 	ASSERT(!ill->ill_isv6);
3612 	ASSERT((flags & NCE_F_MAPPING) == 0);
3613 
3614 	if (ill->ill_resolver_mp == NULL)
3615 		return (EINVAL);
3616 	/*
3617 	 * Allocate the mblk to hold the nce.
3618 	 */
3619 	mp = allocb(sizeof (nce_t), BPRI_MED);
3620 	if (mp == NULL)
3621 		return (ENOMEM);
3622 
3623 	nce = (nce_t *)mp->b_rptr;
3624 	mp->b_wptr = (uchar_t *)&nce[1];
3625 	*nce = nce_nil;
3626 	nce->nce_ill = ill;
3627 	nce->nce_ipversion = IPV4_VERSION;
3628 	nce->nce_flags = flags;
3629 	nce->nce_pcnt = ND_MAX_UNICAST_SOLICIT;
3630 	nce->nce_rcnt = ill->ill_xmit_count;
3631 	IN6_IPADDR_TO_V4MAPPED(*addr, &nce->nce_addr);
3632 	nce->nce_mask = ipv6_all_ones;
3633 	nce->nce_extract_mask = ipv6_all_zeros;
3634 	nce->nce_ll_extract_start = 0;
3635 	nce->nce_qd_mp = NULL;
3636 	nce->nce_mp = mp;
3637 	/* This one is for nce getting created */
3638 	nce->nce_refcnt = 1;
3639 	mutex_init(&nce->nce_lock, NULL, MUTEX_DEFAULT, NULL);
3640 	ncep = ((nce_t **)NCE_HASH_PTR_V4(ipst, *addr));
3641 
3642 	nce->nce_trace_disable = B_FALSE;
3643 
3644 	if (src_nce != NULL) {
3645 		/*
3646 		 * src_nce has been provided by the caller. The only
3647 		 * caller who provides a non-null, non-broadcast
3648 		 * src_nce is from ip_newroute() which must pass in
3649 		 * a ND_REACHABLE src_nce (this condition is verified
3650 		 * via an ASSERT for the save_ire->ire_nce in ip_newroute())
3651 		 */
3652 		mutex_enter(&src_nce->nce_lock);
3653 		state = src_nce->nce_state;
3654 		if ((src_nce->nce_flags & NCE_F_CONDEMNED) ||
3655 		    (ipst->ips_ndp4->ndp_g_hw_change > 0)) {
3656 			/*
3657 			 * src_nce has been deleted, or
3658 			 * ip_arp_news is in the middle of
3659 			 * flushing entries in the the nce.
3660 			 * Fail the add, since we don't know
3661 			 * if it is safe to copy the contents of
3662 			 * src_nce
3663 			 */
3664 			DTRACE_PROBE2(nce__bad__src__nce,
3665 			    nce_t *, src_nce, ill_t *, ill);
3666 			mutex_exit(&src_nce->nce_lock);
3667 			err = EINVAL;
3668 			goto err_ret;
3669 		}
3670 		template = copyb(src_nce->nce_res_mp);
3671 		mutex_exit(&src_nce->nce_lock);
3672 		if (template == NULL) {
3673 			err = ENOMEM;
3674 			goto err_ret;
3675 		}
3676 	} else if (flags & NCE_F_BCAST) {
3677 		/*
3678 		 * broadcast nce.
3679 		 */
3680 		template = copyb(ill->ill_bcast_mp);
3681 		if (template == NULL) {
3682 			err = ENOMEM;
3683 			goto err_ret;
3684 		}
3685 		state = ND_REACHABLE;
3686 	} else if (ill->ill_net_type == IRE_IF_NORESOLVER) {
3687 		/*
3688 		 * NORESOLVER entries are always created in the REACHABLE
3689 		 * state.
3690 		 */
3691 		if (ill->ill_phys_addr_length == IP_ADDR_LEN &&
3692 		    ill->ill_mactype != DL_IPV4 &&
3693 		    ill->ill_mactype != DL_6TO4) {
3694 			/*
3695 			 * We create a nce_res_mp with the IP nexthop address
3696 			 * as the destination address if the physical length
3697 			 * is exactly 4 bytes for point-to-multipoint links
3698 			 * that do their own resolution from IP to link-layer
3699 			 * address (e.g. IP over X.25).
3700 			 */
3701 			template = ill_dlur_gen((uchar_t *)addr,
3702 			    ill->ill_phys_addr_length,
3703 			    ill->ill_sap, ill->ill_sap_length);
3704 		} else {
3705 			template = copyb(ill->ill_resolver_mp);
3706 		}
3707 		if (template == NULL) {
3708 			err = ENOMEM;
3709 			goto err_ret;
3710 		}
3711 		state = ND_REACHABLE;
3712 	}
3713 	nce->nce_fp_mp = NULL;
3714 	nce->nce_res_mp = template;
3715 	nce->nce_state = state;
3716 	if (state == ND_REACHABLE) {
3717 		nce->nce_last = TICK_TO_MSEC(lbolt64);
3718 		nce->nce_init_time = TICK_TO_MSEC(lbolt64);
3719 	} else {
3720 		nce->nce_last = 0;
3721 		if (state == ND_INITIAL)
3722 			nce->nce_init_time = TICK_TO_MSEC(lbolt64);
3723 	}
3724 
3725 	ASSERT((nce->nce_res_mp == NULL && nce->nce_state == ND_INITIAL) ||
3726 	    (nce->nce_res_mp != NULL && nce->nce_state == ND_REACHABLE));
3727 	/*
3728 	 * Atomically ensure that the ill is not CONDEMNED, before
3729 	 * adding the NCE.
3730 	 */
3731 	mutex_enter(&ill->ill_lock);
3732 	if (ill->ill_state_flags & ILL_CONDEMNED) {
3733 		mutex_exit(&ill->ill_lock);
3734 		err = EINVAL;
3735 		goto err_ret;
3736 	}
3737 	if ((nce->nce_next = *ncep) != NULL)
3738 		nce->nce_next->nce_ptpn = &nce->nce_next;
3739 	*ncep = nce;
3740 	nce->nce_ptpn = ncep;
3741 	*newnce = nce;
3742 	/* This one is for nce being used by an active thread */
3743 	NCE_REFHOLD(*newnce);
3744 
3745 	/* Bump up the number of nce's referencing this ill */
3746 	DTRACE_PROBE3(ill__incr__cnt, (ill_t *), ill,
3747 	    (char *), "nce", (void *), nce);
3748 	ill->ill_nce_cnt++;
3749 	mutex_exit(&ill->ill_lock);
3750 	DTRACE_PROBE1(ndp__add__v4, nce_t *, nce);
3751 	return (0);
3752 err_ret:
3753 	freeb(mp);
3754 	freemsg(template);
3755 	return (err);
3756 }
3757 
3758 /*
3759  * ndp_walk routine to delete all entries that have a given destination or
3760  * gateway address and cached link layer (MAC) address.  This is used when ARP
3761  * informs us that a network-to-link-layer mapping may have changed.
3762  */
3763 void
3764 nce_delete_hw_changed(nce_t *nce, void *arg)
3765 {
3766 	nce_hw_map_t *hwm = arg;
3767 	mblk_t *mp;
3768 	dl_unitdata_req_t *dlu;
3769 	uchar_t *macaddr;
3770 	ill_t *ill;
3771 	int saplen;
3772 	ipaddr_t nce_addr;
3773 
3774 	if (nce->nce_state != ND_REACHABLE)
3775 		return;
3776 
3777 	IN6_V4MAPPED_TO_IPADDR(&nce->nce_addr, nce_addr);
3778 	if (nce_addr != hwm->hwm_addr)
3779 		return;
3780 
3781 	mutex_enter(&nce->nce_lock);
3782 	if ((mp = nce->nce_res_mp) == NULL) {
3783 		mutex_exit(&nce->nce_lock);
3784 		return;
3785 	}
3786 	dlu = (dl_unitdata_req_t *)mp->b_rptr;
3787 	macaddr = (uchar_t *)(dlu + 1);
3788 	ill = nce->nce_ill;
3789 	if ((saplen = ill->ill_sap_length) > 0)
3790 		macaddr += saplen;
3791 	else
3792 		saplen = -saplen;
3793 
3794 	/*
3795 	 * If the hardware address is unchanged, then leave this one alone.
3796 	 * Note that saplen == abs(saplen) now.
3797 	 */
3798 	if (hwm->hwm_hwlen == dlu->dl_dest_addr_length - saplen &&
3799 	    bcmp(hwm->hwm_hwaddr, macaddr, hwm->hwm_hwlen) == 0) {
3800 		mutex_exit(&nce->nce_lock);
3801 		return;
3802 	}
3803 	mutex_exit(&nce->nce_lock);
3804 
3805 	DTRACE_PROBE1(nce__hw__deleted, nce_t *, nce);
3806 	ndp_delete(nce);
3807 }
3808 
3809 /*
3810  * This function verifies whether a given IPv4 address is potentially known to
3811  * the NCE subsystem.  If so, then ARP must not delete the corresponding ace_t,
3812  * so that it can continue to look for hardware changes on that address.
3813  */
3814 boolean_t
3815 ndp_lookup_ipaddr(in_addr_t addr, netstack_t *ns)
3816 {
3817 	nce_t		*nce;
3818 	struct in_addr	nceaddr;
3819 	ip_stack_t	*ipst = ns->netstack_ip;
3820 
3821 	if (addr == INADDR_ANY)
3822 		return (B_FALSE);
3823 
3824 	mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
3825 	nce = *(nce_t **)NCE_HASH_PTR_V4(ipst, addr);
3826 	for (; nce != NULL; nce = nce->nce_next) {
3827 		/* Note that only v4 mapped entries are in the table. */
3828 		IN6_V4MAPPED_TO_INADDR(&nce->nce_addr, &nceaddr);
3829 		if (addr == nceaddr.s_addr &&
3830 		    IN6_ARE_ADDR_EQUAL(&nce->nce_mask, &ipv6_all_ones)) {
3831 			/* Single flag check; no lock needed */
3832 			if (!(nce->nce_flags & NCE_F_CONDEMNED))
3833 				break;
3834 		}
3835 	}
3836 	mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
3837 	return (nce != NULL);
3838 }
3839 
3840 /*
3841  * Wrapper around ipif_lookup_addr_exact_v6() that allows ND to work properly
3842  * with IPMP.  Specifically, since neighbor discovery is always done on
3843  * underlying interfaces (even for addresses owned by an IPMP interface), we
3844  * need to check for `v6addrp' on both `ill' and on the IPMP meta-interface
3845  * associated with `ill' (if it exists).
3846  */
3847 static ipif_t *
3848 ip_ndp_lookup_addr_v6(const in6_addr_t *v6addrp, ill_t *ill)
3849 {
3850 	ipif_t *ipif;
3851 	ip_stack_t *ipst = ill->ill_ipst;
3852 
3853 	ipif = ipif_lookup_addr_exact_v6(v6addrp, ill, ipst);
3854 	if (ipif == NULL && IS_UNDER_IPMP(ill)) {
3855 		if ((ill = ipmp_ill_hold_ipmp_ill(ill)) != NULL) {
3856 			ipif = ipif_lookup_addr_exact_v6(v6addrp, ill, ipst);
3857 			ill_refrele(ill);
3858 		}
3859 	}
3860 	return (ipif);
3861 }
3862