xref: /titanic_51/usr/src/uts/common/inet/ip/ip_ndp.c (revision fc51f9bbbff02dbd8c3adf640b1a184ceeb58fa5)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #include <sys/types.h>
27 #include <sys/stream.h>
28 #include <sys/stropts.h>
29 #include <sys/strsun.h>
30 #include <sys/sysmacros.h>
31 #include <sys/errno.h>
32 #include <sys/dlpi.h>
33 #include <sys/socket.h>
34 #include <sys/ddi.h>
35 #include <sys/sunddi.h>
36 #include <sys/cmn_err.h>
37 #include <sys/debug.h>
38 #include <sys/vtrace.h>
39 #include <sys/kmem.h>
40 #include <sys/zone.h>
41 #include <sys/ethernet.h>
42 #include <sys/sdt.h>
43 
44 #include <net/if.h>
45 #include <net/if_types.h>
46 #include <net/if_dl.h>
47 #include <net/route.h>
48 #include <netinet/in.h>
49 #include <netinet/ip6.h>
50 #include <netinet/icmp6.h>
51 
52 #include <inet/common.h>
53 #include <inet/mi.h>
54 #include <inet/mib2.h>
55 #include <inet/nd.h>
56 #include <inet/ip.h>
57 #include <inet/ip_impl.h>
58 #include <inet/ipclassifier.h>
59 #include <inet/ip_if.h>
60 #include <inet/ip_ire.h>
61 #include <inet/ip_rts.h>
62 #include <inet/ip6.h>
63 #include <inet/ip_ndp.h>
64 #include <inet/ipsec_impl.h>
65 #include <inet/ipsec_info.h>
66 #include <inet/sctp_ip.h>
67 #include <inet/ip2mac_impl.h>
68 
69 /*
70  * Function names with nce_ prefix are static while function
71  * names with ndp_ prefix are used by rest of the IP.
72  *
73  * Lock ordering:
74  *
75  *	ndp_g_lock -> ill_lock -> nce_lock
76  *
77  * The ndp_g_lock protects the NCE hash (nce_hash_tbl, NCE_HASH_PTR) and
78  * nce_next.  Nce_lock protects the contents of the NCE (particularly
79  * nce_refcnt).
80  */
81 
82 static	boolean_t nce_cmp_ll_addr(const nce_t *nce, const uchar_t *new_ll_addr,
83     uint32_t ll_addr_len);
84 static	void	nce_ire_delete(nce_t *nce);
85 static	void	nce_ire_delete1(ire_t *ire, char *nce_arg);
86 static	void 	nce_set_ll(nce_t *nce, uchar_t *ll_addr);
87 static	nce_t	*nce_lookup_addr(ill_t *, boolean_t, const in6_addr_t *,
88     nce_t *);
89 static	nce_t	*nce_lookup_mapping(ill_t *, const in6_addr_t *);
90 static	void	nce_make_mapping(nce_t *nce, uchar_t *addrpos,
91     uchar_t *addr);
92 static	int	nce_set_multicast(ill_t *ill, const in6_addr_t *addr);
93 static	void	nce_queue_mp(nce_t *nce, mblk_t *mp);
94 static	mblk_t	*nce_udreq_alloc(ill_t *ill);
95 static	void	nce_update(nce_t *nce, uint16_t new_state,
96     uchar_t *new_ll_addr);
97 static	uint32_t	nce_solicit(nce_t *nce, in6_addr_t src);
98 static	boolean_t	nce_xmit(ill_t *ill, uint8_t type,
99     boolean_t use_lla_addr, const in6_addr_t *sender,
100     const in6_addr_t *target, int flag);
101 static boolean_t	nce_xmit_advert(nce_t *nce, boolean_t use_nd_lla,
102     const in6_addr_t *target, uint_t flags);
103 static boolean_t	nce_xmit_solicit(nce_t *nce, boolean_t use_nd_lla,
104     const in6_addr_t *src, uint_t flags);
105 static int	ndp_add_v4(ill_t *, const in_addr_t *, uint16_t,
106     nce_t **, nce_t *);
107 static ipif_t	*ip_ndp_lookup_addr_v6(const in6_addr_t *v6addrp, ill_t *ill);
108 
109 #ifdef DEBUG
110 static void	nce_trace_cleanup(const nce_t *);
111 #endif
112 
113 #define	NCE_HASH_PTR_V4(ipst, addr)					\
114 	(&((ipst)->ips_ndp4->nce_hash_tbl[IRE_ADDR_HASH(addr, NCE_TABLE_SIZE)]))
115 
116 #define	NCE_HASH_PTR_V6(ipst, addr)				 \
117 	(&((ipst)->ips_ndp6->nce_hash_tbl[NCE_ADDR_HASH_V6(addr, \
118 		NCE_TABLE_SIZE)]))
119 
120 /* Non-tunable probe interval, based on link capabilities */
121 #define	ILL_PROBE_INTERVAL(ill)	((ill)->ill_note_link ? 150 : 1500)
122 
123 /*
124  * NDP Cache Entry creation routine.
125  * Mapped entries will never do NUD .
126  * This routine must always be called with ndp6->ndp_g_lock held.
127  * Prior to return, nce_refcnt is incremented.
128  */
129 int
130 ndp_add_v6(ill_t *ill, uchar_t *hw_addr, const in6_addr_t *addr,
131     const in6_addr_t *mask, const in6_addr_t *extract_mask,
132     uint32_t hw_extract_start, uint16_t flags, uint16_t state,
133     nce_t **newnce)
134 {
135 	static	nce_t		nce_nil;
136 	nce_t		*nce;
137 	mblk_t		*mp;
138 	mblk_t		*template;
139 	nce_t		**ncep;
140 	int		err;
141 	boolean_t	dropped = B_FALSE;
142 	ip_stack_t	*ipst = ill->ill_ipst;
143 
144 	ASSERT(MUTEX_HELD(&ipst->ips_ndp6->ndp_g_lock));
145 	ASSERT(ill != NULL && ill->ill_isv6);
146 	if (IN6_IS_ADDR_UNSPECIFIED(addr)) {
147 		ip0dbg(("ndp_add_v6: no addr\n"));
148 		return (EINVAL);
149 	}
150 	if ((flags & ~NCE_EXTERNAL_FLAGS_MASK)) {
151 		ip0dbg(("ndp_add_v6: flags = %x\n", (int)flags));
152 		return (EINVAL);
153 	}
154 	if (IN6_IS_ADDR_UNSPECIFIED(extract_mask) &&
155 	    (flags & NCE_F_MAPPING)) {
156 		ip0dbg(("ndp_add_v6: extract mask zero for mapping"));
157 		return (EINVAL);
158 	}
159 	/*
160 	 * Allocate the mblk to hold the nce.
161 	 *
162 	 * XXX This can come out of a separate cache - nce_cache.
163 	 * We don't need the mp anymore as there are no more
164 	 * "qwriter"s
165 	 */
166 	mp = allocb(sizeof (nce_t), BPRI_MED);
167 	if (mp == NULL)
168 		return (ENOMEM);
169 
170 	nce = (nce_t *)mp->b_rptr;
171 	mp->b_wptr = (uchar_t *)&nce[1];
172 	*nce = nce_nil;
173 
174 	/*
175 	 * This one holds link layer address
176 	 */
177 	if (ill->ill_net_type == IRE_IF_RESOLVER) {
178 		template = nce_udreq_alloc(ill);
179 	} else {
180 		if (ill->ill_resolver_mp == NULL) {
181 			freeb(mp);
182 			return (EINVAL);
183 		}
184 		ASSERT((ill->ill_net_type == IRE_IF_NORESOLVER));
185 		template = copyb(ill->ill_resolver_mp);
186 	}
187 	if (template == NULL) {
188 		freeb(mp);
189 		return (ENOMEM);
190 	}
191 	nce->nce_ill = ill;
192 	nce->nce_ipversion = IPV6_VERSION;
193 	nce->nce_flags = flags;
194 	nce->nce_state = state;
195 	nce->nce_pcnt = ND_MAX_UNICAST_SOLICIT;
196 	nce->nce_rcnt = ill->ill_xmit_count;
197 	nce->nce_addr = *addr;
198 	nce->nce_mask = *mask;
199 	nce->nce_extract_mask = *extract_mask;
200 	nce->nce_ll_extract_start = hw_extract_start;
201 	nce->nce_fp_mp = NULL;
202 	nce->nce_res_mp = template;
203 	if (state == ND_REACHABLE)
204 		nce->nce_last = TICK_TO_MSEC(lbolt64);
205 	else
206 		nce->nce_last = 0;
207 	nce->nce_qd_mp = NULL;
208 	nce->nce_mp = mp;
209 	if (hw_addr != NULL)
210 		nce_set_ll(nce, hw_addr);
211 	/* This one is for nce getting created */
212 	nce->nce_refcnt = 1;
213 	mutex_init(&nce->nce_lock, NULL, MUTEX_DEFAULT, NULL);
214 	if (nce->nce_flags & NCE_F_MAPPING) {
215 		ASSERT(IN6_IS_ADDR_MULTICAST(addr));
216 		ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&nce->nce_mask));
217 		ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&nce->nce_extract_mask));
218 		ncep = &ipst->ips_ndp6->nce_mask_entries;
219 	} else {
220 		ncep = ((nce_t **)NCE_HASH_PTR_V6(ipst, *addr));
221 	}
222 
223 	nce->nce_trace_disable = B_FALSE;
224 
225 	list_create(&nce->nce_cb, sizeof (nce_cb_t),
226 	    offsetof(nce_cb_t, nce_cb_node));
227 	/*
228 	 * Atomically ensure that the ill is not CONDEMNED, before
229 	 * adding the NCE.
230 	 */
231 	mutex_enter(&ill->ill_lock);
232 	if (ill->ill_state_flags & ILL_CONDEMNED) {
233 		mutex_exit(&ill->ill_lock);
234 		freeb(mp);
235 		freeb(template);
236 		return (EINVAL);
237 	}
238 	if ((nce->nce_next = *ncep) != NULL)
239 		nce->nce_next->nce_ptpn = &nce->nce_next;
240 	*ncep = nce;
241 	nce->nce_ptpn = ncep;
242 	*newnce = nce;
243 	/* This one is for nce being used by an active thread */
244 	NCE_REFHOLD(*newnce);
245 
246 	/* Bump up the number of nce's referencing this ill */
247 	DTRACE_PROBE3(ill__incr__cnt, (ill_t *), ill,
248 	    (char *), "nce", (void *), nce);
249 	ill->ill_nce_cnt++;
250 	mutex_exit(&ill->ill_lock);
251 
252 	err = 0;
253 	if ((flags & NCE_F_PERMANENT) && state == ND_PROBE) {
254 		mutex_enter(&nce->nce_lock);
255 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
256 		nce->nce_pcnt = ND_MAX_UNICAST_SOLICIT;
257 		mutex_exit(&nce->nce_lock);
258 		dropped = nce_xmit_solicit(nce, B_FALSE, NULL, NDP_PROBE);
259 		if (dropped) {
260 			mutex_enter(&nce->nce_lock);
261 			nce->nce_pcnt++;
262 			mutex_exit(&nce->nce_lock);
263 		}
264 		NDP_RESTART_TIMER(nce, ILL_PROBE_INTERVAL(ill));
265 		mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
266 		err = EINPROGRESS;
267 	} else if (flags & NCE_F_UNSOL_ADV) {
268 		/*
269 		 * We account for the transmit below by assigning one
270 		 * less than the ndd variable. Subsequent decrements
271 		 * are done in ndp_timer.
272 		 */
273 		mutex_enter(&nce->nce_lock);
274 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
275 		nce->nce_unsolicit_count = ipst->ips_ip_ndp_unsolicit_count - 1;
276 		mutex_exit(&nce->nce_lock);
277 		dropped = nce_xmit_advert(nce, B_TRUE, &ipv6_all_hosts_mcast,
278 		    0);
279 		mutex_enter(&nce->nce_lock);
280 		if (dropped)
281 			nce->nce_unsolicit_count++;
282 		if (nce->nce_unsolicit_count != 0) {
283 			ASSERT(nce->nce_timeout_id == 0);
284 			nce->nce_timeout_id = timeout(ndp_timer, nce,
285 			    MSEC_TO_TICK(ipst->ips_ip_ndp_unsolicit_interval));
286 		}
287 		mutex_exit(&nce->nce_lock);
288 		mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
289 	}
290 
291 	/*
292 	 * If the hw_addr is NULL, typically for ND_INCOMPLETE nces, then
293 	 * we call nce_fastpath as soon as the nce is resolved in ndp_process.
294 	 * We call nce_fastpath from nce_update if the link layer address of
295 	 * the peer changes from nce_update
296 	 */
297 	if (hw_addr != NULL || ill->ill_net_type == IRE_IF_NORESOLVER)
298 		nce_fastpath(nce);
299 	return (err);
300 }
301 
302 int
303 ndp_lookup_then_add_v6(ill_t *ill, boolean_t match_illgrp, uchar_t *hw_addr,
304     const in6_addr_t *addr, const in6_addr_t *mask,
305     const in6_addr_t *extract_mask, uint32_t hw_extract_start, uint16_t flags,
306     uint16_t state, nce_t **newnce)
307 {
308 	int	err = 0;
309 	nce_t	*nce;
310 	ip_stack_t	*ipst = ill->ill_ipst;
311 
312 	ASSERT(ill->ill_isv6);
313 	mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
314 
315 	/* Get head of v6 hash table */
316 	nce = *((nce_t **)NCE_HASH_PTR_V6(ipst, *addr));
317 	nce = nce_lookup_addr(ill, match_illgrp, addr, nce);
318 	if (nce == NULL) {
319 		err = ndp_add_v6(ill,
320 		    hw_addr,
321 		    addr,
322 		    mask,
323 		    extract_mask,
324 		    hw_extract_start,
325 		    flags,
326 		    state,
327 		    newnce);
328 	} else {
329 		*newnce = nce;
330 		err = EEXIST;
331 	}
332 	mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
333 	return (err);
334 }
335 
336 /*
337  * Remove all the CONDEMNED nces from the appropriate hash table.
338  * We create a private list of NCEs, these may have ires pointing
339  * to them, so the list will be passed through to clean up dependent
340  * ires and only then we can do NCE_REFRELE which can make NCE inactive.
341  */
342 static void
343 nce_remove(ndp_g_t *ndp, nce_t *nce, nce_t **free_nce_list)
344 {
345 	nce_t *nce1;
346 	nce_t **ptpn;
347 
348 	ASSERT(MUTEX_HELD(&ndp->ndp_g_lock));
349 	ASSERT(ndp->ndp_g_walker == 0);
350 	for (; nce; nce = nce1) {
351 		nce1 = nce->nce_next;
352 		mutex_enter(&nce->nce_lock);
353 		if (nce->nce_flags & NCE_F_CONDEMNED) {
354 			ptpn = nce->nce_ptpn;
355 			nce1 = nce->nce_next;
356 			if (nce1 != NULL)
357 				nce1->nce_ptpn = ptpn;
358 			*ptpn = nce1;
359 			nce->nce_ptpn = NULL;
360 			nce->nce_next = NULL;
361 			nce->nce_next = *free_nce_list;
362 			*free_nce_list = nce;
363 		}
364 		mutex_exit(&nce->nce_lock);
365 	}
366 }
367 
368 /*
369  * 1. Mark the nce CONDEMNED. This ensures that no new nce_lookup()
370  *    will return this NCE. Also no new IREs will be created that
371  *    point to this NCE (See ire_add_v6).  Also no new timeouts will
372  *    be started (See NDP_RESTART_TIMER).
373  * 2. Cancel any currently running timeouts.
374  * 3. If there is an ndp walker, return. The walker will do the cleanup.
375  *    This ensures that walkers see a consistent list of NCEs while walking.
376  * 4. Otherwise remove the NCE from the list of NCEs
377  * 5. Delete all IREs pointing to this NCE.
378  */
379 void
380 ndp_delete(nce_t *nce)
381 {
382 	nce_t	**ptpn;
383 	nce_t	*nce1;
384 	int	ipversion = nce->nce_ipversion;
385 	ndp_g_t *ndp;
386 	ip_stack_t	*ipst = nce->nce_ill->ill_ipst;
387 
388 	if (ipversion == IPV4_VERSION)
389 		ndp = ipst->ips_ndp4;
390 	else
391 		ndp = ipst->ips_ndp6;
392 
393 	/* Serialize deletes */
394 	mutex_enter(&nce->nce_lock);
395 	if (nce->nce_flags & NCE_F_CONDEMNED) {
396 		/* Some other thread is doing the delete */
397 		mutex_exit(&nce->nce_lock);
398 		return;
399 	}
400 	/*
401 	 * Caller has a refhold. Also 1 ref for being in the list. Thus
402 	 * refcnt has to be >= 2
403 	 */
404 	ASSERT(nce->nce_refcnt >= 2);
405 	nce->nce_flags |= NCE_F_CONDEMNED;
406 	mutex_exit(&nce->nce_lock);
407 
408 	nce_fastpath_list_delete(nce);
409 
410 	/* Complete any waiting callbacks */
411 	nce_cb_dispatch(nce);
412 
413 	/*
414 	 * Cancel any running timer. Timeout can't be restarted
415 	 * since CONDEMNED is set. Can't hold nce_lock across untimeout.
416 	 * Passing invalid timeout id is fine.
417 	 */
418 	if (nce->nce_timeout_id != 0) {
419 		(void) untimeout(nce->nce_timeout_id);
420 		nce->nce_timeout_id = 0;
421 	}
422 
423 	mutex_enter(&ndp->ndp_g_lock);
424 	if (nce->nce_ptpn == NULL) {
425 		/*
426 		 * The last ndp walker has already removed this nce from
427 		 * the list after we marked the nce CONDEMNED and before
428 		 * we grabbed the global lock.
429 		 */
430 		mutex_exit(&ndp->ndp_g_lock);
431 		return;
432 	}
433 	if (ndp->ndp_g_walker > 0) {
434 		/*
435 		 * Can't unlink. The walker will clean up
436 		 */
437 		ndp->ndp_g_walker_cleanup = B_TRUE;
438 		mutex_exit(&ndp->ndp_g_lock);
439 		return;
440 	}
441 
442 	/*
443 	 * Now remove the nce from the list. NDP_RESTART_TIMER won't restart
444 	 * the timer since it is marked CONDEMNED.
445 	 */
446 	ptpn = nce->nce_ptpn;
447 	nce1 = nce->nce_next;
448 	if (nce1 != NULL)
449 		nce1->nce_ptpn = ptpn;
450 	*ptpn = nce1;
451 	nce->nce_ptpn = NULL;
452 	nce->nce_next = NULL;
453 	mutex_exit(&ndp->ndp_g_lock);
454 
455 	nce_ire_delete(nce);
456 }
457 
458 void
459 ndp_inactive(nce_t *nce)
460 {
461 	mblk_t		**mpp;
462 	ill_t		*ill;
463 
464 	ASSERT(nce->nce_refcnt == 0);
465 	ASSERT(MUTEX_HELD(&nce->nce_lock));
466 	ASSERT(nce->nce_fastpath == NULL);
467 
468 	/* Free all nce allocated messages */
469 	mpp = &nce->nce_first_mp_to_free;
470 	do {
471 		while (*mpp != NULL) {
472 			mblk_t  *mp;
473 
474 			mp = *mpp;
475 			*mpp = mp->b_next;
476 
477 			inet_freemsg(mp);
478 		}
479 	} while (mpp++ != &nce->nce_last_mp_to_free);
480 
481 	if (nce->nce_ipversion == IPV6_VERSION) {
482 		/*
483 		 * must have been cleaned up in nce_delete
484 		 */
485 		ASSERT(list_is_empty(&nce->nce_cb));
486 		list_destroy(&nce->nce_cb);
487 	}
488 #ifdef DEBUG
489 	nce_trace_cleanup(nce);
490 #endif
491 
492 	ill = nce->nce_ill;
493 	mutex_enter(&ill->ill_lock);
494 	DTRACE_PROBE3(ill__decr__cnt, (ill_t *), ill,
495 	    (char *), "nce", (void *), nce);
496 	ill->ill_nce_cnt--;
497 	/*
498 	 * If the number of nce's associated with this ill have dropped
499 	 * to zero, check whether we need to restart any operation that
500 	 * is waiting for this to happen.
501 	 */
502 	if (ILL_DOWN_OK(ill)) {
503 		/* ipif_ill_refrele_tail drops the ill_lock */
504 		ipif_ill_refrele_tail(ill);
505 	} else {
506 		mutex_exit(&ill->ill_lock);
507 	}
508 	mutex_destroy(&nce->nce_lock);
509 	if (nce->nce_mp != NULL)
510 		inet_freemsg(nce->nce_mp);
511 }
512 
513 /*
514  * ndp_walk routine.  Delete the nce if it is associated with the ill
515  * that is going away.  Always called as a writer.
516  */
517 void
518 ndp_delete_per_ill(nce_t *nce, uchar_t *arg)
519 {
520 	if ((nce != NULL) && nce->nce_ill == (ill_t *)arg) {
521 		ndp_delete(nce);
522 	}
523 }
524 
525 /*
526  * Walk a list of to be inactive NCEs and blow away all the ires.
527  */
528 static void
529 nce_ire_delete_list(nce_t *nce)
530 {
531 	nce_t *nce_next;
532 
533 	ASSERT(nce != NULL);
534 	while (nce != NULL) {
535 		nce_next = nce->nce_next;
536 		nce->nce_next = NULL;
537 
538 		/*
539 		 * It is possible for the last ndp walker (this thread)
540 		 * to come here after ndp_delete has marked the nce CONDEMNED
541 		 * and before it has removed the nce from the fastpath list
542 		 * or called untimeout. So we need to do it here. It is safe
543 		 * for both ndp_delete and this thread to do it twice or
544 		 * even simultaneously since each of the threads has a
545 		 * reference on the nce.
546 		 */
547 		nce_fastpath_list_delete(nce);
548 		/*
549 		 * Cancel any running timer. Timeout can't be restarted
550 		 * since CONDEMNED is set. Can't hold nce_lock across untimeout.
551 		 * Passing invalid timeout id is fine.
552 		 */
553 		if (nce->nce_timeout_id != 0) {
554 			(void) untimeout(nce->nce_timeout_id);
555 			nce->nce_timeout_id = 0;
556 		}
557 		/*
558 		 * We might hit this func thus in the v4 case:
559 		 * ipif_down->ipif_ndp_down->ndp_walk
560 		 */
561 
562 		if (nce->nce_ipversion == IPV4_VERSION) {
563 			ire_walk_ill_v4(MATCH_IRE_ILL | MATCH_IRE_TYPE,
564 			    IRE_CACHE, nce_ire_delete1, nce, nce->nce_ill);
565 		} else {
566 			ASSERT(nce->nce_ipversion == IPV6_VERSION);
567 			ire_walk_ill_v6(MATCH_IRE_ILL | MATCH_IRE_TYPE,
568 			    IRE_CACHE, nce_ire_delete1, nce, nce->nce_ill);
569 		}
570 		NCE_REFRELE_NOTR(nce);
571 		nce = nce_next;
572 	}
573 }
574 
575 /*
576  * Delete an ire when the nce goes away.
577  */
578 /* ARGSUSED */
579 static void
580 nce_ire_delete(nce_t *nce)
581 {
582 	if (nce->nce_ipversion == IPV6_VERSION) {
583 		ire_walk_ill_v6(MATCH_IRE_ILL | MATCH_IRE_TYPE, IRE_CACHE,
584 		    nce_ire_delete1, (char *)nce, nce->nce_ill);
585 		NCE_REFRELE_NOTR(nce);
586 	} else {
587 		ire_walk_ill_v4(MATCH_IRE_ILL | MATCH_IRE_TYPE, IRE_CACHE,
588 		    nce_ire_delete1, (char *)nce, nce->nce_ill);
589 		NCE_REFRELE_NOTR(nce);
590 	}
591 }
592 
593 /*
594  * ire_walk routine used to delete every IRE that shares this nce
595  */
596 static void
597 nce_ire_delete1(ire_t *ire, char *nce_arg)
598 {
599 	nce_t	*nce = (nce_t *)nce_arg;
600 
601 	ASSERT(ire->ire_type == IRE_CACHE);
602 
603 	if (ire->ire_nce == nce) {
604 		ASSERT(ire->ire_ipversion == nce->nce_ipversion);
605 		ire_delete(ire);
606 	}
607 }
608 
609 /*
610  * Restart DAD on given NCE.  Returns B_TRUE if DAD has been restarted.
611  */
612 boolean_t
613 ndp_restart_dad(nce_t *nce)
614 {
615 	boolean_t started;
616 	boolean_t dropped;
617 
618 	if (nce == NULL)
619 		return (B_FALSE);
620 	mutex_enter(&nce->nce_lock);
621 	if (nce->nce_state == ND_PROBE) {
622 		mutex_exit(&nce->nce_lock);
623 		started = B_TRUE;
624 	} else if (nce->nce_state == ND_REACHABLE) {
625 		nce->nce_state = ND_PROBE;
626 		nce->nce_pcnt = ND_MAX_UNICAST_SOLICIT - 1;
627 		mutex_exit(&nce->nce_lock);
628 		dropped = nce_xmit_solicit(nce, B_FALSE, NULL, NDP_PROBE);
629 		if (dropped) {
630 			mutex_enter(&nce->nce_lock);
631 			nce->nce_pcnt++;
632 			mutex_exit(&nce->nce_lock);
633 		}
634 		NDP_RESTART_TIMER(nce, ILL_PROBE_INTERVAL(nce->nce_ill));
635 		started = B_TRUE;
636 	} else {
637 		mutex_exit(&nce->nce_lock);
638 		started = B_FALSE;
639 	}
640 	return (started);
641 }
642 
643 /*
644  * IPv6 Cache entry lookup.  Try to find an nce matching the parameters passed.
645  * If one is found, the refcnt on the nce will be incremented.
646  */
647 nce_t *
648 ndp_lookup_v6(ill_t *ill, boolean_t match_illgrp, const in6_addr_t *addr,
649     boolean_t caller_holds_lock)
650 {
651 	nce_t	*nce;
652 	ip_stack_t *ipst = ill->ill_ipst;
653 
654 	ASSERT(ill->ill_isv6);
655 	if (!caller_holds_lock)
656 		mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
657 
658 	/* Get head of v6 hash table */
659 	nce = *((nce_t **)NCE_HASH_PTR_V6(ipst, *addr));
660 	nce = nce_lookup_addr(ill, match_illgrp, addr, nce);
661 	if (nce == NULL)
662 		nce = nce_lookup_mapping(ill, addr);
663 	if (!caller_holds_lock)
664 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
665 	return (nce);
666 }
667 /*
668  * IPv4 Cache entry lookup.  Try to find an nce matching the parameters passed.
669  * If one is found, the refcnt on the nce will be incremented.
670  * Since multicast mappings are handled in arp, there are no nce_mcast_entries
671  * so we skip the nce_lookup_mapping call.
672  * XXX TODO: if the nce is found to be ND_STALE, ndp_delete it and return NULL
673  */
674 nce_t *
675 ndp_lookup_v4(ill_t *ill, const in_addr_t *addr, boolean_t caller_holds_lock)
676 {
677 	nce_t	*nce;
678 	in6_addr_t addr6;
679 	ip_stack_t *ipst = ill->ill_ipst;
680 
681 	if (!caller_holds_lock)
682 		mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
683 
684 	/* Get head of v4 hash table */
685 	nce = *((nce_t **)NCE_HASH_PTR_V4(ipst, *addr));
686 	IN6_IPADDR_TO_V4MAPPED(*addr, &addr6);
687 	/*
688 	 * NOTE: IPv4 never matches across the illgrp since the NCE's we're
689 	 * looking up have fastpath headers that are inherently per-ill.
690 	 */
691 	nce = nce_lookup_addr(ill, B_FALSE, &addr6, nce);
692 	if (!caller_holds_lock)
693 		mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
694 	return (nce);
695 }
696 
697 /*
698  * Cache entry lookup.  Try to find an nce matching the parameters passed.
699  * Look only for exact entries (no mappings).  If an nce is found, increment
700  * the hold count on that nce. The caller passes in the start of the
701  * appropriate hash table, and must be holding the appropriate global
702  * lock (ndp_g_lock).
703  */
704 static nce_t *
705 nce_lookup_addr(ill_t *ill, boolean_t match_illgrp, const in6_addr_t *addr,
706     nce_t *nce)
707 {
708 	ndp_g_t		*ndp;
709 	ip_stack_t	*ipst = ill->ill_ipst;
710 
711 	if (ill->ill_isv6)
712 		ndp = ipst->ips_ndp6;
713 	else
714 		ndp = ipst->ips_ndp4;
715 
716 	ASSERT(MUTEX_HELD(&ndp->ndp_g_lock));
717 	if (IN6_IS_ADDR_UNSPECIFIED(addr))
718 		return (NULL);
719 	for (; nce != NULL; nce = nce->nce_next) {
720 		if (nce->nce_ill == ill ||
721 		    match_illgrp && IS_IN_SAME_ILLGRP(ill, nce->nce_ill)) {
722 			if (IN6_ARE_ADDR_EQUAL(&nce->nce_addr, addr) &&
723 			    IN6_ARE_ADDR_EQUAL(&nce->nce_mask,
724 			    &ipv6_all_ones)) {
725 				mutex_enter(&nce->nce_lock);
726 				if (!(nce->nce_flags & NCE_F_CONDEMNED)) {
727 					NCE_REFHOLD_LOCKED(nce);
728 					mutex_exit(&nce->nce_lock);
729 					break;
730 				}
731 				mutex_exit(&nce->nce_lock);
732 			}
733 		}
734 	}
735 	return (nce);
736 }
737 
738 /*
739  * Cache entry lookup.  Try to find an nce matching the parameters passed.
740  * Look only for mappings.
741  */
742 static nce_t *
743 nce_lookup_mapping(ill_t *ill, const in6_addr_t *addr)
744 {
745 	nce_t	*nce;
746 	ip_stack_t	*ipst = ill->ill_ipst;
747 
748 	ASSERT(ill != NULL && ill->ill_isv6);
749 	ASSERT(MUTEX_HELD(&ipst->ips_ndp6->ndp_g_lock));
750 	if (!IN6_IS_ADDR_MULTICAST(addr))
751 		return (NULL);
752 	nce = ipst->ips_ndp6->nce_mask_entries;
753 	for (; nce != NULL; nce = nce->nce_next)
754 		if (nce->nce_ill == ill &&
755 		    (V6_MASK_EQ(*addr, nce->nce_mask, nce->nce_addr))) {
756 			mutex_enter(&nce->nce_lock);
757 			if (!(nce->nce_flags & NCE_F_CONDEMNED)) {
758 				NCE_REFHOLD_LOCKED(nce);
759 				mutex_exit(&nce->nce_lock);
760 				break;
761 			}
762 			mutex_exit(&nce->nce_lock);
763 		}
764 	return (nce);
765 }
766 
767 /*
768  * Process passed in parameters either from an incoming packet or via
769  * user ioctl.
770  */
771 static void
772 nce_process(nce_t *nce, uchar_t *hw_addr, uint32_t flag, boolean_t is_adv)
773 {
774 	ill_t	*ill = nce->nce_ill;
775 	uint32_t hw_addr_len = ill->ill_nd_lla_len;
776 	mblk_t	*mp;
777 	boolean_t ll_updated = B_FALSE;
778 	boolean_t ll_changed;
779 	ip_stack_t	*ipst = ill->ill_ipst;
780 
781 	ASSERT(nce->nce_ipversion == IPV6_VERSION);
782 	/*
783 	 * No updates of link layer address or the neighbor state is
784 	 * allowed, when the cache is in NONUD state.  This still
785 	 * allows for responding to reachability solicitation.
786 	 */
787 	mutex_enter(&nce->nce_lock);
788 	if (nce->nce_state == ND_INCOMPLETE) {
789 		if (hw_addr == NULL) {
790 			mutex_exit(&nce->nce_lock);
791 			return;
792 		}
793 		nce_set_ll(nce, hw_addr);
794 		/*
795 		 * Update nce state and send the queued packets
796 		 * back to ip this time ire will be added.
797 		 */
798 		if (flag & ND_NA_FLAG_SOLICITED) {
799 			nce_update(nce, ND_REACHABLE, NULL);
800 		} else {
801 			nce_update(nce, ND_STALE, NULL);
802 		}
803 		mutex_exit(&nce->nce_lock);
804 		nce_fastpath(nce);
805 		nce_cb_dispatch(nce); /* complete callbacks */
806 		mutex_enter(&nce->nce_lock);
807 		mp = nce->nce_qd_mp;
808 		nce->nce_qd_mp = NULL;
809 		mutex_exit(&nce->nce_lock);
810 		while (mp != NULL) {
811 			mblk_t *nxt_mp, *data_mp;
812 
813 			nxt_mp = mp->b_next;
814 			mp->b_next = NULL;
815 
816 			if (mp->b_datap->db_type == M_CTL)
817 				data_mp = mp->b_cont;
818 			else
819 				data_mp = mp;
820 			if (data_mp->b_prev != NULL) {
821 				ill_t   *inbound_ill;
822 				queue_t *fwdq = NULL;
823 				uint_t ifindex;
824 
825 				ifindex = (uint_t)(uintptr_t)data_mp->b_prev;
826 				inbound_ill = ill_lookup_on_ifindex(ifindex,
827 				    B_TRUE, NULL, NULL, NULL, NULL, ipst);
828 				if (inbound_ill == NULL) {
829 					data_mp->b_prev = NULL;
830 					freemsg(mp);
831 					return;
832 				} else {
833 					fwdq = inbound_ill->ill_rq;
834 				}
835 				data_mp->b_prev = NULL;
836 				/*
837 				 * Send a forwarded packet back into ip_rput_v6
838 				 * just as in ire_send_v6().
839 				 * Extract the queue from b_prev (set in
840 				 * ip_rput_data_v6).
841 				 */
842 				if (fwdq != NULL) {
843 					/*
844 					 * Forwarded packets hop count will
845 					 * get decremented in ip_rput_data_v6
846 					 */
847 					if (data_mp != mp)
848 						freeb(mp);
849 					put(fwdq, data_mp);
850 				} else {
851 					/*
852 					 * Send locally originated packets back
853 					 * into ip_wput_v6.
854 					 */
855 					put(ill->ill_wq, mp);
856 				}
857 				ill_refrele(inbound_ill);
858 			} else {
859 				put(ill->ill_wq, mp);
860 			}
861 			mp = nxt_mp;
862 		}
863 		return;
864 	}
865 	ll_changed = nce_cmp_ll_addr(nce, hw_addr, hw_addr_len);
866 	if (!is_adv) {
867 		/* If this is a SOLICITATION request only */
868 		if (ll_changed)
869 			nce_update(nce, ND_STALE, hw_addr);
870 		mutex_exit(&nce->nce_lock);
871 		nce_cb_dispatch(nce);
872 		return;
873 	}
874 	if (!(flag & ND_NA_FLAG_OVERRIDE) && ll_changed) {
875 		/* If in any other state than REACHABLE, ignore */
876 		if (nce->nce_state == ND_REACHABLE) {
877 			nce_update(nce, ND_STALE, NULL);
878 		}
879 		mutex_exit(&nce->nce_lock);
880 		nce_cb_dispatch(nce);
881 		return;
882 	} else {
883 		if (ll_changed) {
884 			nce_update(nce, ND_UNCHANGED, hw_addr);
885 			ll_updated = B_TRUE;
886 		}
887 		if (flag & ND_NA_FLAG_SOLICITED) {
888 			nce_update(nce, ND_REACHABLE, NULL);
889 		} else {
890 			if (ll_updated) {
891 				nce_update(nce, ND_STALE, NULL);
892 			}
893 		}
894 		mutex_exit(&nce->nce_lock);
895 		if (!(flag & ND_NA_FLAG_ROUTER) && (nce->nce_flags &
896 		    NCE_F_ISROUTER)) {
897 			ire_t *ire;
898 
899 			/*
900 			 * Router turned to host.  We need to remove the
901 			 * entry as well as any default route that may be
902 			 * using this as a next hop.  This is required by
903 			 * section 7.2.5 of RFC 2461.
904 			 */
905 			ire = ire_ftable_lookup_v6(&ipv6_all_zeros,
906 			    &ipv6_all_zeros, &nce->nce_addr, IRE_DEFAULT,
907 			    nce->nce_ill->ill_ipif, NULL, ALL_ZONES, 0, NULL,
908 			    MATCH_IRE_ILL | MATCH_IRE_TYPE | MATCH_IRE_GW |
909 			    MATCH_IRE_DEFAULT, ipst);
910 			if (ire != NULL) {
911 				ip_rts_rtmsg(RTM_DELETE, ire, 0, ipst);
912 				ire_delete(ire);
913 				ire_refrele(ire);
914 			}
915 			ndp_delete(nce); /* will do nce_cb_dispatch */
916 		} else {
917 			nce_cb_dispatch(nce);
918 		}
919 	}
920 }
921 
922 /*
923  * Walker state structure used by ndp_process() / ndp_process_entry().
924  */
925 typedef struct ndp_process_data {
926 	ill_t		*np_ill; 	/* ill/illgrp to match against */
927 	const in6_addr_t *np_addr; 	/* IPv6 address to match */
928 	uchar_t		*np_hw_addr; 	/* passed to nce_process() */
929 	uint32_t	np_flag;	/* passed to nce_process() */
930 	boolean_t	np_is_adv;	/* passed to nce_process() */
931 } ndp_process_data_t;
932 
933 /*
934  * Walker callback used by ndp_process() for IPMP groups: calls nce_process()
935  * for each NCE with a matching address that's in the same IPMP group.
936  */
937 static void
938 ndp_process_entry(nce_t *nce, void *arg)
939 {
940 	ndp_process_data_t *npp = arg;
941 
942 	if (IS_IN_SAME_ILLGRP(nce->nce_ill, npp->np_ill) &&
943 	    IN6_ARE_ADDR_EQUAL(&nce->nce_addr, npp->np_addr) &&
944 	    IN6_ARE_ADDR_EQUAL(&nce->nce_mask, &ipv6_all_ones)) {
945 		nce_process(nce, npp->np_hw_addr, npp->np_flag, npp->np_is_adv);
946 	}
947 }
948 
949 /*
950  * Wrapper around nce_process() that handles IPMP.  In particular, for IPMP,
951  * NCEs are per-underlying-ill (because of nce_fp_mp) and thus we may have
952  * more than one NCE for a given IPv6 address to tend to.  In that case, we
953  * need to walk all NCEs and callback nce_process() for each one.  Since this
954  * is expensive, in the non-IPMP case we just directly call nce_process().
955  * Ultimately, nce_fp_mp needs to be moved out of the nce_t so that all IP
956  * interfaces in an IPMP group share the same NCEs -- at which point this
957  * function can be removed entirely.
958  */
959 void
960 ndp_process(nce_t *nce, uchar_t *hw_addr, uint32_t flag, boolean_t is_adv)
961 {
962 	ill_t *ill = nce->nce_ill;
963 	struct ndp_g_s *ndp = ill->ill_ipst->ips_ndp6;
964 	ndp_process_data_t np;
965 
966 	if (ill->ill_grp == NULL) {
967 		nce_process(nce, hw_addr, flag, is_adv);
968 		return;
969 	}
970 
971 	/* IPMP case: walk all NCEs */
972 	np.np_ill = ill;
973 	np.np_addr = &nce->nce_addr;
974 	np.np_flag = flag;
975 	np.np_is_adv = is_adv;
976 	np.np_hw_addr = hw_addr;
977 
978 	ndp_walk_common(ndp, NULL, (pfi_t)ndp_process_entry, &np, ALL_ZONES);
979 }
980 
981 /*
982  * Pass arg1 to the pfi supplied, along with each nce in existence.
983  * ndp_walk() places a REFHOLD on the nce and drops the lock when
984  * walking the hash list.
985  */
986 void
987 ndp_walk_common(ndp_g_t *ndp, ill_t *ill, pfi_t pfi, void *arg1,
988     boolean_t trace)
989 {
990 	nce_t	*nce;
991 	nce_t	*nce1;
992 	nce_t	**ncep;
993 	nce_t	*free_nce_list = NULL;
994 
995 	mutex_enter(&ndp->ndp_g_lock);
996 	/* Prevent ndp_delete from unlink and free of NCE */
997 	ndp->ndp_g_walker++;
998 	mutex_exit(&ndp->ndp_g_lock);
999 	for (ncep = ndp->nce_hash_tbl;
1000 	    ncep < A_END(ndp->nce_hash_tbl); ncep++) {
1001 		for (nce = *ncep; nce != NULL; nce = nce1) {
1002 			nce1 = nce->nce_next;
1003 			if (ill == NULL || nce->nce_ill == ill) {
1004 				if (trace) {
1005 					NCE_REFHOLD(nce);
1006 					(*pfi)(nce, arg1);
1007 					NCE_REFRELE(nce);
1008 				} else {
1009 					NCE_REFHOLD_NOTR(nce);
1010 					(*pfi)(nce, arg1);
1011 					NCE_REFRELE_NOTR(nce);
1012 				}
1013 			}
1014 		}
1015 	}
1016 	for (nce = ndp->nce_mask_entries; nce != NULL; nce = nce1) {
1017 		nce1 = nce->nce_next;
1018 		if (ill == NULL || nce->nce_ill == ill) {
1019 			if (trace) {
1020 				NCE_REFHOLD(nce);
1021 				(*pfi)(nce, arg1);
1022 				NCE_REFRELE(nce);
1023 			} else {
1024 				NCE_REFHOLD_NOTR(nce);
1025 				(*pfi)(nce, arg1);
1026 				NCE_REFRELE_NOTR(nce);
1027 			}
1028 		}
1029 	}
1030 	mutex_enter(&ndp->ndp_g_lock);
1031 	ndp->ndp_g_walker--;
1032 	/*
1033 	 * While NCE's are removed from global list they are placed
1034 	 * in a private list, to be passed to nce_ire_delete_list().
1035 	 * The reason is, there may be ires pointing to this nce
1036 	 * which needs to cleaned up.
1037 	 */
1038 	if (ndp->ndp_g_walker_cleanup && ndp->ndp_g_walker == 0) {
1039 		/* Time to delete condemned entries */
1040 		for (ncep = ndp->nce_hash_tbl;
1041 		    ncep < A_END(ndp->nce_hash_tbl); ncep++) {
1042 			nce = *ncep;
1043 			if (nce != NULL) {
1044 				nce_remove(ndp, nce, &free_nce_list);
1045 			}
1046 		}
1047 		nce = ndp->nce_mask_entries;
1048 		if (nce != NULL) {
1049 			nce_remove(ndp, nce, &free_nce_list);
1050 		}
1051 		ndp->ndp_g_walker_cleanup = B_FALSE;
1052 	}
1053 
1054 	mutex_exit(&ndp->ndp_g_lock);
1055 
1056 	if (free_nce_list != NULL) {
1057 		nce_ire_delete_list(free_nce_list);
1058 	}
1059 }
1060 
1061 /*
1062  * Walk everything.
1063  * Note that ill can be NULL hence can't derive the ipst from it.
1064  */
1065 void
1066 ndp_walk(ill_t *ill, pfi_t pfi, void *arg1, ip_stack_t *ipst)
1067 {
1068 	ndp_walk_common(ipst->ips_ndp4, ill, pfi, arg1, B_TRUE);
1069 	ndp_walk_common(ipst->ips_ndp6, ill, pfi, arg1, B_TRUE);
1070 }
1071 
1072 /*
1073  * Process resolve requests.  Handles both mapped entries
1074  * as well as cases that needs to be send out on the wire.
1075  * Lookup a NCE for a given IRE.  Regardless of whether one exists
1076  * or one is created, we defer making ire point to nce until the
1077  * ire is actually added at which point the nce_refcnt on the nce is
1078  * incremented.  This is done primarily to have symmetry between ire_add()
1079  * and ire_delete() which decrements the nce_refcnt, when an ire is deleted.
1080  */
1081 int
1082 ndp_resolver(ill_t *ill, const in6_addr_t *dst, mblk_t *mp, zoneid_t zoneid)
1083 {
1084 	nce_t		*nce, *hw_nce = NULL;
1085 	int		err;
1086 	ill_t		*ipmp_ill;
1087 	uint16_t	nce_flags;
1088 	mblk_t		*mp_nce = NULL;
1089 	ip_stack_t	*ipst = ill->ill_ipst;
1090 	uchar_t		*hwaddr = NULL;
1091 
1092 	ASSERT(ill->ill_isv6);
1093 
1094 	if (IN6_IS_ADDR_MULTICAST(dst))
1095 		return (nce_set_multicast(ill, dst));
1096 
1097 	nce_flags = (ill->ill_flags & ILLF_NONUD) ? NCE_F_NONUD : 0;
1098 
1099 	/*
1100 	 * If `ill' is under IPMP, then first check to see if there's an NCE
1101 	 * for `dst' on the IPMP meta-interface (e.g., because an application
1102 	 * explicitly did an SIOCLIFSETND to tie a hardware address to `dst').
1103 	 * If so, we use that hardware address when creating the NCE below.
1104 	 * Note that we don't yet have a mechanism to remove these NCEs if the
1105 	 * NCE for `dst' on the IPMP meta-interface is subsequently removed --
1106 	 * but rather than build such a beast, we should fix NCEs so that they
1107 	 * can be properly shared across an IPMP group.
1108 	 */
1109 	if (IS_UNDER_IPMP(ill)) {
1110 		if ((ipmp_ill = ipmp_ill_hold_ipmp_ill(ill)) != NULL) {
1111 			hw_nce = ndp_lookup_v6(ipmp_ill, B_FALSE, dst, B_FALSE);
1112 			if (hw_nce != NULL && hw_nce->nce_res_mp != NULL) {
1113 				hwaddr = hw_nce->nce_res_mp->b_rptr +
1114 				    NCE_LL_ADDR_OFFSET(ipmp_ill);
1115 				nce_flags |= hw_nce->nce_flags;
1116 			}
1117 			ill_refrele(ipmp_ill);
1118 		}
1119 	}
1120 
1121 	err = ndp_lookup_then_add_v6(ill,
1122 	    B_FALSE,	/* NCE fastpath is per ill; don't match across group */
1123 	    hwaddr,
1124 	    dst,
1125 	    &ipv6_all_ones,
1126 	    &ipv6_all_zeros,
1127 	    0,
1128 	    nce_flags,
1129 	    hwaddr != NULL ? ND_REACHABLE : ND_INCOMPLETE,
1130 	    &nce);
1131 
1132 	if (hw_nce != NULL)
1133 		NCE_REFRELE(hw_nce);
1134 
1135 	switch (err) {
1136 	case 0:
1137 		/*
1138 		 * New cache entry was created. Make sure that the state
1139 		 * is not ND_INCOMPLETE. It can be in some other state
1140 		 * even before we send out the solicitation as we could
1141 		 * get un-solicited advertisements.
1142 		 *
1143 		 * If this is an XRESOLV interface, simply return 0,
1144 		 * since we don't want to solicit just yet.
1145 		 */
1146 		if (ill->ill_flags & ILLF_XRESOLV) {
1147 			NCE_REFRELE(nce);
1148 			return (0);
1149 		}
1150 
1151 		mutex_enter(&nce->nce_lock);
1152 		if (nce->nce_state != ND_INCOMPLETE) {
1153 			mutex_exit(&nce->nce_lock);
1154 			NCE_REFRELE(nce);
1155 			return (0);
1156 		}
1157 		if (nce->nce_rcnt == 0) {
1158 			/* The caller will free mp */
1159 			mutex_exit(&nce->nce_lock);
1160 			ndp_delete(nce);
1161 			NCE_REFRELE(nce);
1162 			return (ESRCH);
1163 		}
1164 		mp_nce = ip_prepend_zoneid(mp, zoneid, ipst);
1165 		if (mp_nce == NULL) {
1166 			/* The caller will free mp */
1167 			mutex_exit(&nce->nce_lock);
1168 			ndp_delete(nce);
1169 			NCE_REFRELE(nce);
1170 			return (ENOMEM);
1171 		}
1172 		nce_queue_mp(nce, mp_nce);
1173 		ip_ndp_resolve(nce);
1174 		mutex_exit(&nce->nce_lock);
1175 		NCE_REFRELE(nce);
1176 		return (EINPROGRESS);
1177 	case EEXIST:
1178 		/* Resolution in progress just queue the packet */
1179 		mutex_enter(&nce->nce_lock);
1180 		if (nce->nce_state == ND_INCOMPLETE) {
1181 			mp_nce = ip_prepend_zoneid(mp, zoneid, ipst);
1182 			if (mp_nce == NULL) {
1183 				err = ENOMEM;
1184 			} else {
1185 				nce_queue_mp(nce, mp_nce);
1186 				err = EINPROGRESS;
1187 			}
1188 		} else {
1189 			/*
1190 			 * Any other state implies we have
1191 			 * a nce but IRE needs to be added ...
1192 			 * ire_add_v6() will take care of the
1193 			 * the case when the nce becomes CONDEMNED
1194 			 * before the ire is added to the table.
1195 			 */
1196 			err = 0;
1197 		}
1198 		mutex_exit(&nce->nce_lock);
1199 		NCE_REFRELE(nce);
1200 		break;
1201 	default:
1202 		ip1dbg(("ndp_resolver: Can't create NCE %d\n", err));
1203 		break;
1204 	}
1205 	return (err);
1206 }
1207 
1208 /*
1209  * When there is no resolver, the link layer template is passed in
1210  * the IRE.
1211  * Lookup a NCE for a given IRE.  Regardless of whether one exists
1212  * or one is created, we defer making ire point to nce until the
1213  * ire is actually added at which point the nce_refcnt on the nce is
1214  * incremented.  This is done primarily to have symmetry between ire_add()
1215  * and ire_delete() which decrements the nce_refcnt, when an ire is deleted.
1216  */
1217 int
1218 ndp_noresolver(ill_t *ill, const in6_addr_t *dst)
1219 {
1220 	nce_t		*nce;
1221 	int		err = 0;
1222 
1223 	ASSERT(ill != NULL);
1224 	ASSERT(ill->ill_isv6);
1225 	if (IN6_IS_ADDR_MULTICAST(dst)) {
1226 		err = nce_set_multicast(ill, dst);
1227 		return (err);
1228 	}
1229 
1230 	err = ndp_lookup_then_add_v6(ill,
1231 	    B_FALSE,	/* NCE fastpath is per ill; don't match across group */
1232 	    NULL,	/* hardware address */
1233 	    dst,
1234 	    &ipv6_all_ones,
1235 	    &ipv6_all_zeros,
1236 	    0,
1237 	    (ill->ill_flags & ILLF_NONUD) ? NCE_F_NONUD : 0,
1238 	    ND_REACHABLE,
1239 	    &nce);
1240 
1241 	switch (err) {
1242 	case 0:
1243 		/*
1244 		 * Cache entry with a proper resolver cookie was
1245 		 * created.
1246 		 */
1247 		NCE_REFRELE(nce);
1248 		break;
1249 	case EEXIST:
1250 		err = 0;
1251 		NCE_REFRELE(nce);
1252 		break;
1253 	default:
1254 		ip1dbg(("ndp_noresolver: Can't create NCE %d\n", err));
1255 		break;
1256 	}
1257 	return (err);
1258 }
1259 
1260 /*
1261  * For each interface an entry is added for the unspecified multicast group.
1262  * Here that mapping is used to form the multicast cache entry for a particular
1263  * multicast destination.
1264  */
1265 static int
1266 nce_set_multicast(ill_t *ill, const in6_addr_t *dst)
1267 {
1268 	nce_t		*mnce;	/* Multicast mapping entry */
1269 	nce_t		*nce;
1270 	uchar_t		*hw_addr = NULL;
1271 	int		err = 0;
1272 	ip_stack_t	*ipst = ill->ill_ipst;
1273 
1274 	ASSERT(ill != NULL);
1275 	ASSERT(ill->ill_isv6);
1276 	ASSERT(!(IN6_IS_ADDR_UNSPECIFIED(dst)));
1277 
1278 	mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
1279 	nce = *((nce_t **)NCE_HASH_PTR_V6(ipst, *dst));
1280 	nce = nce_lookup_addr(ill, B_FALSE, dst, nce);
1281 	if (nce != NULL) {
1282 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
1283 		NCE_REFRELE(nce);
1284 		return (0);
1285 	}
1286 	/* No entry, now lookup for a mapping this should never fail */
1287 	mnce = nce_lookup_mapping(ill, dst);
1288 	if (mnce == NULL) {
1289 		/* Something broken for the interface. */
1290 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
1291 		return (ESRCH);
1292 	}
1293 	ASSERT(mnce->nce_flags & NCE_F_MAPPING);
1294 	if (ill->ill_net_type == IRE_IF_RESOLVER) {
1295 		/*
1296 		 * For IRE_IF_RESOLVER a hardware mapping can be
1297 		 * generated, for IRE_IF_NORESOLVER, resolution cookie
1298 		 * in the ill is copied in ndp_add_v6().
1299 		 */
1300 		hw_addr = kmem_alloc(ill->ill_nd_lla_len, KM_NOSLEEP);
1301 		if (hw_addr == NULL) {
1302 			mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
1303 			NCE_REFRELE(mnce);
1304 			return (ENOMEM);
1305 		}
1306 		nce_make_mapping(mnce, hw_addr, (uchar_t *)dst);
1307 	}
1308 	NCE_REFRELE(mnce);
1309 	/*
1310 	 * IRE_IF_NORESOLVER type simply copies the resolution
1311 	 * cookie passed in.  So no hw_addr is needed.
1312 	 */
1313 	err = ndp_add_v6(ill,
1314 	    hw_addr,
1315 	    dst,
1316 	    &ipv6_all_ones,
1317 	    &ipv6_all_zeros,
1318 	    0,
1319 	    NCE_F_NONUD,
1320 	    ND_REACHABLE,
1321 	    &nce);
1322 	mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
1323 	if (hw_addr != NULL)
1324 		kmem_free(hw_addr, ill->ill_nd_lla_len);
1325 	if (err != 0) {
1326 		ip1dbg(("nce_set_multicast: create failed" "%d\n", err));
1327 		return (err);
1328 	}
1329 	NCE_REFRELE(nce);
1330 	return (0);
1331 }
1332 
1333 /*
1334  * Return the link layer address, and any flags of a nce.
1335  */
1336 int
1337 ndp_query(ill_t *ill, struct lif_nd_req *lnr)
1338 {
1339 	nce_t		*nce;
1340 	in6_addr_t	*addr;
1341 	sin6_t		*sin6;
1342 	dl_unitdata_req_t	*dl;
1343 
1344 	ASSERT(ill != NULL && ill->ill_isv6);
1345 	sin6 = (sin6_t *)&lnr->lnr_addr;
1346 	addr =  &sin6->sin6_addr;
1347 
1348 	/*
1349 	 * NOTE: if the ill is an IPMP interface, then match against the whole
1350 	 * illgrp.  This e.g. allows in.ndpd to retrieve the link layer
1351 	 * addresses for the data addresses on an IPMP interface even though
1352 	 * ipif_ndp_up() created them with an nce_ill of ipif_bound_ill.
1353 	 */
1354 	nce = ndp_lookup_v6(ill, IS_IPMP(ill), addr, B_FALSE);
1355 	if (nce == NULL)
1356 		return (ESRCH);
1357 	/* If in INCOMPLETE state, no link layer address is available yet */
1358 	if (nce->nce_state == ND_INCOMPLETE)
1359 		goto done;
1360 	dl = (dl_unitdata_req_t *)nce->nce_res_mp->b_rptr;
1361 	if (ill->ill_flags & ILLF_XRESOLV)
1362 		lnr->lnr_hdw_len = dl->dl_dest_addr_length;
1363 	else
1364 		lnr->lnr_hdw_len = ill->ill_nd_lla_len;
1365 	ASSERT(NCE_LL_ADDR_OFFSET(ill) + lnr->lnr_hdw_len <=
1366 	    sizeof (lnr->lnr_hdw_addr));
1367 	bcopy(nce->nce_res_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill),
1368 	    (uchar_t *)&lnr->lnr_hdw_addr, lnr->lnr_hdw_len);
1369 	if (nce->nce_flags & NCE_F_ISROUTER)
1370 		lnr->lnr_flags = NDF_ISROUTER_ON;
1371 	if (nce->nce_flags & NCE_F_ANYCAST)
1372 		lnr->lnr_flags |= NDF_ANYCAST_ON;
1373 done:
1374 	NCE_REFRELE(nce);
1375 	return (0);
1376 }
1377 
1378 /*
1379  * Send Enable/Disable multicast reqs to driver.
1380  */
1381 int
1382 ndp_mcastreq(ill_t *ill, const in6_addr_t *addr, uint32_t hw_addr_len,
1383     uint32_t hw_addr_offset, mblk_t *mp)
1384 {
1385 	nce_t		*nce;
1386 	uchar_t		*hw_addr;
1387 	ip_stack_t	*ipst = ill->ill_ipst;
1388 
1389 	ASSERT(ill != NULL && ill->ill_isv6);
1390 	ASSERT(ill->ill_net_type == IRE_IF_RESOLVER);
1391 	hw_addr = mi_offset_paramc(mp, hw_addr_offset, hw_addr_len);
1392 	if (hw_addr == NULL || !IN6_IS_ADDR_MULTICAST(addr)) {
1393 		freemsg(mp);
1394 		return (EINVAL);
1395 	}
1396 	mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
1397 	nce = nce_lookup_mapping(ill, addr);
1398 	if (nce == NULL) {
1399 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
1400 		freemsg(mp);
1401 		return (ESRCH);
1402 	}
1403 	mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
1404 	/*
1405 	 * Update dl_addr_length and dl_addr_offset for primitives that
1406 	 * have physical addresses as opposed to full saps
1407 	 */
1408 	switch (((union DL_primitives *)mp->b_rptr)->dl_primitive) {
1409 	case DL_ENABMULTI_REQ:
1410 		/* Track the state if this is the first enabmulti */
1411 		if (ill->ill_dlpi_multicast_state == IDS_UNKNOWN)
1412 			ill->ill_dlpi_multicast_state = IDS_INPROGRESS;
1413 		ip1dbg(("ndp_mcastreq: ENABMULTI\n"));
1414 		break;
1415 	case DL_DISABMULTI_REQ:
1416 		ip1dbg(("ndp_mcastreq: DISABMULTI\n"));
1417 		break;
1418 	default:
1419 		NCE_REFRELE(nce);
1420 		ip1dbg(("ndp_mcastreq: default\n"));
1421 		return (EINVAL);
1422 	}
1423 	nce_make_mapping(nce, hw_addr, (uchar_t *)addr);
1424 	NCE_REFRELE(nce);
1425 	ill_dlpi_send(ill, mp);
1426 	return (0);
1427 }
1428 
1429 
1430 /*
1431  * Send out a NS for resolving the ip address in nce.
1432  */
1433 void
1434 ip_ndp_resolve(nce_t *nce)
1435 {
1436 	in6_addr_t	sender6 = ipv6_all_zeros;
1437 	uint32_t	ms;
1438 	mblk_t		*mp;
1439 	ip6_t		*ip6h;
1440 
1441 	ASSERT(MUTEX_HELD(&nce->nce_lock));
1442 	/*
1443 	 * Pick the src from outgoing packet, if one is available.
1444 	 * Otherwise let nce_xmit figure out the src.
1445 	 */
1446 	if ((mp = nce->nce_qd_mp) != NULL) {
1447 		/* Handle ip_newroute_v6 giving us IPSEC packets */
1448 		if (mp->b_datap->db_type == M_CTL)
1449 			mp = mp->b_cont;
1450 		ip6h = (ip6_t *)mp->b_rptr;
1451 		if (ip6h->ip6_nxt == IPPROTO_RAW) {
1452 			/*
1453 			 * This message should have been pulled up already in
1454 			 * ip_wput_v6. We can't do pullups here because
1455 			 * the message could be from the nce_qd_mp which could
1456 			 * have b_next/b_prev non-NULL.
1457 			 */
1458 			ASSERT(MBLKL(mp) >= sizeof (ip6i_t) + IPV6_HDR_LEN);
1459 			ip6h = (ip6_t *)(mp->b_rptr + sizeof (ip6i_t));
1460 		}
1461 		sender6 = ip6h->ip6_src;
1462 	}
1463 	ms = nce_solicit(nce, sender6);
1464 	mutex_exit(&nce->nce_lock);
1465 	if (ms == 0) {
1466 		if (nce->nce_state != ND_REACHABLE) {
1467 			nce_resolv_failed(nce);
1468 			ndp_delete(nce);
1469 		}
1470 	} else {
1471 		NDP_RESTART_TIMER(nce, (clock_t)ms);
1472 	}
1473 	mutex_enter(&nce->nce_lock);
1474 }
1475 
1476 /*
1477  * Send a neighbor solicitation.
1478  * Returns number of milliseconds after which we should either rexmit or abort.
1479  * Return of zero means we should abort.
1480  * The caller holds the nce_lock to protect nce_qd_mp and nce_rcnt.
1481  *
1482  * NOTE: This routine drops nce_lock (and later reacquires it) when sending
1483  * the packet.
1484  */
1485 uint32_t
1486 nce_solicit(nce_t *nce, in6_addr_t sender)
1487 {
1488 	boolean_t	dropped;
1489 
1490 	ASSERT(nce->nce_ipversion == IPV6_VERSION);
1491 	ASSERT(MUTEX_HELD(&nce->nce_lock));
1492 
1493 	if (nce->nce_rcnt == 0)
1494 		return (0);
1495 
1496 	nce->nce_rcnt--;
1497 	mutex_exit(&nce->nce_lock);
1498 	dropped = nce_xmit_solicit(nce, B_TRUE, &sender, 0);
1499 	mutex_enter(&nce->nce_lock);
1500 	if (dropped)
1501 		nce->nce_rcnt++;
1502 	return (nce->nce_ill->ill_reachable_retrans_time);
1503 }
1504 
1505 /*
1506  * Attempt to recover an address on an interface that's been marked as a
1507  * duplicate.  Because NCEs are destroyed when the interface goes down, there's
1508  * no easy way to just probe the address and have the right thing happen if
1509  * it's no longer in use.  Instead, we just bring it up normally and allow the
1510  * regular interface start-up logic to probe for a remaining duplicate and take
1511  * us back down if necessary.
1512  * Neither DHCP nor temporary addresses arrive here; they're excluded by
1513  * ip_ndp_excl.
1514  */
1515 /* ARGSUSED */
1516 static void
1517 ip_ndp_recover(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg)
1518 {
1519 	ill_t	*ill = rq->q_ptr;
1520 	ipif_t	*ipif;
1521 	in6_addr_t *addr = (in6_addr_t *)mp->b_rptr;
1522 
1523 	for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
1524 		/*
1525 		 * We do not support recovery of proxy ARP'd interfaces,
1526 		 * because the system lacks a complete proxy ARP mechanism.
1527 		 */
1528 		if ((ipif->ipif_flags & IPIF_POINTOPOINT) ||
1529 		    !IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6lcl_addr, addr)) {
1530 			continue;
1531 		}
1532 
1533 		/*
1534 		 * If we have already recovered or if the interface is going
1535 		 * away, then ignore.
1536 		 */
1537 		mutex_enter(&ill->ill_lock);
1538 		if (!(ipif->ipif_flags & IPIF_DUPLICATE) ||
1539 		    (ipif->ipif_state_flags & IPIF_CONDEMNED)) {
1540 			mutex_exit(&ill->ill_lock);
1541 			continue;
1542 		}
1543 
1544 		ipif->ipif_flags &= ~IPIF_DUPLICATE;
1545 		ill->ill_ipif_dup_count--;
1546 		mutex_exit(&ill->ill_lock);
1547 		ipif->ipif_was_dup = B_TRUE;
1548 
1549 		VERIFY(ipif_ndp_up(ipif, B_TRUE) != EINPROGRESS);
1550 		(void) ipif_up_done_v6(ipif);
1551 	}
1552 	freeb(mp);
1553 }
1554 
1555 /*
1556  * Attempt to recover an IPv6 interface that's been shut down as a duplicate.
1557  * As long as someone else holds the address, the interface will stay down.
1558  * When that conflict goes away, the interface is brought back up.  This is
1559  * done so that accidental shutdowns of addresses aren't made permanent.  Your
1560  * server will recover from a failure.
1561  *
1562  * For DHCP and temporary addresses, recovery is not done in the kernel.
1563  * Instead, it's handled by user space processes (dhcpagent and in.ndpd).
1564  *
1565  * This function is entered on a timer expiry; the ID is in ipif_recovery_id.
1566  */
1567 static void
1568 ipif6_dup_recovery(void *arg)
1569 {
1570 	ipif_t *ipif = arg;
1571 
1572 	ipif->ipif_recovery_id = 0;
1573 	if (!(ipif->ipif_flags & IPIF_DUPLICATE))
1574 		return;
1575 
1576 	/*
1577 	 * No lock, because this is just an optimization.
1578 	 */
1579 	if (ipif->ipif_state_flags & IPIF_CONDEMNED)
1580 		return;
1581 
1582 	/* If the link is down, we'll retry this later */
1583 	if (!(ipif->ipif_ill->ill_phyint->phyint_flags & PHYI_RUNNING))
1584 		return;
1585 
1586 	ndp_do_recovery(ipif);
1587 }
1588 
1589 /*
1590  * Perform interface recovery by forcing the duplicate interfaces up and
1591  * allowing the system to determine which ones should stay up.
1592  *
1593  * Called both by recovery timer expiry and link-up notification.
1594  */
1595 void
1596 ndp_do_recovery(ipif_t *ipif)
1597 {
1598 	ill_t *ill = ipif->ipif_ill;
1599 	mblk_t *mp;
1600 	ip_stack_t *ipst = ill->ill_ipst;
1601 
1602 	mp = allocb(sizeof (ipif->ipif_v6lcl_addr), BPRI_MED);
1603 	if (mp == NULL) {
1604 		mutex_enter(&ill->ill_lock);
1605 		if (ipif->ipif_recovery_id == 0 &&
1606 		    !(ipif->ipif_state_flags & IPIF_CONDEMNED)) {
1607 			ipif->ipif_recovery_id = timeout(ipif6_dup_recovery,
1608 			    ipif, MSEC_TO_TICK(ipst->ips_ip_dup_recovery));
1609 		}
1610 		mutex_exit(&ill->ill_lock);
1611 	} else {
1612 		/*
1613 		 * A recovery timer may still be running if we got here from
1614 		 * ill_restart_dad(); cancel that timer.
1615 		 */
1616 		if (ipif->ipif_recovery_id != 0)
1617 			(void) untimeout(ipif->ipif_recovery_id);
1618 		ipif->ipif_recovery_id = 0;
1619 
1620 		bcopy(&ipif->ipif_v6lcl_addr, mp->b_rptr,
1621 		    sizeof (ipif->ipif_v6lcl_addr));
1622 		ill_refhold(ill);
1623 		qwriter_ip(ill, ill->ill_rq, mp, ip_ndp_recover, NEW_OP,
1624 		    B_FALSE);
1625 	}
1626 }
1627 
1628 /*
1629  * Find the MAC and IP addresses in an NA/NS message.
1630  */
1631 static void
1632 ip_ndp_find_addresses(mblk_t *mp, mblk_t *dl_mp, ill_t *ill, in6_addr_t *targp,
1633     uchar_t **haddr, uint_t *haddrlenp)
1634 {
1635 	ip6_t *ip6h = (ip6_t *)mp->b_rptr;
1636 	icmp6_t *icmp6 = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
1637 	nd_neighbor_advert_t *na = (nd_neighbor_advert_t *)icmp6;
1638 	nd_neighbor_solicit_t *ns = (nd_neighbor_solicit_t *)icmp6;
1639 	uchar_t *addr;
1640 	int alen = 0;
1641 
1642 	if (dl_mp == NULL) {
1643 		nd_opt_hdr_t *opt = NULL;
1644 		int len;
1645 
1646 		/*
1647 		 * If it's from the fast-path, then it can't be a probe
1648 		 * message, and thus must include a linkaddr option.
1649 		 * Extract that here.
1650 		 */
1651 		switch (icmp6->icmp6_type) {
1652 		case ND_NEIGHBOR_SOLICIT:
1653 			len = mp->b_wptr - (uchar_t *)ns;
1654 			if ((len -= sizeof (*ns)) > 0) {
1655 				opt = ndp_get_option((nd_opt_hdr_t *)(ns + 1),
1656 				    len, ND_OPT_SOURCE_LINKADDR);
1657 			}
1658 			break;
1659 		case ND_NEIGHBOR_ADVERT:
1660 			len = mp->b_wptr - (uchar_t *)na;
1661 			if ((len -= sizeof (*na)) > 0) {
1662 				opt = ndp_get_option((nd_opt_hdr_t *)(na + 1),
1663 				    len, ND_OPT_TARGET_LINKADDR);
1664 			}
1665 			break;
1666 		}
1667 
1668 		if (opt != NULL && opt->nd_opt_len * 8 - sizeof (*opt) >=
1669 		    ill->ill_nd_lla_len) {
1670 			addr = (uchar_t *)(opt + 1);
1671 			alen = ill->ill_nd_lla_len;
1672 		}
1673 
1674 		/*
1675 		 * We cheat a bit here for the sake of printing usable log
1676 		 * messages in the rare case where the reply we got was unicast
1677 		 * without a source linkaddr option, and the interface is in
1678 		 * fastpath mode.  (Sigh.)
1679 		 */
1680 		if (alen == 0 && ill->ill_type == IFT_ETHER &&
1681 		    MBLKHEAD(mp) >= sizeof (struct ether_header)) {
1682 			struct ether_header *pether;
1683 
1684 			pether = (struct ether_header *)((char *)ip6h -
1685 			    sizeof (*pether));
1686 			addr = pether->ether_shost.ether_addr_octet;
1687 			alen = ETHERADDRL;
1688 		}
1689 	} else {
1690 		dl_unitdata_ind_t *dlu;
1691 
1692 		dlu = (dl_unitdata_ind_t *)dl_mp->b_rptr;
1693 		alen = dlu->dl_src_addr_length;
1694 		if (alen > 0 && dlu->dl_src_addr_offset >= sizeof (*dlu) &&
1695 		    dlu->dl_src_addr_offset + alen <= MBLKL(dl_mp)) {
1696 			addr = dl_mp->b_rptr + dlu->dl_src_addr_offset;
1697 			if (ill->ill_sap_length < 0) {
1698 				alen += ill->ill_sap_length;
1699 			} else {
1700 				addr += ill->ill_sap_length;
1701 				alen -= ill->ill_sap_length;
1702 			}
1703 		}
1704 	}
1705 
1706 	if (alen > 0) {
1707 		*haddr = addr;
1708 		*haddrlenp = alen;
1709 	} else {
1710 		*haddr = NULL;
1711 		*haddrlenp = 0;
1712 	}
1713 
1714 	/* nd_ns_target and nd_na_target are at the same offset, so we cheat */
1715 	*targp = ns->nd_ns_target;
1716 }
1717 
1718 /*
1719  * This is for exclusive changes due to NDP duplicate address detection
1720  * failure.
1721  */
1722 /* ARGSUSED */
1723 static void
1724 ip_ndp_excl(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg)
1725 {
1726 	ill_t	*ill = rq->q_ptr;
1727 	ipif_t	*ipif;
1728 	mblk_t	*dl_mp = NULL;
1729 	uchar_t	*haddr;
1730 	uint_t	haddrlen;
1731 	ip_stack_t *ipst = ill->ill_ipst;
1732 	in6_addr_t targ;
1733 
1734 	if (DB_TYPE(mp) != M_DATA) {
1735 		dl_mp = mp;
1736 		mp = mp->b_cont;
1737 	}
1738 
1739 	ip_ndp_find_addresses(mp, dl_mp, ill, &targ, &haddr, &haddrlen);
1740 	if (haddr != NULL && haddrlen == ill->ill_phys_addr_length) {
1741 		/*
1742 		 * Ignore conflicts generated by misbehaving switches that
1743 		 * just reflect our own messages back to us.  For IPMP, we may
1744 		 * see reflections across any ill in the illgrp.
1745 		 */
1746 		if (bcmp(haddr, ill->ill_phys_addr, haddrlen) == 0 ||
1747 		    IS_UNDER_IPMP(ill) &&
1748 		    ipmp_illgrp_find_ill(ill->ill_grp, haddr, haddrlen) != NULL)
1749 			goto ignore_conflict;
1750 	}
1751 
1752 	/*
1753 	 * Look up the appropriate ipif.
1754 	 */
1755 	ipif = ipif_lookup_addr_v6(&targ, ill, ALL_ZONES, NULL, NULL, NULL,
1756 	    NULL, ipst);
1757 	if (ipif == NULL)
1758 		goto ignore_conflict;
1759 
1760 	/* Reload the ill to match the ipif */
1761 	ill = ipif->ipif_ill;
1762 
1763 	/* If it's already duplicate or ineligible, then don't do anything. */
1764 	if (ipif->ipif_flags & (IPIF_POINTOPOINT|IPIF_DUPLICATE)) {
1765 		ipif_refrele(ipif);
1766 		goto ignore_conflict;
1767 	}
1768 
1769 	/*
1770 	 * If this is a failure during duplicate recovery, then don't
1771 	 * complain.  It may take a long time to recover.
1772 	 */
1773 	if (!ipif->ipif_was_dup) {
1774 		char ibuf[LIFNAMSIZ];
1775 		char hbuf[MAC_STR_LEN];
1776 		char sbuf[INET6_ADDRSTRLEN];
1777 
1778 		ipif_get_name(ipif, ibuf, sizeof (ibuf));
1779 		cmn_err(CE_WARN, "%s has duplicate address %s (in use by %s);"
1780 		    " disabled", ibuf,
1781 		    inet_ntop(AF_INET6, &targ, sbuf, sizeof (sbuf)),
1782 		    mac_colon_addr(haddr, haddrlen, hbuf, sizeof (hbuf)));
1783 	}
1784 	mutex_enter(&ill->ill_lock);
1785 	ASSERT(!(ipif->ipif_flags & IPIF_DUPLICATE));
1786 	ipif->ipif_flags |= IPIF_DUPLICATE;
1787 	ill->ill_ipif_dup_count++;
1788 	mutex_exit(&ill->ill_lock);
1789 	(void) ipif_down(ipif, NULL, NULL);
1790 	ipif_down_tail(ipif);
1791 	mutex_enter(&ill->ill_lock);
1792 	if (!(ipif->ipif_flags & (IPIF_DHCPRUNNING|IPIF_TEMPORARY)) &&
1793 	    ill->ill_net_type == IRE_IF_RESOLVER &&
1794 	    !(ipif->ipif_state_flags & IPIF_CONDEMNED) &&
1795 	    ipst->ips_ip_dup_recovery > 0) {
1796 		ASSERT(ipif->ipif_recovery_id == 0);
1797 		ipif->ipif_recovery_id = timeout(ipif6_dup_recovery,
1798 		    ipif, MSEC_TO_TICK(ipst->ips_ip_dup_recovery));
1799 	}
1800 	mutex_exit(&ill->ill_lock);
1801 	ipif_refrele(ipif);
1802 ignore_conflict:
1803 	if (dl_mp != NULL)
1804 		freeb(dl_mp);
1805 	freemsg(mp);
1806 }
1807 
1808 /*
1809  * Handle failure by tearing down the ipifs with the specified address.  Note
1810  * that tearing down the ipif also means deleting the nce through ipif_down, so
1811  * it's not possible to do recovery by just restarting the nce timer.  Instead,
1812  * we start a timer on the ipif.
1813  */
1814 static void
1815 ip_ndp_failure(ill_t *ill, mblk_t *mp, mblk_t *dl_mp)
1816 {
1817 	if ((mp = copymsg(mp)) != NULL) {
1818 		if (dl_mp == NULL)
1819 			dl_mp = mp;
1820 		else if ((dl_mp = copyb(dl_mp)) != NULL)
1821 			dl_mp->b_cont = mp;
1822 		if (dl_mp == NULL) {
1823 			freemsg(mp);
1824 		} else {
1825 			ill_refhold(ill);
1826 			qwriter_ip(ill, ill->ill_rq, dl_mp, ip_ndp_excl, NEW_OP,
1827 			    B_FALSE);
1828 		}
1829 	}
1830 }
1831 
1832 /*
1833  * Handle a discovered conflict: some other system is advertising that it owns
1834  * one of our IP addresses.  We need to defend ourselves, or just shut down the
1835  * interface.
1836  */
1837 static void
1838 ip_ndp_conflict(ill_t *ill, mblk_t *mp, mblk_t *dl_mp, nce_t *nce)
1839 {
1840 	ipif_t *ipif;
1841 	uint32_t now;
1842 	uint_t maxdefense;
1843 	uint_t defs;
1844 	ip_stack_t *ipst = ill->ill_ipst;
1845 
1846 	ipif = ipif_lookup_addr_v6(&nce->nce_addr, ill, ALL_ZONES, NULL, NULL,
1847 	    NULL, NULL, ipst);
1848 	if (ipif == NULL)
1849 		return;
1850 
1851 	/*
1852 	 * First, figure out if this address is disposable.
1853 	 */
1854 	if (ipif->ipif_flags & (IPIF_DHCPRUNNING | IPIF_TEMPORARY))
1855 		maxdefense = ipst->ips_ip_max_temp_defend;
1856 	else
1857 		maxdefense = ipst->ips_ip_max_defend;
1858 
1859 	/*
1860 	 * Now figure out how many times we've defended ourselves.  Ignore
1861 	 * defenses that happened long in the past.
1862 	 */
1863 	now = gethrestime_sec();
1864 	mutex_enter(&nce->nce_lock);
1865 	if ((defs = nce->nce_defense_count) > 0 &&
1866 	    now - nce->nce_defense_time > ipst->ips_ip_defend_interval) {
1867 		nce->nce_defense_count = defs = 0;
1868 	}
1869 	nce->nce_defense_count++;
1870 	nce->nce_defense_time = now;
1871 	mutex_exit(&nce->nce_lock);
1872 	ipif_refrele(ipif);
1873 
1874 	/*
1875 	 * If we've defended ourselves too many times already, then give up and
1876 	 * tear down the interface(s) using this address.  Otherwise, defend by
1877 	 * sending out an unsolicited Neighbor Advertisement.
1878 	 */
1879 	if (defs >= maxdefense) {
1880 		ip_ndp_failure(ill, mp, dl_mp);
1881 	} else {
1882 		char hbuf[MAC_STR_LEN];
1883 		char sbuf[INET6_ADDRSTRLEN];
1884 		uchar_t *haddr;
1885 		uint_t haddrlen;
1886 		in6_addr_t targ;
1887 
1888 		ip_ndp_find_addresses(mp, dl_mp, ill, &targ, &haddr, &haddrlen);
1889 		cmn_err(CE_WARN, "node %s is using our IP address %s on %s",
1890 		    mac_colon_addr(haddr, haddrlen, hbuf, sizeof (hbuf)),
1891 		    inet_ntop(AF_INET6, &targ, sbuf, sizeof (sbuf)),
1892 		    ill->ill_name);
1893 
1894 		(void) nce_xmit_advert(nce, B_FALSE, &ipv6_all_hosts_mcast, 0);
1895 	}
1896 }
1897 
1898 static void
1899 ndp_input_solicit(ill_t *ill, mblk_t *mp, mblk_t *dl_mp)
1900 {
1901 	nd_neighbor_solicit_t *ns;
1902 	uint32_t	hlen = ill->ill_nd_lla_len;
1903 	uchar_t		*haddr = NULL;
1904 	icmp6_t		*icmp_nd;
1905 	ip6_t		*ip6h;
1906 	nce_t		*our_nce = NULL;
1907 	in6_addr_t	target;
1908 	in6_addr_t	src;
1909 	int		len;
1910 	int		flag = 0;
1911 	nd_opt_hdr_t	*opt = NULL;
1912 	boolean_t	bad_solicit = B_FALSE;
1913 	mib2_ipv6IfIcmpEntry_t	*mib = ill->ill_icmp6_mib;
1914 
1915 	ip6h = (ip6_t *)mp->b_rptr;
1916 	icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
1917 	len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN;
1918 	src = ip6h->ip6_src;
1919 	ns = (nd_neighbor_solicit_t *)icmp_nd;
1920 	target = ns->nd_ns_target;
1921 	if (IN6_IS_ADDR_MULTICAST(&target)) {
1922 		if (ip_debug > 2) {
1923 			/* ip1dbg */
1924 			pr_addr_dbg("ndp_input_solicit: Target is"
1925 			    " multicast! %s\n", AF_INET6, &target);
1926 		}
1927 		bad_solicit = B_TRUE;
1928 		goto done;
1929 	}
1930 	if (len > sizeof (nd_neighbor_solicit_t)) {
1931 		/* Options present */
1932 		opt = (nd_opt_hdr_t *)&ns[1];
1933 		len -= sizeof (nd_neighbor_solicit_t);
1934 		if (!ndp_verify_optlen(opt, len)) {
1935 			ip1dbg(("ndp_input_solicit: Bad opt len\n"));
1936 			bad_solicit = B_TRUE;
1937 			goto done;
1938 		}
1939 
1940 	}
1941 	if (IN6_IS_ADDR_UNSPECIFIED(&src)) {
1942 		/* Check to see if this is a valid DAD solicitation */
1943 		if (!IN6_IS_ADDR_MC_SOLICITEDNODE(&ip6h->ip6_dst)) {
1944 			if (ip_debug > 2) {
1945 				/* ip1dbg */
1946 				pr_addr_dbg("ndp_input_solicit: IPv6 "
1947 				    "Destination is not solicited node "
1948 				    "multicast %s\n", AF_INET6,
1949 				    &ip6h->ip6_dst);
1950 			}
1951 			bad_solicit = B_TRUE;
1952 			goto done;
1953 		}
1954 	}
1955 
1956 	/*
1957 	 * NOTE: with IPMP, it's possible the nominated multicast ill (which
1958 	 * received this packet if it's multicast) is not the ill tied to
1959 	 * e.g. the IPMP ill's data link-local.  So we match across the illgrp
1960 	 * to ensure we find the associated NCE.
1961 	 */
1962 	our_nce = ndp_lookup_v6(ill, B_TRUE, &target, B_FALSE);
1963 	/*
1964 	 * If this is a valid Solicitation, a permanent
1965 	 * entry should exist in the cache
1966 	 */
1967 	if (our_nce == NULL ||
1968 	    !(our_nce->nce_flags & NCE_F_PERMANENT)) {
1969 		ip1dbg(("ndp_input_solicit: Wrong target in NS?!"
1970 		    "ifname=%s ", ill->ill_name));
1971 		if (ip_debug > 2) {
1972 			/* ip1dbg */
1973 			pr_addr_dbg(" dst %s\n", AF_INET6, &target);
1974 		}
1975 		bad_solicit = B_TRUE;
1976 		goto done;
1977 	}
1978 
1979 	/* At this point we should have a verified NS per spec */
1980 	if (opt != NULL) {
1981 		opt = ndp_get_option(opt, len, ND_OPT_SOURCE_LINKADDR);
1982 		if (opt != NULL) {
1983 			haddr = (uchar_t *)&opt[1];
1984 			if (hlen > opt->nd_opt_len * 8 - sizeof (*opt) ||
1985 			    hlen == 0) {
1986 				ip1dbg(("ndp_input_solicit: bad SLLA\n"));
1987 				bad_solicit = B_TRUE;
1988 				goto done;
1989 			}
1990 		}
1991 	}
1992 
1993 	/* If sending directly to peer, set the unicast flag */
1994 	if (!IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst))
1995 		flag |= NDP_UNICAST;
1996 
1997 	/*
1998 	 * Create/update the entry for the soliciting node.
1999 	 * or respond to outstanding queries, don't if
2000 	 * the source is unspecified address.
2001 	 */
2002 	if (!IN6_IS_ADDR_UNSPECIFIED(&src)) {
2003 		int	err;
2004 		nce_t	*nnce;
2005 
2006 		ASSERT(ill->ill_isv6);
2007 		/*
2008 		 * Regular solicitations *must* include the Source Link-Layer
2009 		 * Address option.  Ignore messages that do not.
2010 		 */
2011 		if (haddr == NULL && IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst)) {
2012 			ip1dbg(("ndp_input_solicit: source link-layer address "
2013 			    "option missing with a specified source.\n"));
2014 			bad_solicit = B_TRUE;
2015 			goto done;
2016 		}
2017 
2018 		/*
2019 		 * This is a regular solicitation.  If we're still in the
2020 		 * process of verifying the address, then don't respond at all
2021 		 * and don't keep track of the sender.
2022 		 */
2023 		if (our_nce->nce_state == ND_PROBE)
2024 			goto done;
2025 
2026 		/*
2027 		 * If the solicitation doesn't have sender hardware address
2028 		 * (legal for unicast solicitation), then process without
2029 		 * installing the return NCE.  Either we already know it, or
2030 		 * we'll be forced to look it up when (and if) we reply to the
2031 		 * packet.
2032 		 */
2033 		if (haddr == NULL)
2034 			goto no_source;
2035 
2036 		err = ndp_lookup_then_add_v6(ill,
2037 		    B_FALSE,
2038 		    haddr,
2039 		    &src,	/* Soliciting nodes address */
2040 		    &ipv6_all_ones,
2041 		    &ipv6_all_zeros,
2042 		    0,
2043 		    0,
2044 		    ND_STALE,
2045 		    &nnce);
2046 		switch (err) {
2047 		case 0:
2048 			/* done with this entry */
2049 			NCE_REFRELE(nnce);
2050 			break;
2051 		case EEXIST:
2052 			/*
2053 			 * B_FALSE indicates this is not an an advertisement.
2054 			 */
2055 			ndp_process(nnce, haddr, 0, B_FALSE);
2056 			NCE_REFRELE(nnce);
2057 			break;
2058 		default:
2059 			ip1dbg(("ndp_input_solicit: Can't create NCE %d\n",
2060 			    err));
2061 			goto done;
2062 		}
2063 no_source:
2064 		flag |= NDP_SOLICITED;
2065 	} else {
2066 		/*
2067 		 * No source link layer address option should be present in a
2068 		 * valid DAD request.
2069 		 */
2070 		if (haddr != NULL) {
2071 			ip1dbg(("ndp_input_solicit: source link-layer address "
2072 			    "option present with an unspecified source.\n"));
2073 			bad_solicit = B_TRUE;
2074 			goto done;
2075 		}
2076 		if (our_nce->nce_state == ND_PROBE) {
2077 			/*
2078 			 * Internally looped-back probes won't have DLPI
2079 			 * attached to them.  External ones (which are sent by
2080 			 * multicast) always will.  Just ignore our own
2081 			 * transmissions.
2082 			 */
2083 			if (dl_mp != NULL) {
2084 				/*
2085 				 * If someone else is probing our address, then
2086 				 * we've crossed wires.  Declare failure.
2087 				 */
2088 				ip_ndp_failure(ill, mp, dl_mp);
2089 			}
2090 			goto done;
2091 		}
2092 		/*
2093 		 * This is a DAD probe.  Multicast the advertisement to the
2094 		 * all-nodes address.
2095 		 */
2096 		src = ipv6_all_hosts_mcast;
2097 	}
2098 	/* Response to a solicitation */
2099 	(void) nce_xmit_advert(our_nce, B_TRUE, &src, flag);
2100 done:
2101 	if (bad_solicit)
2102 		BUMP_MIB(mib, ipv6IfIcmpInBadNeighborSolicitations);
2103 	if (our_nce != NULL)
2104 		NCE_REFRELE(our_nce);
2105 }
2106 
2107 void
2108 ndp_input_advert(ill_t *ill, mblk_t *mp, mblk_t *dl_mp)
2109 {
2110 	nd_neighbor_advert_t *na;
2111 	uint32_t	hlen = ill->ill_nd_lla_len;
2112 	uchar_t		*haddr = NULL;
2113 	icmp6_t		*icmp_nd;
2114 	ip6_t		*ip6h;
2115 	nce_t		*dst_nce = NULL;
2116 	in6_addr_t	target;
2117 	nd_opt_hdr_t	*opt = NULL;
2118 	int		len;
2119 	ip_stack_t	*ipst = ill->ill_ipst;
2120 	mib2_ipv6IfIcmpEntry_t	*mib = ill->ill_icmp6_mib;
2121 
2122 	ip6h = (ip6_t *)mp->b_rptr;
2123 	icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
2124 	len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN;
2125 	na = (nd_neighbor_advert_t *)icmp_nd;
2126 	if (IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst) &&
2127 	    (na->nd_na_flags_reserved & ND_NA_FLAG_SOLICITED)) {
2128 		ip1dbg(("ndp_input_advert: Target is multicast but the "
2129 		    "solicited flag is not zero\n"));
2130 		BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements);
2131 		return;
2132 	}
2133 	target = na->nd_na_target;
2134 	if (IN6_IS_ADDR_MULTICAST(&target)) {
2135 		ip1dbg(("ndp_input_advert: Target is multicast!\n"));
2136 		BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements);
2137 		return;
2138 	}
2139 	if (len > sizeof (nd_neighbor_advert_t)) {
2140 		opt = (nd_opt_hdr_t *)&na[1];
2141 		if (!ndp_verify_optlen(opt,
2142 		    len - sizeof (nd_neighbor_advert_t))) {
2143 			ip1dbg(("ndp_input_advert: cannot verify SLLA\n"));
2144 			BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements);
2145 			return;
2146 		}
2147 		/* At this point we have a verified NA per spec */
2148 		len -= sizeof (nd_neighbor_advert_t);
2149 		opt = ndp_get_option(opt, len, ND_OPT_TARGET_LINKADDR);
2150 		if (opt != NULL) {
2151 			haddr = (uchar_t *)&opt[1];
2152 			if (hlen > opt->nd_opt_len * 8 - sizeof (*opt) ||
2153 			    hlen == 0) {
2154 				ip1dbg(("ndp_input_advert: bad SLLA\n"));
2155 				BUMP_MIB(mib,
2156 				    ipv6IfIcmpInBadNeighborAdvertisements);
2157 				return;
2158 			}
2159 		}
2160 	}
2161 
2162 	/*
2163 	 * NOTE: we match across the illgrp since we need to do DAD for all of
2164 	 * our local addresses, and those are spread across all the active
2165 	 * ills in the group.
2166 	 */
2167 	if ((dst_nce = ndp_lookup_v6(ill, B_TRUE, &target, B_FALSE)) == NULL)
2168 		return;
2169 
2170 	if (dst_nce->nce_flags & NCE_F_PERMANENT) {
2171 		/*
2172 		 * Someone just advertised one of our local addresses.	First,
2173 		 * check it it was us -- if so, we can safely ignore it.
2174 		 */
2175 		if (haddr != NULL) {
2176 			if (!nce_cmp_ll_addr(dst_nce, haddr, hlen))
2177 				goto out;	/* from us -- no conflict */
2178 
2179 			/*
2180 			 * If we're in an IPMP group, check if this is an echo
2181 			 * from another ill in the group.  Use the double-
2182 			 * checked locking pattern to avoid grabbing
2183 			 * ill_g_lock in the non-IPMP case.
2184 			 */
2185 			if (IS_UNDER_IPMP(ill)) {
2186 				rw_enter(&ipst->ips_ill_g_lock, RW_READER);
2187 				if (IS_UNDER_IPMP(ill) && ipmp_illgrp_find_ill(
2188 				    ill->ill_grp, haddr, hlen) != NULL) {
2189 					rw_exit(&ipst->ips_ill_g_lock);
2190 					goto out;
2191 				}
2192 				rw_exit(&ipst->ips_ill_g_lock);
2193 			}
2194 		}
2195 
2196 		/*
2197 		 * Our own (looped-back) unsolicited neighbor advertisements
2198 		 * will get here with dl_mp == NULL.  (These will usually be
2199 		 * filtered by the `haddr' checks above, but point-to-point
2200 		 * links have no hardware address and thus make it here.)
2201 		 */
2202 		if (dl_mp == NULL && dst_nce->nce_state != ND_PROBE)
2203 			goto out;
2204 
2205 		/*
2206 		 * This appears to be a real conflict.  If we're trying to
2207 		 * configure this NCE (ND_PROBE), then shut it down.
2208 		 * Otherwise, handle the discovered conflict.
2209 		 *
2210 		 * In the ND_PROBE case, dl_mp might be NULL if we're getting
2211 		 * a unicast reply.  This isn't typically done (multicast is
2212 		 * the norm in response to a probe), but we can handle it.
2213 		 */
2214 		if (dst_nce->nce_state == ND_PROBE)
2215 			ip_ndp_failure(ill, mp, dl_mp);
2216 		else
2217 			ip_ndp_conflict(ill, mp, dl_mp, dst_nce);
2218 	} else {
2219 		if (na->nd_na_flags_reserved & ND_NA_FLAG_ROUTER)
2220 			dst_nce->nce_flags |= NCE_F_ISROUTER;
2221 
2222 		/* B_TRUE indicates this an advertisement */
2223 		ndp_process(dst_nce, haddr, na->nd_na_flags_reserved, B_TRUE);
2224 	}
2225 out:
2226 	NCE_REFRELE(dst_nce);
2227 }
2228 
2229 /*
2230  * Process NDP neighbor solicitation/advertisement messages.
2231  * The checksum has already checked o.k before reaching here.
2232  */
2233 void
2234 ndp_input(ill_t *ill, mblk_t *mp, mblk_t *dl_mp)
2235 {
2236 	icmp6_t		*icmp_nd;
2237 	ip6_t		*ip6h;
2238 	int		len;
2239 	mib2_ipv6IfIcmpEntry_t	*mib = ill->ill_icmp6_mib;
2240 
2241 
2242 	if (!pullupmsg(mp, -1)) {
2243 		ip1dbg(("ndp_input: pullupmsg failed\n"));
2244 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2245 		goto done;
2246 	}
2247 	ip6h = (ip6_t *)mp->b_rptr;
2248 	if (ip6h->ip6_hops != IPV6_MAX_HOPS) {
2249 		ip1dbg(("ndp_input: hoplimit != IPV6_MAX_HOPS\n"));
2250 		BUMP_MIB(mib, ipv6IfIcmpBadHoplimit);
2251 		goto done;
2252 	}
2253 	/*
2254 	 * NDP does not accept any extension headers between the
2255 	 * IP header and the ICMP header since e.g. a routing
2256 	 * header could be dangerous.
2257 	 * This assumes that any AH or ESP headers are removed
2258 	 * by ip prior to passing the packet to ndp_input.
2259 	 */
2260 	if (ip6h->ip6_nxt != IPPROTO_ICMPV6) {
2261 		ip1dbg(("ndp_input: Wrong next header 0x%x\n",
2262 		    ip6h->ip6_nxt));
2263 		BUMP_MIB(mib, ipv6IfIcmpInErrors);
2264 		goto done;
2265 	}
2266 	icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
2267 	ASSERT(icmp_nd->icmp6_type == ND_NEIGHBOR_SOLICIT ||
2268 	    icmp_nd->icmp6_type == ND_NEIGHBOR_ADVERT);
2269 	if (icmp_nd->icmp6_code != 0) {
2270 		ip1dbg(("ndp_input: icmp6 code != 0 \n"));
2271 		BUMP_MIB(mib, ipv6IfIcmpInErrors);
2272 		goto done;
2273 	}
2274 	len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN;
2275 	/*
2276 	 * Make sure packet length is large enough for either
2277 	 * a NS or a NA icmp packet.
2278 	 */
2279 	if (len <  sizeof (struct icmp6_hdr) + sizeof (struct in6_addr)) {
2280 		ip1dbg(("ndp_input: packet too short\n"));
2281 		BUMP_MIB(mib, ipv6IfIcmpInErrors);
2282 		goto done;
2283 	}
2284 	if (icmp_nd->icmp6_type == ND_NEIGHBOR_SOLICIT) {
2285 		ndp_input_solicit(ill, mp, dl_mp);
2286 	} else {
2287 		ndp_input_advert(ill, mp, dl_mp);
2288 	}
2289 done:
2290 	freemsg(mp);
2291 }
2292 
2293 /*
2294  * Utility routine to send an advertisement.  Assumes that the NCE cannot
2295  * go away (e.g., because it's refheld).
2296  */
2297 static boolean_t
2298 nce_xmit_advert(nce_t *nce, boolean_t use_nd_lla, const in6_addr_t *target,
2299     uint_t flags)
2300 {
2301 	ASSERT((flags & NDP_PROBE) == 0);
2302 
2303 	if (nce->nce_flags & NCE_F_ISROUTER)
2304 		flags |= NDP_ISROUTER;
2305 	if (!(nce->nce_flags & NCE_F_ANYCAST))
2306 		flags |= NDP_ORIDE;
2307 
2308 	return (nce_xmit(nce->nce_ill, ND_NEIGHBOR_ADVERT, use_nd_lla,
2309 	    &nce->nce_addr, target, flags));
2310 }
2311 
2312 /*
2313  * Utility routine to send a solicitation.  Assumes that the NCE cannot
2314  * go away (e.g., because it's refheld).
2315  */
2316 static boolean_t
2317 nce_xmit_solicit(nce_t *nce, boolean_t use_nd_lla, const in6_addr_t *sender,
2318     uint_t flags)
2319 {
2320 	if (flags & NDP_PROBE)
2321 		sender = &ipv6_all_zeros;
2322 
2323 	return (nce_xmit(nce->nce_ill, ND_NEIGHBOR_SOLICIT, use_nd_lla,
2324 	    sender, &nce->nce_addr, flags));
2325 }
2326 
2327 /*
2328  * nce_xmit is called to form and transmit a ND solicitation or
2329  * advertisement ICMP packet.
2330  *
2331  * If the source address is unspecified and this isn't a probe (used for
2332  * duplicate address detection), an appropriate source address and link layer
2333  * address will be chosen here.  The link layer address option is included if
2334  * the source is specified (i.e., all non-probe packets), and omitted (per the
2335  * specification) otherwise.
2336  *
2337  * It returns B_FALSE only if it does a successful put() to the
2338  * corresponding ill's ill_wq otherwise returns B_TRUE.
2339  */
2340 static boolean_t
2341 nce_xmit(ill_t *ill, uint8_t type, boolean_t use_nd_lla,
2342     const in6_addr_t *sender, const in6_addr_t *target, int flag)
2343 {
2344 	ill_t		*hwaddr_ill;
2345 	uint32_t	len;
2346 	icmp6_t 	*icmp6;
2347 	mblk_t		*mp;
2348 	ip6_t		*ip6h;
2349 	nd_opt_hdr_t	*opt;
2350 	uint_t		plen, maxplen;
2351 	ip6i_t		*ip6i;
2352 	ipif_t		*src_ipif = NULL;
2353 	uint8_t		*hw_addr;
2354 	zoneid_t	zoneid = GLOBAL_ZONEID;
2355 	char		buf[INET6_ADDRSTRLEN];
2356 
2357 	ASSERT(!IS_IPMP(ill));
2358 
2359 	/*
2360 	 * Check that the sender is actually a usable address on `ill', and if
2361 	 * so, track that as the src_ipif.  If not, for solicitations, set the
2362 	 * sender to :: so that a new one will be picked below; for adverts,
2363 	 * drop the packet since we expect nce_xmit_advert() to always provide
2364 	 * a valid sender.
2365 	 */
2366 	if (!IN6_IS_ADDR_UNSPECIFIED(sender)) {
2367 		if ((src_ipif = ip_ndp_lookup_addr_v6(sender, ill)) == NULL ||
2368 		    !src_ipif->ipif_addr_ready) {
2369 			if (src_ipif != NULL) {
2370 				ipif_refrele(src_ipif);
2371 				src_ipif = NULL;
2372 			}
2373 			if (type == ND_NEIGHBOR_ADVERT) {
2374 				ip1dbg(("nce_xmit: No source ipif for src %s\n",
2375 				    inet_ntop(AF_INET6, sender, buf,
2376 				    sizeof (buf))));
2377 				return (B_TRUE);
2378 			}
2379 			sender = &ipv6_all_zeros;
2380 		}
2381 	}
2382 
2383 	/*
2384 	 * If we still have an unspecified source (sender) address and this
2385 	 * isn't a probe, select a source address from `ill'.
2386 	 */
2387 	if (IN6_IS_ADDR_UNSPECIFIED(sender) && !(flag & NDP_PROBE)) {
2388 		ASSERT(type != ND_NEIGHBOR_ADVERT);
2389 		/*
2390 		 * Pick a source address for this solicitation, but restrict
2391 		 * the selection to addresses assigned to the output
2392 		 * interface.  We do this because the destination will create
2393 		 * a neighbor cache entry for the source address of this
2394 		 * packet, so the source address needs to be a valid neighbor.
2395 		 */
2396 		src_ipif = ipif_select_source_v6(ill, target, B_TRUE,
2397 		    IPV6_PREFER_SRC_DEFAULT, ALL_ZONES);
2398 		if (src_ipif == NULL) {
2399 			ip1dbg(("nce_xmit: No source ipif for dst %s\n",
2400 			    inet_ntop(AF_INET6, target, buf, sizeof (buf))));
2401 			return (B_TRUE);
2402 		}
2403 		sender = &src_ipif->ipif_v6src_addr;
2404 	}
2405 
2406 	/*
2407 	 * We're either sending a probe or we have a source address.
2408 	 */
2409 	ASSERT((flag & NDP_PROBE) || src_ipif != NULL);
2410 
2411 	maxplen = roundup(sizeof (nd_opt_hdr_t) + ND_MAX_HDW_LEN, 8);
2412 	len = IPV6_HDR_LEN + sizeof (ip6i_t) + sizeof (nd_neighbor_advert_t) +
2413 	    maxplen;
2414 	mp = allocb(len,  BPRI_LO);
2415 	if (mp == NULL) {
2416 		if (src_ipif != NULL)
2417 			ipif_refrele(src_ipif);
2418 		return (B_TRUE);
2419 	}
2420 	bzero((char *)mp->b_rptr, len);
2421 	mp->b_wptr = mp->b_rptr + len;
2422 
2423 	ip6i = (ip6i_t *)mp->b_rptr;
2424 	ip6i->ip6i_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
2425 	ip6i->ip6i_nxt = IPPROTO_RAW;
2426 	ip6i->ip6i_flags = IP6I_HOPLIMIT;
2427 	if (flag & NDP_PROBE)
2428 		ip6i->ip6i_flags |= IP6I_UNSPEC_SRC;
2429 
2430 	ip6h = (ip6_t *)(mp->b_rptr + sizeof (ip6i_t));
2431 	ip6h->ip6_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
2432 	ip6h->ip6_plen = htons(len - IPV6_HDR_LEN - sizeof (ip6i_t));
2433 	ip6h->ip6_nxt = IPPROTO_ICMPV6;
2434 	ip6h->ip6_hops = IPV6_MAX_HOPS;
2435 	ip6h->ip6_src = *sender;
2436 	ip6h->ip6_dst = *target;
2437 	icmp6 = (icmp6_t *)&ip6h[1];
2438 
2439 	opt = (nd_opt_hdr_t *)((uint8_t *)ip6h + IPV6_HDR_LEN +
2440 	    sizeof (nd_neighbor_advert_t));
2441 
2442 	if (type == ND_NEIGHBOR_SOLICIT) {
2443 		nd_neighbor_solicit_t *ns = (nd_neighbor_solicit_t *)icmp6;
2444 
2445 		if (!(flag & NDP_PROBE))
2446 			opt->nd_opt_type = ND_OPT_SOURCE_LINKADDR;
2447 		ns->nd_ns_target = *target;
2448 		if (!(flag & NDP_UNICAST)) {
2449 			/* Form multicast address of the target */
2450 			ip6h->ip6_dst = ipv6_solicited_node_mcast;
2451 			ip6h->ip6_dst.s6_addr32[3] |=
2452 			    ns->nd_ns_target.s6_addr32[3];
2453 		}
2454 	} else {
2455 		nd_neighbor_advert_t *na = (nd_neighbor_advert_t *)icmp6;
2456 
2457 		ASSERT(!(flag & NDP_PROBE));
2458 		opt->nd_opt_type = ND_OPT_TARGET_LINKADDR;
2459 		na->nd_na_target = *sender;
2460 		if (flag & NDP_ISROUTER)
2461 			na->nd_na_flags_reserved |= ND_NA_FLAG_ROUTER;
2462 		if (flag & NDP_SOLICITED)
2463 			na->nd_na_flags_reserved |= ND_NA_FLAG_SOLICITED;
2464 		if (flag & NDP_ORIDE)
2465 			na->nd_na_flags_reserved |= ND_NA_FLAG_OVERRIDE;
2466 	}
2467 
2468 	hw_addr = NULL;
2469 	if (!(flag & NDP_PROBE)) {
2470 		/*
2471 		 * Use our source address to find the hardware address to put
2472 		 * in the packet, so that the hardware address and IP address
2473 		 * will match up -- even if that hardware address doesn't
2474 		 * match the ill we actually transmit the packet through.
2475 		 */
2476 		if (IS_IPMP(src_ipif->ipif_ill)) {
2477 			hwaddr_ill = ipmp_ipif_hold_bound_ill(src_ipif);
2478 			if (hwaddr_ill == NULL) {
2479 				ip1dbg(("nce_xmit: no bound ill!\n"));
2480 				ipif_refrele(src_ipif);
2481 				freemsg(mp);
2482 				return (B_TRUE);
2483 			}
2484 		} else {
2485 			hwaddr_ill = src_ipif->ipif_ill;
2486 			ill_refhold(hwaddr_ill);	/* for symmetry */
2487 		}
2488 
2489 		plen = roundup(sizeof (nd_opt_hdr_t) +
2490 		    hwaddr_ill->ill_nd_lla_len, 8);
2491 
2492 		hw_addr = use_nd_lla ? hwaddr_ill->ill_nd_lla :
2493 		    hwaddr_ill->ill_phys_addr;
2494 		if (hw_addr != NULL) {
2495 			/* Fill in link layer address and option len */
2496 			opt->nd_opt_len = (uint8_t)(plen / 8);
2497 			bcopy(hw_addr, &opt[1], hwaddr_ill->ill_nd_lla_len);
2498 		}
2499 
2500 		ill_refrele(hwaddr_ill);
2501 	}
2502 
2503 	if (hw_addr == NULL)
2504 		plen = 0;
2505 
2506 	/* Fix up the length of the packet now that plen is known */
2507 	len -= (maxplen - plen);
2508 	mp->b_wptr = mp->b_rptr + len;
2509 	ip6h->ip6_plen = htons(len - IPV6_HDR_LEN - sizeof (ip6i_t));
2510 
2511 	icmp6->icmp6_type = type;
2512 	icmp6->icmp6_code = 0;
2513 	/*
2514 	 * Prepare for checksum by putting icmp length in the icmp
2515 	 * checksum field. The checksum is calculated in ip_wput_v6.
2516 	 */
2517 	icmp6->icmp6_cksum = ip6h->ip6_plen;
2518 
2519 	/*
2520 	 * Before we toss the src_ipif, look up the zoneid to pass to
2521 	 * ip_output_v6().  This is to ensure unicast ND_NEIGHBOR_ADVERT
2522 	 * packets to be routed correctly by IP (we cannot guarantee that the
2523 	 * global zone has an interface route to the destination).
2524 	 */
2525 	if (src_ipif != NULL) {
2526 		if ((zoneid = src_ipif->ipif_zoneid) == ALL_ZONES)
2527 			zoneid = GLOBAL_ZONEID;
2528 		ipif_refrele(src_ipif);
2529 	}
2530 
2531 	ip_output_v6((void *)(uintptr_t)zoneid, mp, ill->ill_wq, IP_WPUT);
2532 	return (B_FALSE);
2533 }
2534 
2535 /*
2536  * Make a link layer address (does not include the SAP) from an nce.
2537  * To form the link layer address, use the last four bytes of ipv6
2538  * address passed in and the fixed offset stored in nce.
2539  */
2540 static void
2541 nce_make_mapping(nce_t *nce, uchar_t *addrpos, uchar_t *addr)
2542 {
2543 	uchar_t *mask, *to;
2544 	ill_t	*ill = nce->nce_ill;
2545 	int 	len;
2546 
2547 	if (ill->ill_net_type == IRE_IF_NORESOLVER)
2548 		return;
2549 	ASSERT(nce->nce_res_mp != NULL);
2550 	ASSERT(ill->ill_net_type == IRE_IF_RESOLVER);
2551 	ASSERT(nce->nce_flags & NCE_F_MAPPING);
2552 	ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&nce->nce_extract_mask));
2553 	ASSERT(addr != NULL);
2554 	bcopy(nce->nce_res_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill),
2555 	    addrpos, ill->ill_nd_lla_len);
2556 	len = MIN((int)ill->ill_nd_lla_len - nce->nce_ll_extract_start,
2557 	    IPV6_ADDR_LEN);
2558 	mask = (uchar_t *)&nce->nce_extract_mask;
2559 	mask += (IPV6_ADDR_LEN - len);
2560 	addr += (IPV6_ADDR_LEN - len);
2561 	to = addrpos + nce->nce_ll_extract_start;
2562 	while (len-- > 0)
2563 		*to++ |= *mask++ & *addr++;
2564 }
2565 
2566 mblk_t *
2567 nce_udreq_alloc(ill_t *ill)
2568 {
2569 	mblk_t	*template_mp = NULL;
2570 	dl_unitdata_req_t *dlur;
2571 	int	sap_length;
2572 
2573 	ASSERT(ill->ill_isv6);
2574 
2575 	sap_length = ill->ill_sap_length;
2576 	template_mp = ip_dlpi_alloc(sizeof (dl_unitdata_req_t) +
2577 	    ill->ill_nd_lla_len + ABS(sap_length), DL_UNITDATA_REQ);
2578 	if (template_mp == NULL)
2579 		return (NULL);
2580 
2581 	dlur = (dl_unitdata_req_t *)template_mp->b_rptr;
2582 	dlur->dl_priority.dl_min = 0;
2583 	dlur->dl_priority.dl_max = 0;
2584 	dlur->dl_dest_addr_length = ABS(sap_length) + ill->ill_nd_lla_len;
2585 	dlur->dl_dest_addr_offset = sizeof (dl_unitdata_req_t);
2586 
2587 	/* Copy in the SAP value. */
2588 	NCE_LL_SAP_COPY(ill, template_mp);
2589 
2590 	return (template_mp);
2591 }
2592 
2593 /*
2594  * NDP retransmit timer.
2595  * This timer goes off when:
2596  * a. It is time to retransmit NS for resolver.
2597  * b. It is time to send reachability probes.
2598  */
2599 void
2600 ndp_timer(void *arg)
2601 {
2602 	nce_t		*nce = arg;
2603 	ill_t		*ill = nce->nce_ill;
2604 	char		addrbuf[INET6_ADDRSTRLEN];
2605 	boolean_t	dropped = B_FALSE;
2606 	ip_stack_t	*ipst = ill->ill_ipst;
2607 
2608 	/*
2609 	 * The timer has to be cancelled by ndp_delete before doing the final
2610 	 * refrele. So the NCE is guaranteed to exist when the timer runs
2611 	 * until it clears the timeout_id. Before clearing the timeout_id
2612 	 * bump up the refcnt so that we can continue to use the nce
2613 	 */
2614 	ASSERT(nce != NULL);
2615 
2616 	mutex_enter(&nce->nce_lock);
2617 	NCE_REFHOLD_LOCKED(nce);
2618 	nce->nce_timeout_id = 0;
2619 
2620 	/*
2621 	 * Check the reachability state first.
2622 	 */
2623 	switch (nce->nce_state) {
2624 	case ND_DELAY:
2625 		nce->nce_state = ND_PROBE;
2626 		mutex_exit(&nce->nce_lock);
2627 		(void) nce_xmit_solicit(nce, B_FALSE, &ipv6_all_zeros,
2628 		    NDP_UNICAST);
2629 		if (ip_debug > 3) {
2630 			/* ip2dbg */
2631 			pr_addr_dbg("ndp_timer: state for %s changed "
2632 			    "to PROBE\n", AF_INET6, &nce->nce_addr);
2633 		}
2634 		NDP_RESTART_TIMER(nce, ill->ill_reachable_retrans_time);
2635 		NCE_REFRELE(nce);
2636 		return;
2637 	case ND_PROBE:
2638 		/* must be retransmit timer */
2639 		nce->nce_pcnt--;
2640 		ASSERT(nce->nce_pcnt < ND_MAX_UNICAST_SOLICIT &&
2641 		    nce->nce_pcnt >= -1);
2642 		if (nce->nce_pcnt > 0) {
2643 			/*
2644 			 * As per RFC2461, the nce gets deleted after
2645 			 * MAX_UNICAST_SOLICIT unsuccessful re-transmissions.
2646 			 * Note that the first unicast solicitation is sent
2647 			 * during the DELAY state.
2648 			 */
2649 			ip2dbg(("ndp_timer: pcount=%x dst %s\n",
2650 			    nce->nce_pcnt, inet_ntop(AF_INET6, &nce->nce_addr,
2651 			    addrbuf, sizeof (addrbuf))));
2652 			mutex_exit(&nce->nce_lock);
2653 			dropped = nce_xmit_solicit(nce, B_FALSE,
2654 			    &ipv6_all_zeros,
2655 			    (nce->nce_flags & NCE_F_PERMANENT) ? NDP_PROBE :
2656 			    NDP_UNICAST);
2657 			if (dropped) {
2658 				mutex_enter(&nce->nce_lock);
2659 				nce->nce_pcnt++;
2660 				mutex_exit(&nce->nce_lock);
2661 			}
2662 			NDP_RESTART_TIMER(nce, ILL_PROBE_INTERVAL(ill));
2663 		} else if (nce->nce_pcnt < 0) {
2664 			/* No hope, delete the nce */
2665 			nce->nce_state = ND_UNREACHABLE;
2666 			mutex_exit(&nce->nce_lock);
2667 			if (ip_debug > 2) {
2668 				/* ip1dbg */
2669 				pr_addr_dbg("ndp_timer: Delete IRE for"
2670 				    " dst %s\n", AF_INET6, &nce->nce_addr);
2671 			}
2672 			ndp_delete(nce);
2673 		} else if (!(nce->nce_flags & NCE_F_PERMANENT)) {
2674 			/* Wait RetransTimer, before deleting the entry */
2675 			ip2dbg(("ndp_timer: pcount=%x dst %s\n",
2676 			    nce->nce_pcnt, inet_ntop(AF_INET6,
2677 			    &nce->nce_addr, addrbuf, sizeof (addrbuf))));
2678 			mutex_exit(&nce->nce_lock);
2679 			/* Wait one interval before killing */
2680 			NDP_RESTART_TIMER(nce, ill->ill_reachable_retrans_time);
2681 		} else if (ill->ill_phyint->phyint_flags & PHYI_RUNNING) {
2682 			ipif_t *ipif;
2683 
2684 			/*
2685 			 * We're done probing, and we can now declare this
2686 			 * address to be usable.  Let IP know that it's ok to
2687 			 * use.
2688 			 */
2689 			nce->nce_state = ND_REACHABLE;
2690 			mutex_exit(&nce->nce_lock);
2691 			ipif = ip_ndp_lookup_addr_v6(&nce->nce_addr,
2692 			    nce->nce_ill);
2693 			if (ipif != NULL) {
2694 				if (ipif->ipif_was_dup) {
2695 					char ibuf[LIFNAMSIZ + 10];
2696 					char sbuf[INET6_ADDRSTRLEN];
2697 
2698 					ipif->ipif_was_dup = B_FALSE;
2699 					(void) inet_ntop(AF_INET6,
2700 					    &ipif->ipif_v6lcl_addr,
2701 					    sbuf, sizeof (sbuf));
2702 					ipif_get_name(ipif, ibuf,
2703 					    sizeof (ibuf));
2704 					cmn_err(CE_NOTE, "recovered address "
2705 					    "%s on %s", sbuf, ibuf);
2706 				}
2707 				if ((ipif->ipif_flags & IPIF_UP) &&
2708 				    !ipif->ipif_addr_ready)
2709 					ipif_up_notify(ipif);
2710 				ipif->ipif_addr_ready = 1;
2711 				ipif_refrele(ipif);
2712 			}
2713 			/* Begin defending our new address */
2714 			nce->nce_unsolicit_count = 0;
2715 			dropped = nce_xmit_advert(nce, B_FALSE,
2716 			    &ipv6_all_hosts_mcast, 0);
2717 			if (dropped) {
2718 				nce->nce_unsolicit_count = 1;
2719 				NDP_RESTART_TIMER(nce,
2720 				    ipst->ips_ip_ndp_unsolicit_interval);
2721 			} else if (ipst->ips_ip_ndp_defense_interval != 0) {
2722 				NDP_RESTART_TIMER(nce,
2723 				    ipst->ips_ip_ndp_defense_interval);
2724 			}
2725 		} else {
2726 			/*
2727 			 * This is an address we're probing to be our own, but
2728 			 * the ill is down.  Wait until it comes back before
2729 			 * doing anything, but switch to reachable state so
2730 			 * that the restart will work.
2731 			 */
2732 			nce->nce_state = ND_REACHABLE;
2733 			mutex_exit(&nce->nce_lock);
2734 		}
2735 		NCE_REFRELE(nce);
2736 		return;
2737 	case ND_INCOMPLETE: {
2738 		ip6_t	*ip6h;
2739 		ip6i_t	*ip6i;
2740 		mblk_t	*mp, *datamp, *nextmp, **prevmpp;
2741 
2742 		/*
2743 		 * Per case (2) in the nce_queue_mp() comments, scan nce_qd_mp
2744 		 * for any IPMP probe packets, and toss 'em.  IPMP probe
2745 		 * packets will always be at the head of nce_qd_mp and always
2746 		 * have an ip6i_t header, so we can stop at the first queued
2747 		 * ND packet without an ip6i_t.
2748 		 */
2749 		prevmpp = &nce->nce_qd_mp;
2750 		for (mp = nce->nce_qd_mp; mp != NULL; mp = nextmp) {
2751 			nextmp = mp->b_next;
2752 			datamp = (DB_TYPE(mp) == M_CTL) ? mp->b_cont : mp;
2753 			ip6h = (ip6_t *)datamp->b_rptr;
2754 			if (ip6h->ip6_nxt != IPPROTO_RAW)
2755 				break;
2756 
2757 			ip6i = (ip6i_t *)ip6h;
2758 			if (ip6i->ip6i_flags & IP6I_IPMP_PROBE) {
2759 				inet_freemsg(mp);
2760 				*prevmpp = nextmp;
2761 			} else {
2762 				prevmpp = &mp->b_next;
2763 			}
2764 		}
2765 		ip_ndp_resolve(nce);
2766 		mutex_exit(&nce->nce_lock);
2767 		NCE_REFRELE(nce);
2768 		break;
2769 	}
2770 	case ND_REACHABLE:
2771 		if (((nce->nce_flags & NCE_F_UNSOL_ADV) &&
2772 		    nce->nce_unsolicit_count != 0) ||
2773 		    ((nce->nce_flags & NCE_F_PERMANENT) &&
2774 		    ipst->ips_ip_ndp_defense_interval != 0)) {
2775 			if (nce->nce_unsolicit_count > 0)
2776 				nce->nce_unsolicit_count--;
2777 			mutex_exit(&nce->nce_lock);
2778 			dropped = nce_xmit_advert(nce, B_FALSE,
2779 			    &ipv6_all_hosts_mcast, 0);
2780 			if (dropped) {
2781 				mutex_enter(&nce->nce_lock);
2782 				nce->nce_unsolicit_count++;
2783 				mutex_exit(&nce->nce_lock);
2784 			}
2785 			if (nce->nce_unsolicit_count != 0) {
2786 				NDP_RESTART_TIMER(nce,
2787 				    ipst->ips_ip_ndp_unsolicit_interval);
2788 			} else {
2789 				NDP_RESTART_TIMER(nce,
2790 				    ipst->ips_ip_ndp_defense_interval);
2791 			}
2792 		} else {
2793 			mutex_exit(&nce->nce_lock);
2794 		}
2795 		NCE_REFRELE(nce);
2796 		break;
2797 	default:
2798 		mutex_exit(&nce->nce_lock);
2799 		NCE_REFRELE(nce);
2800 		break;
2801 	}
2802 }
2803 
2804 /*
2805  * Set a link layer address from the ll_addr passed in.
2806  * Copy SAP from ill.
2807  */
2808 static void
2809 nce_set_ll(nce_t *nce, uchar_t *ll_addr)
2810 {
2811 	ill_t	*ill = nce->nce_ill;
2812 	uchar_t	*woffset;
2813 
2814 	ASSERT(ll_addr != NULL);
2815 	/* Always called before fast_path_probe */
2816 	ASSERT(nce->nce_fp_mp == NULL);
2817 	if (ill->ill_sap_length != 0) {
2818 		/*
2819 		 * Copy the SAP type specified in the
2820 		 * request into the xmit template.
2821 		 */
2822 		NCE_LL_SAP_COPY(ill, nce->nce_res_mp);
2823 	}
2824 	if (ill->ill_phys_addr_length > 0) {
2825 		/*
2826 		 * The bcopy() below used to be called for the physical address
2827 		 * length rather than the link layer address length. For
2828 		 * ethernet and many other media, the phys_addr and lla are
2829 		 * identical.
2830 		 * However, with xresolv interfaces being introduced, the
2831 		 * phys_addr and lla are no longer the same, and the physical
2832 		 * address may not have any useful meaning, so we use the lla
2833 		 * for IPv6 address resolution and destination addressing.
2834 		 *
2835 		 * For PPP or other interfaces with a zero length
2836 		 * physical address, don't do anything here.
2837 		 * The bcopy() with a zero phys_addr length was previously
2838 		 * a no-op for interfaces with a zero-length physical address.
2839 		 * Using the lla for them would change the way they operate.
2840 		 * Doing nothing in such cases preserves expected behavior.
2841 		 */
2842 		woffset = nce->nce_res_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill);
2843 		bcopy(ll_addr, woffset, ill->ill_nd_lla_len);
2844 	}
2845 }
2846 
2847 static boolean_t
2848 nce_cmp_ll_addr(const nce_t *nce, const uchar_t *ll_addr, uint32_t ll_addr_len)
2849 {
2850 	ill_t	*ill = nce->nce_ill;
2851 	uchar_t	*ll_offset;
2852 
2853 	ASSERT(nce->nce_res_mp != NULL);
2854 	if (ll_addr == NULL)
2855 		return (B_FALSE);
2856 	ll_offset = nce->nce_res_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill);
2857 	if (bcmp(ll_addr, ll_offset, ll_addr_len) != 0)
2858 		return (B_TRUE);
2859 	return (B_FALSE);
2860 }
2861 
2862 /*
2863  * Updates the link layer address or the reachability state of
2864  * a cache entry.  Reset probe counter if needed.
2865  */
2866 static void
2867 nce_update(nce_t *nce, uint16_t new_state, uchar_t *new_ll_addr)
2868 {
2869 	ill_t	*ill = nce->nce_ill;
2870 	boolean_t need_stop_timer = B_FALSE;
2871 	boolean_t need_fastpath_update = B_FALSE;
2872 
2873 	ASSERT(MUTEX_HELD(&nce->nce_lock));
2874 	ASSERT(nce->nce_ipversion == IPV6_VERSION);
2875 	/*
2876 	 * If this interface does not do NUD, there is no point
2877 	 * in allowing an update to the cache entry.  Although
2878 	 * we will respond to NS.
2879 	 * The only time we accept an update for a resolver when
2880 	 * NUD is turned off is when it has just been created.
2881 	 * Non-Resolvers will always be created as REACHABLE.
2882 	 */
2883 	if (new_state != ND_UNCHANGED) {
2884 		if ((nce->nce_flags & NCE_F_NONUD) &&
2885 		    (nce->nce_state != ND_INCOMPLETE))
2886 			return;
2887 		ASSERT((int16_t)new_state >= ND_STATE_VALID_MIN);
2888 		ASSERT((int16_t)new_state <= ND_STATE_VALID_MAX);
2889 		need_stop_timer = B_TRUE;
2890 		if (new_state == ND_REACHABLE)
2891 			nce->nce_last = TICK_TO_MSEC(lbolt64);
2892 		else {
2893 			/* We force NUD in this case */
2894 			nce->nce_last = 0;
2895 		}
2896 		nce->nce_state = new_state;
2897 		nce->nce_pcnt = ND_MAX_UNICAST_SOLICIT;
2898 	}
2899 	/*
2900 	 * In case of fast path we need to free the the fastpath
2901 	 * M_DATA and do another probe.  Otherwise we can just
2902 	 * overwrite the DL_UNITDATA_REQ data, noting we'll lose
2903 	 * whatever packets that happens to be transmitting at the time.
2904 	 */
2905 	if (new_ll_addr != NULL) {
2906 		ASSERT(nce->nce_res_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill) +
2907 		    ill->ill_nd_lla_len <= nce->nce_res_mp->b_wptr);
2908 		bcopy(new_ll_addr, nce->nce_res_mp->b_rptr +
2909 		    NCE_LL_ADDR_OFFSET(ill), ill->ill_nd_lla_len);
2910 		if (nce->nce_fp_mp != NULL) {
2911 			freemsg(nce->nce_fp_mp);
2912 			nce->nce_fp_mp = NULL;
2913 		}
2914 		need_fastpath_update = B_TRUE;
2915 	}
2916 	mutex_exit(&nce->nce_lock);
2917 	if (need_stop_timer) {
2918 		(void) untimeout(nce->nce_timeout_id);
2919 		nce->nce_timeout_id = 0;
2920 	}
2921 	if (need_fastpath_update)
2922 		nce_fastpath(nce);
2923 	mutex_enter(&nce->nce_lock);
2924 }
2925 
2926 void
2927 nce_queue_mp_common(nce_t *nce, mblk_t *mp, boolean_t head_insert)
2928 {
2929 	uint_t	count = 0;
2930 	mblk_t  **mpp, *tmp;
2931 
2932 	ASSERT(MUTEX_HELD(&nce->nce_lock));
2933 
2934 	for (mpp = &nce->nce_qd_mp; *mpp != NULL; mpp = &(*mpp)->b_next) {
2935 		if (++count > nce->nce_ill->ill_max_buf) {
2936 			tmp = nce->nce_qd_mp->b_next;
2937 			nce->nce_qd_mp->b_next = NULL;
2938 			nce->nce_qd_mp->b_prev = NULL;
2939 			freemsg(nce->nce_qd_mp);
2940 			nce->nce_qd_mp = tmp;
2941 		}
2942 	}
2943 
2944 	if (head_insert) {
2945 		mp->b_next = nce->nce_qd_mp;
2946 		nce->nce_qd_mp = mp;
2947 	} else {
2948 		*mpp = mp;
2949 	}
2950 }
2951 
2952 static void
2953 nce_queue_mp(nce_t *nce, mblk_t *mp)
2954 {
2955 	boolean_t head_insert = B_FALSE;
2956 	ip6_t	*ip6h;
2957 	ip6i_t  *ip6i;
2958 	mblk_t	*data_mp;
2959 
2960 	ASSERT(MUTEX_HELD(&nce->nce_lock));
2961 
2962 	if (mp->b_datap->db_type == M_CTL)
2963 		data_mp = mp->b_cont;
2964 	else
2965 		data_mp = mp;
2966 	ip6h = (ip6_t *)data_mp->b_rptr;
2967 	if (ip6h->ip6_nxt == IPPROTO_RAW) {
2968 		/*
2969 		 * This message should have been pulled up already in
2970 		 * ip_wput_v6. We can't do pullups here because the message
2971 		 * could be from the nce_qd_mp which could have b_next/b_prev
2972 		 * non-NULL.
2973 		 */
2974 		ip6i = (ip6i_t *)ip6h;
2975 		ASSERT(MBLKL(data_mp) >= sizeof (ip6i_t) + IPV6_HDR_LEN);
2976 
2977 		/*
2978 		 * If this packet is marked IP6I_IPMP_PROBE, then we need to:
2979 		 *
2980 		 *   1. Insert it at the head of the nce_qd_mp list.  Consider
2981 		 *	the normal (non-probe) load-speading case where the
2982 		 *	source address of the ND packet is not tied to nce_ill.
2983 		 *	If the ill bound to the source address cannot receive,
2984 		 *	the response to the ND packet will not be received.
2985 		 *	However, if ND packets for nce_ill's probes are queued
2986 		 *	behind that ND packet, those probes will also fail to
2987 		 *	be sent, and thus in.mpathd will erroneously conclude
2988 		 *	that nce_ill has also failed.
2989 		 *
2990 		 *   2. Drop the probe packet in ndp_timer() if the ND did
2991 		 *	not succeed on the first attempt.  This ensures that
2992 		 *	ND problems do not manifest as probe RTT spikes.
2993 		 */
2994 		if (ip6i->ip6i_flags & IP6I_IPMP_PROBE)
2995 			head_insert = B_TRUE;
2996 	}
2997 	nce_queue_mp_common(nce, mp, head_insert);
2998 }
2999 
3000 /*
3001  * Called when address resolution failed due to a timeout.
3002  * Send an ICMP unreachable in response to all queued packets.
3003  */
3004 void
3005 nce_resolv_failed(nce_t *nce)
3006 {
3007 	mblk_t	*mp, *nxt_mp, *first_mp;
3008 	char	buf[INET6_ADDRSTRLEN];
3009 	ip6_t *ip6h;
3010 	zoneid_t zoneid = GLOBAL_ZONEID;
3011 	ip_stack_t	*ipst = nce->nce_ill->ill_ipst;
3012 
3013 	ip1dbg(("nce_resolv_failed: dst %s\n",
3014 	    inet_ntop(AF_INET6, (char *)&nce->nce_addr, buf, sizeof (buf))));
3015 	mutex_enter(&nce->nce_lock);
3016 	mp = nce->nce_qd_mp;
3017 	nce->nce_qd_mp = NULL;
3018 	mutex_exit(&nce->nce_lock);
3019 	while (mp != NULL) {
3020 		nxt_mp = mp->b_next;
3021 		mp->b_next = NULL;
3022 		mp->b_prev = NULL;
3023 
3024 		first_mp = mp;
3025 		if (mp->b_datap->db_type == M_CTL) {
3026 			ipsec_out_t *io = (ipsec_out_t *)mp->b_rptr;
3027 			ASSERT(io->ipsec_out_type == IPSEC_OUT);
3028 			zoneid = io->ipsec_out_zoneid;
3029 			ASSERT(zoneid != ALL_ZONES);
3030 			mp = mp->b_cont;
3031 			mp->b_next = NULL;
3032 			mp->b_prev = NULL;
3033 		}
3034 
3035 		ip6h = (ip6_t *)mp->b_rptr;
3036 		if (ip6h->ip6_nxt == IPPROTO_RAW) {
3037 			ip6i_t *ip6i;
3038 			/*
3039 			 * This message should have been pulled up already
3040 			 * in ip_wput_v6. ip_hdr_complete_v6 assumes that
3041 			 * the header is pulled up.
3042 			 */
3043 			ip6i = (ip6i_t *)ip6h;
3044 			ASSERT((mp->b_wptr - (uchar_t *)ip6i) >=
3045 			    sizeof (ip6i_t) + IPV6_HDR_LEN);
3046 			mp->b_rptr += sizeof (ip6i_t);
3047 		}
3048 		/*
3049 		 * Ignore failure since icmp_unreachable_v6 will silently
3050 		 * drop packets with an unspecified source address.
3051 		 */
3052 		(void) ip_hdr_complete_v6((ip6_t *)mp->b_rptr, zoneid, ipst);
3053 		icmp_unreachable_v6(nce->nce_ill->ill_wq, first_mp,
3054 		    ICMP6_DST_UNREACH_ADDR, B_FALSE, B_FALSE, zoneid, ipst);
3055 		mp = nxt_mp;
3056 	}
3057 	nce_cb_dispatch(nce);
3058 }
3059 
3060 /*
3061  * Called by SIOCSNDP* ioctl to add/change an nce entry
3062  * and the corresponding attributes.
3063  * Disallow states other than ND_REACHABLE or ND_STALE.
3064  */
3065 int
3066 ndp_sioc_update(ill_t *ill, lif_nd_req_t *lnr)
3067 {
3068 	sin6_t		*sin6;
3069 	in6_addr_t	*addr;
3070 	nce_t		*nce;
3071 	int		err;
3072 	uint16_t	new_flags = 0;
3073 	uint16_t	old_flags = 0;
3074 	int		inflags = lnr->lnr_flags;
3075 	ip_stack_t	*ipst = ill->ill_ipst;
3076 
3077 	ASSERT(ill->ill_isv6);
3078 	if ((lnr->lnr_state_create != ND_REACHABLE) &&
3079 	    (lnr->lnr_state_create != ND_STALE))
3080 		return (EINVAL);
3081 
3082 	if (lnr->lnr_hdw_len > ND_MAX_HDW_LEN)
3083 		return (EINVAL);
3084 
3085 	sin6 = (sin6_t *)&lnr->lnr_addr;
3086 	addr = &sin6->sin6_addr;
3087 
3088 	mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
3089 	/* We know it can not be mapping so just look in the hash table */
3090 	nce = *((nce_t **)NCE_HASH_PTR_V6(ipst, *addr));
3091 	/* See comment in ndp_query() regarding IS_IPMP(ill) usage */
3092 	nce = nce_lookup_addr(ill, IS_IPMP(ill), addr, nce);
3093 	if (nce != NULL)
3094 		new_flags = nce->nce_flags;
3095 
3096 	switch (inflags & (NDF_ISROUTER_ON|NDF_ISROUTER_OFF)) {
3097 	case NDF_ISROUTER_ON:
3098 		new_flags |= NCE_F_ISROUTER;
3099 		break;
3100 	case NDF_ISROUTER_OFF:
3101 		new_flags &= ~NCE_F_ISROUTER;
3102 		break;
3103 	case (NDF_ISROUTER_OFF|NDF_ISROUTER_ON):
3104 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
3105 		if (nce != NULL)
3106 			NCE_REFRELE(nce);
3107 		return (EINVAL);
3108 	}
3109 
3110 	switch (inflags & (NDF_ANYCAST_ON|NDF_ANYCAST_OFF)) {
3111 	case NDF_ANYCAST_ON:
3112 		new_flags |= NCE_F_ANYCAST;
3113 		break;
3114 	case NDF_ANYCAST_OFF:
3115 		new_flags &= ~NCE_F_ANYCAST;
3116 		break;
3117 	case (NDF_ANYCAST_OFF|NDF_ANYCAST_ON):
3118 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
3119 		if (nce != NULL)
3120 			NCE_REFRELE(nce);
3121 		return (EINVAL);
3122 	}
3123 
3124 	if (nce == NULL) {
3125 		err = ndp_add_v6(ill,
3126 		    (uchar_t *)lnr->lnr_hdw_addr,
3127 		    addr,
3128 		    &ipv6_all_ones,
3129 		    &ipv6_all_zeros,
3130 		    0,
3131 		    new_flags,
3132 		    lnr->lnr_state_create,
3133 		    &nce);
3134 		if (err != 0) {
3135 			mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
3136 			ip1dbg(("ndp_sioc_update: Can't create NCE %d\n", err));
3137 			return (err);
3138 		}
3139 	}
3140 	old_flags = nce->nce_flags;
3141 	if (old_flags & NCE_F_ISROUTER && !(new_flags & NCE_F_ISROUTER)) {
3142 		/*
3143 		 * Router turned to host, delete all ires.
3144 		 * XXX Just delete the entry, but we need to add too.
3145 		 */
3146 		nce->nce_flags &= ~NCE_F_ISROUTER;
3147 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
3148 		ndp_delete(nce);
3149 		NCE_REFRELE(nce);
3150 		return (0);
3151 	}
3152 	mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
3153 
3154 	mutex_enter(&nce->nce_lock);
3155 	nce->nce_flags = new_flags;
3156 	mutex_exit(&nce->nce_lock);
3157 	/*
3158 	 * Note that we ignore the state at this point, which
3159 	 * should be either STALE or REACHABLE.  Instead we let
3160 	 * the link layer address passed in to determine the state
3161 	 * much like incoming packets.
3162 	 */
3163 	nce_process(nce, (uchar_t *)lnr->lnr_hdw_addr, 0, B_FALSE);
3164 	NCE_REFRELE(nce);
3165 	return (0);
3166 }
3167 
3168 /*
3169  * If the device driver supports it, we make nce_fp_mp to have
3170  * an M_DATA prepend.  Otherwise nce_fp_mp will be null.
3171  * The caller ensures there is hold on nce for this function.
3172  * Note that since ill_fastpath_probe() copies the mblk there is
3173  * no need for the hold beyond this function.
3174  */
3175 void
3176 nce_fastpath(nce_t *nce)
3177 {
3178 	ill_t	*ill = nce->nce_ill;
3179 	int res;
3180 
3181 	ASSERT(ill != NULL);
3182 	ASSERT(nce->nce_state != ND_INITIAL && nce->nce_state != ND_INCOMPLETE);
3183 
3184 	if (nce->nce_fp_mp != NULL) {
3185 		/* Already contains fastpath info */
3186 		return;
3187 	}
3188 	if (nce->nce_res_mp != NULL) {
3189 		nce_fastpath_list_add(nce);
3190 		res = ill_fastpath_probe(ill, nce->nce_res_mp);
3191 		/*
3192 		 * EAGAIN is an indication of a transient error
3193 		 * i.e. allocation failure etc. leave the nce in the list it
3194 		 * will be updated when another probe happens for another ire
3195 		 * if not it will be taken out of the list when the ire is
3196 		 * deleted.
3197 		 */
3198 
3199 		if (res != 0 && res != EAGAIN)
3200 			nce_fastpath_list_delete(nce);
3201 	}
3202 }
3203 
3204 /*
3205  * Drain the list of nce's waiting for fastpath response.
3206  */
3207 void
3208 nce_fastpath_list_dispatch(ill_t *ill, boolean_t (*func)(nce_t *, void  *),
3209     void *arg)
3210 {
3211 
3212 	nce_t *next_nce;
3213 	nce_t *current_nce;
3214 	nce_t *first_nce;
3215 	nce_t *prev_nce = NULL;
3216 
3217 	mutex_enter(&ill->ill_lock);
3218 	first_nce = current_nce = (nce_t *)ill->ill_fastpath_list;
3219 	while (current_nce != (nce_t *)&ill->ill_fastpath_list) {
3220 		next_nce = current_nce->nce_fastpath;
3221 		/*
3222 		 * Take it off the list if we're flushing, or if the callback
3223 		 * routine tells us to do so.  Otherwise, leave the nce in the
3224 		 * fastpath list to handle any pending response from the lower
3225 		 * layer.  We can't drain the list when the callback routine
3226 		 * comparison failed, because the response is asynchronous in
3227 		 * nature, and may not arrive in the same order as the list
3228 		 * insertion.
3229 		 */
3230 		if (func == NULL || func(current_nce, arg)) {
3231 			current_nce->nce_fastpath = NULL;
3232 			if (current_nce == first_nce)
3233 				ill->ill_fastpath_list = first_nce = next_nce;
3234 			else
3235 				prev_nce->nce_fastpath = next_nce;
3236 		} else {
3237 			/* previous element that is still in the list */
3238 			prev_nce = current_nce;
3239 		}
3240 		current_nce = next_nce;
3241 	}
3242 	mutex_exit(&ill->ill_lock);
3243 }
3244 
3245 /*
3246  * Add nce to the nce fastpath list.
3247  */
3248 void
3249 nce_fastpath_list_add(nce_t *nce)
3250 {
3251 	ill_t *ill;
3252 
3253 	ill = nce->nce_ill;
3254 
3255 	mutex_enter(&ill->ill_lock);
3256 	mutex_enter(&nce->nce_lock);
3257 
3258 	/*
3259 	 * if nce has not been deleted and
3260 	 * is not already in the list add it.
3261 	 */
3262 	if (!(nce->nce_flags & NCE_F_CONDEMNED) &&
3263 	    (nce->nce_fastpath == NULL)) {
3264 		nce->nce_fastpath = (nce_t *)ill->ill_fastpath_list;
3265 		ill->ill_fastpath_list = nce;
3266 	}
3267 
3268 	mutex_exit(&nce->nce_lock);
3269 	mutex_exit(&ill->ill_lock);
3270 }
3271 
3272 /*
3273  * remove nce from the nce fastpath list.
3274  */
3275 void
3276 nce_fastpath_list_delete(nce_t *nce)
3277 {
3278 	nce_t *nce_ptr;
3279 
3280 	ill_t *ill;
3281 
3282 	ill = nce->nce_ill;
3283 	ASSERT(ill != NULL);
3284 
3285 	mutex_enter(&ill->ill_lock);
3286 	if (nce->nce_fastpath == NULL)
3287 		goto done;
3288 
3289 	ASSERT(ill->ill_fastpath_list != &ill->ill_fastpath_list);
3290 
3291 	if (ill->ill_fastpath_list == nce) {
3292 		ill->ill_fastpath_list = nce->nce_fastpath;
3293 	} else {
3294 		nce_ptr = ill->ill_fastpath_list;
3295 		while (nce_ptr != (nce_t *)&ill->ill_fastpath_list) {
3296 			if (nce_ptr->nce_fastpath == nce) {
3297 				nce_ptr->nce_fastpath = nce->nce_fastpath;
3298 				break;
3299 			}
3300 			nce_ptr = nce_ptr->nce_fastpath;
3301 		}
3302 	}
3303 
3304 	nce->nce_fastpath = NULL;
3305 done:
3306 	mutex_exit(&ill->ill_lock);
3307 }
3308 
3309 /*
3310  * Update all NCE's that are not in fastpath mode and
3311  * have an nce_fp_mp that matches mp. mp->b_cont contains
3312  * the fastpath header.
3313  *
3314  * Returns TRUE if entry should be dequeued, or FALSE otherwise.
3315  */
3316 boolean_t
3317 ndp_fastpath_update(nce_t *nce, void *arg)
3318 {
3319 	mblk_t 	*mp, *fp_mp;
3320 	uchar_t	*mp_rptr, *ud_mp_rptr;
3321 	mblk_t	*ud_mp = nce->nce_res_mp;
3322 	ptrdiff_t	cmplen;
3323 
3324 	if (nce->nce_flags & NCE_F_MAPPING)
3325 		return (B_TRUE);
3326 	if ((nce->nce_fp_mp != NULL) || (ud_mp == NULL))
3327 		return (B_TRUE);
3328 
3329 	ip2dbg(("ndp_fastpath_update: trying\n"));
3330 	mp = (mblk_t *)arg;
3331 	mp_rptr = mp->b_rptr;
3332 	cmplen = mp->b_wptr - mp_rptr;
3333 	ASSERT(cmplen >= 0);
3334 	ud_mp_rptr = ud_mp->b_rptr;
3335 	/*
3336 	 * The nce is locked here to prevent any other threads
3337 	 * from accessing and changing nce_res_mp when the IPv6 address
3338 	 * becomes resolved to an lla while we're in the middle
3339 	 * of looking at and comparing the hardware address (lla).
3340 	 * It is also locked to prevent multiple threads in nce_fastpath_update
3341 	 * from examining nce_res_mp atthe same time.
3342 	 */
3343 	mutex_enter(&nce->nce_lock);
3344 	if (ud_mp->b_wptr - ud_mp_rptr != cmplen ||
3345 	    bcmp((char *)mp_rptr, (char *)ud_mp_rptr, cmplen) != 0) {
3346 		mutex_exit(&nce->nce_lock);
3347 		/*
3348 		 * Don't take the ire off the fastpath list yet,
3349 		 * since the response may come later.
3350 		 */
3351 		return (B_FALSE);
3352 	}
3353 	/* Matched - install mp as the fastpath mp */
3354 	ip1dbg(("ndp_fastpath_update: match\n"));
3355 	fp_mp = dupb(mp->b_cont);
3356 	if (fp_mp != NULL) {
3357 		nce->nce_fp_mp = fp_mp;
3358 	}
3359 	mutex_exit(&nce->nce_lock);
3360 	return (B_TRUE);
3361 }
3362 
3363 /*
3364  * This function handles the DL_NOTE_FASTPATH_FLUSH notification from
3365  * driver.  Note that it assumes IP is exclusive...
3366  */
3367 /* ARGSUSED */
3368 void
3369 ndp_fastpath_flush(nce_t *nce, char *arg)
3370 {
3371 	if (nce->nce_flags & NCE_F_MAPPING)
3372 		return;
3373 	/* No fastpath info? */
3374 	if (nce->nce_fp_mp == NULL || nce->nce_res_mp == NULL)
3375 		return;
3376 
3377 	if (nce->nce_ipversion == IPV4_VERSION &&
3378 	    nce->nce_flags & NCE_F_BCAST) {
3379 		/*
3380 		 * IPv4 BROADCAST entries:
3381 		 * We can't delete the nce since it is difficult to
3382 		 * recreate these without going through the
3383 		 * ipif down/up dance.
3384 		 *
3385 		 * All access to nce->nce_fp_mp in the case of these
3386 		 * is protected by nce_lock.
3387 		 */
3388 		mutex_enter(&nce->nce_lock);
3389 		if (nce->nce_fp_mp != NULL) {
3390 			freeb(nce->nce_fp_mp);
3391 			nce->nce_fp_mp = NULL;
3392 			mutex_exit(&nce->nce_lock);
3393 			nce_fastpath(nce);
3394 		} else {
3395 			mutex_exit(&nce->nce_lock);
3396 		}
3397 	} else {
3398 		/* Just delete the NCE... */
3399 		ndp_delete(nce);
3400 	}
3401 }
3402 
3403 /*
3404  * Return a pointer to a given option in the packet.
3405  * Assumes that option part of the packet have already been validated.
3406  */
3407 nd_opt_hdr_t *
3408 ndp_get_option(nd_opt_hdr_t *opt, int optlen, int opt_type)
3409 {
3410 	while (optlen > 0) {
3411 		if (opt->nd_opt_type == opt_type)
3412 			return (opt);
3413 		optlen -= 8 * opt->nd_opt_len;
3414 		opt = (struct nd_opt_hdr *)((char *)opt + 8 * opt->nd_opt_len);
3415 	}
3416 	return (NULL);
3417 }
3418 
3419 /*
3420  * Verify all option lengths present are > 0, also check to see
3421  * if the option lengths and packet length are consistent.
3422  */
3423 boolean_t
3424 ndp_verify_optlen(nd_opt_hdr_t *opt, int optlen)
3425 {
3426 	ASSERT(opt != NULL);
3427 	while (optlen > 0) {
3428 		if (opt->nd_opt_len == 0)
3429 			return (B_FALSE);
3430 		optlen -= 8 * opt->nd_opt_len;
3431 		if (optlen < 0)
3432 			return (B_FALSE);
3433 		opt = (struct nd_opt_hdr *)((char *)opt + 8 * opt->nd_opt_len);
3434 	}
3435 	return (B_TRUE);
3436 }
3437 
3438 /*
3439  * ndp_walk function.
3440  * Free a fraction of the NCE cache entries.
3441  * A fraction of zero means to not free any in that category.
3442  */
3443 void
3444 ndp_cache_reclaim(nce_t *nce, char *arg)
3445 {
3446 	nce_cache_reclaim_t *ncr = (nce_cache_reclaim_t *)arg;
3447 	uint_t	rand;
3448 
3449 	if (nce->nce_flags & NCE_F_PERMANENT)
3450 		return;
3451 
3452 	rand = (uint_t)lbolt +
3453 	    NCE_ADDR_HASH_V6(nce->nce_addr, NCE_TABLE_SIZE);
3454 	if (ncr->ncr_host != 0 &&
3455 	    (rand/ncr->ncr_host)*ncr->ncr_host == rand) {
3456 		ndp_delete(nce);
3457 		return;
3458 	}
3459 }
3460 
3461 /*
3462  * ndp_walk function.
3463  * Count the number of NCEs that can be deleted.
3464  * These would be hosts but not routers.
3465  */
3466 void
3467 ndp_cache_count(nce_t *nce, char *arg)
3468 {
3469 	ncc_cache_count_t *ncc = (ncc_cache_count_t *)arg;
3470 
3471 	if (nce->nce_flags & NCE_F_PERMANENT)
3472 		return;
3473 
3474 	ncc->ncc_total++;
3475 	if (!(nce->nce_flags & NCE_F_ISROUTER))
3476 		ncc->ncc_host++;
3477 }
3478 
3479 #ifdef DEBUG
3480 void
3481 nce_trace_ref(nce_t *nce)
3482 {
3483 	ASSERT(MUTEX_HELD(&nce->nce_lock));
3484 
3485 	if (nce->nce_trace_disable)
3486 		return;
3487 
3488 	if (!th_trace_ref(nce, nce->nce_ill->ill_ipst)) {
3489 		nce->nce_trace_disable = B_TRUE;
3490 		nce_trace_cleanup(nce);
3491 	}
3492 }
3493 
3494 void
3495 nce_untrace_ref(nce_t *nce)
3496 {
3497 	ASSERT(MUTEX_HELD(&nce->nce_lock));
3498 
3499 	if (!nce->nce_trace_disable)
3500 		th_trace_unref(nce);
3501 }
3502 
3503 static void
3504 nce_trace_cleanup(const nce_t *nce)
3505 {
3506 	th_trace_cleanup(nce, nce->nce_trace_disable);
3507 }
3508 #endif
3509 
3510 /*
3511  * Called when address resolution fails due to a timeout.
3512  * Send an ICMP unreachable in response to all queued packets.
3513  */
3514 void
3515 arp_resolv_failed(nce_t *nce)
3516 {
3517 	mblk_t	*mp, *nxt_mp, *first_mp;
3518 	char	buf[INET6_ADDRSTRLEN];
3519 	zoneid_t zoneid = GLOBAL_ZONEID;
3520 	struct in_addr ipv4addr;
3521 	ip_stack_t *ipst = nce->nce_ill->ill_ipst;
3522 
3523 	IN6_V4MAPPED_TO_INADDR(&nce->nce_addr, &ipv4addr);
3524 	ip3dbg(("arp_resolv_failed: dst %s\n",
3525 	    inet_ntop(AF_INET, &ipv4addr, buf, sizeof (buf))));
3526 	mutex_enter(&nce->nce_lock);
3527 	mp = nce->nce_qd_mp;
3528 	nce->nce_qd_mp = NULL;
3529 	mutex_exit(&nce->nce_lock);
3530 
3531 	while (mp != NULL) {
3532 		nxt_mp = mp->b_next;
3533 		mp->b_next = NULL;
3534 		mp->b_prev = NULL;
3535 
3536 		first_mp = mp;
3537 		/*
3538 		 * Send icmp unreachable messages
3539 		 * to the hosts.
3540 		 */
3541 		(void) ip_hdr_complete((ipha_t *)mp->b_rptr, zoneid, ipst);
3542 		ip3dbg(("arp_resolv_failed: Calling icmp_unreachable\n"));
3543 		icmp_unreachable(nce->nce_ill->ill_wq, first_mp,
3544 		    ICMP_HOST_UNREACHABLE, zoneid, ipst);
3545 		mp = nxt_mp;
3546 	}
3547 }
3548 
3549 int
3550 ndp_lookup_then_add_v4(ill_t *ill, const in_addr_t *addr, uint16_t flags,
3551     nce_t **newnce, nce_t *src_nce)
3552 {
3553 	int	err;
3554 	nce_t	*nce;
3555 	in6_addr_t addr6;
3556 	ip_stack_t *ipst = ill->ill_ipst;
3557 
3558 	mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
3559 	nce = *((nce_t **)NCE_HASH_PTR_V4(ipst, *addr));
3560 	IN6_IPADDR_TO_V4MAPPED(*addr, &addr6);
3561 	/*
3562 	 * NOTE: IPv4 never matches across the illgrp since the NCE's we're
3563 	 * looking up have fastpath headers that are inherently per-ill.
3564 	 */
3565 	nce = nce_lookup_addr(ill, B_FALSE, &addr6, nce);
3566 	if (nce == NULL) {
3567 		err = ndp_add_v4(ill, addr, flags, newnce, src_nce);
3568 	} else {
3569 		*newnce = nce;
3570 		err = EEXIST;
3571 	}
3572 	mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
3573 	return (err);
3574 }
3575 
3576 /*
3577  * NDP Cache Entry creation routine for IPv4.
3578  * Mapped entries are handled in arp.
3579  * This routine must always be called with ndp4->ndp_g_lock held.
3580  * Prior to return, nce_refcnt is incremented.
3581  */
3582 static int
3583 ndp_add_v4(ill_t *ill, const in_addr_t *addr, uint16_t flags,
3584     nce_t **newnce, nce_t *src_nce)
3585 {
3586 	static	nce_t		nce_nil;
3587 	nce_t		*nce;
3588 	mblk_t		*mp;
3589 	mblk_t		*template = NULL;
3590 	nce_t		**ncep;
3591 	ip_stack_t	*ipst = ill->ill_ipst;
3592 	uint16_t	state = ND_INITIAL;
3593 	int		err;
3594 
3595 	ASSERT(MUTEX_HELD(&ipst->ips_ndp4->ndp_g_lock));
3596 	ASSERT(!ill->ill_isv6);
3597 	ASSERT((flags & NCE_F_MAPPING) == 0);
3598 
3599 	if (ill->ill_resolver_mp == NULL)
3600 		return (EINVAL);
3601 	/*
3602 	 * Allocate the mblk to hold the nce.
3603 	 */
3604 	mp = allocb(sizeof (nce_t), BPRI_MED);
3605 	if (mp == NULL)
3606 		return (ENOMEM);
3607 
3608 	nce = (nce_t *)mp->b_rptr;
3609 	mp->b_wptr = (uchar_t *)&nce[1];
3610 	*nce = nce_nil;
3611 	nce->nce_ill = ill;
3612 	nce->nce_ipversion = IPV4_VERSION;
3613 	nce->nce_flags = flags;
3614 	nce->nce_pcnt = ND_MAX_UNICAST_SOLICIT;
3615 	nce->nce_rcnt = ill->ill_xmit_count;
3616 	IN6_IPADDR_TO_V4MAPPED(*addr, &nce->nce_addr);
3617 	nce->nce_mask = ipv6_all_ones;
3618 	nce->nce_extract_mask = ipv6_all_zeros;
3619 	nce->nce_ll_extract_start = 0;
3620 	nce->nce_qd_mp = NULL;
3621 	nce->nce_mp = mp;
3622 	/* This one is for nce getting created */
3623 	nce->nce_refcnt = 1;
3624 	mutex_init(&nce->nce_lock, NULL, MUTEX_DEFAULT, NULL);
3625 	ncep = ((nce_t **)NCE_HASH_PTR_V4(ipst, *addr));
3626 
3627 	nce->nce_trace_disable = B_FALSE;
3628 
3629 	if (src_nce != NULL) {
3630 		/*
3631 		 * src_nce has been provided by the caller. The only
3632 		 * caller who provides a non-null, non-broadcast
3633 		 * src_nce is from ip_newroute() which must pass in
3634 		 * a ND_REACHABLE src_nce (this condition is verified
3635 		 * via an ASSERT for the save_ire->ire_nce in ip_newroute())
3636 		 */
3637 		mutex_enter(&src_nce->nce_lock);
3638 		state = src_nce->nce_state;
3639 		if ((src_nce->nce_flags & NCE_F_CONDEMNED) ||
3640 		    (ipst->ips_ndp4->ndp_g_hw_change > 0)) {
3641 			/*
3642 			 * src_nce has been deleted, or
3643 			 * ip_arp_news is in the middle of
3644 			 * flushing entries in the the nce.
3645 			 * Fail the add, since we don't know
3646 			 * if it is safe to copy the contents of
3647 			 * src_nce
3648 			 */
3649 			DTRACE_PROBE2(nce__bad__src__nce,
3650 			    nce_t *, src_nce, ill_t *, ill);
3651 			mutex_exit(&src_nce->nce_lock);
3652 			err = EINVAL;
3653 			goto err_ret;
3654 		}
3655 		template = copyb(src_nce->nce_res_mp);
3656 		mutex_exit(&src_nce->nce_lock);
3657 		if (template == NULL) {
3658 			err = ENOMEM;
3659 			goto err_ret;
3660 		}
3661 	} else if (flags & NCE_F_BCAST) {
3662 		/*
3663 		 * broadcast nce.
3664 		 */
3665 		template = copyb(ill->ill_bcast_mp);
3666 		if (template == NULL) {
3667 			err = ENOMEM;
3668 			goto err_ret;
3669 		}
3670 		state = ND_REACHABLE;
3671 	} else if (ill->ill_net_type == IRE_IF_NORESOLVER) {
3672 		/*
3673 		 * NORESOLVER entries are always created in the REACHABLE
3674 		 * state. We create a nce_res_mp with the IP nexthop address
3675 		 * in the destination address in the DLPI hdr if the
3676 		 * physical length is exactly 4 bytes.
3677 		 *
3678 		 * XXX not clear which drivers set ill_phys_addr_length to
3679 		 * IP_ADDR_LEN.
3680 		 */
3681 		if (ill->ill_phys_addr_length == IP_ADDR_LEN) {
3682 			template = ill_dlur_gen((uchar_t *)addr,
3683 			    ill->ill_phys_addr_length,
3684 			    ill->ill_sap, ill->ill_sap_length);
3685 		} else {
3686 			template = copyb(ill->ill_resolver_mp);
3687 		}
3688 		if (template == NULL) {
3689 			err = ENOMEM;
3690 			goto err_ret;
3691 		}
3692 		state = ND_REACHABLE;
3693 	}
3694 	nce->nce_fp_mp = NULL;
3695 	nce->nce_res_mp = template;
3696 	nce->nce_state = state;
3697 	if (state == ND_REACHABLE) {
3698 		nce->nce_last = TICK_TO_MSEC(lbolt64);
3699 		nce->nce_init_time = TICK_TO_MSEC(lbolt64);
3700 	} else {
3701 		nce->nce_last = 0;
3702 		if (state == ND_INITIAL)
3703 			nce->nce_init_time = TICK_TO_MSEC(lbolt64);
3704 	}
3705 
3706 	ASSERT((nce->nce_res_mp == NULL && nce->nce_state == ND_INITIAL) ||
3707 	    (nce->nce_res_mp != NULL && nce->nce_state == ND_REACHABLE));
3708 	/*
3709 	 * Atomically ensure that the ill is not CONDEMNED, before
3710 	 * adding the NCE.
3711 	 */
3712 	mutex_enter(&ill->ill_lock);
3713 	if (ill->ill_state_flags & ILL_CONDEMNED) {
3714 		mutex_exit(&ill->ill_lock);
3715 		err = EINVAL;
3716 		goto err_ret;
3717 	}
3718 	if ((nce->nce_next = *ncep) != NULL)
3719 		nce->nce_next->nce_ptpn = &nce->nce_next;
3720 	*ncep = nce;
3721 	nce->nce_ptpn = ncep;
3722 	*newnce = nce;
3723 	/* This one is for nce being used by an active thread */
3724 	NCE_REFHOLD(*newnce);
3725 
3726 	/* Bump up the number of nce's referencing this ill */
3727 	DTRACE_PROBE3(ill__incr__cnt, (ill_t *), ill,
3728 	    (char *), "nce", (void *), nce);
3729 	ill->ill_nce_cnt++;
3730 	mutex_exit(&ill->ill_lock);
3731 	DTRACE_PROBE1(ndp__add__v4, nce_t *, nce);
3732 	return (0);
3733 err_ret:
3734 	freeb(mp);
3735 	freemsg(template);
3736 	return (err);
3737 }
3738 
3739 /*
3740  * ndp_walk routine to delete all entries that have a given destination or
3741  * gateway address and cached link layer (MAC) address.  This is used when ARP
3742  * informs us that a network-to-link-layer mapping may have changed.
3743  */
3744 void
3745 nce_delete_hw_changed(nce_t *nce, void *arg)
3746 {
3747 	nce_hw_map_t *hwm = arg;
3748 	mblk_t *mp;
3749 	dl_unitdata_req_t *dlu;
3750 	uchar_t *macaddr;
3751 	ill_t *ill;
3752 	int saplen;
3753 	ipaddr_t nce_addr;
3754 
3755 	if (nce->nce_state != ND_REACHABLE)
3756 		return;
3757 
3758 	IN6_V4MAPPED_TO_IPADDR(&nce->nce_addr, nce_addr);
3759 	if (nce_addr != hwm->hwm_addr)
3760 		return;
3761 
3762 	mutex_enter(&nce->nce_lock);
3763 	if ((mp = nce->nce_res_mp) == NULL) {
3764 		mutex_exit(&nce->nce_lock);
3765 		return;
3766 	}
3767 	dlu = (dl_unitdata_req_t *)mp->b_rptr;
3768 	macaddr = (uchar_t *)(dlu + 1);
3769 	ill = nce->nce_ill;
3770 	if ((saplen = ill->ill_sap_length) > 0)
3771 		macaddr += saplen;
3772 	else
3773 		saplen = -saplen;
3774 
3775 	/*
3776 	 * If the hardware address is unchanged, then leave this one alone.
3777 	 * Note that saplen == abs(saplen) now.
3778 	 */
3779 	if (hwm->hwm_hwlen == dlu->dl_dest_addr_length - saplen &&
3780 	    bcmp(hwm->hwm_hwaddr, macaddr, hwm->hwm_hwlen) == 0) {
3781 		mutex_exit(&nce->nce_lock);
3782 		return;
3783 	}
3784 	mutex_exit(&nce->nce_lock);
3785 
3786 	DTRACE_PROBE1(nce__hw__deleted, nce_t *, nce);
3787 	ndp_delete(nce);
3788 }
3789 
3790 /*
3791  * This function verifies whether a given IPv4 address is potentially known to
3792  * the NCE subsystem.  If so, then ARP must not delete the corresponding ace_t,
3793  * so that it can continue to look for hardware changes on that address.
3794  */
3795 boolean_t
3796 ndp_lookup_ipaddr(in_addr_t addr, netstack_t *ns)
3797 {
3798 	nce_t		*nce;
3799 	struct in_addr	nceaddr;
3800 	ip_stack_t	*ipst = ns->netstack_ip;
3801 
3802 	if (addr == INADDR_ANY)
3803 		return (B_FALSE);
3804 
3805 	mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
3806 	nce = *(nce_t **)NCE_HASH_PTR_V4(ipst, addr);
3807 	for (; nce != NULL; nce = nce->nce_next) {
3808 		/* Note that only v4 mapped entries are in the table. */
3809 		IN6_V4MAPPED_TO_INADDR(&nce->nce_addr, &nceaddr);
3810 		if (addr == nceaddr.s_addr &&
3811 		    IN6_ARE_ADDR_EQUAL(&nce->nce_mask, &ipv6_all_ones)) {
3812 			/* Single flag check; no lock needed */
3813 			if (!(nce->nce_flags & NCE_F_CONDEMNED))
3814 				break;
3815 		}
3816 	}
3817 	mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
3818 	return (nce != NULL);
3819 }
3820 
3821 /*
3822  * Wrapper around ipif_lookup_addr_exact_v6() that allows ND to work properly
3823  * with IPMP.  Specifically, since neighbor discovery is always done on
3824  * underlying interfaces (even for addresses owned by an IPMP interface), we
3825  * need to check for `v6addrp' on both `ill' and on the IPMP meta-interface
3826  * associated with `ill' (if it exists).
3827  */
3828 static ipif_t *
3829 ip_ndp_lookup_addr_v6(const in6_addr_t *v6addrp, ill_t *ill)
3830 {
3831 	ipif_t *ipif;
3832 	ip_stack_t *ipst = ill->ill_ipst;
3833 
3834 	ipif = ipif_lookup_addr_exact_v6(v6addrp, ill, ipst);
3835 	if (ipif == NULL && IS_UNDER_IPMP(ill)) {
3836 		if ((ill = ipmp_ill_hold_ipmp_ill(ill)) != NULL) {
3837 			ipif = ipif_lookup_addr_exact_v6(v6addrp, ill, ipst);
3838 			ill_refrele(ill);
3839 		}
3840 	}
3841 	return (ipif);
3842 }
3843