xref: /titanic_41/usr/src/uts/common/inet/ip/ip_ndp.c (revision c6d6228cbba828ab5b2b6db6c280a44b2d841653)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #include <sys/types.h>
27 #include <sys/stream.h>
28 #include <sys/stropts.h>
29 #include <sys/strsun.h>
30 #include <sys/sysmacros.h>
31 #include <sys/errno.h>
32 #include <sys/dlpi.h>
33 #include <sys/socket.h>
34 #include <sys/ddi.h>
35 #include <sys/sunddi.h>
36 #include <sys/cmn_err.h>
37 #include <sys/debug.h>
38 #include <sys/vtrace.h>
39 #include <sys/kmem.h>
40 #include <sys/zone.h>
41 #include <sys/ethernet.h>
42 #include <sys/sdt.h>
43 
44 #include <net/if.h>
45 #include <net/if_types.h>
46 #include <net/if_dl.h>
47 #include <net/route.h>
48 #include <netinet/in.h>
49 #include <netinet/ip6.h>
50 #include <netinet/icmp6.h>
51 
52 #include <inet/common.h>
53 #include <inet/mi.h>
54 #include <inet/mib2.h>
55 #include <inet/nd.h>
56 #include <inet/ip.h>
57 #include <inet/ip_impl.h>
58 #include <inet/ipclassifier.h>
59 #include <inet/ip_if.h>
60 #include <inet/ip_ire.h>
61 #include <inet/ip_rts.h>
62 #include <inet/ip6.h>
63 #include <inet/ip_ndp.h>
64 #include <inet/ipsec_impl.h>
65 #include <inet/ipsec_info.h>
66 #include <inet/sctp_ip.h>
67 #include <inet/ip2mac_impl.h>
68 
69 /*
70  * Function names with nce_ prefix are static while function
71  * names with ndp_ prefix are used by rest of the IP.
72  *
73  * Lock ordering:
74  *
75  *	ndp_g_lock -> ill_lock -> nce_lock
76  *
77  * The ndp_g_lock protects the NCE hash (nce_hash_tbl, NCE_HASH_PTR) and
78  * nce_next.  Nce_lock protects the contents of the NCE (particularly
79  * nce_refcnt).
80  */
81 
82 static	boolean_t nce_cmp_ll_addr(const nce_t *nce, const uchar_t *new_ll_addr,
83     uint32_t ll_addr_len);
84 static	void	nce_ire_delete(nce_t *nce);
85 static	void	nce_ire_delete1(ire_t *ire, char *nce_arg);
86 static	void 	nce_set_ll(nce_t *nce, uchar_t *ll_addr);
87 static	nce_t	*nce_lookup_addr(ill_t *, boolean_t, const in6_addr_t *,
88     nce_t *);
89 static	nce_t	*nce_lookup_mapping(ill_t *, const in6_addr_t *);
90 static	void	nce_make_mapping(nce_t *nce, uchar_t *addrpos,
91     uchar_t *addr);
92 static	int	nce_set_multicast(ill_t *ill, const in6_addr_t *addr);
93 static	void	nce_queue_mp(nce_t *nce, mblk_t *mp);
94 static	mblk_t	*nce_udreq_alloc(ill_t *ill);
95 static	void	nce_update(nce_t *nce, uint16_t new_state,
96     uchar_t *new_ll_addr);
97 static	uint32_t	nce_solicit(nce_t *nce, in6_addr_t src);
98 static	boolean_t	nce_xmit(ill_t *ill, uint8_t type,
99     boolean_t use_lla_addr, const in6_addr_t *sender,
100     const in6_addr_t *target, int flag);
101 static boolean_t	nce_xmit_advert(nce_t *nce, boolean_t use_nd_lla,
102     const in6_addr_t *target, uint_t flags);
103 static boolean_t	nce_xmit_solicit(nce_t *nce, boolean_t use_nd_lla,
104     const in6_addr_t *src, uint_t flags);
105 static int	ndp_add_v4(ill_t *, const in_addr_t *, uint16_t,
106     nce_t **, nce_t *);
107 static ipif_t	*ip_ndp_lookup_addr_v6(const in6_addr_t *v6addrp, ill_t *ill);
108 
109 #ifdef DEBUG
110 static void	nce_trace_cleanup(const nce_t *);
111 #endif
112 
113 #define	NCE_HASH_PTR_V4(ipst, addr)					\
114 	(&((ipst)->ips_ndp4->nce_hash_tbl[IRE_ADDR_HASH(addr, NCE_TABLE_SIZE)]))
115 
116 #define	NCE_HASH_PTR_V6(ipst, addr)				 \
117 	(&((ipst)->ips_ndp6->nce_hash_tbl[NCE_ADDR_HASH_V6(addr, \
118 		NCE_TABLE_SIZE)]))
119 
120 /* Non-tunable probe interval, based on link capabilities */
121 #define	ILL_PROBE_INTERVAL(ill)	((ill)->ill_note_link ? 150 : 1500)
122 
123 /*
124  * NDP Cache Entry creation routine.
125  * Mapped entries will never do NUD .
126  * This routine must always be called with ndp6->ndp_g_lock held.
127  * Prior to return, nce_refcnt is incremented.
128  */
129 int
130 ndp_add_v6(ill_t *ill, uchar_t *hw_addr, const in6_addr_t *addr,
131     const in6_addr_t *mask, const in6_addr_t *extract_mask,
132     uint32_t hw_extract_start, uint16_t flags, uint16_t state,
133     nce_t **newnce)
134 {
135 	static	nce_t		nce_nil;
136 	nce_t		*nce;
137 	mblk_t		*mp;
138 	mblk_t		*template;
139 	nce_t		**ncep;
140 	int		err;
141 	boolean_t	dropped = B_FALSE;
142 	ip_stack_t	*ipst = ill->ill_ipst;
143 
144 	ASSERT(MUTEX_HELD(&ipst->ips_ndp6->ndp_g_lock));
145 	ASSERT(ill != NULL && ill->ill_isv6);
146 	if (IN6_IS_ADDR_UNSPECIFIED(addr)) {
147 		ip0dbg(("ndp_add_v6: no addr\n"));
148 		return (EINVAL);
149 	}
150 	if ((flags & ~NCE_EXTERNAL_FLAGS_MASK)) {
151 		ip0dbg(("ndp_add_v6: flags = %x\n", (int)flags));
152 		return (EINVAL);
153 	}
154 	if (IN6_IS_ADDR_UNSPECIFIED(extract_mask) &&
155 	    (flags & NCE_F_MAPPING)) {
156 		ip0dbg(("ndp_add_v6: extract mask zero for mapping"));
157 		return (EINVAL);
158 	}
159 	/*
160 	 * Allocate the mblk to hold the nce.
161 	 *
162 	 * XXX This can come out of a separate cache - nce_cache.
163 	 * We don't need the mp anymore as there are no more
164 	 * "qwriter"s
165 	 */
166 	mp = allocb(sizeof (nce_t), BPRI_MED);
167 	if (mp == NULL)
168 		return (ENOMEM);
169 
170 	nce = (nce_t *)mp->b_rptr;
171 	mp->b_wptr = (uchar_t *)&nce[1];
172 	*nce = nce_nil;
173 
174 	/*
175 	 * This one holds link layer address
176 	 */
177 	if (ill->ill_net_type == IRE_IF_RESOLVER) {
178 		template = nce_udreq_alloc(ill);
179 	} else {
180 		if (ill->ill_resolver_mp == NULL) {
181 			freeb(mp);
182 			return (EINVAL);
183 		}
184 		ASSERT((ill->ill_net_type == IRE_IF_NORESOLVER));
185 		template = copyb(ill->ill_resolver_mp);
186 	}
187 	if (template == NULL) {
188 		freeb(mp);
189 		return (ENOMEM);
190 	}
191 	nce->nce_ill = ill;
192 	nce->nce_ipversion = IPV6_VERSION;
193 	nce->nce_flags = flags;
194 	nce->nce_state = state;
195 	nce->nce_pcnt = ND_MAX_UNICAST_SOLICIT;
196 	nce->nce_rcnt = ill->ill_xmit_count;
197 	nce->nce_addr = *addr;
198 	nce->nce_mask = *mask;
199 	nce->nce_extract_mask = *extract_mask;
200 	nce->nce_ll_extract_start = hw_extract_start;
201 	nce->nce_fp_mp = NULL;
202 	nce->nce_res_mp = template;
203 	if (state == ND_REACHABLE)
204 		nce->nce_last = TICK_TO_MSEC(lbolt64);
205 	else
206 		nce->nce_last = 0;
207 	nce->nce_qd_mp = NULL;
208 	nce->nce_mp = mp;
209 	if (hw_addr != NULL)
210 		nce_set_ll(nce, hw_addr);
211 	/* This one is for nce getting created */
212 	nce->nce_refcnt = 1;
213 	mutex_init(&nce->nce_lock, NULL, MUTEX_DEFAULT, NULL);
214 	if (nce->nce_flags & NCE_F_MAPPING) {
215 		ASSERT(IN6_IS_ADDR_MULTICAST(addr));
216 		ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&nce->nce_mask));
217 		ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&nce->nce_extract_mask));
218 		ncep = &ipst->ips_ndp6->nce_mask_entries;
219 	} else {
220 		ncep = ((nce_t **)NCE_HASH_PTR_V6(ipst, *addr));
221 	}
222 
223 	nce->nce_trace_disable = B_FALSE;
224 
225 	list_create(&nce->nce_cb, sizeof (nce_cb_t),
226 	    offsetof(nce_cb_t, nce_cb_node));
227 	/*
228 	 * Atomically ensure that the ill is not CONDEMNED, before
229 	 * adding the NCE.
230 	 */
231 	mutex_enter(&ill->ill_lock);
232 	if (ill->ill_state_flags & ILL_CONDEMNED) {
233 		mutex_exit(&ill->ill_lock);
234 		freeb(mp);
235 		freeb(template);
236 		return (EINVAL);
237 	}
238 	if ((nce->nce_next = *ncep) != NULL)
239 		nce->nce_next->nce_ptpn = &nce->nce_next;
240 	*ncep = nce;
241 	nce->nce_ptpn = ncep;
242 	*newnce = nce;
243 	/* This one is for nce being used by an active thread */
244 	NCE_REFHOLD(*newnce);
245 
246 	/* Bump up the number of nce's referencing this ill */
247 	DTRACE_PROBE3(ill__incr__cnt, (ill_t *), ill,
248 	    (char *), "nce", (void *), nce);
249 	ill->ill_nce_cnt++;
250 	mutex_exit(&ill->ill_lock);
251 
252 	err = 0;
253 	if ((flags & NCE_F_PERMANENT) && state == ND_PROBE) {
254 		mutex_enter(&nce->nce_lock);
255 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
256 		nce->nce_pcnt = ND_MAX_UNICAST_SOLICIT;
257 		mutex_exit(&nce->nce_lock);
258 		dropped = nce_xmit_solicit(nce, B_FALSE, NULL, NDP_PROBE);
259 		if (dropped) {
260 			mutex_enter(&nce->nce_lock);
261 			nce->nce_pcnt++;
262 			mutex_exit(&nce->nce_lock);
263 		}
264 		NDP_RESTART_TIMER(nce, ILL_PROBE_INTERVAL(ill));
265 		mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
266 		err = EINPROGRESS;
267 	} else if (flags & NCE_F_UNSOL_ADV) {
268 		/*
269 		 * We account for the transmit below by assigning one
270 		 * less than the ndd variable. Subsequent decrements
271 		 * are done in ndp_timer.
272 		 */
273 		mutex_enter(&nce->nce_lock);
274 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
275 		nce->nce_unsolicit_count = ipst->ips_ip_ndp_unsolicit_count - 1;
276 		mutex_exit(&nce->nce_lock);
277 		dropped = nce_xmit_advert(nce, B_TRUE, &ipv6_all_hosts_mcast,
278 		    0);
279 		mutex_enter(&nce->nce_lock);
280 		if (dropped)
281 			nce->nce_unsolicit_count++;
282 		if (nce->nce_unsolicit_count != 0) {
283 			ASSERT(nce->nce_timeout_id == 0);
284 			nce->nce_timeout_id = timeout(ndp_timer, nce,
285 			    MSEC_TO_TICK(ipst->ips_ip_ndp_unsolicit_interval));
286 		}
287 		mutex_exit(&nce->nce_lock);
288 		mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
289 	}
290 
291 	/*
292 	 * If the hw_addr is NULL, typically for ND_INCOMPLETE nces, then
293 	 * we call nce_fastpath as soon as the nce is resolved in ndp_process.
294 	 * We call nce_fastpath from nce_update if the link layer address of
295 	 * the peer changes from nce_update
296 	 */
297 	if (hw_addr != NULL || ill->ill_net_type == IRE_IF_NORESOLVER)
298 		nce_fastpath(nce);
299 	return (err);
300 }
301 
302 int
303 ndp_lookup_then_add_v6(ill_t *ill, boolean_t match_illgrp, uchar_t *hw_addr,
304     const in6_addr_t *addr, const in6_addr_t *mask,
305     const in6_addr_t *extract_mask, uint32_t hw_extract_start, uint16_t flags,
306     uint16_t state, nce_t **newnce)
307 {
308 	int	err = 0;
309 	nce_t	*nce;
310 	ip_stack_t	*ipst = ill->ill_ipst;
311 
312 	ASSERT(ill->ill_isv6);
313 	mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
314 
315 	/* Get head of v6 hash table */
316 	nce = *((nce_t **)NCE_HASH_PTR_V6(ipst, *addr));
317 	nce = nce_lookup_addr(ill, match_illgrp, addr, nce);
318 	if (nce == NULL) {
319 		err = ndp_add_v6(ill,
320 		    hw_addr,
321 		    addr,
322 		    mask,
323 		    extract_mask,
324 		    hw_extract_start,
325 		    flags,
326 		    state,
327 		    newnce);
328 	} else {
329 		*newnce = nce;
330 		err = EEXIST;
331 	}
332 	mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
333 	return (err);
334 }
335 
336 /*
337  * Remove all the CONDEMNED nces from the appropriate hash table.
338  * We create a private list of NCEs, these may have ires pointing
339  * to them, so the list will be passed through to clean up dependent
340  * ires and only then we can do NCE_REFRELE which can make NCE inactive.
341  */
342 static void
343 nce_remove(ndp_g_t *ndp, nce_t *nce, nce_t **free_nce_list)
344 {
345 	nce_t *nce1;
346 	nce_t **ptpn;
347 
348 	ASSERT(MUTEX_HELD(&ndp->ndp_g_lock));
349 	ASSERT(ndp->ndp_g_walker == 0);
350 	for (; nce; nce = nce1) {
351 		nce1 = nce->nce_next;
352 		mutex_enter(&nce->nce_lock);
353 		if (nce->nce_flags & NCE_F_CONDEMNED) {
354 			ptpn = nce->nce_ptpn;
355 			nce1 = nce->nce_next;
356 			if (nce1 != NULL)
357 				nce1->nce_ptpn = ptpn;
358 			*ptpn = nce1;
359 			nce->nce_ptpn = NULL;
360 			nce->nce_next = NULL;
361 			nce->nce_next = *free_nce_list;
362 			*free_nce_list = nce;
363 		}
364 		mutex_exit(&nce->nce_lock);
365 	}
366 }
367 
368 /*
369  * 1. Mark the nce CONDEMNED. This ensures that no new nce_lookup()
370  *    will return this NCE. Also no new IREs will be created that
371  *    point to this NCE (See ire_add_v6).  Also no new timeouts will
372  *    be started (See NDP_RESTART_TIMER).
373  * 2. Cancel any currently running timeouts.
374  * 3. If there is an ndp walker, return. The walker will do the cleanup.
375  *    This ensures that walkers see a consistent list of NCEs while walking.
376  * 4. Otherwise remove the NCE from the list of NCEs
377  * 5. Delete all IREs pointing to this NCE.
378  */
379 void
380 ndp_delete(nce_t *nce)
381 {
382 	nce_t	**ptpn;
383 	nce_t	*nce1;
384 	int	ipversion = nce->nce_ipversion;
385 	ndp_g_t *ndp;
386 	ip_stack_t	*ipst = nce->nce_ill->ill_ipst;
387 
388 	if (ipversion == IPV4_VERSION)
389 		ndp = ipst->ips_ndp4;
390 	else
391 		ndp = ipst->ips_ndp6;
392 
393 	/* Serialize deletes */
394 	mutex_enter(&nce->nce_lock);
395 	if (nce->nce_flags & NCE_F_CONDEMNED) {
396 		/* Some other thread is doing the delete */
397 		mutex_exit(&nce->nce_lock);
398 		return;
399 	}
400 	/*
401 	 * Caller has a refhold. Also 1 ref for being in the list. Thus
402 	 * refcnt has to be >= 2
403 	 */
404 	ASSERT(nce->nce_refcnt >= 2);
405 	nce->nce_flags |= NCE_F_CONDEMNED;
406 	mutex_exit(&nce->nce_lock);
407 
408 	nce_fastpath_list_delete(nce);
409 
410 	/* Complete any waiting callbacks */
411 	nce_cb_dispatch(nce);
412 
413 	/*
414 	 * Cancel any running timer. Timeout can't be restarted
415 	 * since CONDEMNED is set. Can't hold nce_lock across untimeout.
416 	 * Passing invalid timeout id is fine.
417 	 */
418 	if (nce->nce_timeout_id != 0) {
419 		(void) untimeout(nce->nce_timeout_id);
420 		nce->nce_timeout_id = 0;
421 	}
422 
423 	mutex_enter(&ndp->ndp_g_lock);
424 	if (nce->nce_ptpn == NULL) {
425 		/*
426 		 * The last ndp walker has already removed this nce from
427 		 * the list after we marked the nce CONDEMNED and before
428 		 * we grabbed the global lock.
429 		 */
430 		mutex_exit(&ndp->ndp_g_lock);
431 		return;
432 	}
433 	if (ndp->ndp_g_walker > 0) {
434 		/*
435 		 * Can't unlink. The walker will clean up
436 		 */
437 		ndp->ndp_g_walker_cleanup = B_TRUE;
438 		mutex_exit(&ndp->ndp_g_lock);
439 		return;
440 	}
441 
442 	/*
443 	 * Now remove the nce from the list. NDP_RESTART_TIMER won't restart
444 	 * the timer since it is marked CONDEMNED.
445 	 */
446 	ptpn = nce->nce_ptpn;
447 	nce1 = nce->nce_next;
448 	if (nce1 != NULL)
449 		nce1->nce_ptpn = ptpn;
450 	*ptpn = nce1;
451 	nce->nce_ptpn = NULL;
452 	nce->nce_next = NULL;
453 	mutex_exit(&ndp->ndp_g_lock);
454 
455 	nce_ire_delete(nce);
456 }
457 
458 void
459 ndp_inactive(nce_t *nce)
460 {
461 	mblk_t		**mpp;
462 	ill_t		*ill;
463 
464 	ASSERT(nce->nce_refcnt == 0);
465 	ASSERT(MUTEX_HELD(&nce->nce_lock));
466 	ASSERT(nce->nce_fastpath == NULL);
467 
468 	/* Free all nce allocated messages */
469 	mpp = &nce->nce_first_mp_to_free;
470 	do {
471 		while (*mpp != NULL) {
472 			mblk_t  *mp;
473 
474 			mp = *mpp;
475 			*mpp = mp->b_next;
476 
477 			inet_freemsg(mp);
478 		}
479 	} while (mpp++ != &nce->nce_last_mp_to_free);
480 
481 	if (nce->nce_ipversion == IPV6_VERSION) {
482 		/*
483 		 * must have been cleaned up in nce_delete
484 		 */
485 		ASSERT(list_is_empty(&nce->nce_cb));
486 		list_destroy(&nce->nce_cb);
487 	}
488 #ifdef DEBUG
489 	nce_trace_cleanup(nce);
490 #endif
491 
492 	ill = nce->nce_ill;
493 	mutex_enter(&ill->ill_lock);
494 	DTRACE_PROBE3(ill__decr__cnt, (ill_t *), ill,
495 	    (char *), "nce", (void *), nce);
496 	ill->ill_nce_cnt--;
497 	/*
498 	 * If the number of nce's associated with this ill have dropped
499 	 * to zero, check whether we need to restart any operation that
500 	 * is waiting for this to happen.
501 	 */
502 	if (ILL_DOWN_OK(ill)) {
503 		/* ipif_ill_refrele_tail drops the ill_lock */
504 		ipif_ill_refrele_tail(ill);
505 	} else {
506 		mutex_exit(&ill->ill_lock);
507 	}
508 	mutex_destroy(&nce->nce_lock);
509 	if (nce->nce_mp != NULL)
510 		inet_freemsg(nce->nce_mp);
511 }
512 
513 /*
514  * ndp_walk routine.  Delete the nce if it is associated with the ill
515  * that is going away.  Always called as a writer.
516  */
517 void
518 ndp_delete_per_ill(nce_t *nce, uchar_t *arg)
519 {
520 	if ((nce != NULL) && nce->nce_ill == (ill_t *)arg) {
521 		ndp_delete(nce);
522 	}
523 }
524 
525 /*
526  * Walk a list of to be inactive NCEs and blow away all the ires.
527  */
528 static void
529 nce_ire_delete_list(nce_t *nce)
530 {
531 	nce_t *nce_next;
532 
533 	ASSERT(nce != NULL);
534 	while (nce != NULL) {
535 		nce_next = nce->nce_next;
536 		nce->nce_next = NULL;
537 
538 		/*
539 		 * It is possible for the last ndp walker (this thread)
540 		 * to come here after ndp_delete has marked the nce CONDEMNED
541 		 * and before it has removed the nce from the fastpath list
542 		 * or called untimeout. So we need to do it here. It is safe
543 		 * for both ndp_delete and this thread to do it twice or
544 		 * even simultaneously since each of the threads has a
545 		 * reference on the nce.
546 		 */
547 		nce_fastpath_list_delete(nce);
548 		/*
549 		 * Cancel any running timer. Timeout can't be restarted
550 		 * since CONDEMNED is set. Can't hold nce_lock across untimeout.
551 		 * Passing invalid timeout id is fine.
552 		 */
553 		if (nce->nce_timeout_id != 0) {
554 			(void) untimeout(nce->nce_timeout_id);
555 			nce->nce_timeout_id = 0;
556 		}
557 		/*
558 		 * We might hit this func thus in the v4 case:
559 		 * ipif_down->ipif_ndp_down->ndp_walk
560 		 */
561 
562 		if (nce->nce_ipversion == IPV4_VERSION) {
563 			ire_walk_ill_v4(MATCH_IRE_ILL | MATCH_IRE_TYPE,
564 			    IRE_CACHE, nce_ire_delete1, nce, nce->nce_ill);
565 		} else {
566 			ASSERT(nce->nce_ipversion == IPV6_VERSION);
567 			ire_walk_ill_v6(MATCH_IRE_ILL | MATCH_IRE_TYPE,
568 			    IRE_CACHE, nce_ire_delete1, nce, nce->nce_ill);
569 		}
570 		NCE_REFRELE_NOTR(nce);
571 		nce = nce_next;
572 	}
573 }
574 
575 /*
576  * Delete an ire when the nce goes away.
577  */
578 /* ARGSUSED */
579 static void
580 nce_ire_delete(nce_t *nce)
581 {
582 	if (nce->nce_ipversion == IPV6_VERSION) {
583 		ire_walk_ill_v6(MATCH_IRE_ILL | MATCH_IRE_TYPE, IRE_CACHE,
584 		    nce_ire_delete1, (char *)nce, nce->nce_ill);
585 		NCE_REFRELE_NOTR(nce);
586 	} else {
587 		ire_walk_ill_v4(MATCH_IRE_ILL | MATCH_IRE_TYPE, IRE_CACHE,
588 		    nce_ire_delete1, (char *)nce, nce->nce_ill);
589 		NCE_REFRELE_NOTR(nce);
590 	}
591 }
592 
593 /*
594  * ire_walk routine used to delete every IRE that shares this nce
595  */
596 static void
597 nce_ire_delete1(ire_t *ire, char *nce_arg)
598 {
599 	nce_t	*nce = (nce_t *)nce_arg;
600 
601 	ASSERT(ire->ire_type == IRE_CACHE);
602 
603 	if (ire->ire_nce == nce) {
604 		ASSERT(ire->ire_ipversion == nce->nce_ipversion);
605 		ire_delete(ire);
606 	}
607 }
608 
609 /*
610  * Restart DAD on given NCE.  Returns B_TRUE if DAD has been restarted.
611  */
612 boolean_t
613 ndp_restart_dad(nce_t *nce)
614 {
615 	boolean_t started;
616 	boolean_t dropped;
617 
618 	if (nce == NULL)
619 		return (B_FALSE);
620 	mutex_enter(&nce->nce_lock);
621 	if (nce->nce_state == ND_PROBE) {
622 		mutex_exit(&nce->nce_lock);
623 		started = B_TRUE;
624 	} else if (nce->nce_state == ND_REACHABLE) {
625 		nce->nce_state = ND_PROBE;
626 		nce->nce_pcnt = ND_MAX_UNICAST_SOLICIT - 1;
627 		mutex_exit(&nce->nce_lock);
628 		dropped = nce_xmit_solicit(nce, B_FALSE, NULL, NDP_PROBE);
629 		if (dropped) {
630 			mutex_enter(&nce->nce_lock);
631 			nce->nce_pcnt++;
632 			mutex_exit(&nce->nce_lock);
633 		}
634 		NDP_RESTART_TIMER(nce, ILL_PROBE_INTERVAL(nce->nce_ill));
635 		started = B_TRUE;
636 	} else {
637 		mutex_exit(&nce->nce_lock);
638 		started = B_FALSE;
639 	}
640 	return (started);
641 }
642 
643 /*
644  * IPv6 Cache entry lookup.  Try to find an nce matching the parameters passed.
645  * If one is found, the refcnt on the nce will be incremented.
646  */
647 nce_t *
648 ndp_lookup_v6(ill_t *ill, boolean_t match_illgrp, const in6_addr_t *addr,
649     boolean_t caller_holds_lock)
650 {
651 	nce_t	*nce;
652 	ip_stack_t *ipst = ill->ill_ipst;
653 
654 	ASSERT(ill->ill_isv6);
655 	if (!caller_holds_lock)
656 		mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
657 
658 	/* Get head of v6 hash table */
659 	nce = *((nce_t **)NCE_HASH_PTR_V6(ipst, *addr));
660 	nce = nce_lookup_addr(ill, match_illgrp, addr, nce);
661 	if (nce == NULL)
662 		nce = nce_lookup_mapping(ill, addr);
663 	if (!caller_holds_lock)
664 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
665 	return (nce);
666 }
667 /*
668  * IPv4 Cache entry lookup.  Try to find an nce matching the parameters passed.
669  * If one is found, the refcnt on the nce will be incremented.
670  * Since multicast mappings are handled in arp, there are no nce_mcast_entries
671  * so we skip the nce_lookup_mapping call.
672  * XXX TODO: if the nce is found to be ND_STALE, ndp_delete it and return NULL
673  */
674 nce_t *
675 ndp_lookup_v4(ill_t *ill, const in_addr_t *addr, boolean_t caller_holds_lock)
676 {
677 	nce_t	*nce;
678 	in6_addr_t addr6;
679 	ip_stack_t *ipst = ill->ill_ipst;
680 
681 	if (!caller_holds_lock)
682 		mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
683 
684 	/* Get head of v4 hash table */
685 	nce = *((nce_t **)NCE_HASH_PTR_V4(ipst, *addr));
686 	IN6_IPADDR_TO_V4MAPPED(*addr, &addr6);
687 	/*
688 	 * NOTE: IPv4 never matches across the illgrp since the NCE's we're
689 	 * looking up have fastpath headers that are inherently per-ill.
690 	 */
691 	nce = nce_lookup_addr(ill, B_FALSE, &addr6, nce);
692 	if (!caller_holds_lock)
693 		mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
694 	return (nce);
695 }
696 
697 /*
698  * Cache entry lookup.  Try to find an nce matching the parameters passed.
699  * Look only for exact entries (no mappings).  If an nce is found, increment
700  * the hold count on that nce. The caller passes in the start of the
701  * appropriate hash table, and must be holding the appropriate global
702  * lock (ndp_g_lock).
703  */
704 static nce_t *
705 nce_lookup_addr(ill_t *ill, boolean_t match_illgrp, const in6_addr_t *addr,
706     nce_t *nce)
707 {
708 	ndp_g_t		*ndp;
709 	ip_stack_t	*ipst = ill->ill_ipst;
710 
711 	if (ill->ill_isv6)
712 		ndp = ipst->ips_ndp6;
713 	else
714 		ndp = ipst->ips_ndp4;
715 
716 	ASSERT(MUTEX_HELD(&ndp->ndp_g_lock));
717 	if (IN6_IS_ADDR_UNSPECIFIED(addr))
718 		return (NULL);
719 	for (; nce != NULL; nce = nce->nce_next) {
720 		if (nce->nce_ill == ill ||
721 		    match_illgrp && IS_IN_SAME_ILLGRP(ill, nce->nce_ill)) {
722 			if (IN6_ARE_ADDR_EQUAL(&nce->nce_addr, addr) &&
723 			    IN6_ARE_ADDR_EQUAL(&nce->nce_mask,
724 			    &ipv6_all_ones)) {
725 				mutex_enter(&nce->nce_lock);
726 				if (!(nce->nce_flags & NCE_F_CONDEMNED)) {
727 					NCE_REFHOLD_LOCKED(nce);
728 					mutex_exit(&nce->nce_lock);
729 					break;
730 				}
731 				mutex_exit(&nce->nce_lock);
732 			}
733 		}
734 	}
735 	return (nce);
736 }
737 
738 /*
739  * Cache entry lookup.  Try to find an nce matching the parameters passed.
740  * Look only for mappings.
741  */
742 static nce_t *
743 nce_lookup_mapping(ill_t *ill, const in6_addr_t *addr)
744 {
745 	nce_t	*nce;
746 	ip_stack_t	*ipst = ill->ill_ipst;
747 
748 	ASSERT(ill != NULL && ill->ill_isv6);
749 	ASSERT(MUTEX_HELD(&ipst->ips_ndp6->ndp_g_lock));
750 	if (!IN6_IS_ADDR_MULTICAST(addr))
751 		return (NULL);
752 	nce = ipst->ips_ndp6->nce_mask_entries;
753 	for (; nce != NULL; nce = nce->nce_next)
754 		if (nce->nce_ill == ill &&
755 		    (V6_MASK_EQ(*addr, nce->nce_mask, nce->nce_addr))) {
756 			mutex_enter(&nce->nce_lock);
757 			if (!(nce->nce_flags & NCE_F_CONDEMNED)) {
758 				NCE_REFHOLD_LOCKED(nce);
759 				mutex_exit(&nce->nce_lock);
760 				break;
761 			}
762 			mutex_exit(&nce->nce_lock);
763 		}
764 	return (nce);
765 }
766 
767 /*
768  * Process passed in parameters either from an incoming packet or via
769  * user ioctl.
770  */
771 static void
772 nce_process(nce_t *nce, uchar_t *hw_addr, uint32_t flag, boolean_t is_adv)
773 {
774 	ill_t	*ill = nce->nce_ill;
775 	uint32_t hw_addr_len = ill->ill_nd_lla_len;
776 	mblk_t	*mp;
777 	boolean_t ll_updated = B_FALSE;
778 	boolean_t ll_changed;
779 	ip_stack_t	*ipst = ill->ill_ipst;
780 
781 	ASSERT(nce->nce_ipversion == IPV6_VERSION);
782 	/*
783 	 * No updates of link layer address or the neighbor state is
784 	 * allowed, when the cache is in NONUD state.  This still
785 	 * allows for responding to reachability solicitation.
786 	 */
787 	mutex_enter(&nce->nce_lock);
788 	if (nce->nce_state == ND_INCOMPLETE) {
789 		if (hw_addr == NULL) {
790 			mutex_exit(&nce->nce_lock);
791 			return;
792 		}
793 		nce_set_ll(nce, hw_addr);
794 		/*
795 		 * Update nce state and send the queued packets
796 		 * back to ip this time ire will be added.
797 		 */
798 		if (flag & ND_NA_FLAG_SOLICITED) {
799 			nce_update(nce, ND_REACHABLE, NULL);
800 		} else {
801 			nce_update(nce, ND_STALE, NULL);
802 		}
803 		mutex_exit(&nce->nce_lock);
804 		nce_fastpath(nce);
805 		nce_cb_dispatch(nce); /* complete callbacks */
806 		mutex_enter(&nce->nce_lock);
807 		mp = nce->nce_qd_mp;
808 		nce->nce_qd_mp = NULL;
809 		mutex_exit(&nce->nce_lock);
810 		while (mp != NULL) {
811 			mblk_t *nxt_mp, *data_mp;
812 
813 			nxt_mp = mp->b_next;
814 			mp->b_next = NULL;
815 
816 			if (mp->b_datap->db_type == M_CTL)
817 				data_mp = mp->b_cont;
818 			else
819 				data_mp = mp;
820 			if (data_mp->b_prev != NULL) {
821 				ill_t   *inbound_ill;
822 				queue_t *fwdq = NULL;
823 				uint_t ifindex;
824 
825 				ifindex = (uint_t)(uintptr_t)data_mp->b_prev;
826 				inbound_ill = ill_lookup_on_ifindex(ifindex,
827 				    B_TRUE, NULL, NULL, NULL, NULL, ipst);
828 				if (inbound_ill == NULL) {
829 					data_mp->b_prev = NULL;
830 					freemsg(mp);
831 					return;
832 				} else {
833 					fwdq = inbound_ill->ill_rq;
834 				}
835 				data_mp->b_prev = NULL;
836 				/*
837 				 * Send a forwarded packet back into ip_rput_v6
838 				 * just as in ire_send_v6().
839 				 * Extract the queue from b_prev (set in
840 				 * ip_rput_data_v6).
841 				 */
842 				if (fwdq != NULL) {
843 					/*
844 					 * Forwarded packets hop count will
845 					 * get decremented in ip_rput_data_v6
846 					 */
847 					if (data_mp != mp)
848 						freeb(mp);
849 					put(fwdq, data_mp);
850 				} else {
851 					/*
852 					 * Send locally originated packets back
853 					 * into ip_wput_v6.
854 					 */
855 					put(ill->ill_wq, mp);
856 				}
857 				ill_refrele(inbound_ill);
858 			} else {
859 				put(ill->ill_wq, mp);
860 			}
861 			mp = nxt_mp;
862 		}
863 		return;
864 	}
865 	ll_changed = nce_cmp_ll_addr(nce, hw_addr, hw_addr_len);
866 	if (!is_adv) {
867 		/* If this is a SOLICITATION request only */
868 		if (ll_changed)
869 			nce_update(nce, ND_STALE, hw_addr);
870 		mutex_exit(&nce->nce_lock);
871 		nce_cb_dispatch(nce);
872 		return;
873 	}
874 	if (!(flag & ND_NA_FLAG_OVERRIDE) && ll_changed) {
875 		/* If in any other state than REACHABLE, ignore */
876 		if (nce->nce_state == ND_REACHABLE) {
877 			nce_update(nce, ND_STALE, NULL);
878 		}
879 		mutex_exit(&nce->nce_lock);
880 		nce_cb_dispatch(nce);
881 		return;
882 	} else {
883 		if (ll_changed) {
884 			nce_update(nce, ND_UNCHANGED, hw_addr);
885 			ll_updated = B_TRUE;
886 		}
887 		if (flag & ND_NA_FLAG_SOLICITED) {
888 			nce_update(nce, ND_REACHABLE, NULL);
889 		} else {
890 			if (ll_updated) {
891 				nce_update(nce, ND_STALE, NULL);
892 			}
893 		}
894 		mutex_exit(&nce->nce_lock);
895 		if (!(flag & ND_NA_FLAG_ROUTER) && (nce->nce_flags &
896 		    NCE_F_ISROUTER)) {
897 			ire_t *ire;
898 
899 			/*
900 			 * Router turned to host.  We need to remove the
901 			 * entry as well as any default route that may be
902 			 * using this as a next hop.  This is required by
903 			 * section 7.2.5 of RFC 2461.
904 			 */
905 			ire = ire_ftable_lookup_v6(&ipv6_all_zeros,
906 			    &ipv6_all_zeros, &nce->nce_addr, IRE_DEFAULT,
907 			    nce->nce_ill->ill_ipif, NULL, ALL_ZONES, 0, NULL,
908 			    MATCH_IRE_ILL | MATCH_IRE_TYPE | MATCH_IRE_GW |
909 			    MATCH_IRE_DEFAULT, ipst);
910 			if (ire != NULL) {
911 				ip_rts_rtmsg(RTM_DELETE, ire, 0, ipst);
912 				ire_delete(ire);
913 				ire_refrele(ire);
914 			}
915 			ndp_delete(nce); /* will do nce_cb_dispatch */
916 		} else {
917 			nce_cb_dispatch(nce);
918 		}
919 	}
920 }
921 
922 /*
923  * Walker state structure used by ndp_process() / ndp_process_entry().
924  */
925 typedef struct ndp_process_data {
926 	ill_t		*np_ill; 	/* ill/illgrp to match against */
927 	const in6_addr_t *np_addr; 	/* IPv6 address to match */
928 	uchar_t		*np_hw_addr; 	/* passed to nce_process() */
929 	uint32_t	np_flag;	/* passed to nce_process() */
930 	boolean_t	np_is_adv;	/* passed to nce_process() */
931 } ndp_process_data_t;
932 
933 /*
934  * Walker callback used by ndp_process() for IPMP groups: calls nce_process()
935  * for each NCE with a matching address that's in the same IPMP group.
936  */
937 static void
938 ndp_process_entry(nce_t *nce, void *arg)
939 {
940 	ndp_process_data_t *npp = arg;
941 
942 	if (IS_IN_SAME_ILLGRP(nce->nce_ill, npp->np_ill) &&
943 	    IN6_ARE_ADDR_EQUAL(&nce->nce_addr, npp->np_addr) &&
944 	    IN6_ARE_ADDR_EQUAL(&nce->nce_mask, &ipv6_all_ones)) {
945 		nce_process(nce, npp->np_hw_addr, npp->np_flag, npp->np_is_adv);
946 	}
947 }
948 
949 /*
950  * Wrapper around nce_process() that handles IPMP.  In particular, for IPMP,
951  * NCEs are per-underlying-ill (because of nce_fp_mp) and thus we may have
952  * more than one NCE for a given IPv6 address to tend to.  In that case, we
953  * need to walk all NCEs and callback nce_process() for each one.  Since this
954  * is expensive, in the non-IPMP case we just directly call nce_process().
955  * Ultimately, nce_fp_mp needs to be moved out of the nce_t so that all IP
956  * interfaces in an IPMP group share the same NCEs -- at which point this
957  * function can be removed entirely.
958  */
959 void
960 ndp_process(nce_t *nce, uchar_t *hw_addr, uint32_t flag, boolean_t is_adv)
961 {
962 	ill_t *ill = nce->nce_ill;
963 	struct ndp_g_s *ndp = ill->ill_ipst->ips_ndp6;
964 	ndp_process_data_t np;
965 
966 	if (ill->ill_grp == NULL) {
967 		nce_process(nce, hw_addr, flag, is_adv);
968 		return;
969 	}
970 
971 	/* IPMP case: walk all NCEs */
972 	np.np_ill = ill;
973 	np.np_addr = &nce->nce_addr;
974 	np.np_flag = flag;
975 	np.np_is_adv = is_adv;
976 	np.np_hw_addr = hw_addr;
977 
978 	ndp_walk_common(ndp, NULL, (pfi_t)ndp_process_entry, &np, ALL_ZONES);
979 }
980 
981 /*
982  * Pass arg1 to the pfi supplied, along with each nce in existence.
983  * ndp_walk() places a REFHOLD on the nce and drops the lock when
984  * walking the hash list.
985  */
986 void
987 ndp_walk_common(ndp_g_t *ndp, ill_t *ill, pfi_t pfi, void *arg1,
988     boolean_t trace)
989 {
990 	nce_t	*nce;
991 	nce_t	*nce1;
992 	nce_t	**ncep;
993 	nce_t	*free_nce_list = NULL;
994 
995 	mutex_enter(&ndp->ndp_g_lock);
996 	/* Prevent ndp_delete from unlink and free of NCE */
997 	ndp->ndp_g_walker++;
998 	mutex_exit(&ndp->ndp_g_lock);
999 	for (ncep = ndp->nce_hash_tbl;
1000 	    ncep < A_END(ndp->nce_hash_tbl); ncep++) {
1001 		for (nce = *ncep; nce != NULL; nce = nce1) {
1002 			nce1 = nce->nce_next;
1003 			if (ill == NULL || nce->nce_ill == ill) {
1004 				if (trace) {
1005 					NCE_REFHOLD(nce);
1006 					(*pfi)(nce, arg1);
1007 					NCE_REFRELE(nce);
1008 				} else {
1009 					NCE_REFHOLD_NOTR(nce);
1010 					(*pfi)(nce, arg1);
1011 					NCE_REFRELE_NOTR(nce);
1012 				}
1013 			}
1014 		}
1015 	}
1016 	for (nce = ndp->nce_mask_entries; nce != NULL; nce = nce1) {
1017 		nce1 = nce->nce_next;
1018 		if (ill == NULL || nce->nce_ill == ill) {
1019 			if (trace) {
1020 				NCE_REFHOLD(nce);
1021 				(*pfi)(nce, arg1);
1022 				NCE_REFRELE(nce);
1023 			} else {
1024 				NCE_REFHOLD_NOTR(nce);
1025 				(*pfi)(nce, arg1);
1026 				NCE_REFRELE_NOTR(nce);
1027 			}
1028 		}
1029 	}
1030 	mutex_enter(&ndp->ndp_g_lock);
1031 	ndp->ndp_g_walker--;
1032 	/*
1033 	 * While NCE's are removed from global list they are placed
1034 	 * in a private list, to be passed to nce_ire_delete_list().
1035 	 * The reason is, there may be ires pointing to this nce
1036 	 * which needs to cleaned up.
1037 	 */
1038 	if (ndp->ndp_g_walker_cleanup && ndp->ndp_g_walker == 0) {
1039 		/* Time to delete condemned entries */
1040 		for (ncep = ndp->nce_hash_tbl;
1041 		    ncep < A_END(ndp->nce_hash_tbl); ncep++) {
1042 			nce = *ncep;
1043 			if (nce != NULL) {
1044 				nce_remove(ndp, nce, &free_nce_list);
1045 			}
1046 		}
1047 		nce = ndp->nce_mask_entries;
1048 		if (nce != NULL) {
1049 			nce_remove(ndp, nce, &free_nce_list);
1050 		}
1051 		ndp->ndp_g_walker_cleanup = B_FALSE;
1052 	}
1053 
1054 	mutex_exit(&ndp->ndp_g_lock);
1055 
1056 	if (free_nce_list != NULL) {
1057 		nce_ire_delete_list(free_nce_list);
1058 	}
1059 }
1060 
1061 /*
1062  * Walk everything.
1063  * Note that ill can be NULL hence can't derive the ipst from it.
1064  */
1065 void
1066 ndp_walk(ill_t *ill, pfi_t pfi, void *arg1, ip_stack_t *ipst)
1067 {
1068 	ndp_walk_common(ipst->ips_ndp4, ill, pfi, arg1, B_TRUE);
1069 	ndp_walk_common(ipst->ips_ndp6, ill, pfi, arg1, B_TRUE);
1070 }
1071 
1072 /*
1073  * Process resolve requests.  Handles both mapped entries
1074  * as well as cases that needs to be send out on the wire.
1075  * Lookup a NCE for a given IRE.  Regardless of whether one exists
1076  * or one is created, we defer making ire point to nce until the
1077  * ire is actually added at which point the nce_refcnt on the nce is
1078  * incremented.  This is done primarily to have symmetry between ire_add()
1079  * and ire_delete() which decrements the nce_refcnt, when an ire is deleted.
1080  */
1081 int
1082 ndp_resolver(ill_t *ill, const in6_addr_t *dst, mblk_t *mp, zoneid_t zoneid)
1083 {
1084 	nce_t		*nce, *hw_nce = NULL;
1085 	int		err;
1086 	ill_t		*ipmp_ill;
1087 	uint16_t	nce_flags;
1088 	mblk_t		*mp_nce = NULL;
1089 	ip_stack_t	*ipst = ill->ill_ipst;
1090 	uchar_t		*hwaddr = NULL;
1091 
1092 	ASSERT(ill->ill_isv6);
1093 
1094 	if (IN6_IS_ADDR_MULTICAST(dst))
1095 		return (nce_set_multicast(ill, dst));
1096 
1097 	nce_flags = (ill->ill_flags & ILLF_NONUD) ? NCE_F_NONUD : 0;
1098 
1099 	/*
1100 	 * If `ill' is under IPMP, then first check to see if there's an NCE
1101 	 * for `dst' on the IPMP meta-interface (e.g., because an application
1102 	 * explicitly did an SIOCLIFSETND to tie a hardware address to `dst').
1103 	 * If so, we use that hardware address when creating the NCE below.
1104 	 * Note that we don't yet have a mechanism to remove these NCEs if the
1105 	 * NCE for `dst' on the IPMP meta-interface is subsequently removed --
1106 	 * but rather than build such a beast, we should fix NCEs so that they
1107 	 * can be properly shared across an IPMP group.
1108 	 */
1109 	if (IS_UNDER_IPMP(ill)) {
1110 		if ((ipmp_ill = ipmp_ill_hold_ipmp_ill(ill)) != NULL) {
1111 			hw_nce = ndp_lookup_v6(ipmp_ill, B_FALSE, dst, B_FALSE);
1112 			if (hw_nce != NULL && hw_nce->nce_res_mp != NULL) {
1113 				hwaddr = hw_nce->nce_res_mp->b_rptr +
1114 				    NCE_LL_ADDR_OFFSET(ipmp_ill);
1115 				nce_flags |= hw_nce->nce_flags;
1116 			}
1117 			ill_refrele(ipmp_ill);
1118 		}
1119 	}
1120 
1121 	err = ndp_lookup_then_add_v6(ill,
1122 	    B_FALSE,	/* NCE fastpath is per ill; don't match across group */
1123 	    hwaddr,
1124 	    dst,
1125 	    &ipv6_all_ones,
1126 	    &ipv6_all_zeros,
1127 	    0,
1128 	    nce_flags,
1129 	    hwaddr != NULL ? ND_REACHABLE : ND_INCOMPLETE,
1130 	    &nce);
1131 
1132 	if (hw_nce != NULL)
1133 		NCE_REFRELE(hw_nce);
1134 
1135 	switch (err) {
1136 	case 0:
1137 		/*
1138 		 * New cache entry was created. Make sure that the state
1139 		 * is not ND_INCOMPLETE. It can be in some other state
1140 		 * even before we send out the solicitation as we could
1141 		 * get un-solicited advertisements.
1142 		 *
1143 		 * If this is an XRESOLV interface, simply return 0,
1144 		 * since we don't want to solicit just yet.
1145 		 */
1146 		if (ill->ill_flags & ILLF_XRESOLV) {
1147 			NCE_REFRELE(nce);
1148 			return (0);
1149 		}
1150 
1151 		mutex_enter(&nce->nce_lock);
1152 		if (nce->nce_state != ND_INCOMPLETE) {
1153 			mutex_exit(&nce->nce_lock);
1154 			NCE_REFRELE(nce);
1155 			return (0);
1156 		}
1157 		if (nce->nce_rcnt == 0) {
1158 			/* The caller will free mp */
1159 			mutex_exit(&nce->nce_lock);
1160 			ndp_delete(nce);
1161 			NCE_REFRELE(nce);
1162 			return (ESRCH);
1163 		}
1164 		mp_nce = ip_prepend_zoneid(mp, zoneid, ipst);
1165 		if (mp_nce == NULL) {
1166 			/* The caller will free mp */
1167 			mutex_exit(&nce->nce_lock);
1168 			ndp_delete(nce);
1169 			NCE_REFRELE(nce);
1170 			return (ENOMEM);
1171 		}
1172 		nce_queue_mp(nce, mp_nce);
1173 		ip_ndp_resolve(nce);
1174 		mutex_exit(&nce->nce_lock);
1175 		NCE_REFRELE(nce);
1176 		return (EINPROGRESS);
1177 	case EEXIST:
1178 		/* Resolution in progress just queue the packet */
1179 		mutex_enter(&nce->nce_lock);
1180 		if (nce->nce_state == ND_INCOMPLETE) {
1181 			mp_nce = ip_prepend_zoneid(mp, zoneid, ipst);
1182 			if (mp_nce == NULL) {
1183 				err = ENOMEM;
1184 			} else {
1185 				nce_queue_mp(nce, mp_nce);
1186 				err = EINPROGRESS;
1187 			}
1188 		} else {
1189 			/*
1190 			 * Any other state implies we have
1191 			 * a nce but IRE needs to be added ...
1192 			 * ire_add_v6() will take care of the
1193 			 * the case when the nce becomes CONDEMNED
1194 			 * before the ire is added to the table.
1195 			 */
1196 			err = 0;
1197 		}
1198 		mutex_exit(&nce->nce_lock);
1199 		NCE_REFRELE(nce);
1200 		break;
1201 	default:
1202 		ip1dbg(("ndp_resolver: Can't create NCE %d\n", err));
1203 		break;
1204 	}
1205 	return (err);
1206 }
1207 
1208 /*
1209  * When there is no resolver, the link layer template is passed in
1210  * the IRE.
1211  * Lookup a NCE for a given IRE.  Regardless of whether one exists
1212  * or one is created, we defer making ire point to nce until the
1213  * ire is actually added at which point the nce_refcnt on the nce is
1214  * incremented.  This is done primarily to have symmetry between ire_add()
1215  * and ire_delete() which decrements the nce_refcnt, when an ire is deleted.
1216  */
1217 int
1218 ndp_noresolver(ill_t *ill, const in6_addr_t *dst)
1219 {
1220 	nce_t		*nce;
1221 	int		err = 0;
1222 
1223 	ASSERT(ill != NULL);
1224 	ASSERT(ill->ill_isv6);
1225 	if (IN6_IS_ADDR_MULTICAST(dst)) {
1226 		err = nce_set_multicast(ill, dst);
1227 		return (err);
1228 	}
1229 
1230 	err = ndp_lookup_then_add_v6(ill,
1231 	    B_FALSE,	/* NCE fastpath is per ill; don't match across group */
1232 	    NULL,	/* hardware address */
1233 	    dst,
1234 	    &ipv6_all_ones,
1235 	    &ipv6_all_zeros,
1236 	    0,
1237 	    (ill->ill_flags & ILLF_NONUD) ? NCE_F_NONUD : 0,
1238 	    ND_REACHABLE,
1239 	    &nce);
1240 
1241 	switch (err) {
1242 	case 0:
1243 		/*
1244 		 * Cache entry with a proper resolver cookie was
1245 		 * created.
1246 		 */
1247 		NCE_REFRELE(nce);
1248 		break;
1249 	case EEXIST:
1250 		err = 0;
1251 		NCE_REFRELE(nce);
1252 		break;
1253 	default:
1254 		ip1dbg(("ndp_noresolver: Can't create NCE %d\n", err));
1255 		break;
1256 	}
1257 	return (err);
1258 }
1259 
1260 /*
1261  * For each interface an entry is added for the unspecified multicast group.
1262  * Here that mapping is used to form the multicast cache entry for a particular
1263  * multicast destination.
1264  */
1265 static int
1266 nce_set_multicast(ill_t *ill, const in6_addr_t *dst)
1267 {
1268 	nce_t		*mnce;	/* Multicast mapping entry */
1269 	nce_t		*nce;
1270 	uchar_t		*hw_addr = NULL;
1271 	int		err = 0;
1272 	ip_stack_t	*ipst = ill->ill_ipst;
1273 
1274 	ASSERT(ill != NULL);
1275 	ASSERT(ill->ill_isv6);
1276 	ASSERT(!(IN6_IS_ADDR_UNSPECIFIED(dst)));
1277 
1278 	mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
1279 	nce = *((nce_t **)NCE_HASH_PTR_V6(ipst, *dst));
1280 	nce = nce_lookup_addr(ill, B_FALSE, dst, nce);
1281 	if (nce != NULL) {
1282 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
1283 		NCE_REFRELE(nce);
1284 		return (0);
1285 	}
1286 	/* No entry, now lookup for a mapping this should never fail */
1287 	mnce = nce_lookup_mapping(ill, dst);
1288 	if (mnce == NULL) {
1289 		/* Something broken for the interface. */
1290 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
1291 		return (ESRCH);
1292 	}
1293 	ASSERT(mnce->nce_flags & NCE_F_MAPPING);
1294 	if (ill->ill_net_type == IRE_IF_RESOLVER) {
1295 		/*
1296 		 * For IRE_IF_RESOLVER a hardware mapping can be
1297 		 * generated, for IRE_IF_NORESOLVER, resolution cookie
1298 		 * in the ill is copied in ndp_add_v6().
1299 		 */
1300 		hw_addr = kmem_alloc(ill->ill_nd_lla_len, KM_NOSLEEP);
1301 		if (hw_addr == NULL) {
1302 			mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
1303 			NCE_REFRELE(mnce);
1304 			return (ENOMEM);
1305 		}
1306 		nce_make_mapping(mnce, hw_addr, (uchar_t *)dst);
1307 	}
1308 	NCE_REFRELE(mnce);
1309 	/*
1310 	 * IRE_IF_NORESOLVER type simply copies the resolution
1311 	 * cookie passed in.  So no hw_addr is needed.
1312 	 */
1313 	err = ndp_add_v6(ill,
1314 	    hw_addr,
1315 	    dst,
1316 	    &ipv6_all_ones,
1317 	    &ipv6_all_zeros,
1318 	    0,
1319 	    NCE_F_NONUD,
1320 	    ND_REACHABLE,
1321 	    &nce);
1322 	mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
1323 	if (hw_addr != NULL)
1324 		kmem_free(hw_addr, ill->ill_nd_lla_len);
1325 	if (err != 0) {
1326 		ip1dbg(("nce_set_multicast: create failed" "%d\n", err));
1327 		return (err);
1328 	}
1329 	NCE_REFRELE(nce);
1330 	return (0);
1331 }
1332 
1333 /*
1334  * Return the link layer address, and any flags of a nce.
1335  */
1336 int
1337 ndp_query(ill_t *ill, struct lif_nd_req *lnr)
1338 {
1339 	nce_t		*nce;
1340 	in6_addr_t	*addr;
1341 	sin6_t		*sin6;
1342 	dl_unitdata_req_t	*dl;
1343 
1344 	ASSERT(ill != NULL && ill->ill_isv6);
1345 	sin6 = (sin6_t *)&lnr->lnr_addr;
1346 	addr =  &sin6->sin6_addr;
1347 
1348 	/*
1349 	 * NOTE: if the ill is an IPMP interface, then match against the whole
1350 	 * illgrp.  This e.g. allows in.ndpd to retrieve the link layer
1351 	 * addresses for the data addresses on an IPMP interface even though
1352 	 * ipif_ndp_up() created them with an nce_ill of ipif_bound_ill.
1353 	 */
1354 	nce = ndp_lookup_v6(ill, IS_IPMP(ill), addr, B_FALSE);
1355 	if (nce == NULL)
1356 		return (ESRCH);
1357 	/* If in INCOMPLETE state, no link layer address is available yet */
1358 	if (!NCE_ISREACHABLE(nce)) {
1359 		NCE_REFRELE(nce);
1360 		return (ESRCH);
1361 	}
1362 	dl = (dl_unitdata_req_t *)nce->nce_res_mp->b_rptr;
1363 	if (ill->ill_flags & ILLF_XRESOLV)
1364 		lnr->lnr_hdw_len = dl->dl_dest_addr_length;
1365 	else
1366 		lnr->lnr_hdw_len = ill->ill_nd_lla_len;
1367 	ASSERT(NCE_LL_ADDR_OFFSET(ill) + lnr->lnr_hdw_len <=
1368 	    sizeof (lnr->lnr_hdw_addr));
1369 	bcopy(nce->nce_res_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill),
1370 	    (uchar_t *)&lnr->lnr_hdw_addr, lnr->lnr_hdw_len);
1371 	if (nce->nce_flags & NCE_F_ISROUTER)
1372 		lnr->lnr_flags = NDF_ISROUTER_ON;
1373 	if (nce->nce_flags & NCE_F_ANYCAST)
1374 		lnr->lnr_flags |= NDF_ANYCAST_ON;
1375 	NCE_REFRELE(nce);
1376 	return (0);
1377 }
1378 
1379 /*
1380  * Send Enable/Disable multicast reqs to driver.
1381  */
1382 int
1383 ndp_mcastreq(ill_t *ill, const in6_addr_t *addr, uint32_t hw_addr_len,
1384     uint32_t hw_addr_offset, mblk_t *mp)
1385 {
1386 	nce_t		*nce;
1387 	uchar_t		*hw_addr;
1388 	ip_stack_t	*ipst = ill->ill_ipst;
1389 
1390 	ASSERT(ill != NULL && ill->ill_isv6);
1391 	ASSERT(ill->ill_net_type == IRE_IF_RESOLVER);
1392 	hw_addr = mi_offset_paramc(mp, hw_addr_offset, hw_addr_len);
1393 	if (hw_addr == NULL || !IN6_IS_ADDR_MULTICAST(addr)) {
1394 		freemsg(mp);
1395 		return (EINVAL);
1396 	}
1397 	mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
1398 	nce = nce_lookup_mapping(ill, addr);
1399 	if (nce == NULL) {
1400 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
1401 		freemsg(mp);
1402 		return (ESRCH);
1403 	}
1404 	mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
1405 	/*
1406 	 * Update dl_addr_length and dl_addr_offset for primitives that
1407 	 * have physical addresses as opposed to full saps
1408 	 */
1409 	switch (((union DL_primitives *)mp->b_rptr)->dl_primitive) {
1410 	case DL_ENABMULTI_REQ:
1411 		/* Track the state if this is the first enabmulti */
1412 		if (ill->ill_dlpi_multicast_state == IDS_UNKNOWN)
1413 			ill->ill_dlpi_multicast_state = IDS_INPROGRESS;
1414 		ip1dbg(("ndp_mcastreq: ENABMULTI\n"));
1415 		break;
1416 	case DL_DISABMULTI_REQ:
1417 		ip1dbg(("ndp_mcastreq: DISABMULTI\n"));
1418 		break;
1419 	default:
1420 		NCE_REFRELE(nce);
1421 		ip1dbg(("ndp_mcastreq: default\n"));
1422 		return (EINVAL);
1423 	}
1424 	nce_make_mapping(nce, hw_addr, (uchar_t *)addr);
1425 	NCE_REFRELE(nce);
1426 	ill_dlpi_send(ill, mp);
1427 	return (0);
1428 }
1429 
1430 
1431 /*
1432  * Send out a NS for resolving the ip address in nce.
1433  */
1434 void
1435 ip_ndp_resolve(nce_t *nce)
1436 {
1437 	in6_addr_t	sender6 = ipv6_all_zeros;
1438 	uint32_t	ms;
1439 	mblk_t		*mp;
1440 	ip6_t		*ip6h;
1441 
1442 	ASSERT(MUTEX_HELD(&nce->nce_lock));
1443 	/*
1444 	 * Pick the src from outgoing packet, if one is available.
1445 	 * Otherwise let nce_xmit figure out the src.
1446 	 */
1447 	if ((mp = nce->nce_qd_mp) != NULL) {
1448 		/* Handle ip_newroute_v6 giving us IPSEC packets */
1449 		if (mp->b_datap->db_type == M_CTL)
1450 			mp = mp->b_cont;
1451 		ip6h = (ip6_t *)mp->b_rptr;
1452 		if (ip6h->ip6_nxt == IPPROTO_RAW) {
1453 			/*
1454 			 * This message should have been pulled up already in
1455 			 * ip_wput_v6. We can't do pullups here because
1456 			 * the message could be from the nce_qd_mp which could
1457 			 * have b_next/b_prev non-NULL.
1458 			 */
1459 			ASSERT(MBLKL(mp) >= sizeof (ip6i_t) + IPV6_HDR_LEN);
1460 			ip6h = (ip6_t *)(mp->b_rptr + sizeof (ip6i_t));
1461 		}
1462 		sender6 = ip6h->ip6_src;
1463 	}
1464 	ms = nce_solicit(nce, sender6);
1465 	mutex_exit(&nce->nce_lock);
1466 	if (ms == 0) {
1467 		if (nce->nce_state != ND_REACHABLE) {
1468 			nce_resolv_failed(nce);
1469 			ndp_delete(nce);
1470 		}
1471 	} else {
1472 		NDP_RESTART_TIMER(nce, (clock_t)ms);
1473 	}
1474 	mutex_enter(&nce->nce_lock);
1475 }
1476 
1477 /*
1478  * Send a neighbor solicitation.
1479  * Returns number of milliseconds after which we should either rexmit or abort.
1480  * Return of zero means we should abort.
1481  * The caller holds the nce_lock to protect nce_qd_mp and nce_rcnt.
1482  *
1483  * NOTE: This routine drops nce_lock (and later reacquires it) when sending
1484  * the packet.
1485  */
1486 uint32_t
1487 nce_solicit(nce_t *nce, in6_addr_t sender)
1488 {
1489 	boolean_t	dropped;
1490 
1491 	ASSERT(nce->nce_ipversion == IPV6_VERSION);
1492 	ASSERT(MUTEX_HELD(&nce->nce_lock));
1493 
1494 	if (nce->nce_rcnt == 0)
1495 		return (0);
1496 
1497 	nce->nce_rcnt--;
1498 	mutex_exit(&nce->nce_lock);
1499 	dropped = nce_xmit_solicit(nce, B_TRUE, &sender, 0);
1500 	mutex_enter(&nce->nce_lock);
1501 	if (dropped)
1502 		nce->nce_rcnt++;
1503 	return (nce->nce_ill->ill_reachable_retrans_time);
1504 }
1505 
1506 /*
1507  * Attempt to recover an address on an interface that's been marked as a
1508  * duplicate.  Because NCEs are destroyed when the interface goes down, there's
1509  * no easy way to just probe the address and have the right thing happen if
1510  * it's no longer in use.  Instead, we just bring it up normally and allow the
1511  * regular interface start-up logic to probe for a remaining duplicate and take
1512  * us back down if necessary.
1513  * Neither DHCP nor temporary addresses arrive here; they're excluded by
1514  * ip_ndp_excl.
1515  */
1516 /* ARGSUSED */
1517 static void
1518 ip_ndp_recover(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg)
1519 {
1520 	ill_t	*ill = rq->q_ptr;
1521 	ipif_t	*ipif;
1522 	in6_addr_t *addr = (in6_addr_t *)mp->b_rptr;
1523 
1524 	for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
1525 		/*
1526 		 * We do not support recovery of proxy ARP'd interfaces,
1527 		 * because the system lacks a complete proxy ARP mechanism.
1528 		 */
1529 		if ((ipif->ipif_flags & IPIF_POINTOPOINT) ||
1530 		    !IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6lcl_addr, addr)) {
1531 			continue;
1532 		}
1533 
1534 		/*
1535 		 * If we have already recovered or if the interface is going
1536 		 * away, then ignore.
1537 		 */
1538 		mutex_enter(&ill->ill_lock);
1539 		if (!(ipif->ipif_flags & IPIF_DUPLICATE) ||
1540 		    (ipif->ipif_state_flags & IPIF_CONDEMNED)) {
1541 			mutex_exit(&ill->ill_lock);
1542 			continue;
1543 		}
1544 
1545 		ipif->ipif_flags &= ~IPIF_DUPLICATE;
1546 		ill->ill_ipif_dup_count--;
1547 		mutex_exit(&ill->ill_lock);
1548 		ipif->ipif_was_dup = B_TRUE;
1549 
1550 		VERIFY(ipif_ndp_up(ipif, B_TRUE) != EINPROGRESS);
1551 		(void) ipif_up_done_v6(ipif);
1552 	}
1553 	freeb(mp);
1554 }
1555 
1556 /*
1557  * Attempt to recover an IPv6 interface that's been shut down as a duplicate.
1558  * As long as someone else holds the address, the interface will stay down.
1559  * When that conflict goes away, the interface is brought back up.  This is
1560  * done so that accidental shutdowns of addresses aren't made permanent.  Your
1561  * server will recover from a failure.
1562  *
1563  * For DHCP and temporary addresses, recovery is not done in the kernel.
1564  * Instead, it's handled by user space processes (dhcpagent and in.ndpd).
1565  *
1566  * This function is entered on a timer expiry; the ID is in ipif_recovery_id.
1567  */
1568 static void
1569 ipif6_dup_recovery(void *arg)
1570 {
1571 	ipif_t *ipif = arg;
1572 
1573 	ipif->ipif_recovery_id = 0;
1574 	if (!(ipif->ipif_flags & IPIF_DUPLICATE))
1575 		return;
1576 
1577 	/*
1578 	 * No lock, because this is just an optimization.
1579 	 */
1580 	if (ipif->ipif_state_flags & IPIF_CONDEMNED)
1581 		return;
1582 
1583 	/* If the link is down, we'll retry this later */
1584 	if (!(ipif->ipif_ill->ill_phyint->phyint_flags & PHYI_RUNNING))
1585 		return;
1586 
1587 	ndp_do_recovery(ipif);
1588 }
1589 
1590 /*
1591  * Perform interface recovery by forcing the duplicate interfaces up and
1592  * allowing the system to determine which ones should stay up.
1593  *
1594  * Called both by recovery timer expiry and link-up notification.
1595  */
1596 void
1597 ndp_do_recovery(ipif_t *ipif)
1598 {
1599 	ill_t *ill = ipif->ipif_ill;
1600 	mblk_t *mp;
1601 	ip_stack_t *ipst = ill->ill_ipst;
1602 
1603 	mp = allocb(sizeof (ipif->ipif_v6lcl_addr), BPRI_MED);
1604 	if (mp == NULL) {
1605 		mutex_enter(&ill->ill_lock);
1606 		if (ipif->ipif_recovery_id == 0 &&
1607 		    !(ipif->ipif_state_flags & IPIF_CONDEMNED)) {
1608 			ipif->ipif_recovery_id = timeout(ipif6_dup_recovery,
1609 			    ipif, MSEC_TO_TICK(ipst->ips_ip_dup_recovery));
1610 		}
1611 		mutex_exit(&ill->ill_lock);
1612 	} else {
1613 		/*
1614 		 * A recovery timer may still be running if we got here from
1615 		 * ill_restart_dad(); cancel that timer.
1616 		 */
1617 		if (ipif->ipif_recovery_id != 0)
1618 			(void) untimeout(ipif->ipif_recovery_id);
1619 		ipif->ipif_recovery_id = 0;
1620 
1621 		bcopy(&ipif->ipif_v6lcl_addr, mp->b_rptr,
1622 		    sizeof (ipif->ipif_v6lcl_addr));
1623 		ill_refhold(ill);
1624 		qwriter_ip(ill, ill->ill_rq, mp, ip_ndp_recover, NEW_OP,
1625 		    B_FALSE);
1626 	}
1627 }
1628 
1629 /*
1630  * Find the MAC and IP addresses in an NA/NS message.
1631  */
1632 static void
1633 ip_ndp_find_addresses(mblk_t *mp, mblk_t *dl_mp, ill_t *ill, in6_addr_t *targp,
1634     uchar_t **haddr, uint_t *haddrlenp)
1635 {
1636 	ip6_t *ip6h = (ip6_t *)mp->b_rptr;
1637 	icmp6_t *icmp6 = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
1638 	nd_neighbor_advert_t *na = (nd_neighbor_advert_t *)icmp6;
1639 	nd_neighbor_solicit_t *ns = (nd_neighbor_solicit_t *)icmp6;
1640 	uchar_t *addr;
1641 	int alen = 0;
1642 
1643 	if (dl_mp == NULL) {
1644 		nd_opt_hdr_t *opt = NULL;
1645 		int len;
1646 
1647 		/*
1648 		 * If it's from the fast-path, then it can't be a probe
1649 		 * message, and thus must include a linkaddr option.
1650 		 * Extract that here.
1651 		 */
1652 		switch (icmp6->icmp6_type) {
1653 		case ND_NEIGHBOR_SOLICIT:
1654 			len = mp->b_wptr - (uchar_t *)ns;
1655 			if ((len -= sizeof (*ns)) > 0) {
1656 				opt = ndp_get_option((nd_opt_hdr_t *)(ns + 1),
1657 				    len, ND_OPT_SOURCE_LINKADDR);
1658 			}
1659 			break;
1660 		case ND_NEIGHBOR_ADVERT:
1661 			len = mp->b_wptr - (uchar_t *)na;
1662 			if ((len -= sizeof (*na)) > 0) {
1663 				opt = ndp_get_option((nd_opt_hdr_t *)(na + 1),
1664 				    len, ND_OPT_TARGET_LINKADDR);
1665 			}
1666 			break;
1667 		}
1668 
1669 		if (opt != NULL && opt->nd_opt_len * 8 - sizeof (*opt) >=
1670 		    ill->ill_nd_lla_len) {
1671 			addr = (uchar_t *)(opt + 1);
1672 			alen = ill->ill_nd_lla_len;
1673 		}
1674 
1675 		/*
1676 		 * We cheat a bit here for the sake of printing usable log
1677 		 * messages in the rare case where the reply we got was unicast
1678 		 * without a source linkaddr option, and the interface is in
1679 		 * fastpath mode.  (Sigh.)
1680 		 */
1681 		if (alen == 0 && ill->ill_type == IFT_ETHER &&
1682 		    MBLKHEAD(mp) >= sizeof (struct ether_header)) {
1683 			struct ether_header *pether;
1684 
1685 			pether = (struct ether_header *)((char *)ip6h -
1686 			    sizeof (*pether));
1687 			addr = pether->ether_shost.ether_addr_octet;
1688 			alen = ETHERADDRL;
1689 		}
1690 	} else {
1691 		dl_unitdata_ind_t *dlu;
1692 
1693 		dlu = (dl_unitdata_ind_t *)dl_mp->b_rptr;
1694 		alen = dlu->dl_src_addr_length;
1695 		if (alen > 0 && dlu->dl_src_addr_offset >= sizeof (*dlu) &&
1696 		    dlu->dl_src_addr_offset + alen <= MBLKL(dl_mp)) {
1697 			addr = dl_mp->b_rptr + dlu->dl_src_addr_offset;
1698 			if (ill->ill_sap_length < 0) {
1699 				alen += ill->ill_sap_length;
1700 			} else {
1701 				addr += ill->ill_sap_length;
1702 				alen -= ill->ill_sap_length;
1703 			}
1704 		}
1705 	}
1706 
1707 	if (alen > 0) {
1708 		*haddr = addr;
1709 		*haddrlenp = alen;
1710 	} else {
1711 		*haddr = NULL;
1712 		*haddrlenp = 0;
1713 	}
1714 
1715 	/* nd_ns_target and nd_na_target are at the same offset, so we cheat */
1716 	*targp = ns->nd_ns_target;
1717 }
1718 
1719 /*
1720  * This is for exclusive changes due to NDP duplicate address detection
1721  * failure.
1722  */
1723 /* ARGSUSED */
1724 static void
1725 ip_ndp_excl(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg)
1726 {
1727 	ill_t	*ill = rq->q_ptr;
1728 	ipif_t	*ipif;
1729 	mblk_t	*dl_mp = NULL;
1730 	uchar_t	*haddr;
1731 	uint_t	haddrlen;
1732 	ip_stack_t *ipst = ill->ill_ipst;
1733 	in6_addr_t targ;
1734 
1735 	if (DB_TYPE(mp) != M_DATA) {
1736 		dl_mp = mp;
1737 		mp = mp->b_cont;
1738 	}
1739 
1740 	ip_ndp_find_addresses(mp, dl_mp, ill, &targ, &haddr, &haddrlen);
1741 	if (haddr != NULL && haddrlen == ill->ill_phys_addr_length) {
1742 		/*
1743 		 * Ignore conflicts generated by misbehaving switches that
1744 		 * just reflect our own messages back to us.  For IPMP, we may
1745 		 * see reflections across any ill in the illgrp.
1746 		 */
1747 		if (bcmp(haddr, ill->ill_phys_addr, haddrlen) == 0 ||
1748 		    IS_UNDER_IPMP(ill) &&
1749 		    ipmp_illgrp_find_ill(ill->ill_grp, haddr, haddrlen) != NULL)
1750 			goto ignore_conflict;
1751 	}
1752 
1753 	/*
1754 	 * Look up the appropriate ipif.
1755 	 */
1756 	ipif = ipif_lookup_addr_v6(&targ, ill, ALL_ZONES, NULL, NULL, NULL,
1757 	    NULL, ipst);
1758 	if (ipif == NULL)
1759 		goto ignore_conflict;
1760 
1761 	/* Reload the ill to match the ipif */
1762 	ill = ipif->ipif_ill;
1763 
1764 	/* If it's already duplicate or ineligible, then don't do anything. */
1765 	if (ipif->ipif_flags & (IPIF_POINTOPOINT|IPIF_DUPLICATE)) {
1766 		ipif_refrele(ipif);
1767 		goto ignore_conflict;
1768 	}
1769 
1770 	/*
1771 	 * If this is a failure during duplicate recovery, then don't
1772 	 * complain.  It may take a long time to recover.
1773 	 */
1774 	if (!ipif->ipif_was_dup) {
1775 		char ibuf[LIFNAMSIZ];
1776 		char hbuf[MAC_STR_LEN];
1777 		char sbuf[INET6_ADDRSTRLEN];
1778 
1779 		ipif_get_name(ipif, ibuf, sizeof (ibuf));
1780 		cmn_err(CE_WARN, "%s has duplicate address %s (in use by %s);"
1781 		    " disabled", ibuf,
1782 		    inet_ntop(AF_INET6, &targ, sbuf, sizeof (sbuf)),
1783 		    mac_colon_addr(haddr, haddrlen, hbuf, sizeof (hbuf)));
1784 	}
1785 	mutex_enter(&ill->ill_lock);
1786 	ASSERT(!(ipif->ipif_flags & IPIF_DUPLICATE));
1787 	ipif->ipif_flags |= IPIF_DUPLICATE;
1788 	ill->ill_ipif_dup_count++;
1789 	mutex_exit(&ill->ill_lock);
1790 	(void) ipif_down(ipif, NULL, NULL);
1791 	ipif_down_tail(ipif);
1792 	mutex_enter(&ill->ill_lock);
1793 	if (!(ipif->ipif_flags & (IPIF_DHCPRUNNING|IPIF_TEMPORARY)) &&
1794 	    ill->ill_net_type == IRE_IF_RESOLVER &&
1795 	    !(ipif->ipif_state_flags & IPIF_CONDEMNED) &&
1796 	    ipst->ips_ip_dup_recovery > 0) {
1797 		ASSERT(ipif->ipif_recovery_id == 0);
1798 		ipif->ipif_recovery_id = timeout(ipif6_dup_recovery,
1799 		    ipif, MSEC_TO_TICK(ipst->ips_ip_dup_recovery));
1800 	}
1801 	mutex_exit(&ill->ill_lock);
1802 	ipif_refrele(ipif);
1803 ignore_conflict:
1804 	if (dl_mp != NULL)
1805 		freeb(dl_mp);
1806 	freemsg(mp);
1807 }
1808 
1809 /*
1810  * Handle failure by tearing down the ipifs with the specified address.  Note
1811  * that tearing down the ipif also means deleting the nce through ipif_down, so
1812  * it's not possible to do recovery by just restarting the nce timer.  Instead,
1813  * we start a timer on the ipif.
1814  */
1815 static void
1816 ip_ndp_failure(ill_t *ill, mblk_t *mp, mblk_t *dl_mp)
1817 {
1818 	if ((mp = copymsg(mp)) != NULL) {
1819 		if (dl_mp == NULL)
1820 			dl_mp = mp;
1821 		else if ((dl_mp = copyb(dl_mp)) != NULL)
1822 			dl_mp->b_cont = mp;
1823 		if (dl_mp == NULL) {
1824 			freemsg(mp);
1825 		} else {
1826 			ill_refhold(ill);
1827 			qwriter_ip(ill, ill->ill_rq, dl_mp, ip_ndp_excl, NEW_OP,
1828 			    B_FALSE);
1829 		}
1830 	}
1831 }
1832 
1833 /*
1834  * Handle a discovered conflict: some other system is advertising that it owns
1835  * one of our IP addresses.  We need to defend ourselves, or just shut down the
1836  * interface.
1837  */
1838 static void
1839 ip_ndp_conflict(ill_t *ill, mblk_t *mp, mblk_t *dl_mp, nce_t *nce)
1840 {
1841 	ipif_t *ipif;
1842 	uint32_t now;
1843 	uint_t maxdefense;
1844 	uint_t defs;
1845 	ip_stack_t *ipst = ill->ill_ipst;
1846 
1847 	ipif = ipif_lookup_addr_v6(&nce->nce_addr, ill, ALL_ZONES, NULL, NULL,
1848 	    NULL, NULL, ipst);
1849 	if (ipif == NULL)
1850 		return;
1851 
1852 	/*
1853 	 * First, figure out if this address is disposable.
1854 	 */
1855 	if (ipif->ipif_flags & (IPIF_DHCPRUNNING | IPIF_TEMPORARY))
1856 		maxdefense = ipst->ips_ip_max_temp_defend;
1857 	else
1858 		maxdefense = ipst->ips_ip_max_defend;
1859 
1860 	/*
1861 	 * Now figure out how many times we've defended ourselves.  Ignore
1862 	 * defenses that happened long in the past.
1863 	 */
1864 	now = gethrestime_sec();
1865 	mutex_enter(&nce->nce_lock);
1866 	if ((defs = nce->nce_defense_count) > 0 &&
1867 	    now - nce->nce_defense_time > ipst->ips_ip_defend_interval) {
1868 		nce->nce_defense_count = defs = 0;
1869 	}
1870 	nce->nce_defense_count++;
1871 	nce->nce_defense_time = now;
1872 	mutex_exit(&nce->nce_lock);
1873 	ipif_refrele(ipif);
1874 
1875 	/*
1876 	 * If we've defended ourselves too many times already, then give up and
1877 	 * tear down the interface(s) using this address.  Otherwise, defend by
1878 	 * sending out an unsolicited Neighbor Advertisement.
1879 	 */
1880 	if (defs >= maxdefense) {
1881 		ip_ndp_failure(ill, mp, dl_mp);
1882 	} else {
1883 		char hbuf[MAC_STR_LEN];
1884 		char sbuf[INET6_ADDRSTRLEN];
1885 		uchar_t *haddr;
1886 		uint_t haddrlen;
1887 		in6_addr_t targ;
1888 
1889 		ip_ndp_find_addresses(mp, dl_mp, ill, &targ, &haddr, &haddrlen);
1890 		cmn_err(CE_WARN, "node %s is using our IP address %s on %s",
1891 		    mac_colon_addr(haddr, haddrlen, hbuf, sizeof (hbuf)),
1892 		    inet_ntop(AF_INET6, &targ, sbuf, sizeof (sbuf)),
1893 		    ill->ill_name);
1894 
1895 		(void) nce_xmit_advert(nce, B_FALSE, &ipv6_all_hosts_mcast, 0);
1896 	}
1897 }
1898 
1899 static void
1900 ndp_input_solicit(ill_t *ill, mblk_t *mp, mblk_t *dl_mp)
1901 {
1902 	nd_neighbor_solicit_t *ns;
1903 	uint32_t	hlen = ill->ill_nd_lla_len;
1904 	uchar_t		*haddr = NULL;
1905 	icmp6_t		*icmp_nd;
1906 	ip6_t		*ip6h;
1907 	nce_t		*our_nce = NULL;
1908 	in6_addr_t	target;
1909 	in6_addr_t	src;
1910 	int		len;
1911 	int		flag = 0;
1912 	nd_opt_hdr_t	*opt = NULL;
1913 	boolean_t	bad_solicit = B_FALSE;
1914 	mib2_ipv6IfIcmpEntry_t	*mib = ill->ill_icmp6_mib;
1915 
1916 	ip6h = (ip6_t *)mp->b_rptr;
1917 	icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
1918 	len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN;
1919 	src = ip6h->ip6_src;
1920 	ns = (nd_neighbor_solicit_t *)icmp_nd;
1921 	target = ns->nd_ns_target;
1922 	if (IN6_IS_ADDR_MULTICAST(&target)) {
1923 		if (ip_debug > 2) {
1924 			/* ip1dbg */
1925 			pr_addr_dbg("ndp_input_solicit: Target is"
1926 			    " multicast! %s\n", AF_INET6, &target);
1927 		}
1928 		bad_solicit = B_TRUE;
1929 		goto done;
1930 	}
1931 	if (len > sizeof (nd_neighbor_solicit_t)) {
1932 		/* Options present */
1933 		opt = (nd_opt_hdr_t *)&ns[1];
1934 		len -= sizeof (nd_neighbor_solicit_t);
1935 		if (!ndp_verify_optlen(opt, len)) {
1936 			ip1dbg(("ndp_input_solicit: Bad opt len\n"));
1937 			bad_solicit = B_TRUE;
1938 			goto done;
1939 		}
1940 
1941 	}
1942 	if (IN6_IS_ADDR_UNSPECIFIED(&src)) {
1943 		/* Check to see if this is a valid DAD solicitation */
1944 		if (!IN6_IS_ADDR_MC_SOLICITEDNODE(&ip6h->ip6_dst)) {
1945 			if (ip_debug > 2) {
1946 				/* ip1dbg */
1947 				pr_addr_dbg("ndp_input_solicit: IPv6 "
1948 				    "Destination is not solicited node "
1949 				    "multicast %s\n", AF_INET6,
1950 				    &ip6h->ip6_dst);
1951 			}
1952 			bad_solicit = B_TRUE;
1953 			goto done;
1954 		}
1955 	}
1956 
1957 	/*
1958 	 * NOTE: with IPMP, it's possible the nominated multicast ill (which
1959 	 * received this packet if it's multicast) is not the ill tied to
1960 	 * e.g. the IPMP ill's data link-local.  So we match across the illgrp
1961 	 * to ensure we find the associated NCE.
1962 	 */
1963 	our_nce = ndp_lookup_v6(ill, B_TRUE, &target, B_FALSE);
1964 	/*
1965 	 * If this is a valid Solicitation, a permanent
1966 	 * entry should exist in the cache
1967 	 */
1968 	if (our_nce == NULL ||
1969 	    !(our_nce->nce_flags & NCE_F_PERMANENT)) {
1970 		ip1dbg(("ndp_input_solicit: Wrong target in NS?!"
1971 		    "ifname=%s ", ill->ill_name));
1972 		if (ip_debug > 2) {
1973 			/* ip1dbg */
1974 			pr_addr_dbg(" dst %s\n", AF_INET6, &target);
1975 		}
1976 		bad_solicit = B_TRUE;
1977 		goto done;
1978 	}
1979 
1980 	/* At this point we should have a verified NS per spec */
1981 	if (opt != NULL) {
1982 		opt = ndp_get_option(opt, len, ND_OPT_SOURCE_LINKADDR);
1983 		if (opt != NULL) {
1984 			haddr = (uchar_t *)&opt[1];
1985 			if (hlen > opt->nd_opt_len * 8 - sizeof (*opt) ||
1986 			    hlen == 0) {
1987 				ip1dbg(("ndp_input_solicit: bad SLLA\n"));
1988 				bad_solicit = B_TRUE;
1989 				goto done;
1990 			}
1991 		}
1992 	}
1993 
1994 	/* If sending directly to peer, set the unicast flag */
1995 	if (!IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst))
1996 		flag |= NDP_UNICAST;
1997 
1998 	/*
1999 	 * Create/update the entry for the soliciting node.
2000 	 * or respond to outstanding queries, don't if
2001 	 * the source is unspecified address.
2002 	 */
2003 	if (!IN6_IS_ADDR_UNSPECIFIED(&src)) {
2004 		int	err;
2005 		nce_t	*nnce;
2006 
2007 		ASSERT(ill->ill_isv6);
2008 		/*
2009 		 * Regular solicitations *must* include the Source Link-Layer
2010 		 * Address option.  Ignore messages that do not.
2011 		 */
2012 		if (haddr == NULL && IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst)) {
2013 			ip1dbg(("ndp_input_solicit: source link-layer address "
2014 			    "option missing with a specified source.\n"));
2015 			bad_solicit = B_TRUE;
2016 			goto done;
2017 		}
2018 
2019 		/*
2020 		 * This is a regular solicitation.  If we're still in the
2021 		 * process of verifying the address, then don't respond at all
2022 		 * and don't keep track of the sender.
2023 		 */
2024 		if (our_nce->nce_state == ND_PROBE)
2025 			goto done;
2026 
2027 		/*
2028 		 * If the solicitation doesn't have sender hardware address
2029 		 * (legal for unicast solicitation), then process without
2030 		 * installing the return NCE.  Either we already know it, or
2031 		 * we'll be forced to look it up when (and if) we reply to the
2032 		 * packet.
2033 		 */
2034 		if (haddr == NULL)
2035 			goto no_source;
2036 
2037 		err = ndp_lookup_then_add_v6(ill,
2038 		    B_FALSE,
2039 		    haddr,
2040 		    &src,	/* Soliciting nodes address */
2041 		    &ipv6_all_ones,
2042 		    &ipv6_all_zeros,
2043 		    0,
2044 		    0,
2045 		    ND_STALE,
2046 		    &nnce);
2047 		switch (err) {
2048 		case 0:
2049 			/* done with this entry */
2050 			NCE_REFRELE(nnce);
2051 			break;
2052 		case EEXIST:
2053 			/*
2054 			 * B_FALSE indicates this is not an an advertisement.
2055 			 */
2056 			ndp_process(nnce, haddr, 0, B_FALSE);
2057 			NCE_REFRELE(nnce);
2058 			break;
2059 		default:
2060 			ip1dbg(("ndp_input_solicit: Can't create NCE %d\n",
2061 			    err));
2062 			goto done;
2063 		}
2064 no_source:
2065 		flag |= NDP_SOLICITED;
2066 	} else {
2067 		/*
2068 		 * No source link layer address option should be present in a
2069 		 * valid DAD request.
2070 		 */
2071 		if (haddr != NULL) {
2072 			ip1dbg(("ndp_input_solicit: source link-layer address "
2073 			    "option present with an unspecified source.\n"));
2074 			bad_solicit = B_TRUE;
2075 			goto done;
2076 		}
2077 		if (our_nce->nce_state == ND_PROBE) {
2078 			/*
2079 			 * Internally looped-back probes won't have DLPI
2080 			 * attached to them.  External ones (which are sent by
2081 			 * multicast) always will.  Just ignore our own
2082 			 * transmissions.
2083 			 */
2084 			if (dl_mp != NULL) {
2085 				/*
2086 				 * If someone else is probing our address, then
2087 				 * we've crossed wires.  Declare failure.
2088 				 */
2089 				ip_ndp_failure(ill, mp, dl_mp);
2090 			}
2091 			goto done;
2092 		}
2093 		/*
2094 		 * This is a DAD probe.  Multicast the advertisement to the
2095 		 * all-nodes address.
2096 		 */
2097 		src = ipv6_all_hosts_mcast;
2098 	}
2099 	/* Response to a solicitation */
2100 	(void) nce_xmit_advert(our_nce, B_TRUE, &src, flag);
2101 done:
2102 	if (bad_solicit)
2103 		BUMP_MIB(mib, ipv6IfIcmpInBadNeighborSolicitations);
2104 	if (our_nce != NULL)
2105 		NCE_REFRELE(our_nce);
2106 }
2107 
2108 void
2109 ndp_input_advert(ill_t *ill, mblk_t *mp, mblk_t *dl_mp)
2110 {
2111 	nd_neighbor_advert_t *na;
2112 	uint32_t	hlen = ill->ill_nd_lla_len;
2113 	uchar_t		*haddr = NULL;
2114 	icmp6_t		*icmp_nd;
2115 	ip6_t		*ip6h;
2116 	nce_t		*dst_nce = NULL;
2117 	in6_addr_t	target;
2118 	nd_opt_hdr_t	*opt = NULL;
2119 	int		len;
2120 	ip_stack_t	*ipst = ill->ill_ipst;
2121 	mib2_ipv6IfIcmpEntry_t	*mib = ill->ill_icmp6_mib;
2122 
2123 	ip6h = (ip6_t *)mp->b_rptr;
2124 	icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
2125 	len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN;
2126 	na = (nd_neighbor_advert_t *)icmp_nd;
2127 	if (IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst) &&
2128 	    (na->nd_na_flags_reserved & ND_NA_FLAG_SOLICITED)) {
2129 		ip1dbg(("ndp_input_advert: Target is multicast but the "
2130 		    "solicited flag is not zero\n"));
2131 		BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements);
2132 		return;
2133 	}
2134 	target = na->nd_na_target;
2135 	if (IN6_IS_ADDR_MULTICAST(&target)) {
2136 		ip1dbg(("ndp_input_advert: Target is multicast!\n"));
2137 		BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements);
2138 		return;
2139 	}
2140 	if (len > sizeof (nd_neighbor_advert_t)) {
2141 		opt = (nd_opt_hdr_t *)&na[1];
2142 		if (!ndp_verify_optlen(opt,
2143 		    len - sizeof (nd_neighbor_advert_t))) {
2144 			ip1dbg(("ndp_input_advert: cannot verify SLLA\n"));
2145 			BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements);
2146 			return;
2147 		}
2148 		/* At this point we have a verified NA per spec */
2149 		len -= sizeof (nd_neighbor_advert_t);
2150 		opt = ndp_get_option(opt, len, ND_OPT_TARGET_LINKADDR);
2151 		if (opt != NULL) {
2152 			haddr = (uchar_t *)&opt[1];
2153 			if (hlen > opt->nd_opt_len * 8 - sizeof (*opt) ||
2154 			    hlen == 0) {
2155 				ip1dbg(("ndp_input_advert: bad SLLA\n"));
2156 				BUMP_MIB(mib,
2157 				    ipv6IfIcmpInBadNeighborAdvertisements);
2158 				return;
2159 			}
2160 		}
2161 	}
2162 
2163 	/*
2164 	 * NOTE: we match across the illgrp since we need to do DAD for all of
2165 	 * our local addresses, and those are spread across all the active
2166 	 * ills in the group.
2167 	 */
2168 	if ((dst_nce = ndp_lookup_v6(ill, B_TRUE, &target, B_FALSE)) == NULL)
2169 		return;
2170 
2171 	if (dst_nce->nce_flags & NCE_F_PERMANENT) {
2172 		/*
2173 		 * Someone just advertised one of our local addresses.	First,
2174 		 * check it it was us -- if so, we can safely ignore it.
2175 		 */
2176 		if (haddr != NULL) {
2177 			if (!nce_cmp_ll_addr(dst_nce, haddr, hlen))
2178 				goto out;	/* from us -- no conflict */
2179 
2180 			/*
2181 			 * If we're in an IPMP group, check if this is an echo
2182 			 * from another ill in the group.  Use the double-
2183 			 * checked locking pattern to avoid grabbing
2184 			 * ill_g_lock in the non-IPMP case.
2185 			 */
2186 			if (IS_UNDER_IPMP(ill)) {
2187 				rw_enter(&ipst->ips_ill_g_lock, RW_READER);
2188 				if (IS_UNDER_IPMP(ill) && ipmp_illgrp_find_ill(
2189 				    ill->ill_grp, haddr, hlen) != NULL) {
2190 					rw_exit(&ipst->ips_ill_g_lock);
2191 					goto out;
2192 				}
2193 				rw_exit(&ipst->ips_ill_g_lock);
2194 			}
2195 		}
2196 
2197 		/*
2198 		 * Our own (looped-back) unsolicited neighbor advertisements
2199 		 * will get here with dl_mp == NULL.  (These will usually be
2200 		 * filtered by the `haddr' checks above, but point-to-point
2201 		 * links have no hardware address and thus make it here.)
2202 		 */
2203 		if (dl_mp == NULL && dst_nce->nce_state != ND_PROBE)
2204 			goto out;
2205 
2206 		/*
2207 		 * This appears to be a real conflict.  If we're trying to
2208 		 * configure this NCE (ND_PROBE), then shut it down.
2209 		 * Otherwise, handle the discovered conflict.
2210 		 *
2211 		 * In the ND_PROBE case, dl_mp might be NULL if we're getting
2212 		 * a unicast reply.  This isn't typically done (multicast is
2213 		 * the norm in response to a probe), but we can handle it.
2214 		 */
2215 		if (dst_nce->nce_state == ND_PROBE)
2216 			ip_ndp_failure(ill, mp, dl_mp);
2217 		else
2218 			ip_ndp_conflict(ill, mp, dl_mp, dst_nce);
2219 	} else {
2220 		if (na->nd_na_flags_reserved & ND_NA_FLAG_ROUTER)
2221 			dst_nce->nce_flags |= NCE_F_ISROUTER;
2222 
2223 		/* B_TRUE indicates this an advertisement */
2224 		ndp_process(dst_nce, haddr, na->nd_na_flags_reserved, B_TRUE);
2225 	}
2226 out:
2227 	NCE_REFRELE(dst_nce);
2228 }
2229 
2230 /*
2231  * Process NDP neighbor solicitation/advertisement messages.
2232  * The checksum has already checked o.k before reaching here.
2233  */
2234 void
2235 ndp_input(ill_t *ill, mblk_t *mp, mblk_t *dl_mp)
2236 {
2237 	icmp6_t		*icmp_nd;
2238 	ip6_t		*ip6h;
2239 	int		len;
2240 	mib2_ipv6IfIcmpEntry_t	*mib = ill->ill_icmp6_mib;
2241 
2242 
2243 	if (!pullupmsg(mp, -1)) {
2244 		ip1dbg(("ndp_input: pullupmsg failed\n"));
2245 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2246 		goto done;
2247 	}
2248 	ip6h = (ip6_t *)mp->b_rptr;
2249 	if (ip6h->ip6_hops != IPV6_MAX_HOPS) {
2250 		ip1dbg(("ndp_input: hoplimit != IPV6_MAX_HOPS\n"));
2251 		BUMP_MIB(mib, ipv6IfIcmpBadHoplimit);
2252 		goto done;
2253 	}
2254 	/*
2255 	 * NDP does not accept any extension headers between the
2256 	 * IP header and the ICMP header since e.g. a routing
2257 	 * header could be dangerous.
2258 	 * This assumes that any AH or ESP headers are removed
2259 	 * by ip prior to passing the packet to ndp_input.
2260 	 */
2261 	if (ip6h->ip6_nxt != IPPROTO_ICMPV6) {
2262 		ip1dbg(("ndp_input: Wrong next header 0x%x\n",
2263 		    ip6h->ip6_nxt));
2264 		BUMP_MIB(mib, ipv6IfIcmpInErrors);
2265 		goto done;
2266 	}
2267 	icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
2268 	ASSERT(icmp_nd->icmp6_type == ND_NEIGHBOR_SOLICIT ||
2269 	    icmp_nd->icmp6_type == ND_NEIGHBOR_ADVERT);
2270 	if (icmp_nd->icmp6_code != 0) {
2271 		ip1dbg(("ndp_input: icmp6 code != 0 \n"));
2272 		BUMP_MIB(mib, ipv6IfIcmpInErrors);
2273 		goto done;
2274 	}
2275 	len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN;
2276 	/*
2277 	 * Make sure packet length is large enough for either
2278 	 * a NS or a NA icmp packet.
2279 	 */
2280 	if (len <  sizeof (struct icmp6_hdr) + sizeof (struct in6_addr)) {
2281 		ip1dbg(("ndp_input: packet too short\n"));
2282 		BUMP_MIB(mib, ipv6IfIcmpInErrors);
2283 		goto done;
2284 	}
2285 	if (icmp_nd->icmp6_type == ND_NEIGHBOR_SOLICIT) {
2286 		ndp_input_solicit(ill, mp, dl_mp);
2287 	} else {
2288 		ndp_input_advert(ill, mp, dl_mp);
2289 	}
2290 done:
2291 	freemsg(mp);
2292 }
2293 
2294 /*
2295  * Utility routine to send an advertisement.  Assumes that the NCE cannot
2296  * go away (e.g., because it's refheld).
2297  */
2298 static boolean_t
2299 nce_xmit_advert(nce_t *nce, boolean_t use_nd_lla, const in6_addr_t *target,
2300     uint_t flags)
2301 {
2302 	ASSERT((flags & NDP_PROBE) == 0);
2303 
2304 	if (nce->nce_flags & NCE_F_ISROUTER)
2305 		flags |= NDP_ISROUTER;
2306 	if (!(nce->nce_flags & NCE_F_ANYCAST))
2307 		flags |= NDP_ORIDE;
2308 
2309 	return (nce_xmit(nce->nce_ill, ND_NEIGHBOR_ADVERT, use_nd_lla,
2310 	    &nce->nce_addr, target, flags));
2311 }
2312 
2313 /*
2314  * Utility routine to send a solicitation.  Assumes that the NCE cannot
2315  * go away (e.g., because it's refheld).
2316  */
2317 static boolean_t
2318 nce_xmit_solicit(nce_t *nce, boolean_t use_nd_lla, const in6_addr_t *sender,
2319     uint_t flags)
2320 {
2321 	if (flags & NDP_PROBE)
2322 		sender = &ipv6_all_zeros;
2323 
2324 	return (nce_xmit(nce->nce_ill, ND_NEIGHBOR_SOLICIT, use_nd_lla,
2325 	    sender, &nce->nce_addr, flags));
2326 }
2327 
2328 /*
2329  * nce_xmit is called to form and transmit a ND solicitation or
2330  * advertisement ICMP packet.
2331  *
2332  * If the source address is unspecified and this isn't a probe (used for
2333  * duplicate address detection), an appropriate source address and link layer
2334  * address will be chosen here.  The link layer address option is included if
2335  * the source is specified (i.e., all non-probe packets), and omitted (per the
2336  * specification) otherwise.
2337  *
2338  * It returns B_FALSE only if it does a successful put() to the
2339  * corresponding ill's ill_wq otherwise returns B_TRUE.
2340  */
2341 static boolean_t
2342 nce_xmit(ill_t *ill, uint8_t type, boolean_t use_nd_lla,
2343     const in6_addr_t *sender, const in6_addr_t *target, int flag)
2344 {
2345 	ill_t		*hwaddr_ill;
2346 	uint32_t	len;
2347 	icmp6_t 	*icmp6;
2348 	mblk_t		*mp;
2349 	ip6_t		*ip6h;
2350 	nd_opt_hdr_t	*opt;
2351 	uint_t		plen, maxplen;
2352 	ip6i_t		*ip6i;
2353 	ipif_t		*src_ipif = NULL;
2354 	uint8_t		*hw_addr;
2355 	zoneid_t	zoneid = GLOBAL_ZONEID;
2356 	char		buf[INET6_ADDRSTRLEN];
2357 
2358 	ASSERT(!IS_IPMP(ill));
2359 
2360 	/*
2361 	 * Check that the sender is actually a usable address on `ill', and if
2362 	 * so, track that as the src_ipif.  If not, for solicitations, set the
2363 	 * sender to :: so that a new one will be picked below; for adverts,
2364 	 * drop the packet since we expect nce_xmit_advert() to always provide
2365 	 * a valid sender.
2366 	 */
2367 	if (!IN6_IS_ADDR_UNSPECIFIED(sender)) {
2368 		if ((src_ipif = ip_ndp_lookup_addr_v6(sender, ill)) == NULL ||
2369 		    !src_ipif->ipif_addr_ready) {
2370 			if (src_ipif != NULL) {
2371 				ipif_refrele(src_ipif);
2372 				src_ipif = NULL;
2373 			}
2374 			if (type == ND_NEIGHBOR_ADVERT) {
2375 				ip1dbg(("nce_xmit: No source ipif for src %s\n",
2376 				    inet_ntop(AF_INET6, sender, buf,
2377 				    sizeof (buf))));
2378 				return (B_TRUE);
2379 			}
2380 			sender = &ipv6_all_zeros;
2381 		}
2382 	}
2383 
2384 	/*
2385 	 * If we still have an unspecified source (sender) address and this
2386 	 * isn't a probe, select a source address from `ill'.
2387 	 */
2388 	if (IN6_IS_ADDR_UNSPECIFIED(sender) && !(flag & NDP_PROBE)) {
2389 		ASSERT(type != ND_NEIGHBOR_ADVERT);
2390 		/*
2391 		 * Pick a source address for this solicitation, but restrict
2392 		 * the selection to addresses assigned to the output
2393 		 * interface.  We do this because the destination will create
2394 		 * a neighbor cache entry for the source address of this
2395 		 * packet, so the source address needs to be a valid neighbor.
2396 		 */
2397 		src_ipif = ipif_select_source_v6(ill, target, B_TRUE,
2398 		    IPV6_PREFER_SRC_DEFAULT, ALL_ZONES);
2399 		if (src_ipif == NULL) {
2400 			ip1dbg(("nce_xmit: No source ipif for dst %s\n",
2401 			    inet_ntop(AF_INET6, target, buf, sizeof (buf))));
2402 			return (B_TRUE);
2403 		}
2404 		sender = &src_ipif->ipif_v6src_addr;
2405 	}
2406 
2407 	/*
2408 	 * We're either sending a probe or we have a source address.
2409 	 */
2410 	ASSERT((flag & NDP_PROBE) || src_ipif != NULL);
2411 
2412 	maxplen = roundup(sizeof (nd_opt_hdr_t) + ND_MAX_HDW_LEN, 8);
2413 	len = IPV6_HDR_LEN + sizeof (ip6i_t) + sizeof (nd_neighbor_advert_t) +
2414 	    maxplen;
2415 	mp = allocb(len,  BPRI_LO);
2416 	if (mp == NULL) {
2417 		if (src_ipif != NULL)
2418 			ipif_refrele(src_ipif);
2419 		return (B_TRUE);
2420 	}
2421 	bzero((char *)mp->b_rptr, len);
2422 	mp->b_wptr = mp->b_rptr + len;
2423 
2424 	ip6i = (ip6i_t *)mp->b_rptr;
2425 	ip6i->ip6i_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
2426 	ip6i->ip6i_nxt = IPPROTO_RAW;
2427 	ip6i->ip6i_flags = IP6I_HOPLIMIT;
2428 	if (flag & NDP_PROBE)
2429 		ip6i->ip6i_flags |= IP6I_UNSPEC_SRC;
2430 
2431 	ip6h = (ip6_t *)(mp->b_rptr + sizeof (ip6i_t));
2432 	ip6h->ip6_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
2433 	ip6h->ip6_plen = htons(len - IPV6_HDR_LEN - sizeof (ip6i_t));
2434 	ip6h->ip6_nxt = IPPROTO_ICMPV6;
2435 	ip6h->ip6_hops = IPV6_MAX_HOPS;
2436 	ip6h->ip6_src = *sender;
2437 	ip6h->ip6_dst = *target;
2438 	icmp6 = (icmp6_t *)&ip6h[1];
2439 
2440 	opt = (nd_opt_hdr_t *)((uint8_t *)ip6h + IPV6_HDR_LEN +
2441 	    sizeof (nd_neighbor_advert_t));
2442 
2443 	if (type == ND_NEIGHBOR_SOLICIT) {
2444 		nd_neighbor_solicit_t *ns = (nd_neighbor_solicit_t *)icmp6;
2445 
2446 		if (!(flag & NDP_PROBE))
2447 			opt->nd_opt_type = ND_OPT_SOURCE_LINKADDR;
2448 		ns->nd_ns_target = *target;
2449 		if (!(flag & NDP_UNICAST)) {
2450 			/* Form multicast address of the target */
2451 			ip6h->ip6_dst = ipv6_solicited_node_mcast;
2452 			ip6h->ip6_dst.s6_addr32[3] |=
2453 			    ns->nd_ns_target.s6_addr32[3];
2454 		}
2455 	} else {
2456 		nd_neighbor_advert_t *na = (nd_neighbor_advert_t *)icmp6;
2457 
2458 		ASSERT(!(flag & NDP_PROBE));
2459 		opt->nd_opt_type = ND_OPT_TARGET_LINKADDR;
2460 		na->nd_na_target = *sender;
2461 		if (flag & NDP_ISROUTER)
2462 			na->nd_na_flags_reserved |= ND_NA_FLAG_ROUTER;
2463 		if (flag & NDP_SOLICITED)
2464 			na->nd_na_flags_reserved |= ND_NA_FLAG_SOLICITED;
2465 		if (flag & NDP_ORIDE)
2466 			na->nd_na_flags_reserved |= ND_NA_FLAG_OVERRIDE;
2467 	}
2468 
2469 	hw_addr = NULL;
2470 	if (!(flag & NDP_PROBE)) {
2471 		/*
2472 		 * Use our source address to find the hardware address to put
2473 		 * in the packet, so that the hardware address and IP address
2474 		 * will match up -- even if that hardware address doesn't
2475 		 * match the ill we actually transmit the packet through.
2476 		 */
2477 		if (IS_IPMP(src_ipif->ipif_ill)) {
2478 			hwaddr_ill = ipmp_ipif_hold_bound_ill(src_ipif);
2479 			if (hwaddr_ill == NULL) {
2480 				ip1dbg(("nce_xmit: no bound ill!\n"));
2481 				ipif_refrele(src_ipif);
2482 				freemsg(mp);
2483 				return (B_TRUE);
2484 			}
2485 		} else {
2486 			hwaddr_ill = src_ipif->ipif_ill;
2487 			ill_refhold(hwaddr_ill);	/* for symmetry */
2488 		}
2489 
2490 		plen = roundup(sizeof (nd_opt_hdr_t) +
2491 		    hwaddr_ill->ill_nd_lla_len, 8);
2492 
2493 		hw_addr = use_nd_lla ? hwaddr_ill->ill_nd_lla :
2494 		    hwaddr_ill->ill_phys_addr;
2495 		if (hw_addr != NULL) {
2496 			/* Fill in link layer address and option len */
2497 			opt->nd_opt_len = (uint8_t)(plen / 8);
2498 			bcopy(hw_addr, &opt[1], hwaddr_ill->ill_nd_lla_len);
2499 		}
2500 
2501 		ill_refrele(hwaddr_ill);
2502 	}
2503 
2504 	if (hw_addr == NULL)
2505 		plen = 0;
2506 
2507 	/* Fix up the length of the packet now that plen is known */
2508 	len -= (maxplen - plen);
2509 	mp->b_wptr = mp->b_rptr + len;
2510 	ip6h->ip6_plen = htons(len - IPV6_HDR_LEN - sizeof (ip6i_t));
2511 
2512 	icmp6->icmp6_type = type;
2513 	icmp6->icmp6_code = 0;
2514 	/*
2515 	 * Prepare for checksum by putting icmp length in the icmp
2516 	 * checksum field. The checksum is calculated in ip_wput_v6.
2517 	 */
2518 	icmp6->icmp6_cksum = ip6h->ip6_plen;
2519 
2520 	/*
2521 	 * Before we toss the src_ipif, look up the zoneid to pass to
2522 	 * ip_output_v6().  This is to ensure unicast ND_NEIGHBOR_ADVERT
2523 	 * packets to be routed correctly by IP (we cannot guarantee that the
2524 	 * global zone has an interface route to the destination).
2525 	 */
2526 	if (src_ipif != NULL) {
2527 		if ((zoneid = src_ipif->ipif_zoneid) == ALL_ZONES)
2528 			zoneid = GLOBAL_ZONEID;
2529 		ipif_refrele(src_ipif);
2530 	}
2531 
2532 	ip_output_v6((void *)(uintptr_t)zoneid, mp, ill->ill_wq, IP_WPUT);
2533 	return (B_FALSE);
2534 }
2535 
2536 /*
2537  * Make a link layer address (does not include the SAP) from an nce.
2538  * To form the link layer address, use the last four bytes of ipv6
2539  * address passed in and the fixed offset stored in nce.
2540  */
2541 static void
2542 nce_make_mapping(nce_t *nce, uchar_t *addrpos, uchar_t *addr)
2543 {
2544 	uchar_t *mask, *to;
2545 	ill_t	*ill = nce->nce_ill;
2546 	int 	len;
2547 
2548 	if (ill->ill_net_type == IRE_IF_NORESOLVER)
2549 		return;
2550 	ASSERT(nce->nce_res_mp != NULL);
2551 	ASSERT(ill->ill_net_type == IRE_IF_RESOLVER);
2552 	ASSERT(nce->nce_flags & NCE_F_MAPPING);
2553 	ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&nce->nce_extract_mask));
2554 	ASSERT(addr != NULL);
2555 	bcopy(nce->nce_res_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill),
2556 	    addrpos, ill->ill_nd_lla_len);
2557 	len = MIN((int)ill->ill_nd_lla_len - nce->nce_ll_extract_start,
2558 	    IPV6_ADDR_LEN);
2559 	mask = (uchar_t *)&nce->nce_extract_mask;
2560 	mask += (IPV6_ADDR_LEN - len);
2561 	addr += (IPV6_ADDR_LEN - len);
2562 	to = addrpos + nce->nce_ll_extract_start;
2563 	while (len-- > 0)
2564 		*to++ |= *mask++ & *addr++;
2565 }
2566 
2567 mblk_t *
2568 nce_udreq_alloc(ill_t *ill)
2569 {
2570 	mblk_t	*template_mp = NULL;
2571 	dl_unitdata_req_t *dlur;
2572 	int	sap_length;
2573 
2574 	ASSERT(ill->ill_isv6);
2575 
2576 	sap_length = ill->ill_sap_length;
2577 	template_mp = ip_dlpi_alloc(sizeof (dl_unitdata_req_t) +
2578 	    ill->ill_nd_lla_len + ABS(sap_length), DL_UNITDATA_REQ);
2579 	if (template_mp == NULL)
2580 		return (NULL);
2581 
2582 	dlur = (dl_unitdata_req_t *)template_mp->b_rptr;
2583 	dlur->dl_priority.dl_min = 0;
2584 	dlur->dl_priority.dl_max = 0;
2585 	dlur->dl_dest_addr_length = ABS(sap_length) + ill->ill_nd_lla_len;
2586 	dlur->dl_dest_addr_offset = sizeof (dl_unitdata_req_t);
2587 
2588 	/* Copy in the SAP value. */
2589 	NCE_LL_SAP_COPY(ill, template_mp);
2590 
2591 	return (template_mp);
2592 }
2593 
2594 /*
2595  * NDP retransmit timer.
2596  * This timer goes off when:
2597  * a. It is time to retransmit NS for resolver.
2598  * b. It is time to send reachability probes.
2599  */
2600 void
2601 ndp_timer(void *arg)
2602 {
2603 	nce_t		*nce = arg;
2604 	ill_t		*ill = nce->nce_ill;
2605 	char		addrbuf[INET6_ADDRSTRLEN];
2606 	boolean_t	dropped = B_FALSE;
2607 	ip_stack_t	*ipst = ill->ill_ipst;
2608 
2609 	/*
2610 	 * The timer has to be cancelled by ndp_delete before doing the final
2611 	 * refrele. So the NCE is guaranteed to exist when the timer runs
2612 	 * until it clears the timeout_id. Before clearing the timeout_id
2613 	 * bump up the refcnt so that we can continue to use the nce
2614 	 */
2615 	ASSERT(nce != NULL);
2616 
2617 	mutex_enter(&nce->nce_lock);
2618 	NCE_REFHOLD_LOCKED(nce);
2619 	nce->nce_timeout_id = 0;
2620 
2621 	/*
2622 	 * Check the reachability state first.
2623 	 */
2624 	switch (nce->nce_state) {
2625 	case ND_DELAY:
2626 		nce->nce_state = ND_PROBE;
2627 		mutex_exit(&nce->nce_lock);
2628 		(void) nce_xmit_solicit(nce, B_FALSE, &ipv6_all_zeros,
2629 		    NDP_UNICAST);
2630 		if (ip_debug > 3) {
2631 			/* ip2dbg */
2632 			pr_addr_dbg("ndp_timer: state for %s changed "
2633 			    "to PROBE\n", AF_INET6, &nce->nce_addr);
2634 		}
2635 		NDP_RESTART_TIMER(nce, ill->ill_reachable_retrans_time);
2636 		NCE_REFRELE(nce);
2637 		return;
2638 	case ND_PROBE:
2639 		/* must be retransmit timer */
2640 		nce->nce_pcnt--;
2641 		ASSERT(nce->nce_pcnt < ND_MAX_UNICAST_SOLICIT &&
2642 		    nce->nce_pcnt >= -1);
2643 		if (nce->nce_pcnt > 0) {
2644 			/*
2645 			 * As per RFC2461, the nce gets deleted after
2646 			 * MAX_UNICAST_SOLICIT unsuccessful re-transmissions.
2647 			 * Note that the first unicast solicitation is sent
2648 			 * during the DELAY state.
2649 			 */
2650 			ip2dbg(("ndp_timer: pcount=%x dst %s\n",
2651 			    nce->nce_pcnt, inet_ntop(AF_INET6, &nce->nce_addr,
2652 			    addrbuf, sizeof (addrbuf))));
2653 			mutex_exit(&nce->nce_lock);
2654 			dropped = nce_xmit_solicit(nce, B_FALSE,
2655 			    &ipv6_all_zeros,
2656 			    (nce->nce_flags & NCE_F_PERMANENT) ? NDP_PROBE :
2657 			    NDP_UNICAST);
2658 			if (dropped) {
2659 				mutex_enter(&nce->nce_lock);
2660 				nce->nce_pcnt++;
2661 				mutex_exit(&nce->nce_lock);
2662 			}
2663 			NDP_RESTART_TIMER(nce, ILL_PROBE_INTERVAL(ill));
2664 		} else if (nce->nce_pcnt < 0) {
2665 			/* No hope, delete the nce */
2666 			nce->nce_state = ND_UNREACHABLE;
2667 			mutex_exit(&nce->nce_lock);
2668 			if (ip_debug > 2) {
2669 				/* ip1dbg */
2670 				pr_addr_dbg("ndp_timer: Delete IRE for"
2671 				    " dst %s\n", AF_INET6, &nce->nce_addr);
2672 			}
2673 			ndp_delete(nce);
2674 		} else if (!(nce->nce_flags & NCE_F_PERMANENT)) {
2675 			/* Wait RetransTimer, before deleting the entry */
2676 			ip2dbg(("ndp_timer: pcount=%x dst %s\n",
2677 			    nce->nce_pcnt, inet_ntop(AF_INET6,
2678 			    &nce->nce_addr, addrbuf, sizeof (addrbuf))));
2679 			mutex_exit(&nce->nce_lock);
2680 			/* Wait one interval before killing */
2681 			NDP_RESTART_TIMER(nce, ill->ill_reachable_retrans_time);
2682 		} else if (ill->ill_phyint->phyint_flags & PHYI_RUNNING) {
2683 			ipif_t *ipif;
2684 
2685 			/*
2686 			 * We're done probing, and we can now declare this
2687 			 * address to be usable.  Let IP know that it's ok to
2688 			 * use.
2689 			 */
2690 			nce->nce_state = ND_REACHABLE;
2691 			mutex_exit(&nce->nce_lock);
2692 			ipif = ip_ndp_lookup_addr_v6(&nce->nce_addr,
2693 			    nce->nce_ill);
2694 			if (ipif != NULL) {
2695 				if (ipif->ipif_was_dup) {
2696 					char ibuf[LIFNAMSIZ + 10];
2697 					char sbuf[INET6_ADDRSTRLEN];
2698 
2699 					ipif->ipif_was_dup = B_FALSE;
2700 					(void) inet_ntop(AF_INET6,
2701 					    &ipif->ipif_v6lcl_addr,
2702 					    sbuf, sizeof (sbuf));
2703 					ipif_get_name(ipif, ibuf,
2704 					    sizeof (ibuf));
2705 					cmn_err(CE_NOTE, "recovered address "
2706 					    "%s on %s", sbuf, ibuf);
2707 				}
2708 				if ((ipif->ipif_flags & IPIF_UP) &&
2709 				    !ipif->ipif_addr_ready)
2710 					ipif_up_notify(ipif);
2711 				ipif->ipif_addr_ready = 1;
2712 				ipif_refrele(ipif);
2713 			}
2714 			/* Begin defending our new address */
2715 			nce->nce_unsolicit_count = 0;
2716 			dropped = nce_xmit_advert(nce, B_FALSE,
2717 			    &ipv6_all_hosts_mcast, 0);
2718 			if (dropped) {
2719 				nce->nce_unsolicit_count = 1;
2720 				NDP_RESTART_TIMER(nce,
2721 				    ipst->ips_ip_ndp_unsolicit_interval);
2722 			} else if (ipst->ips_ip_ndp_defense_interval != 0) {
2723 				NDP_RESTART_TIMER(nce,
2724 				    ipst->ips_ip_ndp_defense_interval);
2725 			}
2726 		} else {
2727 			/*
2728 			 * This is an address we're probing to be our own, but
2729 			 * the ill is down.  Wait until it comes back before
2730 			 * doing anything, but switch to reachable state so
2731 			 * that the restart will work.
2732 			 */
2733 			nce->nce_state = ND_REACHABLE;
2734 			mutex_exit(&nce->nce_lock);
2735 		}
2736 		NCE_REFRELE(nce);
2737 		return;
2738 	case ND_INCOMPLETE: {
2739 		ip6_t	*ip6h;
2740 		ip6i_t	*ip6i;
2741 		mblk_t	*mp, *datamp, *nextmp, **prevmpp;
2742 
2743 		/*
2744 		 * Per case (2) in the nce_queue_mp() comments, scan nce_qd_mp
2745 		 * for any IPMP probe packets, and toss 'em.  IPMP probe
2746 		 * packets will always be at the head of nce_qd_mp and always
2747 		 * have an ip6i_t header, so we can stop at the first queued
2748 		 * ND packet without an ip6i_t.
2749 		 */
2750 		prevmpp = &nce->nce_qd_mp;
2751 		for (mp = nce->nce_qd_mp; mp != NULL; mp = nextmp) {
2752 			nextmp = mp->b_next;
2753 			datamp = (DB_TYPE(mp) == M_CTL) ? mp->b_cont : mp;
2754 			ip6h = (ip6_t *)datamp->b_rptr;
2755 			if (ip6h->ip6_nxt != IPPROTO_RAW)
2756 				break;
2757 
2758 			ip6i = (ip6i_t *)ip6h;
2759 			if (ip6i->ip6i_flags & IP6I_IPMP_PROBE) {
2760 				inet_freemsg(mp);
2761 				*prevmpp = nextmp;
2762 			} else {
2763 				prevmpp = &mp->b_next;
2764 			}
2765 		}
2766 		ip_ndp_resolve(nce);
2767 		mutex_exit(&nce->nce_lock);
2768 		NCE_REFRELE(nce);
2769 		break;
2770 	}
2771 	case ND_REACHABLE:
2772 		if (((nce->nce_flags & NCE_F_UNSOL_ADV) &&
2773 		    nce->nce_unsolicit_count != 0) ||
2774 		    ((nce->nce_flags & NCE_F_PERMANENT) &&
2775 		    ipst->ips_ip_ndp_defense_interval != 0)) {
2776 			if (nce->nce_unsolicit_count > 0)
2777 				nce->nce_unsolicit_count--;
2778 			mutex_exit(&nce->nce_lock);
2779 			dropped = nce_xmit_advert(nce, B_FALSE,
2780 			    &ipv6_all_hosts_mcast, 0);
2781 			if (dropped) {
2782 				mutex_enter(&nce->nce_lock);
2783 				nce->nce_unsolicit_count++;
2784 				mutex_exit(&nce->nce_lock);
2785 			}
2786 			if (nce->nce_unsolicit_count != 0) {
2787 				NDP_RESTART_TIMER(nce,
2788 				    ipst->ips_ip_ndp_unsolicit_interval);
2789 			} else {
2790 				NDP_RESTART_TIMER(nce,
2791 				    ipst->ips_ip_ndp_defense_interval);
2792 			}
2793 		} else {
2794 			mutex_exit(&nce->nce_lock);
2795 		}
2796 		NCE_REFRELE(nce);
2797 		break;
2798 	default:
2799 		mutex_exit(&nce->nce_lock);
2800 		NCE_REFRELE(nce);
2801 		break;
2802 	}
2803 }
2804 
2805 /*
2806  * Set a link layer address from the ll_addr passed in.
2807  * Copy SAP from ill.
2808  */
2809 static void
2810 nce_set_ll(nce_t *nce, uchar_t *ll_addr)
2811 {
2812 	ill_t	*ill = nce->nce_ill;
2813 	uchar_t	*woffset;
2814 
2815 	ASSERT(ll_addr != NULL);
2816 	/* Always called before fast_path_probe */
2817 	ASSERT(nce->nce_fp_mp == NULL);
2818 	if (ill->ill_sap_length != 0) {
2819 		/*
2820 		 * Copy the SAP type specified in the
2821 		 * request into the xmit template.
2822 		 */
2823 		NCE_LL_SAP_COPY(ill, nce->nce_res_mp);
2824 	}
2825 	if (ill->ill_phys_addr_length > 0) {
2826 		/*
2827 		 * The bcopy() below used to be called for the physical address
2828 		 * length rather than the link layer address length. For
2829 		 * ethernet and many other media, the phys_addr and lla are
2830 		 * identical.
2831 		 * However, with xresolv interfaces being introduced, the
2832 		 * phys_addr and lla are no longer the same, and the physical
2833 		 * address may not have any useful meaning, so we use the lla
2834 		 * for IPv6 address resolution and destination addressing.
2835 		 *
2836 		 * For PPP or other interfaces with a zero length
2837 		 * physical address, don't do anything here.
2838 		 * The bcopy() with a zero phys_addr length was previously
2839 		 * a no-op for interfaces with a zero-length physical address.
2840 		 * Using the lla for them would change the way they operate.
2841 		 * Doing nothing in such cases preserves expected behavior.
2842 		 */
2843 		woffset = nce->nce_res_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill);
2844 		bcopy(ll_addr, woffset, ill->ill_nd_lla_len);
2845 	}
2846 }
2847 
2848 static boolean_t
2849 nce_cmp_ll_addr(const nce_t *nce, const uchar_t *ll_addr, uint32_t ll_addr_len)
2850 {
2851 	ill_t	*ill = nce->nce_ill;
2852 	uchar_t	*ll_offset;
2853 
2854 	ASSERT(nce->nce_res_mp != NULL);
2855 	if (ll_addr == NULL)
2856 		return (B_FALSE);
2857 	ll_offset = nce->nce_res_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill);
2858 	if (bcmp(ll_addr, ll_offset, ll_addr_len) != 0)
2859 		return (B_TRUE);
2860 	return (B_FALSE);
2861 }
2862 
2863 /*
2864  * Updates the link layer address or the reachability state of
2865  * a cache entry.  Reset probe counter if needed.
2866  */
2867 static void
2868 nce_update(nce_t *nce, uint16_t new_state, uchar_t *new_ll_addr)
2869 {
2870 	ill_t	*ill = nce->nce_ill;
2871 	boolean_t need_stop_timer = B_FALSE;
2872 	boolean_t need_fastpath_update = B_FALSE;
2873 
2874 	ASSERT(MUTEX_HELD(&nce->nce_lock));
2875 	ASSERT(nce->nce_ipversion == IPV6_VERSION);
2876 	/*
2877 	 * If this interface does not do NUD, there is no point
2878 	 * in allowing an update to the cache entry.  Although
2879 	 * we will respond to NS.
2880 	 * The only time we accept an update for a resolver when
2881 	 * NUD is turned off is when it has just been created.
2882 	 * Non-Resolvers will always be created as REACHABLE.
2883 	 */
2884 	if (new_state != ND_UNCHANGED) {
2885 		if ((nce->nce_flags & NCE_F_NONUD) &&
2886 		    (nce->nce_state != ND_INCOMPLETE))
2887 			return;
2888 		ASSERT((int16_t)new_state >= ND_STATE_VALID_MIN);
2889 		ASSERT((int16_t)new_state <= ND_STATE_VALID_MAX);
2890 		need_stop_timer = B_TRUE;
2891 		if (new_state == ND_REACHABLE)
2892 			nce->nce_last = TICK_TO_MSEC(lbolt64);
2893 		else {
2894 			/* We force NUD in this case */
2895 			nce->nce_last = 0;
2896 		}
2897 		nce->nce_state = new_state;
2898 		nce->nce_pcnt = ND_MAX_UNICAST_SOLICIT;
2899 	}
2900 	/*
2901 	 * In case of fast path we need to free the the fastpath
2902 	 * M_DATA and do another probe.  Otherwise we can just
2903 	 * overwrite the DL_UNITDATA_REQ data, noting we'll lose
2904 	 * whatever packets that happens to be transmitting at the time.
2905 	 */
2906 	if (new_ll_addr != NULL) {
2907 		ASSERT(nce->nce_res_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill) +
2908 		    ill->ill_nd_lla_len <= nce->nce_res_mp->b_wptr);
2909 		bcopy(new_ll_addr, nce->nce_res_mp->b_rptr +
2910 		    NCE_LL_ADDR_OFFSET(ill), ill->ill_nd_lla_len);
2911 		if (nce->nce_fp_mp != NULL) {
2912 			freemsg(nce->nce_fp_mp);
2913 			nce->nce_fp_mp = NULL;
2914 		}
2915 		need_fastpath_update = B_TRUE;
2916 	}
2917 	mutex_exit(&nce->nce_lock);
2918 	if (need_stop_timer) {
2919 		(void) untimeout(nce->nce_timeout_id);
2920 		nce->nce_timeout_id = 0;
2921 	}
2922 	if (need_fastpath_update)
2923 		nce_fastpath(nce);
2924 	mutex_enter(&nce->nce_lock);
2925 }
2926 
2927 void
2928 nce_queue_mp_common(nce_t *nce, mblk_t *mp, boolean_t head_insert)
2929 {
2930 	uint_t	count = 0;
2931 	mblk_t  **mpp, *tmp;
2932 
2933 	ASSERT(MUTEX_HELD(&nce->nce_lock));
2934 
2935 	for (mpp = &nce->nce_qd_mp; *mpp != NULL; mpp = &(*mpp)->b_next) {
2936 		if (++count > nce->nce_ill->ill_max_buf) {
2937 			tmp = nce->nce_qd_mp->b_next;
2938 			nce->nce_qd_mp->b_next = NULL;
2939 			nce->nce_qd_mp->b_prev = NULL;
2940 			freemsg(nce->nce_qd_mp);
2941 			nce->nce_qd_mp = tmp;
2942 		}
2943 	}
2944 
2945 	if (head_insert) {
2946 		mp->b_next = nce->nce_qd_mp;
2947 		nce->nce_qd_mp = mp;
2948 	} else {
2949 		*mpp = mp;
2950 	}
2951 }
2952 
2953 static void
2954 nce_queue_mp(nce_t *nce, mblk_t *mp)
2955 {
2956 	boolean_t head_insert = B_FALSE;
2957 	ip6_t	*ip6h;
2958 	ip6i_t  *ip6i;
2959 	mblk_t	*data_mp;
2960 
2961 	ASSERT(MUTEX_HELD(&nce->nce_lock));
2962 
2963 	if (mp->b_datap->db_type == M_CTL)
2964 		data_mp = mp->b_cont;
2965 	else
2966 		data_mp = mp;
2967 	ip6h = (ip6_t *)data_mp->b_rptr;
2968 	if (ip6h->ip6_nxt == IPPROTO_RAW) {
2969 		/*
2970 		 * This message should have been pulled up already in
2971 		 * ip_wput_v6. We can't do pullups here because the message
2972 		 * could be from the nce_qd_mp which could have b_next/b_prev
2973 		 * non-NULL.
2974 		 */
2975 		ip6i = (ip6i_t *)ip6h;
2976 		ASSERT(MBLKL(data_mp) >= sizeof (ip6i_t) + IPV6_HDR_LEN);
2977 
2978 		/*
2979 		 * If this packet is marked IP6I_IPMP_PROBE, then we need to:
2980 		 *
2981 		 *   1. Insert it at the head of the nce_qd_mp list.  Consider
2982 		 *	the normal (non-probe) load-speading case where the
2983 		 *	source address of the ND packet is not tied to nce_ill.
2984 		 *	If the ill bound to the source address cannot receive,
2985 		 *	the response to the ND packet will not be received.
2986 		 *	However, if ND packets for nce_ill's probes are queued
2987 		 *	behind that ND packet, those probes will also fail to
2988 		 *	be sent, and thus in.mpathd will erroneously conclude
2989 		 *	that nce_ill has also failed.
2990 		 *
2991 		 *   2. Drop the probe packet in ndp_timer() if the ND did
2992 		 *	not succeed on the first attempt.  This ensures that
2993 		 *	ND problems do not manifest as probe RTT spikes.
2994 		 */
2995 		if (ip6i->ip6i_flags & IP6I_IPMP_PROBE)
2996 			head_insert = B_TRUE;
2997 	}
2998 	nce_queue_mp_common(nce, mp, head_insert);
2999 }
3000 
3001 /*
3002  * Called when address resolution failed due to a timeout.
3003  * Send an ICMP unreachable in response to all queued packets.
3004  */
3005 void
3006 nce_resolv_failed(nce_t *nce)
3007 {
3008 	mblk_t	*mp, *nxt_mp, *first_mp;
3009 	char	buf[INET6_ADDRSTRLEN];
3010 	ip6_t *ip6h;
3011 	zoneid_t zoneid = GLOBAL_ZONEID;
3012 	ip_stack_t	*ipst = nce->nce_ill->ill_ipst;
3013 
3014 	ip1dbg(("nce_resolv_failed: dst %s\n",
3015 	    inet_ntop(AF_INET6, (char *)&nce->nce_addr, buf, sizeof (buf))));
3016 	mutex_enter(&nce->nce_lock);
3017 	mp = nce->nce_qd_mp;
3018 	nce->nce_qd_mp = NULL;
3019 	mutex_exit(&nce->nce_lock);
3020 	while (mp != NULL) {
3021 		nxt_mp = mp->b_next;
3022 		mp->b_next = NULL;
3023 		mp->b_prev = NULL;
3024 
3025 		first_mp = mp;
3026 		if (mp->b_datap->db_type == M_CTL) {
3027 			ipsec_out_t *io = (ipsec_out_t *)mp->b_rptr;
3028 			ASSERT(io->ipsec_out_type == IPSEC_OUT);
3029 			zoneid = io->ipsec_out_zoneid;
3030 			ASSERT(zoneid != ALL_ZONES);
3031 			mp = mp->b_cont;
3032 			mp->b_next = NULL;
3033 			mp->b_prev = NULL;
3034 		}
3035 
3036 		ip6h = (ip6_t *)mp->b_rptr;
3037 		if (ip6h->ip6_nxt == IPPROTO_RAW) {
3038 			ip6i_t *ip6i;
3039 			/*
3040 			 * This message should have been pulled up already
3041 			 * in ip_wput_v6. ip_hdr_complete_v6 assumes that
3042 			 * the header is pulled up.
3043 			 */
3044 			ip6i = (ip6i_t *)ip6h;
3045 			ASSERT((mp->b_wptr - (uchar_t *)ip6i) >=
3046 			    sizeof (ip6i_t) + IPV6_HDR_LEN);
3047 			mp->b_rptr += sizeof (ip6i_t);
3048 		}
3049 		/*
3050 		 * Ignore failure since icmp_unreachable_v6 will silently
3051 		 * drop packets with an unspecified source address.
3052 		 */
3053 		(void) ip_hdr_complete_v6((ip6_t *)mp->b_rptr, zoneid, ipst);
3054 		icmp_unreachable_v6(nce->nce_ill->ill_wq, first_mp,
3055 		    ICMP6_DST_UNREACH_ADDR, B_FALSE, B_FALSE, zoneid, ipst);
3056 		mp = nxt_mp;
3057 	}
3058 	nce_cb_dispatch(nce);
3059 }
3060 
3061 /*
3062  * Called by SIOCSNDP* ioctl to add/change an nce entry
3063  * and the corresponding attributes.
3064  * Disallow states other than ND_REACHABLE or ND_STALE.
3065  */
3066 int
3067 ndp_sioc_update(ill_t *ill, lif_nd_req_t *lnr)
3068 {
3069 	sin6_t		*sin6;
3070 	in6_addr_t	*addr;
3071 	nce_t		*nce;
3072 	int		err;
3073 	uint16_t	new_flags = 0;
3074 	uint16_t	old_flags = 0;
3075 	int		inflags = lnr->lnr_flags;
3076 	ip_stack_t	*ipst = ill->ill_ipst;
3077 
3078 	ASSERT(ill->ill_isv6);
3079 	if ((lnr->lnr_state_create != ND_REACHABLE) &&
3080 	    (lnr->lnr_state_create != ND_STALE))
3081 		return (EINVAL);
3082 
3083 	if (lnr->lnr_hdw_len > ND_MAX_HDW_LEN)
3084 		return (EINVAL);
3085 
3086 	sin6 = (sin6_t *)&lnr->lnr_addr;
3087 	addr = &sin6->sin6_addr;
3088 
3089 	mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
3090 	/* We know it can not be mapping so just look in the hash table */
3091 	nce = *((nce_t **)NCE_HASH_PTR_V6(ipst, *addr));
3092 	/* See comment in ndp_query() regarding IS_IPMP(ill) usage */
3093 	nce = nce_lookup_addr(ill, IS_IPMP(ill), addr, nce);
3094 	if (nce != NULL)
3095 		new_flags = nce->nce_flags;
3096 
3097 	switch (inflags & (NDF_ISROUTER_ON|NDF_ISROUTER_OFF)) {
3098 	case NDF_ISROUTER_ON:
3099 		new_flags |= NCE_F_ISROUTER;
3100 		break;
3101 	case NDF_ISROUTER_OFF:
3102 		new_flags &= ~NCE_F_ISROUTER;
3103 		break;
3104 	case (NDF_ISROUTER_OFF|NDF_ISROUTER_ON):
3105 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
3106 		if (nce != NULL)
3107 			NCE_REFRELE(nce);
3108 		return (EINVAL);
3109 	}
3110 
3111 	switch (inflags & (NDF_ANYCAST_ON|NDF_ANYCAST_OFF)) {
3112 	case NDF_ANYCAST_ON:
3113 		new_flags |= NCE_F_ANYCAST;
3114 		break;
3115 	case NDF_ANYCAST_OFF:
3116 		new_flags &= ~NCE_F_ANYCAST;
3117 		break;
3118 	case (NDF_ANYCAST_OFF|NDF_ANYCAST_ON):
3119 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
3120 		if (nce != NULL)
3121 			NCE_REFRELE(nce);
3122 		return (EINVAL);
3123 	}
3124 
3125 	if (nce == NULL) {
3126 		err = ndp_add_v6(ill,
3127 		    (uchar_t *)lnr->lnr_hdw_addr,
3128 		    addr,
3129 		    &ipv6_all_ones,
3130 		    &ipv6_all_zeros,
3131 		    0,
3132 		    new_flags,
3133 		    lnr->lnr_state_create,
3134 		    &nce);
3135 		if (err != 0) {
3136 			mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
3137 			ip1dbg(("ndp_sioc_update: Can't create NCE %d\n", err));
3138 			return (err);
3139 		}
3140 	}
3141 	old_flags = nce->nce_flags;
3142 	if (old_flags & NCE_F_ISROUTER && !(new_flags & NCE_F_ISROUTER)) {
3143 		/*
3144 		 * Router turned to host, delete all ires.
3145 		 * XXX Just delete the entry, but we need to add too.
3146 		 */
3147 		nce->nce_flags &= ~NCE_F_ISROUTER;
3148 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
3149 		ndp_delete(nce);
3150 		NCE_REFRELE(nce);
3151 		return (0);
3152 	}
3153 	mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
3154 
3155 	mutex_enter(&nce->nce_lock);
3156 	nce->nce_flags = new_flags;
3157 	mutex_exit(&nce->nce_lock);
3158 	/*
3159 	 * Note that we ignore the state at this point, which
3160 	 * should be either STALE or REACHABLE.  Instead we let
3161 	 * the link layer address passed in to determine the state
3162 	 * much like incoming packets.
3163 	 */
3164 	nce_process(nce, (uchar_t *)lnr->lnr_hdw_addr, 0, B_FALSE);
3165 	NCE_REFRELE(nce);
3166 	return (0);
3167 }
3168 
3169 /*
3170  * If the device driver supports it, we make nce_fp_mp to have
3171  * an M_DATA prepend.  Otherwise nce_fp_mp will be null.
3172  * The caller ensures there is hold on nce for this function.
3173  * Note that since ill_fastpath_probe() copies the mblk there is
3174  * no need for the hold beyond this function.
3175  */
3176 void
3177 nce_fastpath(nce_t *nce)
3178 {
3179 	ill_t	*ill = nce->nce_ill;
3180 	int res;
3181 
3182 	ASSERT(ill != NULL);
3183 	ASSERT(nce->nce_state != ND_INITIAL && nce->nce_state != ND_INCOMPLETE);
3184 
3185 	if (nce->nce_fp_mp != NULL) {
3186 		/* Already contains fastpath info */
3187 		return;
3188 	}
3189 	if (nce->nce_res_mp != NULL) {
3190 		nce_fastpath_list_add(nce);
3191 		res = ill_fastpath_probe(ill, nce->nce_res_mp);
3192 		/*
3193 		 * EAGAIN is an indication of a transient error
3194 		 * i.e. allocation failure etc. leave the nce in the list it
3195 		 * will be updated when another probe happens for another ire
3196 		 * if not it will be taken out of the list when the ire is
3197 		 * deleted.
3198 		 */
3199 
3200 		if (res != 0 && res != EAGAIN)
3201 			nce_fastpath_list_delete(nce);
3202 	}
3203 }
3204 
3205 /*
3206  * Drain the list of nce's waiting for fastpath response.
3207  */
3208 void
3209 nce_fastpath_list_dispatch(ill_t *ill, boolean_t (*func)(nce_t *, void  *),
3210     void *arg)
3211 {
3212 
3213 	nce_t *next_nce;
3214 	nce_t *current_nce;
3215 	nce_t *first_nce;
3216 	nce_t *prev_nce = NULL;
3217 
3218 	mutex_enter(&ill->ill_lock);
3219 	first_nce = current_nce = (nce_t *)ill->ill_fastpath_list;
3220 	while (current_nce != (nce_t *)&ill->ill_fastpath_list) {
3221 		next_nce = current_nce->nce_fastpath;
3222 		/*
3223 		 * Take it off the list if we're flushing, or if the callback
3224 		 * routine tells us to do so.  Otherwise, leave the nce in the
3225 		 * fastpath list to handle any pending response from the lower
3226 		 * layer.  We can't drain the list when the callback routine
3227 		 * comparison failed, because the response is asynchronous in
3228 		 * nature, and may not arrive in the same order as the list
3229 		 * insertion.
3230 		 */
3231 		if (func == NULL || func(current_nce, arg)) {
3232 			current_nce->nce_fastpath = NULL;
3233 			if (current_nce == first_nce)
3234 				ill->ill_fastpath_list = first_nce = next_nce;
3235 			else
3236 				prev_nce->nce_fastpath = next_nce;
3237 		} else {
3238 			/* previous element that is still in the list */
3239 			prev_nce = current_nce;
3240 		}
3241 		current_nce = next_nce;
3242 	}
3243 	mutex_exit(&ill->ill_lock);
3244 }
3245 
3246 /*
3247  * Add nce to the nce fastpath list.
3248  */
3249 void
3250 nce_fastpath_list_add(nce_t *nce)
3251 {
3252 	ill_t *ill;
3253 
3254 	ill = nce->nce_ill;
3255 
3256 	mutex_enter(&ill->ill_lock);
3257 	mutex_enter(&nce->nce_lock);
3258 
3259 	/*
3260 	 * if nce has not been deleted and
3261 	 * is not already in the list add it.
3262 	 */
3263 	if (!(nce->nce_flags & NCE_F_CONDEMNED) &&
3264 	    (nce->nce_fastpath == NULL)) {
3265 		nce->nce_fastpath = (nce_t *)ill->ill_fastpath_list;
3266 		ill->ill_fastpath_list = nce;
3267 	}
3268 
3269 	mutex_exit(&nce->nce_lock);
3270 	mutex_exit(&ill->ill_lock);
3271 }
3272 
3273 /*
3274  * remove nce from the nce fastpath list.
3275  */
3276 void
3277 nce_fastpath_list_delete(nce_t *nce)
3278 {
3279 	nce_t *nce_ptr;
3280 
3281 	ill_t *ill;
3282 
3283 	ill = nce->nce_ill;
3284 	ASSERT(ill != NULL);
3285 
3286 	mutex_enter(&ill->ill_lock);
3287 	if (nce->nce_fastpath == NULL)
3288 		goto done;
3289 
3290 	ASSERT(ill->ill_fastpath_list != &ill->ill_fastpath_list);
3291 
3292 	if (ill->ill_fastpath_list == nce) {
3293 		ill->ill_fastpath_list = nce->nce_fastpath;
3294 	} else {
3295 		nce_ptr = ill->ill_fastpath_list;
3296 		while (nce_ptr != (nce_t *)&ill->ill_fastpath_list) {
3297 			if (nce_ptr->nce_fastpath == nce) {
3298 				nce_ptr->nce_fastpath = nce->nce_fastpath;
3299 				break;
3300 			}
3301 			nce_ptr = nce_ptr->nce_fastpath;
3302 		}
3303 	}
3304 
3305 	nce->nce_fastpath = NULL;
3306 done:
3307 	mutex_exit(&ill->ill_lock);
3308 }
3309 
3310 /*
3311  * Update all NCE's that are not in fastpath mode and
3312  * have an nce_fp_mp that matches mp. mp->b_cont contains
3313  * the fastpath header.
3314  *
3315  * Returns TRUE if entry should be dequeued, or FALSE otherwise.
3316  */
3317 boolean_t
3318 ndp_fastpath_update(nce_t *nce, void *arg)
3319 {
3320 	mblk_t 	*mp, *fp_mp;
3321 	uchar_t	*mp_rptr, *ud_mp_rptr;
3322 	mblk_t	*ud_mp = nce->nce_res_mp;
3323 	ptrdiff_t	cmplen;
3324 
3325 	if (nce->nce_flags & NCE_F_MAPPING)
3326 		return (B_TRUE);
3327 	if ((nce->nce_fp_mp != NULL) || (ud_mp == NULL))
3328 		return (B_TRUE);
3329 
3330 	ip2dbg(("ndp_fastpath_update: trying\n"));
3331 	mp = (mblk_t *)arg;
3332 	mp_rptr = mp->b_rptr;
3333 	cmplen = mp->b_wptr - mp_rptr;
3334 	ASSERT(cmplen >= 0);
3335 	ud_mp_rptr = ud_mp->b_rptr;
3336 	/*
3337 	 * The nce is locked here to prevent any other threads
3338 	 * from accessing and changing nce_res_mp when the IPv6 address
3339 	 * becomes resolved to an lla while we're in the middle
3340 	 * of looking at and comparing the hardware address (lla).
3341 	 * It is also locked to prevent multiple threads in nce_fastpath_update
3342 	 * from examining nce_res_mp atthe same time.
3343 	 */
3344 	mutex_enter(&nce->nce_lock);
3345 	if (ud_mp->b_wptr - ud_mp_rptr != cmplen ||
3346 	    bcmp((char *)mp_rptr, (char *)ud_mp_rptr, cmplen) != 0) {
3347 		mutex_exit(&nce->nce_lock);
3348 		/*
3349 		 * Don't take the ire off the fastpath list yet,
3350 		 * since the response may come later.
3351 		 */
3352 		return (B_FALSE);
3353 	}
3354 	/* Matched - install mp as the fastpath mp */
3355 	ip1dbg(("ndp_fastpath_update: match\n"));
3356 	fp_mp = dupb(mp->b_cont);
3357 	if (fp_mp != NULL) {
3358 		nce->nce_fp_mp = fp_mp;
3359 	}
3360 	mutex_exit(&nce->nce_lock);
3361 	return (B_TRUE);
3362 }
3363 
3364 /*
3365  * This function handles the DL_NOTE_FASTPATH_FLUSH notification from
3366  * driver.  Note that it assumes IP is exclusive...
3367  */
3368 /* ARGSUSED */
3369 void
3370 ndp_fastpath_flush(nce_t *nce, char *arg)
3371 {
3372 	if (nce->nce_flags & NCE_F_MAPPING)
3373 		return;
3374 	/* No fastpath info? */
3375 	if (nce->nce_fp_mp == NULL || nce->nce_res_mp == NULL)
3376 		return;
3377 
3378 	if (nce->nce_ipversion == IPV4_VERSION &&
3379 	    nce->nce_flags & NCE_F_BCAST) {
3380 		/*
3381 		 * IPv4 BROADCAST entries:
3382 		 * We can't delete the nce since it is difficult to
3383 		 * recreate these without going through the
3384 		 * ipif down/up dance.
3385 		 *
3386 		 * All access to nce->nce_fp_mp in the case of these
3387 		 * is protected by nce_lock.
3388 		 */
3389 		mutex_enter(&nce->nce_lock);
3390 		if (nce->nce_fp_mp != NULL) {
3391 			freeb(nce->nce_fp_mp);
3392 			nce->nce_fp_mp = NULL;
3393 			mutex_exit(&nce->nce_lock);
3394 			nce_fastpath(nce);
3395 		} else {
3396 			mutex_exit(&nce->nce_lock);
3397 		}
3398 	} else {
3399 		/* Just delete the NCE... */
3400 		ndp_delete(nce);
3401 	}
3402 }
3403 
3404 /*
3405  * Return a pointer to a given option in the packet.
3406  * Assumes that option part of the packet have already been validated.
3407  */
3408 nd_opt_hdr_t *
3409 ndp_get_option(nd_opt_hdr_t *opt, int optlen, int opt_type)
3410 {
3411 	while (optlen > 0) {
3412 		if (opt->nd_opt_type == opt_type)
3413 			return (opt);
3414 		optlen -= 8 * opt->nd_opt_len;
3415 		opt = (struct nd_opt_hdr *)((char *)opt + 8 * opt->nd_opt_len);
3416 	}
3417 	return (NULL);
3418 }
3419 
3420 /*
3421  * Verify all option lengths present are > 0, also check to see
3422  * if the option lengths and packet length are consistent.
3423  */
3424 boolean_t
3425 ndp_verify_optlen(nd_opt_hdr_t *opt, int optlen)
3426 {
3427 	ASSERT(opt != NULL);
3428 	while (optlen > 0) {
3429 		if (opt->nd_opt_len == 0)
3430 			return (B_FALSE);
3431 		optlen -= 8 * opt->nd_opt_len;
3432 		if (optlen < 0)
3433 			return (B_FALSE);
3434 		opt = (struct nd_opt_hdr *)((char *)opt + 8 * opt->nd_opt_len);
3435 	}
3436 	return (B_TRUE);
3437 }
3438 
3439 /*
3440  * ndp_walk function.
3441  * Free a fraction of the NCE cache entries.
3442  * A fraction of zero means to not free any in that category.
3443  */
3444 void
3445 ndp_cache_reclaim(nce_t *nce, char *arg)
3446 {
3447 	nce_cache_reclaim_t *ncr = (nce_cache_reclaim_t *)arg;
3448 	uint_t	rand;
3449 
3450 	if (nce->nce_flags & NCE_F_PERMANENT)
3451 		return;
3452 
3453 	rand = (uint_t)lbolt +
3454 	    NCE_ADDR_HASH_V6(nce->nce_addr, NCE_TABLE_SIZE);
3455 	if (ncr->ncr_host != 0 &&
3456 	    (rand/ncr->ncr_host)*ncr->ncr_host == rand) {
3457 		ndp_delete(nce);
3458 		return;
3459 	}
3460 }
3461 
3462 /*
3463  * ndp_walk function.
3464  * Count the number of NCEs that can be deleted.
3465  * These would be hosts but not routers.
3466  */
3467 void
3468 ndp_cache_count(nce_t *nce, char *arg)
3469 {
3470 	ncc_cache_count_t *ncc = (ncc_cache_count_t *)arg;
3471 
3472 	if (nce->nce_flags & NCE_F_PERMANENT)
3473 		return;
3474 
3475 	ncc->ncc_total++;
3476 	if (!(nce->nce_flags & NCE_F_ISROUTER))
3477 		ncc->ncc_host++;
3478 }
3479 
3480 #ifdef DEBUG
3481 void
3482 nce_trace_ref(nce_t *nce)
3483 {
3484 	ASSERT(MUTEX_HELD(&nce->nce_lock));
3485 
3486 	if (nce->nce_trace_disable)
3487 		return;
3488 
3489 	if (!th_trace_ref(nce, nce->nce_ill->ill_ipst)) {
3490 		nce->nce_trace_disable = B_TRUE;
3491 		nce_trace_cleanup(nce);
3492 	}
3493 }
3494 
3495 void
3496 nce_untrace_ref(nce_t *nce)
3497 {
3498 	ASSERT(MUTEX_HELD(&nce->nce_lock));
3499 
3500 	if (!nce->nce_trace_disable)
3501 		th_trace_unref(nce);
3502 }
3503 
3504 static void
3505 nce_trace_cleanup(const nce_t *nce)
3506 {
3507 	th_trace_cleanup(nce, nce->nce_trace_disable);
3508 }
3509 #endif
3510 
3511 /*
3512  * Called when address resolution fails due to a timeout.
3513  * Send an ICMP unreachable in response to all queued packets.
3514  */
3515 void
3516 arp_resolv_failed(nce_t *nce)
3517 {
3518 	mblk_t	*mp, *nxt_mp, *first_mp;
3519 	char	buf[INET6_ADDRSTRLEN];
3520 	zoneid_t zoneid = GLOBAL_ZONEID;
3521 	struct in_addr ipv4addr;
3522 	ip_stack_t *ipst = nce->nce_ill->ill_ipst;
3523 
3524 	IN6_V4MAPPED_TO_INADDR(&nce->nce_addr, &ipv4addr);
3525 	ip3dbg(("arp_resolv_failed: dst %s\n",
3526 	    inet_ntop(AF_INET, &ipv4addr, buf, sizeof (buf))));
3527 	mutex_enter(&nce->nce_lock);
3528 	mp = nce->nce_qd_mp;
3529 	nce->nce_qd_mp = NULL;
3530 	mutex_exit(&nce->nce_lock);
3531 
3532 	while (mp != NULL) {
3533 		nxt_mp = mp->b_next;
3534 		mp->b_next = NULL;
3535 		mp->b_prev = NULL;
3536 
3537 		first_mp = mp;
3538 		/*
3539 		 * Send icmp unreachable messages
3540 		 * to the hosts.
3541 		 */
3542 		(void) ip_hdr_complete((ipha_t *)mp->b_rptr, zoneid, ipst);
3543 		ip3dbg(("arp_resolv_failed: Calling icmp_unreachable\n"));
3544 		icmp_unreachable(nce->nce_ill->ill_wq, first_mp,
3545 		    ICMP_HOST_UNREACHABLE, zoneid, ipst);
3546 		mp = nxt_mp;
3547 	}
3548 }
3549 
3550 int
3551 ndp_lookup_then_add_v4(ill_t *ill, const in_addr_t *addr, uint16_t flags,
3552     nce_t **newnce, nce_t *src_nce)
3553 {
3554 	int	err;
3555 	nce_t	*nce;
3556 	in6_addr_t addr6;
3557 	ip_stack_t *ipst = ill->ill_ipst;
3558 
3559 	mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
3560 	nce = *((nce_t **)NCE_HASH_PTR_V4(ipst, *addr));
3561 	IN6_IPADDR_TO_V4MAPPED(*addr, &addr6);
3562 	/*
3563 	 * NOTE: IPv4 never matches across the illgrp since the NCE's we're
3564 	 * looking up have fastpath headers that are inherently per-ill.
3565 	 */
3566 	nce = nce_lookup_addr(ill, B_FALSE, &addr6, nce);
3567 	if (nce == NULL) {
3568 		err = ndp_add_v4(ill, addr, flags, newnce, src_nce);
3569 	} else {
3570 		*newnce = nce;
3571 		err = EEXIST;
3572 	}
3573 	mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
3574 	return (err);
3575 }
3576 
3577 /*
3578  * NDP Cache Entry creation routine for IPv4.
3579  * Mapped entries are handled in arp.
3580  * This routine must always be called with ndp4->ndp_g_lock held.
3581  * Prior to return, nce_refcnt is incremented.
3582  */
3583 static int
3584 ndp_add_v4(ill_t *ill, const in_addr_t *addr, uint16_t flags,
3585     nce_t **newnce, nce_t *src_nce)
3586 {
3587 	static	nce_t		nce_nil;
3588 	nce_t		*nce;
3589 	mblk_t		*mp;
3590 	mblk_t		*template = NULL;
3591 	nce_t		**ncep;
3592 	ip_stack_t	*ipst = ill->ill_ipst;
3593 	uint16_t	state = ND_INITIAL;
3594 	int		err;
3595 
3596 	ASSERT(MUTEX_HELD(&ipst->ips_ndp4->ndp_g_lock));
3597 	ASSERT(!ill->ill_isv6);
3598 	ASSERT((flags & NCE_F_MAPPING) == 0);
3599 
3600 	if (ill->ill_resolver_mp == NULL)
3601 		return (EINVAL);
3602 	/*
3603 	 * Allocate the mblk to hold the nce.
3604 	 */
3605 	mp = allocb(sizeof (nce_t), BPRI_MED);
3606 	if (mp == NULL)
3607 		return (ENOMEM);
3608 
3609 	nce = (nce_t *)mp->b_rptr;
3610 	mp->b_wptr = (uchar_t *)&nce[1];
3611 	*nce = nce_nil;
3612 	nce->nce_ill = ill;
3613 	nce->nce_ipversion = IPV4_VERSION;
3614 	nce->nce_flags = flags;
3615 	nce->nce_pcnt = ND_MAX_UNICAST_SOLICIT;
3616 	nce->nce_rcnt = ill->ill_xmit_count;
3617 	IN6_IPADDR_TO_V4MAPPED(*addr, &nce->nce_addr);
3618 	nce->nce_mask = ipv6_all_ones;
3619 	nce->nce_extract_mask = ipv6_all_zeros;
3620 	nce->nce_ll_extract_start = 0;
3621 	nce->nce_qd_mp = NULL;
3622 	nce->nce_mp = mp;
3623 	/* This one is for nce getting created */
3624 	nce->nce_refcnt = 1;
3625 	mutex_init(&nce->nce_lock, NULL, MUTEX_DEFAULT, NULL);
3626 	ncep = ((nce_t **)NCE_HASH_PTR_V4(ipst, *addr));
3627 
3628 	nce->nce_trace_disable = B_FALSE;
3629 
3630 	if (src_nce != NULL) {
3631 		/*
3632 		 * src_nce has been provided by the caller. The only
3633 		 * caller who provides a non-null, non-broadcast
3634 		 * src_nce is from ip_newroute() which must pass in
3635 		 * a ND_REACHABLE src_nce (this condition is verified
3636 		 * via an ASSERT for the save_ire->ire_nce in ip_newroute())
3637 		 */
3638 		mutex_enter(&src_nce->nce_lock);
3639 		state = src_nce->nce_state;
3640 		if ((src_nce->nce_flags & NCE_F_CONDEMNED) ||
3641 		    (ipst->ips_ndp4->ndp_g_hw_change > 0)) {
3642 			/*
3643 			 * src_nce has been deleted, or
3644 			 * ip_arp_news is in the middle of
3645 			 * flushing entries in the the nce.
3646 			 * Fail the add, since we don't know
3647 			 * if it is safe to copy the contents of
3648 			 * src_nce
3649 			 */
3650 			DTRACE_PROBE2(nce__bad__src__nce,
3651 			    nce_t *, src_nce, ill_t *, ill);
3652 			mutex_exit(&src_nce->nce_lock);
3653 			err = EINVAL;
3654 			goto err_ret;
3655 		}
3656 		template = copyb(src_nce->nce_res_mp);
3657 		mutex_exit(&src_nce->nce_lock);
3658 		if (template == NULL) {
3659 			err = ENOMEM;
3660 			goto err_ret;
3661 		}
3662 	} else if (flags & NCE_F_BCAST) {
3663 		/*
3664 		 * broadcast nce.
3665 		 */
3666 		template = copyb(ill->ill_bcast_mp);
3667 		if (template == NULL) {
3668 			err = ENOMEM;
3669 			goto err_ret;
3670 		}
3671 		state = ND_REACHABLE;
3672 	} else if (ill->ill_net_type == IRE_IF_NORESOLVER) {
3673 		/*
3674 		 * NORESOLVER entries are always created in the REACHABLE
3675 		 * state. We create a nce_res_mp with the IP nexthop address
3676 		 * in the destination address in the DLPI hdr if the
3677 		 * physical length is exactly 4 bytes.
3678 		 *
3679 		 * XXX not clear which drivers set ill_phys_addr_length to
3680 		 * IP_ADDR_LEN.
3681 		 */
3682 		if (ill->ill_phys_addr_length == IP_ADDR_LEN) {
3683 			template = ill_dlur_gen((uchar_t *)addr,
3684 			    ill->ill_phys_addr_length,
3685 			    ill->ill_sap, ill->ill_sap_length);
3686 		} else {
3687 			template = copyb(ill->ill_resolver_mp);
3688 		}
3689 		if (template == NULL) {
3690 			err = ENOMEM;
3691 			goto err_ret;
3692 		}
3693 		state = ND_REACHABLE;
3694 	}
3695 	nce->nce_fp_mp = NULL;
3696 	nce->nce_res_mp = template;
3697 	nce->nce_state = state;
3698 	if (state == ND_REACHABLE) {
3699 		nce->nce_last = TICK_TO_MSEC(lbolt64);
3700 		nce->nce_init_time = TICK_TO_MSEC(lbolt64);
3701 	} else {
3702 		nce->nce_last = 0;
3703 		if (state == ND_INITIAL)
3704 			nce->nce_init_time = TICK_TO_MSEC(lbolt64);
3705 	}
3706 
3707 	ASSERT((nce->nce_res_mp == NULL && nce->nce_state == ND_INITIAL) ||
3708 	    (nce->nce_res_mp != NULL && nce->nce_state == ND_REACHABLE));
3709 	/*
3710 	 * Atomically ensure that the ill is not CONDEMNED, before
3711 	 * adding the NCE.
3712 	 */
3713 	mutex_enter(&ill->ill_lock);
3714 	if (ill->ill_state_flags & ILL_CONDEMNED) {
3715 		mutex_exit(&ill->ill_lock);
3716 		err = EINVAL;
3717 		goto err_ret;
3718 	}
3719 	if ((nce->nce_next = *ncep) != NULL)
3720 		nce->nce_next->nce_ptpn = &nce->nce_next;
3721 	*ncep = nce;
3722 	nce->nce_ptpn = ncep;
3723 	*newnce = nce;
3724 	/* This one is for nce being used by an active thread */
3725 	NCE_REFHOLD(*newnce);
3726 
3727 	/* Bump up the number of nce's referencing this ill */
3728 	DTRACE_PROBE3(ill__incr__cnt, (ill_t *), ill,
3729 	    (char *), "nce", (void *), nce);
3730 	ill->ill_nce_cnt++;
3731 	mutex_exit(&ill->ill_lock);
3732 	DTRACE_PROBE1(ndp__add__v4, nce_t *, nce);
3733 	return (0);
3734 err_ret:
3735 	freeb(mp);
3736 	freemsg(template);
3737 	return (err);
3738 }
3739 
3740 /*
3741  * ndp_walk routine to delete all entries that have a given destination or
3742  * gateway address and cached link layer (MAC) address.  This is used when ARP
3743  * informs us that a network-to-link-layer mapping may have changed.
3744  */
3745 void
3746 nce_delete_hw_changed(nce_t *nce, void *arg)
3747 {
3748 	nce_hw_map_t *hwm = arg;
3749 	mblk_t *mp;
3750 	dl_unitdata_req_t *dlu;
3751 	uchar_t *macaddr;
3752 	ill_t *ill;
3753 	int saplen;
3754 	ipaddr_t nce_addr;
3755 
3756 	if (nce->nce_state != ND_REACHABLE)
3757 		return;
3758 
3759 	IN6_V4MAPPED_TO_IPADDR(&nce->nce_addr, nce_addr);
3760 	if (nce_addr != hwm->hwm_addr)
3761 		return;
3762 
3763 	mutex_enter(&nce->nce_lock);
3764 	if ((mp = nce->nce_res_mp) == NULL) {
3765 		mutex_exit(&nce->nce_lock);
3766 		return;
3767 	}
3768 	dlu = (dl_unitdata_req_t *)mp->b_rptr;
3769 	macaddr = (uchar_t *)(dlu + 1);
3770 	ill = nce->nce_ill;
3771 	if ((saplen = ill->ill_sap_length) > 0)
3772 		macaddr += saplen;
3773 	else
3774 		saplen = -saplen;
3775 
3776 	/*
3777 	 * If the hardware address is unchanged, then leave this one alone.
3778 	 * Note that saplen == abs(saplen) now.
3779 	 */
3780 	if (hwm->hwm_hwlen == dlu->dl_dest_addr_length - saplen &&
3781 	    bcmp(hwm->hwm_hwaddr, macaddr, hwm->hwm_hwlen) == 0) {
3782 		mutex_exit(&nce->nce_lock);
3783 		return;
3784 	}
3785 	mutex_exit(&nce->nce_lock);
3786 
3787 	DTRACE_PROBE1(nce__hw__deleted, nce_t *, nce);
3788 	ndp_delete(nce);
3789 }
3790 
3791 /*
3792  * This function verifies whether a given IPv4 address is potentially known to
3793  * the NCE subsystem.  If so, then ARP must not delete the corresponding ace_t,
3794  * so that it can continue to look for hardware changes on that address.
3795  */
3796 boolean_t
3797 ndp_lookup_ipaddr(in_addr_t addr, netstack_t *ns)
3798 {
3799 	nce_t		*nce;
3800 	struct in_addr	nceaddr;
3801 	ip_stack_t	*ipst = ns->netstack_ip;
3802 
3803 	if (addr == INADDR_ANY)
3804 		return (B_FALSE);
3805 
3806 	mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
3807 	nce = *(nce_t **)NCE_HASH_PTR_V4(ipst, addr);
3808 	for (; nce != NULL; nce = nce->nce_next) {
3809 		/* Note that only v4 mapped entries are in the table. */
3810 		IN6_V4MAPPED_TO_INADDR(&nce->nce_addr, &nceaddr);
3811 		if (addr == nceaddr.s_addr &&
3812 		    IN6_ARE_ADDR_EQUAL(&nce->nce_mask, &ipv6_all_ones)) {
3813 			/* Single flag check; no lock needed */
3814 			if (!(nce->nce_flags & NCE_F_CONDEMNED))
3815 				break;
3816 		}
3817 	}
3818 	mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
3819 	return (nce != NULL);
3820 }
3821 
3822 /*
3823  * Wrapper around ipif_lookup_addr_exact_v6() that allows ND to work properly
3824  * with IPMP.  Specifically, since neighbor discovery is always done on
3825  * underlying interfaces (even for addresses owned by an IPMP interface), we
3826  * need to check for `v6addrp' on both `ill' and on the IPMP meta-interface
3827  * associated with `ill' (if it exists).
3828  */
3829 static ipif_t *
3830 ip_ndp_lookup_addr_v6(const in6_addr_t *v6addrp, ill_t *ill)
3831 {
3832 	ipif_t *ipif;
3833 	ip_stack_t *ipst = ill->ill_ipst;
3834 
3835 	ipif = ipif_lookup_addr_exact_v6(v6addrp, ill, ipst);
3836 	if (ipif == NULL && IS_UNDER_IPMP(ill)) {
3837 		if ((ill = ipmp_ill_hold_ipmp_ill(ill)) != NULL) {
3838 			ipif = ipif_lookup_addr_exact_v6(v6addrp, ill, ipst);
3839 			ill_refrele(ill);
3840 		}
3841 	}
3842 	return (ipif);
3843 }
3844