xref: /titanic_44/usr/src/uts/common/inet/ip/ip_ndp.c (revision 2d6b5ea734bb47d251c82670646fde46af15fd69)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #include <sys/types.h>
27 #include <sys/stream.h>
28 #include <sys/stropts.h>
29 #include <sys/strsun.h>
30 #include <sys/sysmacros.h>
31 #include <sys/errno.h>
32 #include <sys/dlpi.h>
33 #include <sys/socket.h>
34 #include <sys/ddi.h>
35 #include <sys/sunddi.h>
36 #include <sys/cmn_err.h>
37 #include <sys/debug.h>
38 #include <sys/vtrace.h>
39 #include <sys/kmem.h>
40 #include <sys/zone.h>
41 #include <sys/ethernet.h>
42 #include <sys/sdt.h>
43 
44 #include <net/if.h>
45 #include <net/if_types.h>
46 #include <net/if_dl.h>
47 #include <net/route.h>
48 #include <netinet/in.h>
49 #include <netinet/ip6.h>
50 #include <netinet/icmp6.h>
51 
52 #include <inet/common.h>
53 #include <inet/mi.h>
54 #include <inet/mib2.h>
55 #include <inet/nd.h>
56 #include <inet/ip.h>
57 #include <inet/ip_impl.h>
58 #include <inet/ipclassifier.h>
59 #include <inet/ip_if.h>
60 #include <inet/ip_ire.h>
61 #include <inet/ip_rts.h>
62 #include <inet/ip6.h>
63 #include <inet/ip_ndp.h>
64 #include <inet/ipsec_impl.h>
65 #include <inet/ipsec_info.h>
66 #include <inet/sctp_ip.h>
67 
68 /*
69  * Function names with nce_ prefix are static while function
70  * names with ndp_ prefix are used by rest of the IP.
71  *
72  * Lock ordering:
73  *
74  *	ndp_g_lock -> ill_lock -> nce_lock
75  *
76  * The ndp_g_lock protects the NCE hash (nce_hash_tbl, NCE_HASH_PTR) and
77  * nce_next.  Nce_lock protects the contents of the NCE (particularly
78  * nce_refcnt).
79  */
80 
81 static	boolean_t nce_cmp_ll_addr(const nce_t *nce, const uchar_t *new_ll_addr,
82     uint32_t ll_addr_len);
83 static	void	nce_ire_delete(nce_t *nce);
84 static	void	nce_ire_delete1(ire_t *ire, char *nce_arg);
85 static	void 	nce_set_ll(nce_t *nce, uchar_t *ll_addr);
86 static	nce_t	*nce_lookup_addr(ill_t *, boolean_t, const in6_addr_t *,
87     nce_t *);
88 static	nce_t	*nce_lookup_mapping(ill_t *, const in6_addr_t *);
89 static	void	nce_make_mapping(nce_t *nce, uchar_t *addrpos,
90     uchar_t *addr);
91 static	int	nce_set_multicast(ill_t *ill, const in6_addr_t *addr);
92 static	void	nce_queue_mp(nce_t *nce, mblk_t *mp);
93 static	mblk_t	*nce_udreq_alloc(ill_t *ill);
94 static	void	nce_update(nce_t *nce, uint16_t new_state,
95     uchar_t *new_ll_addr);
96 static	uint32_t	nce_solicit(nce_t *nce, mblk_t *mp);
97 static	boolean_t	nce_xmit(ill_t *ill, uint8_t type,
98     boolean_t use_lla_addr, const in6_addr_t *sender,
99     const in6_addr_t *target, int flag);
100 static boolean_t	nce_xmit_advert(nce_t *nce, boolean_t use_nd_lla,
101     const in6_addr_t *target, uint_t flags);
102 static boolean_t	nce_xmit_solicit(nce_t *nce, boolean_t use_nd_lla,
103     const in6_addr_t *src, uint_t flags);
104 static int	ndp_add_v4(ill_t *, const in_addr_t *, uint16_t,
105     nce_t **, nce_t *);
106 static ipif_t	*ip_ndp_lookup_addr_v6(const in6_addr_t *v6addrp, ill_t *ill);
107 
108 #ifdef DEBUG
109 static void	nce_trace_cleanup(const nce_t *);
110 #endif
111 
112 #define	NCE_HASH_PTR_V4(ipst, addr)					\
113 	(&((ipst)->ips_ndp4->nce_hash_tbl[IRE_ADDR_HASH(addr, NCE_TABLE_SIZE)]))
114 
115 #define	NCE_HASH_PTR_V6(ipst, addr)				 \
116 	(&((ipst)->ips_ndp6->nce_hash_tbl[NCE_ADDR_HASH_V6(addr, \
117 		NCE_TABLE_SIZE)]))
118 
119 /* Non-tunable probe interval, based on link capabilities */
120 #define	ILL_PROBE_INTERVAL(ill)	((ill)->ill_note_link ? 150 : 1500)
121 
122 /*
123  * NDP Cache Entry creation routine.
124  * Mapped entries will never do NUD .
125  * This routine must always be called with ndp6->ndp_g_lock held.
126  * Prior to return, nce_refcnt is incremented.
127  */
128 int
129 ndp_add_v6(ill_t *ill, uchar_t *hw_addr, const in6_addr_t *addr,
130     const in6_addr_t *mask, const in6_addr_t *extract_mask,
131     uint32_t hw_extract_start, uint16_t flags, uint16_t state,
132     nce_t **newnce)
133 {
134 	static	nce_t		nce_nil;
135 	nce_t		*nce;
136 	mblk_t		*mp;
137 	mblk_t		*template;
138 	nce_t		**ncep;
139 	int		err;
140 	boolean_t	dropped = B_FALSE;
141 	ip_stack_t	*ipst = ill->ill_ipst;
142 
143 	ASSERT(MUTEX_HELD(&ipst->ips_ndp6->ndp_g_lock));
144 	ASSERT(ill != NULL && ill->ill_isv6);
145 	if (IN6_IS_ADDR_UNSPECIFIED(addr)) {
146 		ip0dbg(("ndp_add_v6: no addr\n"));
147 		return (EINVAL);
148 	}
149 	if ((flags & ~NCE_EXTERNAL_FLAGS_MASK)) {
150 		ip0dbg(("ndp_add_v6: flags = %x\n", (int)flags));
151 		return (EINVAL);
152 	}
153 	if (IN6_IS_ADDR_UNSPECIFIED(extract_mask) &&
154 	    (flags & NCE_F_MAPPING)) {
155 		ip0dbg(("ndp_add_v6: extract mask zero for mapping"));
156 		return (EINVAL);
157 	}
158 	/*
159 	 * Allocate the mblk to hold the nce.
160 	 *
161 	 * XXX This can come out of a separate cache - nce_cache.
162 	 * We don't need the mp anymore as there are no more
163 	 * "qwriter"s
164 	 */
165 	mp = allocb(sizeof (nce_t), BPRI_MED);
166 	if (mp == NULL)
167 		return (ENOMEM);
168 
169 	nce = (nce_t *)mp->b_rptr;
170 	mp->b_wptr = (uchar_t *)&nce[1];
171 	*nce = nce_nil;
172 
173 	/*
174 	 * This one holds link layer address
175 	 */
176 	if (ill->ill_net_type == IRE_IF_RESOLVER) {
177 		template = nce_udreq_alloc(ill);
178 	} else {
179 		if (ill->ill_resolver_mp == NULL) {
180 			freeb(mp);
181 			return (EINVAL);
182 		}
183 		ASSERT((ill->ill_net_type == IRE_IF_NORESOLVER));
184 		template = copyb(ill->ill_resolver_mp);
185 	}
186 	if (template == NULL) {
187 		freeb(mp);
188 		return (ENOMEM);
189 	}
190 	nce->nce_ill = ill;
191 	nce->nce_ipversion = IPV6_VERSION;
192 	nce->nce_flags = flags;
193 	nce->nce_state = state;
194 	nce->nce_pcnt = ND_MAX_UNICAST_SOLICIT;
195 	nce->nce_rcnt = ill->ill_xmit_count;
196 	nce->nce_addr = *addr;
197 	nce->nce_mask = *mask;
198 	nce->nce_extract_mask = *extract_mask;
199 	nce->nce_ll_extract_start = hw_extract_start;
200 	nce->nce_fp_mp = NULL;
201 	nce->nce_res_mp = template;
202 	if (state == ND_REACHABLE)
203 		nce->nce_last = TICK_TO_MSEC(lbolt64);
204 	else
205 		nce->nce_last = 0;
206 	nce->nce_qd_mp = NULL;
207 	nce->nce_mp = mp;
208 	if (hw_addr != NULL)
209 		nce_set_ll(nce, hw_addr);
210 	/* This one is for nce getting created */
211 	nce->nce_refcnt = 1;
212 	mutex_init(&nce->nce_lock, NULL, MUTEX_DEFAULT, NULL);
213 	if (nce->nce_flags & NCE_F_MAPPING) {
214 		ASSERT(IN6_IS_ADDR_MULTICAST(addr));
215 		ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&nce->nce_mask));
216 		ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&nce->nce_extract_mask));
217 		ncep = &ipst->ips_ndp6->nce_mask_entries;
218 	} else {
219 		ncep = ((nce_t **)NCE_HASH_PTR_V6(ipst, *addr));
220 	}
221 
222 	nce->nce_trace_disable = B_FALSE;
223 
224 	/*
225 	 * Atomically ensure that the ill is not CONDEMNED, before
226 	 * adding the NCE.
227 	 */
228 	mutex_enter(&ill->ill_lock);
229 	if (ill->ill_state_flags & ILL_CONDEMNED) {
230 		mutex_exit(&ill->ill_lock);
231 		freeb(mp);
232 		freeb(template);
233 		return (EINVAL);
234 	}
235 	if ((nce->nce_next = *ncep) != NULL)
236 		nce->nce_next->nce_ptpn = &nce->nce_next;
237 	*ncep = nce;
238 	nce->nce_ptpn = ncep;
239 	*newnce = nce;
240 	/* This one is for nce being used by an active thread */
241 	NCE_REFHOLD(*newnce);
242 
243 	/* Bump up the number of nce's referencing this ill */
244 	DTRACE_PROBE3(ill__incr__cnt, (ill_t *), ill,
245 	    (char *), "nce", (void *), nce);
246 	ill->ill_nce_cnt++;
247 	mutex_exit(&ill->ill_lock);
248 
249 	err = 0;
250 	if ((flags & NCE_F_PERMANENT) && state == ND_PROBE) {
251 		mutex_enter(&nce->nce_lock);
252 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
253 		nce->nce_pcnt = ND_MAX_UNICAST_SOLICIT;
254 		mutex_exit(&nce->nce_lock);
255 		dropped = nce_xmit_solicit(nce, B_FALSE, NULL, NDP_PROBE);
256 		if (dropped) {
257 			mutex_enter(&nce->nce_lock);
258 			nce->nce_pcnt++;
259 			mutex_exit(&nce->nce_lock);
260 		}
261 		NDP_RESTART_TIMER(nce, ILL_PROBE_INTERVAL(ill));
262 		mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
263 		err = EINPROGRESS;
264 	} else if (flags & NCE_F_UNSOL_ADV) {
265 		/*
266 		 * We account for the transmit below by assigning one
267 		 * less than the ndd variable. Subsequent decrements
268 		 * are done in ndp_timer.
269 		 */
270 		mutex_enter(&nce->nce_lock);
271 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
272 		nce->nce_unsolicit_count = ipst->ips_ip_ndp_unsolicit_count - 1;
273 		mutex_exit(&nce->nce_lock);
274 		dropped = nce_xmit_advert(nce, B_TRUE, &ipv6_all_hosts_mcast,
275 		    0);
276 		mutex_enter(&nce->nce_lock);
277 		if (dropped)
278 			nce->nce_unsolicit_count++;
279 		if (nce->nce_unsolicit_count != 0) {
280 			ASSERT(nce->nce_timeout_id == 0);
281 			nce->nce_timeout_id = timeout(ndp_timer, nce,
282 			    MSEC_TO_TICK(ipst->ips_ip_ndp_unsolicit_interval));
283 		}
284 		mutex_exit(&nce->nce_lock);
285 		mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
286 	}
287 
288 	/*
289 	 * If the hw_addr is NULL, typically for ND_INCOMPLETE nces, then
290 	 * we call nce_fastpath as soon as the nce is resolved in ndp_process.
291 	 * We call nce_fastpath from nce_update if the link layer address of
292 	 * the peer changes from nce_update
293 	 */
294 	if (hw_addr != NULL || ill->ill_net_type == IRE_IF_NORESOLVER)
295 		nce_fastpath(nce);
296 	return (err);
297 }
298 
299 int
300 ndp_lookup_then_add_v6(ill_t *ill, boolean_t match_illgrp, uchar_t *hw_addr,
301     const in6_addr_t *addr, const in6_addr_t *mask,
302     const in6_addr_t *extract_mask, uint32_t hw_extract_start, uint16_t flags,
303     uint16_t state, nce_t **newnce)
304 {
305 	int	err = 0;
306 	nce_t	*nce;
307 	ip_stack_t	*ipst = ill->ill_ipst;
308 
309 	ASSERT(ill->ill_isv6);
310 	mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
311 
312 	/* Get head of v6 hash table */
313 	nce = *((nce_t **)NCE_HASH_PTR_V6(ipst, *addr));
314 	nce = nce_lookup_addr(ill, match_illgrp, addr, nce);
315 	if (nce == NULL) {
316 		err = ndp_add_v6(ill,
317 		    hw_addr,
318 		    addr,
319 		    mask,
320 		    extract_mask,
321 		    hw_extract_start,
322 		    flags,
323 		    state,
324 		    newnce);
325 	} else {
326 		*newnce = nce;
327 		err = EEXIST;
328 	}
329 	mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
330 	return (err);
331 }
332 
333 /*
334  * Remove all the CONDEMNED nces from the appropriate hash table.
335  * We create a private list of NCEs, these may have ires pointing
336  * to them, so the list will be passed through to clean up dependent
337  * ires and only then we can do NCE_REFRELE which can make NCE inactive.
338  */
339 static void
340 nce_remove(ndp_g_t *ndp, nce_t *nce, nce_t **free_nce_list)
341 {
342 	nce_t *nce1;
343 	nce_t **ptpn;
344 
345 	ASSERT(MUTEX_HELD(&ndp->ndp_g_lock));
346 	ASSERT(ndp->ndp_g_walker == 0);
347 	for (; nce; nce = nce1) {
348 		nce1 = nce->nce_next;
349 		mutex_enter(&nce->nce_lock);
350 		if (nce->nce_flags & NCE_F_CONDEMNED) {
351 			ptpn = nce->nce_ptpn;
352 			nce1 = nce->nce_next;
353 			if (nce1 != NULL)
354 				nce1->nce_ptpn = ptpn;
355 			*ptpn = nce1;
356 			nce->nce_ptpn = NULL;
357 			nce->nce_next = NULL;
358 			nce->nce_next = *free_nce_list;
359 			*free_nce_list = nce;
360 		}
361 		mutex_exit(&nce->nce_lock);
362 	}
363 }
364 
365 /*
366  * 1. Mark the nce CONDEMNED. This ensures that no new nce_lookup()
367  *    will return this NCE. Also no new IREs will be created that
368  *    point to this NCE (See ire_add_v6).  Also no new timeouts will
369  *    be started (See NDP_RESTART_TIMER).
370  * 2. Cancel any currently running timeouts.
371  * 3. If there is an ndp walker, return. The walker will do the cleanup.
372  *    This ensures that walkers see a consistent list of NCEs while walking.
373  * 4. Otherwise remove the NCE from the list of NCEs
374  * 5. Delete all IREs pointing to this NCE.
375  */
376 void
377 ndp_delete(nce_t *nce)
378 {
379 	nce_t	**ptpn;
380 	nce_t	*nce1;
381 	int	ipversion = nce->nce_ipversion;
382 	ndp_g_t *ndp;
383 	ip_stack_t	*ipst = nce->nce_ill->ill_ipst;
384 
385 	if (ipversion == IPV4_VERSION)
386 		ndp = ipst->ips_ndp4;
387 	else
388 		ndp = ipst->ips_ndp6;
389 
390 	/* Serialize deletes */
391 	mutex_enter(&nce->nce_lock);
392 	if (nce->nce_flags & NCE_F_CONDEMNED) {
393 		/* Some other thread is doing the delete */
394 		mutex_exit(&nce->nce_lock);
395 		return;
396 	}
397 	/*
398 	 * Caller has a refhold. Also 1 ref for being in the list. Thus
399 	 * refcnt has to be >= 2
400 	 */
401 	ASSERT(nce->nce_refcnt >= 2);
402 	nce->nce_flags |= NCE_F_CONDEMNED;
403 	mutex_exit(&nce->nce_lock);
404 
405 	nce_fastpath_list_delete(nce);
406 
407 	/*
408 	 * Cancel any running timer. Timeout can't be restarted
409 	 * since CONDEMNED is set. Can't hold nce_lock across untimeout.
410 	 * Passing invalid timeout id is fine.
411 	 */
412 	if (nce->nce_timeout_id != 0) {
413 		(void) untimeout(nce->nce_timeout_id);
414 		nce->nce_timeout_id = 0;
415 	}
416 
417 	mutex_enter(&ndp->ndp_g_lock);
418 	if (nce->nce_ptpn == NULL) {
419 		/*
420 		 * The last ndp walker has already removed this nce from
421 		 * the list after we marked the nce CONDEMNED and before
422 		 * we grabbed the global lock.
423 		 */
424 		mutex_exit(&ndp->ndp_g_lock);
425 		return;
426 	}
427 	if (ndp->ndp_g_walker > 0) {
428 		/*
429 		 * Can't unlink. The walker will clean up
430 		 */
431 		ndp->ndp_g_walker_cleanup = B_TRUE;
432 		mutex_exit(&ndp->ndp_g_lock);
433 		return;
434 	}
435 
436 	/*
437 	 * Now remove the nce from the list. NDP_RESTART_TIMER won't restart
438 	 * the timer since it is marked CONDEMNED.
439 	 */
440 	ptpn = nce->nce_ptpn;
441 	nce1 = nce->nce_next;
442 	if (nce1 != NULL)
443 		nce1->nce_ptpn = ptpn;
444 	*ptpn = nce1;
445 	nce->nce_ptpn = NULL;
446 	nce->nce_next = NULL;
447 	mutex_exit(&ndp->ndp_g_lock);
448 
449 	nce_ire_delete(nce);
450 }
451 
452 void
453 ndp_inactive(nce_t *nce)
454 {
455 	mblk_t		**mpp;
456 	ill_t		*ill;
457 
458 	ASSERT(nce->nce_refcnt == 0);
459 	ASSERT(MUTEX_HELD(&nce->nce_lock));
460 	ASSERT(nce->nce_fastpath == NULL);
461 
462 	/* Free all nce allocated messages */
463 	mpp = &nce->nce_first_mp_to_free;
464 	do {
465 		while (*mpp != NULL) {
466 			mblk_t  *mp;
467 
468 			mp = *mpp;
469 			*mpp = mp->b_next;
470 
471 			inet_freemsg(mp);
472 		}
473 	} while (mpp++ != &nce->nce_last_mp_to_free);
474 
475 #ifdef DEBUG
476 	nce_trace_cleanup(nce);
477 #endif
478 
479 	ill = nce->nce_ill;
480 	mutex_enter(&ill->ill_lock);
481 	DTRACE_PROBE3(ill__decr__cnt, (ill_t *), ill,
482 	    (char *), "nce", (void *), nce);
483 	ill->ill_nce_cnt--;
484 	/*
485 	 * If the number of nce's associated with this ill have dropped
486 	 * to zero, check whether we need to restart any operation that
487 	 * is waiting for this to happen.
488 	 */
489 	if (ILL_DOWN_OK(ill)) {
490 		/* ipif_ill_refrele_tail drops the ill_lock */
491 		ipif_ill_refrele_tail(ill);
492 	} else {
493 		mutex_exit(&ill->ill_lock);
494 	}
495 	mutex_destroy(&nce->nce_lock);
496 	if (nce->nce_mp != NULL)
497 		inet_freemsg(nce->nce_mp);
498 }
499 
500 /*
501  * ndp_walk routine.  Delete the nce if it is associated with the ill
502  * that is going away.  Always called as a writer.
503  */
504 void
505 ndp_delete_per_ill(nce_t *nce, uchar_t *arg)
506 {
507 	if ((nce != NULL) && nce->nce_ill == (ill_t *)arg) {
508 		ndp_delete(nce);
509 	}
510 }
511 
512 /*
513  * Walk a list of to be inactive NCEs and blow away all the ires.
514  */
515 static void
516 nce_ire_delete_list(nce_t *nce)
517 {
518 	nce_t *nce_next;
519 
520 	ASSERT(nce != NULL);
521 	while (nce != NULL) {
522 		nce_next = nce->nce_next;
523 		nce->nce_next = NULL;
524 
525 		/*
526 		 * It is possible for the last ndp walker (this thread)
527 		 * to come here after ndp_delete has marked the nce CONDEMNED
528 		 * and before it has removed the nce from the fastpath list
529 		 * or called untimeout. So we need to do it here. It is safe
530 		 * for both ndp_delete and this thread to do it twice or
531 		 * even simultaneously since each of the threads has a
532 		 * reference on the nce.
533 		 */
534 		nce_fastpath_list_delete(nce);
535 		/*
536 		 * Cancel any running timer. Timeout can't be restarted
537 		 * since CONDEMNED is set. Can't hold nce_lock across untimeout.
538 		 * Passing invalid timeout id is fine.
539 		 */
540 		if (nce->nce_timeout_id != 0) {
541 			(void) untimeout(nce->nce_timeout_id);
542 			nce->nce_timeout_id = 0;
543 		}
544 		/*
545 		 * We might hit this func thus in the v4 case:
546 		 * ipif_down->ipif_ndp_down->ndp_walk
547 		 */
548 
549 		if (nce->nce_ipversion == IPV4_VERSION) {
550 			ire_walk_ill_v4(MATCH_IRE_ILL | MATCH_IRE_TYPE,
551 			    IRE_CACHE, nce_ire_delete1, nce, nce->nce_ill);
552 		} else {
553 			ASSERT(nce->nce_ipversion == IPV6_VERSION);
554 			ire_walk_ill_v6(MATCH_IRE_ILL | MATCH_IRE_TYPE,
555 			    IRE_CACHE, nce_ire_delete1, nce, nce->nce_ill);
556 		}
557 		NCE_REFRELE_NOTR(nce);
558 		nce = nce_next;
559 	}
560 }
561 
562 /*
563  * Delete an ire when the nce goes away.
564  */
565 /* ARGSUSED */
566 static void
567 nce_ire_delete(nce_t *nce)
568 {
569 	if (nce->nce_ipversion == IPV6_VERSION) {
570 		ire_walk_ill_v6(MATCH_IRE_ILL | MATCH_IRE_TYPE, IRE_CACHE,
571 		    nce_ire_delete1, (char *)nce, nce->nce_ill);
572 		NCE_REFRELE_NOTR(nce);
573 	} else {
574 		ire_walk_ill_v4(MATCH_IRE_ILL | MATCH_IRE_TYPE, IRE_CACHE,
575 		    nce_ire_delete1, (char *)nce, nce->nce_ill);
576 		NCE_REFRELE_NOTR(nce);
577 	}
578 }
579 
580 /*
581  * ire_walk routine used to delete every IRE that shares this nce
582  */
583 static void
584 nce_ire_delete1(ire_t *ire, char *nce_arg)
585 {
586 	nce_t	*nce = (nce_t *)nce_arg;
587 
588 	ASSERT(ire->ire_type == IRE_CACHE);
589 
590 	if (ire->ire_nce == nce) {
591 		ASSERT(ire->ire_ipversion == nce->nce_ipversion);
592 		ire_delete(ire);
593 	}
594 }
595 
596 /*
597  * Restart DAD on given NCE.  Returns B_TRUE if DAD has been restarted.
598  */
599 boolean_t
600 ndp_restart_dad(nce_t *nce)
601 {
602 	boolean_t started;
603 	boolean_t dropped;
604 
605 	if (nce == NULL)
606 		return (B_FALSE);
607 	mutex_enter(&nce->nce_lock);
608 	if (nce->nce_state == ND_PROBE) {
609 		mutex_exit(&nce->nce_lock);
610 		started = B_TRUE;
611 	} else if (nce->nce_state == ND_REACHABLE) {
612 		nce->nce_state = ND_PROBE;
613 		nce->nce_pcnt = ND_MAX_UNICAST_SOLICIT - 1;
614 		mutex_exit(&nce->nce_lock);
615 		dropped = nce_xmit_solicit(nce, B_FALSE, NULL, NDP_PROBE);
616 		if (dropped) {
617 			mutex_enter(&nce->nce_lock);
618 			nce->nce_pcnt++;
619 			mutex_exit(&nce->nce_lock);
620 		}
621 		NDP_RESTART_TIMER(nce, ILL_PROBE_INTERVAL(nce->nce_ill));
622 		started = B_TRUE;
623 	} else {
624 		mutex_exit(&nce->nce_lock);
625 		started = B_FALSE;
626 	}
627 	return (started);
628 }
629 
630 /*
631  * IPv6 Cache entry lookup.  Try to find an nce matching the parameters passed.
632  * If one is found, the refcnt on the nce will be incremented.
633  */
634 nce_t *
635 ndp_lookup_v6(ill_t *ill, boolean_t match_illgrp, const in6_addr_t *addr,
636     boolean_t caller_holds_lock)
637 {
638 	nce_t	*nce;
639 	ip_stack_t *ipst = ill->ill_ipst;
640 
641 	ASSERT(ill->ill_isv6);
642 	if (!caller_holds_lock)
643 		mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
644 
645 	/* Get head of v6 hash table */
646 	nce = *((nce_t **)NCE_HASH_PTR_V6(ipst, *addr));
647 	nce = nce_lookup_addr(ill, match_illgrp, addr, nce);
648 	if (nce == NULL)
649 		nce = nce_lookup_mapping(ill, addr);
650 	if (!caller_holds_lock)
651 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
652 	return (nce);
653 }
654 /*
655  * IPv4 Cache entry lookup.  Try to find an nce matching the parameters passed.
656  * If one is found, the refcnt on the nce will be incremented.
657  * Since multicast mappings are handled in arp, there are no nce_mcast_entries
658  * so we skip the nce_lookup_mapping call.
659  * XXX TODO: if the nce is found to be ND_STALE, ndp_delete it and return NULL
660  */
661 nce_t *
662 ndp_lookup_v4(ill_t *ill, const in_addr_t *addr, boolean_t caller_holds_lock)
663 {
664 	nce_t	*nce;
665 	in6_addr_t addr6;
666 	ip_stack_t *ipst = ill->ill_ipst;
667 
668 	if (!caller_holds_lock)
669 		mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
670 
671 	/* Get head of v4 hash table */
672 	nce = *((nce_t **)NCE_HASH_PTR_V4(ipst, *addr));
673 	IN6_IPADDR_TO_V4MAPPED(*addr, &addr6);
674 	/*
675 	 * NOTE: IPv4 never matches across the illgrp since the NCE's we're
676 	 * looking up have fastpath headers that are inherently per-ill.
677 	 */
678 	nce = nce_lookup_addr(ill, B_FALSE, &addr6, nce);
679 	if (!caller_holds_lock)
680 		mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
681 	return (nce);
682 }
683 
684 /*
685  * Cache entry lookup.  Try to find an nce matching the parameters passed.
686  * Look only for exact entries (no mappings).  If an nce is found, increment
687  * the hold count on that nce. The caller passes in the start of the
688  * appropriate hash table, and must be holding the appropriate global
689  * lock (ndp_g_lock).
690  */
691 static nce_t *
692 nce_lookup_addr(ill_t *ill, boolean_t match_illgrp, const in6_addr_t *addr,
693     nce_t *nce)
694 {
695 	ndp_g_t		*ndp;
696 	ip_stack_t	*ipst = ill->ill_ipst;
697 
698 	if (ill->ill_isv6)
699 		ndp = ipst->ips_ndp6;
700 	else
701 		ndp = ipst->ips_ndp4;
702 
703 	ASSERT(MUTEX_HELD(&ndp->ndp_g_lock));
704 	if (IN6_IS_ADDR_UNSPECIFIED(addr))
705 		return (NULL);
706 	for (; nce != NULL; nce = nce->nce_next) {
707 		if (nce->nce_ill == ill ||
708 		    match_illgrp && IS_IN_SAME_ILLGRP(ill, nce->nce_ill)) {
709 			if (IN6_ARE_ADDR_EQUAL(&nce->nce_addr, addr) &&
710 			    IN6_ARE_ADDR_EQUAL(&nce->nce_mask,
711 			    &ipv6_all_ones)) {
712 				mutex_enter(&nce->nce_lock);
713 				if (!(nce->nce_flags & NCE_F_CONDEMNED)) {
714 					NCE_REFHOLD_LOCKED(nce);
715 					mutex_exit(&nce->nce_lock);
716 					break;
717 				}
718 				mutex_exit(&nce->nce_lock);
719 			}
720 		}
721 	}
722 	return (nce);
723 }
724 
725 /*
726  * Cache entry lookup.  Try to find an nce matching the parameters passed.
727  * Look only for mappings.
728  */
729 static nce_t *
730 nce_lookup_mapping(ill_t *ill, const in6_addr_t *addr)
731 {
732 	nce_t	*nce;
733 	ip_stack_t	*ipst = ill->ill_ipst;
734 
735 	ASSERT(ill != NULL && ill->ill_isv6);
736 	ASSERT(MUTEX_HELD(&ipst->ips_ndp6->ndp_g_lock));
737 	if (!IN6_IS_ADDR_MULTICAST(addr))
738 		return (NULL);
739 	nce = ipst->ips_ndp6->nce_mask_entries;
740 	for (; nce != NULL; nce = nce->nce_next)
741 		if (nce->nce_ill == ill &&
742 		    (V6_MASK_EQ(*addr, nce->nce_mask, nce->nce_addr))) {
743 			mutex_enter(&nce->nce_lock);
744 			if (!(nce->nce_flags & NCE_F_CONDEMNED)) {
745 				NCE_REFHOLD_LOCKED(nce);
746 				mutex_exit(&nce->nce_lock);
747 				break;
748 			}
749 			mutex_exit(&nce->nce_lock);
750 		}
751 	return (nce);
752 }
753 
754 /*
755  * Process passed in parameters either from an incoming packet or via
756  * user ioctl.
757  */
758 static void
759 nce_process(nce_t *nce, uchar_t *hw_addr, uint32_t flag, boolean_t is_adv)
760 {
761 	ill_t	*ill = nce->nce_ill;
762 	uint32_t hw_addr_len = ill->ill_nd_lla_len;
763 	mblk_t	*mp;
764 	boolean_t ll_updated = B_FALSE;
765 	boolean_t ll_changed;
766 	ip_stack_t	*ipst = ill->ill_ipst;
767 
768 	ASSERT(nce->nce_ipversion == IPV6_VERSION);
769 	/*
770 	 * No updates of link layer address or the neighbor state is
771 	 * allowed, when the cache is in NONUD state.  This still
772 	 * allows for responding to reachability solicitation.
773 	 */
774 	mutex_enter(&nce->nce_lock);
775 	if (nce->nce_state == ND_INCOMPLETE) {
776 		if (hw_addr == NULL) {
777 			mutex_exit(&nce->nce_lock);
778 			return;
779 		}
780 		nce_set_ll(nce, hw_addr);
781 		/*
782 		 * Update nce state and send the queued packets
783 		 * back to ip this time ire will be added.
784 		 */
785 		if (flag & ND_NA_FLAG_SOLICITED) {
786 			nce_update(nce, ND_REACHABLE, NULL);
787 		} else {
788 			nce_update(nce, ND_STALE, NULL);
789 		}
790 		mutex_exit(&nce->nce_lock);
791 		nce_fastpath(nce);
792 		mutex_enter(&nce->nce_lock);
793 		mp = nce->nce_qd_mp;
794 		nce->nce_qd_mp = NULL;
795 		mutex_exit(&nce->nce_lock);
796 		while (mp != NULL) {
797 			mblk_t *nxt_mp, *data_mp;
798 
799 			nxt_mp = mp->b_next;
800 			mp->b_next = NULL;
801 
802 			if (mp->b_datap->db_type == M_CTL)
803 				data_mp = mp->b_cont;
804 			else
805 				data_mp = mp;
806 			if (data_mp->b_prev != NULL) {
807 				ill_t   *inbound_ill;
808 				queue_t *fwdq = NULL;
809 				uint_t ifindex;
810 
811 				ifindex = (uint_t)(uintptr_t)data_mp->b_prev;
812 				inbound_ill = ill_lookup_on_ifindex(ifindex,
813 				    B_TRUE, NULL, NULL, NULL, NULL, ipst);
814 				if (inbound_ill == NULL) {
815 					data_mp->b_prev = NULL;
816 					freemsg(mp);
817 					return;
818 				} else {
819 					fwdq = inbound_ill->ill_rq;
820 				}
821 				data_mp->b_prev = NULL;
822 				/*
823 				 * Send a forwarded packet back into ip_rput_v6
824 				 * just as in ire_send_v6().
825 				 * Extract the queue from b_prev (set in
826 				 * ip_rput_data_v6).
827 				 */
828 				if (fwdq != NULL) {
829 					/*
830 					 * Forwarded packets hop count will
831 					 * get decremented in ip_rput_data_v6
832 					 */
833 					if (data_mp != mp)
834 						freeb(mp);
835 					put(fwdq, data_mp);
836 				} else {
837 					/*
838 					 * Send locally originated packets back
839 					 * into ip_wput_v6.
840 					 */
841 					put(ill->ill_wq, mp);
842 				}
843 				ill_refrele(inbound_ill);
844 			} else {
845 				put(ill->ill_wq, mp);
846 			}
847 			mp = nxt_mp;
848 		}
849 		return;
850 	}
851 	ll_changed = nce_cmp_ll_addr(nce, hw_addr, hw_addr_len);
852 	if (!is_adv) {
853 		/* If this is a SOLICITATION request only */
854 		if (ll_changed)
855 			nce_update(nce, ND_STALE, hw_addr);
856 		mutex_exit(&nce->nce_lock);
857 		return;
858 	}
859 	if (!(flag & ND_NA_FLAG_OVERRIDE) && ll_changed) {
860 		/* If in any other state than REACHABLE, ignore */
861 		if (nce->nce_state == ND_REACHABLE) {
862 			nce_update(nce, ND_STALE, NULL);
863 		}
864 		mutex_exit(&nce->nce_lock);
865 		return;
866 	} else {
867 		if (ll_changed) {
868 			nce_update(nce, ND_UNCHANGED, hw_addr);
869 			ll_updated = B_TRUE;
870 		}
871 		if (flag & ND_NA_FLAG_SOLICITED) {
872 			nce_update(nce, ND_REACHABLE, NULL);
873 		} else {
874 			if (ll_updated) {
875 				nce_update(nce, ND_STALE, NULL);
876 			}
877 		}
878 		mutex_exit(&nce->nce_lock);
879 		if (!(flag & ND_NA_FLAG_ROUTER) && (nce->nce_flags &
880 		    NCE_F_ISROUTER)) {
881 			ire_t *ire;
882 
883 			/*
884 			 * Router turned to host.  We need to remove the
885 			 * entry as well as any default route that may be
886 			 * using this as a next hop.  This is required by
887 			 * section 7.2.5 of RFC 2461.
888 			 */
889 			ire = ire_ftable_lookup_v6(&ipv6_all_zeros,
890 			    &ipv6_all_zeros, &nce->nce_addr, IRE_DEFAULT,
891 			    nce->nce_ill->ill_ipif, NULL, ALL_ZONES, 0, NULL,
892 			    MATCH_IRE_ILL | MATCH_IRE_TYPE | MATCH_IRE_GW |
893 			    MATCH_IRE_DEFAULT, ipst);
894 			if (ire != NULL) {
895 				ip_rts_rtmsg(RTM_DELETE, ire, 0, ipst);
896 				ire_delete(ire);
897 				ire_refrele(ire);
898 			}
899 			ndp_delete(nce);
900 		}
901 	}
902 }
903 
904 /*
905  * Walker state structure used by ndp_process() / ndp_process_entry().
906  */
907 typedef struct ndp_process_data {
908 	ill_t		*np_ill; 	/* ill/illgrp to match against */
909 	const in6_addr_t *np_addr; 	/* IPv6 address to match */
910 	uchar_t		*np_hw_addr; 	/* passed to nce_process() */
911 	uint32_t	np_flag;	/* passed to nce_process() */
912 	boolean_t	np_is_adv;	/* passed to nce_process() */
913 } ndp_process_data_t;
914 
915 /*
916  * Walker callback used by ndp_process() for IPMP groups: calls nce_process()
917  * for each NCE with a matching address that's in the same IPMP group.
918  */
919 static void
920 ndp_process_entry(nce_t *nce, void *arg)
921 {
922 	ndp_process_data_t *npp = arg;
923 
924 	if (IS_IN_SAME_ILLGRP(nce->nce_ill, npp->np_ill) &&
925 	    IN6_ARE_ADDR_EQUAL(&nce->nce_addr, npp->np_addr) &&
926 	    IN6_ARE_ADDR_EQUAL(&nce->nce_mask, &ipv6_all_ones)) {
927 		nce_process(nce, npp->np_hw_addr, npp->np_flag, npp->np_is_adv);
928 	}
929 }
930 
931 /*
932  * Wrapper around nce_process() that handles IPMP.  In particular, for IPMP,
933  * NCEs are per-underlying-ill (because of nce_fp_mp) and thus we may have
934  * more than one NCE for a given IPv6 address to tend to.  In that case, we
935  * need to walk all NCEs and callback nce_process() for each one.  Since this
936  * is expensive, in the non-IPMP case we just directly call nce_process().
937  * Ultimately, nce_fp_mp needs to be moved out of the nce_t so that all IP
938  * interfaces in an IPMP group share the same NCEs -- at which point this
939  * function can be removed entirely.
940  */
941 void
942 ndp_process(nce_t *nce, uchar_t *hw_addr, uint32_t flag, boolean_t is_adv)
943 {
944 	ill_t *ill = nce->nce_ill;
945 	struct ndp_g_s *ndp = ill->ill_ipst->ips_ndp6;
946 	ndp_process_data_t np;
947 
948 	if (ill->ill_grp == NULL) {
949 		nce_process(nce, hw_addr, flag, is_adv);
950 		return;
951 	}
952 
953 	/* IPMP case: walk all NCEs */
954 	np.np_ill = ill;
955 	np.np_addr = &nce->nce_addr;
956 	np.np_flag = flag;
957 	np.np_is_adv = is_adv;
958 	np.np_hw_addr = hw_addr;
959 
960 	ndp_walk_common(ndp, NULL, (pfi_t)ndp_process_entry, &np, ALL_ZONES);
961 }
962 
963 /*
964  * Pass arg1 to the pfi supplied, along with each nce in existence.
965  * ndp_walk() places a REFHOLD on the nce and drops the lock when
966  * walking the hash list.
967  */
968 void
969 ndp_walk_common(ndp_g_t *ndp, ill_t *ill, pfi_t pfi, void *arg1,
970     boolean_t trace)
971 {
972 	nce_t	*nce;
973 	nce_t	*nce1;
974 	nce_t	**ncep;
975 	nce_t	*free_nce_list = NULL;
976 
977 	mutex_enter(&ndp->ndp_g_lock);
978 	/* Prevent ndp_delete from unlink and free of NCE */
979 	ndp->ndp_g_walker++;
980 	mutex_exit(&ndp->ndp_g_lock);
981 	for (ncep = ndp->nce_hash_tbl;
982 	    ncep < A_END(ndp->nce_hash_tbl); ncep++) {
983 		for (nce = *ncep; nce != NULL; nce = nce1) {
984 			nce1 = nce->nce_next;
985 			if (ill == NULL || nce->nce_ill == ill) {
986 				if (trace) {
987 					NCE_REFHOLD(nce);
988 					(*pfi)(nce, arg1);
989 					NCE_REFRELE(nce);
990 				} else {
991 					NCE_REFHOLD_NOTR(nce);
992 					(*pfi)(nce, arg1);
993 					NCE_REFRELE_NOTR(nce);
994 				}
995 			}
996 		}
997 	}
998 	for (nce = ndp->nce_mask_entries; nce != NULL; nce = nce1) {
999 		nce1 = nce->nce_next;
1000 		if (ill == NULL || nce->nce_ill == ill) {
1001 			if (trace) {
1002 				NCE_REFHOLD(nce);
1003 				(*pfi)(nce, arg1);
1004 				NCE_REFRELE(nce);
1005 			} else {
1006 				NCE_REFHOLD_NOTR(nce);
1007 				(*pfi)(nce, arg1);
1008 				NCE_REFRELE_NOTR(nce);
1009 			}
1010 		}
1011 	}
1012 	mutex_enter(&ndp->ndp_g_lock);
1013 	ndp->ndp_g_walker--;
1014 	/*
1015 	 * While NCE's are removed from global list they are placed
1016 	 * in a private list, to be passed to nce_ire_delete_list().
1017 	 * The reason is, there may be ires pointing to this nce
1018 	 * which needs to cleaned up.
1019 	 */
1020 	if (ndp->ndp_g_walker_cleanup && ndp->ndp_g_walker == 0) {
1021 		/* Time to delete condemned entries */
1022 		for (ncep = ndp->nce_hash_tbl;
1023 		    ncep < A_END(ndp->nce_hash_tbl); ncep++) {
1024 			nce = *ncep;
1025 			if (nce != NULL) {
1026 				nce_remove(ndp, nce, &free_nce_list);
1027 			}
1028 		}
1029 		nce = ndp->nce_mask_entries;
1030 		if (nce != NULL) {
1031 			nce_remove(ndp, nce, &free_nce_list);
1032 		}
1033 		ndp->ndp_g_walker_cleanup = B_FALSE;
1034 	}
1035 
1036 	mutex_exit(&ndp->ndp_g_lock);
1037 
1038 	if (free_nce_list != NULL) {
1039 		nce_ire_delete_list(free_nce_list);
1040 	}
1041 }
1042 
1043 /*
1044  * Walk everything.
1045  * Note that ill can be NULL hence can't derive the ipst from it.
1046  */
1047 void
1048 ndp_walk(ill_t *ill, pfi_t pfi, void *arg1, ip_stack_t *ipst)
1049 {
1050 	ndp_walk_common(ipst->ips_ndp4, ill, pfi, arg1, B_TRUE);
1051 	ndp_walk_common(ipst->ips_ndp6, ill, pfi, arg1, B_TRUE);
1052 }
1053 
1054 /*
1055  * Process resolve requests.  Handles both mapped entries
1056  * as well as cases that needs to be send out on the wire.
1057  * Lookup a NCE for a given IRE.  Regardless of whether one exists
1058  * or one is created, we defer making ire point to nce until the
1059  * ire is actually added at which point the nce_refcnt on the nce is
1060  * incremented.  This is done primarily to have symmetry between ire_add()
1061  * and ire_delete() which decrements the nce_refcnt, when an ire is deleted.
1062  */
1063 int
1064 ndp_resolver(ill_t *ill, const in6_addr_t *dst, mblk_t *mp, zoneid_t zoneid)
1065 {
1066 	nce_t		*nce, *hw_nce = NULL;
1067 	int		err;
1068 	ill_t		*ipmp_ill;
1069 	uint16_t	nce_flags;
1070 	uint32_t	ms;
1071 	mblk_t		*mp_nce = NULL;
1072 	ip_stack_t	*ipst = ill->ill_ipst;
1073 	uchar_t		*hwaddr = NULL;
1074 
1075 	ASSERT(ill->ill_isv6);
1076 
1077 	if (IN6_IS_ADDR_MULTICAST(dst))
1078 		return (nce_set_multicast(ill, dst));
1079 
1080 	nce_flags = (ill->ill_flags & ILLF_NONUD) ? NCE_F_NONUD : 0;
1081 
1082 	/*
1083 	 * If `ill' is under IPMP, then first check to see if there's an NCE
1084 	 * for `dst' on the IPMP meta-interface (e.g., because an application
1085 	 * explicitly did an SIOCLIFSETND to tie a hardware address to `dst').
1086 	 * If so, we use that hardware address when creating the NCE below.
1087 	 * Note that we don't yet have a mechanism to remove these NCEs if the
1088 	 * NCE for `dst' on the IPMP meta-interface is subsequently removed --
1089 	 * but rather than build such a beast, we should fix NCEs so that they
1090 	 * can be properly shared across an IPMP group.
1091 	 */
1092 	if (IS_UNDER_IPMP(ill)) {
1093 		if ((ipmp_ill = ipmp_ill_hold_ipmp_ill(ill)) != NULL) {
1094 			hw_nce = ndp_lookup_v6(ipmp_ill, B_FALSE, dst, B_FALSE);
1095 			if (hw_nce != NULL && hw_nce->nce_res_mp != NULL) {
1096 				hwaddr = hw_nce->nce_res_mp->b_rptr +
1097 				    NCE_LL_ADDR_OFFSET(ipmp_ill);
1098 				nce_flags |= hw_nce->nce_flags;
1099 			}
1100 			ill_refrele(ipmp_ill);
1101 		}
1102 	}
1103 
1104 	err = ndp_lookup_then_add_v6(ill,
1105 	    B_FALSE,	/* NCE fastpath is per ill; don't match across group */
1106 	    hwaddr,
1107 	    dst,
1108 	    &ipv6_all_ones,
1109 	    &ipv6_all_zeros,
1110 	    0,
1111 	    nce_flags,
1112 	    hwaddr != NULL ? ND_REACHABLE : ND_INCOMPLETE,
1113 	    &nce);
1114 
1115 	if (hw_nce != NULL)
1116 		NCE_REFRELE(hw_nce);
1117 
1118 	switch (err) {
1119 	case 0:
1120 		/*
1121 		 * New cache entry was created. Make sure that the state
1122 		 * is not ND_INCOMPLETE. It can be in some other state
1123 		 * even before we send out the solicitation as we could
1124 		 * get un-solicited advertisements.
1125 		 *
1126 		 * If this is an XRESOLV interface, simply return 0,
1127 		 * since we don't want to solicit just yet.
1128 		 */
1129 		if (ill->ill_flags & ILLF_XRESOLV) {
1130 			NCE_REFRELE(nce);
1131 			return (0);
1132 		}
1133 
1134 		mutex_enter(&nce->nce_lock);
1135 		if (nce->nce_state != ND_INCOMPLETE) {
1136 			mutex_exit(&nce->nce_lock);
1137 			NCE_REFRELE(nce);
1138 			return (0);
1139 		}
1140 		mp_nce = ip_prepend_zoneid(mp, zoneid, ipst);
1141 		if (mp_nce == NULL) {
1142 			/* The caller will free mp */
1143 			mutex_exit(&nce->nce_lock);
1144 			ndp_delete(nce);
1145 			NCE_REFRELE(nce);
1146 			return (ENOMEM);
1147 		}
1148 		if ((ms = nce_solicit(nce, mp_nce)) == 0) {
1149 			/* The caller will free mp */
1150 			if (mp_nce != mp)
1151 				freeb(mp_nce);
1152 			mutex_exit(&nce->nce_lock);
1153 			ndp_delete(nce);
1154 			NCE_REFRELE(nce);
1155 			return (EBUSY);
1156 		}
1157 		mutex_exit(&nce->nce_lock);
1158 		NDP_RESTART_TIMER(nce, (clock_t)ms);
1159 		NCE_REFRELE(nce);
1160 		return (EINPROGRESS);
1161 	case EEXIST:
1162 		/* Resolution in progress just queue the packet */
1163 		mutex_enter(&nce->nce_lock);
1164 		if (nce->nce_state == ND_INCOMPLETE) {
1165 			mp_nce = ip_prepend_zoneid(mp, zoneid, ipst);
1166 			if (mp_nce == NULL) {
1167 				err = ENOMEM;
1168 			} else {
1169 				nce_queue_mp(nce, mp_nce);
1170 				err = EINPROGRESS;
1171 			}
1172 		} else {
1173 			/*
1174 			 * Any other state implies we have
1175 			 * a nce but IRE needs to be added ...
1176 			 * ire_add_v6() will take care of the
1177 			 * the case when the nce becomes CONDEMNED
1178 			 * before the ire is added to the table.
1179 			 */
1180 			err = 0;
1181 		}
1182 		mutex_exit(&nce->nce_lock);
1183 		NCE_REFRELE(nce);
1184 		break;
1185 	default:
1186 		ip1dbg(("ndp_resolver: Can't create NCE %d\n", err));
1187 		break;
1188 	}
1189 	return (err);
1190 }
1191 
1192 /*
1193  * When there is no resolver, the link layer template is passed in
1194  * the IRE.
1195  * Lookup a NCE for a given IRE.  Regardless of whether one exists
1196  * or one is created, we defer making ire point to nce until the
1197  * ire is actually added at which point the nce_refcnt on the nce is
1198  * incremented.  This is done primarily to have symmetry between ire_add()
1199  * and ire_delete() which decrements the nce_refcnt, when an ire is deleted.
1200  */
1201 int
1202 ndp_noresolver(ill_t *ill, const in6_addr_t *dst)
1203 {
1204 	nce_t		*nce;
1205 	int		err = 0;
1206 
1207 	ASSERT(ill != NULL);
1208 	ASSERT(ill->ill_isv6);
1209 	if (IN6_IS_ADDR_MULTICAST(dst)) {
1210 		err = nce_set_multicast(ill, dst);
1211 		return (err);
1212 	}
1213 
1214 	err = ndp_lookup_then_add_v6(ill,
1215 	    B_FALSE,	/* NCE fastpath is per ill; don't match across group */
1216 	    NULL,	/* hardware address */
1217 	    dst,
1218 	    &ipv6_all_ones,
1219 	    &ipv6_all_zeros,
1220 	    0,
1221 	    (ill->ill_flags & ILLF_NONUD) ? NCE_F_NONUD : 0,
1222 	    ND_REACHABLE,
1223 	    &nce);
1224 
1225 	switch (err) {
1226 	case 0:
1227 		/*
1228 		 * Cache entry with a proper resolver cookie was
1229 		 * created.
1230 		 */
1231 		NCE_REFRELE(nce);
1232 		break;
1233 	case EEXIST:
1234 		err = 0;
1235 		NCE_REFRELE(nce);
1236 		break;
1237 	default:
1238 		ip1dbg(("ndp_noresolver: Can't create NCE %d\n", err));
1239 		break;
1240 	}
1241 	return (err);
1242 }
1243 
1244 /*
1245  * For each interface an entry is added for the unspecified multicast group.
1246  * Here that mapping is used to form the multicast cache entry for a particular
1247  * multicast destination.
1248  */
1249 static int
1250 nce_set_multicast(ill_t *ill, const in6_addr_t *dst)
1251 {
1252 	nce_t		*mnce;	/* Multicast mapping entry */
1253 	nce_t		*nce;
1254 	uchar_t		*hw_addr = NULL;
1255 	int		err = 0;
1256 	ip_stack_t	*ipst = ill->ill_ipst;
1257 
1258 	ASSERT(ill != NULL);
1259 	ASSERT(ill->ill_isv6);
1260 	ASSERT(!(IN6_IS_ADDR_UNSPECIFIED(dst)));
1261 
1262 	mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
1263 	nce = *((nce_t **)NCE_HASH_PTR_V6(ipst, *dst));
1264 	nce = nce_lookup_addr(ill, B_FALSE, dst, nce);
1265 	if (nce != NULL) {
1266 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
1267 		NCE_REFRELE(nce);
1268 		return (0);
1269 	}
1270 	/* No entry, now lookup for a mapping this should never fail */
1271 	mnce = nce_lookup_mapping(ill, dst);
1272 	if (mnce == NULL) {
1273 		/* Something broken for the interface. */
1274 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
1275 		return (ESRCH);
1276 	}
1277 	ASSERT(mnce->nce_flags & NCE_F_MAPPING);
1278 	if (ill->ill_net_type == IRE_IF_RESOLVER) {
1279 		/*
1280 		 * For IRE_IF_RESOLVER a hardware mapping can be
1281 		 * generated, for IRE_IF_NORESOLVER, resolution cookie
1282 		 * in the ill is copied in ndp_add_v6().
1283 		 */
1284 		hw_addr = kmem_alloc(ill->ill_nd_lla_len, KM_NOSLEEP);
1285 		if (hw_addr == NULL) {
1286 			mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
1287 			NCE_REFRELE(mnce);
1288 			return (ENOMEM);
1289 		}
1290 		nce_make_mapping(mnce, hw_addr, (uchar_t *)dst);
1291 	}
1292 	NCE_REFRELE(mnce);
1293 	/*
1294 	 * IRE_IF_NORESOLVER type simply copies the resolution
1295 	 * cookie passed in.  So no hw_addr is needed.
1296 	 */
1297 	err = ndp_add_v6(ill,
1298 	    hw_addr,
1299 	    dst,
1300 	    &ipv6_all_ones,
1301 	    &ipv6_all_zeros,
1302 	    0,
1303 	    NCE_F_NONUD,
1304 	    ND_REACHABLE,
1305 	    &nce);
1306 	mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
1307 	if (hw_addr != NULL)
1308 		kmem_free(hw_addr, ill->ill_nd_lla_len);
1309 	if (err != 0) {
1310 		ip1dbg(("nce_set_multicast: create failed" "%d\n", err));
1311 		return (err);
1312 	}
1313 	NCE_REFRELE(nce);
1314 	return (0);
1315 }
1316 
1317 /*
1318  * Return the link layer address, and any flags of a nce.
1319  */
1320 int
1321 ndp_query(ill_t *ill, struct lif_nd_req *lnr)
1322 {
1323 	nce_t		*nce;
1324 	in6_addr_t	*addr;
1325 	sin6_t		*sin6;
1326 	dl_unitdata_req_t	*dl;
1327 
1328 	ASSERT(ill != NULL && ill->ill_isv6);
1329 	sin6 = (sin6_t *)&lnr->lnr_addr;
1330 	addr =  &sin6->sin6_addr;
1331 
1332 	/*
1333 	 * NOTE: if the ill is an IPMP interface, then match against the whole
1334 	 * illgrp.  This e.g. allows in.ndpd to retrieve the link layer
1335 	 * addresses for the data addresses on an IPMP interface even though
1336 	 * ipif_ndp_up() created them with an nce_ill of ipif_bound_ill.
1337 	 */
1338 	nce = ndp_lookup_v6(ill, IS_IPMP(ill), addr, B_FALSE);
1339 	if (nce == NULL)
1340 		return (ESRCH);
1341 	/* If in INCOMPLETE state, no link layer address is available yet */
1342 	if (nce->nce_state == ND_INCOMPLETE)
1343 		goto done;
1344 	dl = (dl_unitdata_req_t *)nce->nce_res_mp->b_rptr;
1345 	if (ill->ill_flags & ILLF_XRESOLV)
1346 		lnr->lnr_hdw_len = dl->dl_dest_addr_length;
1347 	else
1348 		lnr->lnr_hdw_len = ill->ill_nd_lla_len;
1349 	ASSERT(NCE_LL_ADDR_OFFSET(ill) + lnr->lnr_hdw_len <=
1350 	    sizeof (lnr->lnr_hdw_addr));
1351 	bcopy(nce->nce_res_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill),
1352 	    (uchar_t *)&lnr->lnr_hdw_addr, lnr->lnr_hdw_len);
1353 	if (nce->nce_flags & NCE_F_ISROUTER)
1354 		lnr->lnr_flags = NDF_ISROUTER_ON;
1355 	if (nce->nce_flags & NCE_F_ANYCAST)
1356 		lnr->lnr_flags |= NDF_ANYCAST_ON;
1357 done:
1358 	NCE_REFRELE(nce);
1359 	return (0);
1360 }
1361 
1362 /*
1363  * Send Enable/Disable multicast reqs to driver.
1364  */
1365 int
1366 ndp_mcastreq(ill_t *ill, const in6_addr_t *addr, uint32_t hw_addr_len,
1367     uint32_t hw_addr_offset, mblk_t *mp)
1368 {
1369 	nce_t		*nce;
1370 	uchar_t		*hw_addr;
1371 	ip_stack_t	*ipst = ill->ill_ipst;
1372 
1373 	ASSERT(ill != NULL && ill->ill_isv6);
1374 	ASSERT(ill->ill_net_type == IRE_IF_RESOLVER);
1375 	hw_addr = mi_offset_paramc(mp, hw_addr_offset, hw_addr_len);
1376 	if (hw_addr == NULL || !IN6_IS_ADDR_MULTICAST(addr)) {
1377 		freemsg(mp);
1378 		return (EINVAL);
1379 	}
1380 	mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
1381 	nce = nce_lookup_mapping(ill, addr);
1382 	if (nce == NULL) {
1383 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
1384 		freemsg(mp);
1385 		return (ESRCH);
1386 	}
1387 	mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
1388 	/*
1389 	 * Update dl_addr_length and dl_addr_offset for primitives that
1390 	 * have physical addresses as opposed to full saps
1391 	 */
1392 	switch (((union DL_primitives *)mp->b_rptr)->dl_primitive) {
1393 	case DL_ENABMULTI_REQ:
1394 		/* Track the state if this is the first enabmulti */
1395 		if (ill->ill_dlpi_multicast_state == IDS_UNKNOWN)
1396 			ill->ill_dlpi_multicast_state = IDS_INPROGRESS;
1397 		ip1dbg(("ndp_mcastreq: ENABMULTI\n"));
1398 		break;
1399 	case DL_DISABMULTI_REQ:
1400 		ip1dbg(("ndp_mcastreq: DISABMULTI\n"));
1401 		break;
1402 	default:
1403 		NCE_REFRELE(nce);
1404 		ip1dbg(("ndp_mcastreq: default\n"));
1405 		return (EINVAL);
1406 	}
1407 	nce_make_mapping(nce, hw_addr, (uchar_t *)addr);
1408 	NCE_REFRELE(nce);
1409 	ill_dlpi_send(ill, mp);
1410 	return (0);
1411 }
1412 
1413 /*
1414  * Send a neighbor solicitation.
1415  * Returns number of milliseconds after which we should either rexmit or abort.
1416  * Return of zero means we should abort.
1417  * The caller holds the nce_lock to protect nce_qd_mp and nce_rcnt.
1418  *
1419  * NOTE: This routine drops nce_lock (and later reacquires it) when sending
1420  * the packet.
1421  * NOTE: This routine does not consume mp.
1422  */
1423 uint32_t
1424 nce_solicit(nce_t *nce, mblk_t *mp)
1425 {
1426 	ip6_t		*ip6h;
1427 	in6_addr_t	sender;
1428 	boolean_t	dropped;
1429 
1430 	ASSERT(MUTEX_HELD(&nce->nce_lock));
1431 
1432 	if (nce->nce_rcnt == 0)
1433 		return (0);
1434 
1435 	if (mp == NULL) {
1436 		ASSERT(nce->nce_qd_mp != NULL);
1437 		mp = nce->nce_qd_mp;
1438 	} else {
1439 		nce_queue_mp(nce, mp);
1440 	}
1441 
1442 	/* Handle ip_newroute_v6 giving us IPSEC packets */
1443 	if (mp->b_datap->db_type == M_CTL)
1444 		mp = mp->b_cont;
1445 
1446 	ip6h = (ip6_t *)mp->b_rptr;
1447 	if (ip6h->ip6_nxt == IPPROTO_RAW) {
1448 		/*
1449 		 * This message should have been pulled up already in
1450 		 * ip_wput_v6. We can't do pullups here because the message
1451 		 * could be from the nce_qd_mp which could have b_next/b_prev
1452 		 * non-NULL.
1453 		 */
1454 		ASSERT(MBLKL(mp) >= sizeof (ip6i_t) + IPV6_HDR_LEN);
1455 		ip6h = (ip6_t *)(mp->b_rptr + sizeof (ip6i_t));
1456 	}
1457 
1458 	/*
1459 	 * Need to copy the sender address into a local since `mp' can
1460 	 * go away once we drop nce_lock.
1461 	 */
1462 	sender = ip6h->ip6_src;
1463 	nce->nce_rcnt--;
1464 	mutex_exit(&nce->nce_lock);
1465 	dropped = nce_xmit_solicit(nce, B_TRUE, &sender, 0);
1466 	mutex_enter(&nce->nce_lock);
1467 	if (dropped)
1468 		nce->nce_rcnt++;
1469 	return (nce->nce_ill->ill_reachable_retrans_time);
1470 }
1471 
1472 /*
1473  * Attempt to recover an address on an interface that's been marked as a
1474  * duplicate.  Because NCEs are destroyed when the interface goes down, there's
1475  * no easy way to just probe the address and have the right thing happen if
1476  * it's no longer in use.  Instead, we just bring it up normally and allow the
1477  * regular interface start-up logic to probe for a remaining duplicate and take
1478  * us back down if necessary.
1479  * Neither DHCP nor temporary addresses arrive here; they're excluded by
1480  * ip_ndp_excl.
1481  */
1482 /* ARGSUSED */
1483 static void
1484 ip_ndp_recover(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg)
1485 {
1486 	ill_t	*ill = rq->q_ptr;
1487 	ipif_t	*ipif;
1488 	in6_addr_t *addr = (in6_addr_t *)mp->b_rptr;
1489 
1490 	for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
1491 		/*
1492 		 * We do not support recovery of proxy ARP'd interfaces,
1493 		 * because the system lacks a complete proxy ARP mechanism.
1494 		 */
1495 		if ((ipif->ipif_flags & IPIF_POINTOPOINT) ||
1496 		    !IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6lcl_addr, addr)) {
1497 			continue;
1498 		}
1499 
1500 		/*
1501 		 * If we have already recovered or if the interface is going
1502 		 * away, then ignore.
1503 		 */
1504 		mutex_enter(&ill->ill_lock);
1505 		if (!(ipif->ipif_flags & IPIF_DUPLICATE) ||
1506 		    (ipif->ipif_state_flags & IPIF_CONDEMNED)) {
1507 			mutex_exit(&ill->ill_lock);
1508 			continue;
1509 		}
1510 
1511 		ipif->ipif_flags &= ~IPIF_DUPLICATE;
1512 		ill->ill_ipif_dup_count--;
1513 		mutex_exit(&ill->ill_lock);
1514 		ipif->ipif_was_dup = B_TRUE;
1515 
1516 		VERIFY(ipif_ndp_up(ipif, B_TRUE) != EINPROGRESS);
1517 		(void) ipif_up_done_v6(ipif);
1518 	}
1519 	freeb(mp);
1520 }
1521 
1522 /*
1523  * Attempt to recover an IPv6 interface that's been shut down as a duplicate.
1524  * As long as someone else holds the address, the interface will stay down.
1525  * When that conflict goes away, the interface is brought back up.  This is
1526  * done so that accidental shutdowns of addresses aren't made permanent.  Your
1527  * server will recover from a failure.
1528  *
1529  * For DHCP and temporary addresses, recovery is not done in the kernel.
1530  * Instead, it's handled by user space processes (dhcpagent and in.ndpd).
1531  *
1532  * This function is entered on a timer expiry; the ID is in ipif_recovery_id.
1533  */
1534 static void
1535 ipif6_dup_recovery(void *arg)
1536 {
1537 	ipif_t *ipif = arg;
1538 
1539 	ipif->ipif_recovery_id = 0;
1540 	if (!(ipif->ipif_flags & IPIF_DUPLICATE))
1541 		return;
1542 
1543 	/*
1544 	 * No lock, because this is just an optimization.
1545 	 */
1546 	if (ipif->ipif_state_flags & IPIF_CONDEMNED)
1547 		return;
1548 
1549 	/* If the link is down, we'll retry this later */
1550 	if (!(ipif->ipif_ill->ill_phyint->phyint_flags & PHYI_RUNNING))
1551 		return;
1552 
1553 	ndp_do_recovery(ipif);
1554 }
1555 
1556 /*
1557  * Perform interface recovery by forcing the duplicate interfaces up and
1558  * allowing the system to determine which ones should stay up.
1559  *
1560  * Called both by recovery timer expiry and link-up notification.
1561  */
1562 void
1563 ndp_do_recovery(ipif_t *ipif)
1564 {
1565 	ill_t *ill = ipif->ipif_ill;
1566 	mblk_t *mp;
1567 	ip_stack_t *ipst = ill->ill_ipst;
1568 
1569 	mp = allocb(sizeof (ipif->ipif_v6lcl_addr), BPRI_MED);
1570 	if (mp == NULL) {
1571 		mutex_enter(&ill->ill_lock);
1572 		if (ipif->ipif_recovery_id == 0 &&
1573 		    !(ipif->ipif_state_flags & IPIF_CONDEMNED)) {
1574 			ipif->ipif_recovery_id = timeout(ipif6_dup_recovery,
1575 			    ipif, MSEC_TO_TICK(ipst->ips_ip_dup_recovery));
1576 		}
1577 		mutex_exit(&ill->ill_lock);
1578 	} else {
1579 		/*
1580 		 * A recovery timer may still be running if we got here from
1581 		 * ill_restart_dad(); cancel that timer.
1582 		 */
1583 		if (ipif->ipif_recovery_id != 0)
1584 			(void) untimeout(ipif->ipif_recovery_id);
1585 		ipif->ipif_recovery_id = 0;
1586 
1587 		bcopy(&ipif->ipif_v6lcl_addr, mp->b_rptr,
1588 		    sizeof (ipif->ipif_v6lcl_addr));
1589 		ill_refhold(ill);
1590 		qwriter_ip(ill, ill->ill_rq, mp, ip_ndp_recover, NEW_OP,
1591 		    B_FALSE);
1592 	}
1593 }
1594 
1595 /*
1596  * Find the MAC and IP addresses in an NA/NS message.
1597  */
1598 static void
1599 ip_ndp_find_addresses(mblk_t *mp, mblk_t *dl_mp, ill_t *ill, in6_addr_t *targp,
1600     uchar_t **haddr, uint_t *haddrlenp)
1601 {
1602 	ip6_t *ip6h = (ip6_t *)mp->b_rptr;
1603 	icmp6_t *icmp6 = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
1604 	nd_neighbor_advert_t *na = (nd_neighbor_advert_t *)icmp6;
1605 	nd_neighbor_solicit_t *ns = (nd_neighbor_solicit_t *)icmp6;
1606 	uchar_t *addr;
1607 	int alen = 0;
1608 
1609 	if (dl_mp == NULL) {
1610 		nd_opt_hdr_t *opt = NULL;
1611 		int len;
1612 
1613 		/*
1614 		 * If it's from the fast-path, then it can't be a probe
1615 		 * message, and thus must include a linkaddr option.
1616 		 * Extract that here.
1617 		 */
1618 		switch (icmp6->icmp6_type) {
1619 		case ND_NEIGHBOR_SOLICIT:
1620 			len = mp->b_wptr - (uchar_t *)ns;
1621 			if ((len -= sizeof (*ns)) > 0) {
1622 				opt = ndp_get_option((nd_opt_hdr_t *)(ns + 1),
1623 				    len, ND_OPT_SOURCE_LINKADDR);
1624 			}
1625 			break;
1626 		case ND_NEIGHBOR_ADVERT:
1627 			len = mp->b_wptr - (uchar_t *)na;
1628 			if ((len -= sizeof (*na)) > 0) {
1629 				opt = ndp_get_option((nd_opt_hdr_t *)(na + 1),
1630 				    len, ND_OPT_TARGET_LINKADDR);
1631 			}
1632 			break;
1633 		}
1634 
1635 		if (opt != NULL && opt->nd_opt_len * 8 - sizeof (*opt) >=
1636 		    ill->ill_nd_lla_len) {
1637 			addr = (uchar_t *)(opt + 1);
1638 			alen = ill->ill_nd_lla_len;
1639 		}
1640 
1641 		/*
1642 		 * We cheat a bit here for the sake of printing usable log
1643 		 * messages in the rare case where the reply we got was unicast
1644 		 * without a source linkaddr option, and the interface is in
1645 		 * fastpath mode.  (Sigh.)
1646 		 */
1647 		if (alen == 0 && ill->ill_type == IFT_ETHER &&
1648 		    MBLKHEAD(mp) >= sizeof (struct ether_header)) {
1649 			struct ether_header *pether;
1650 
1651 			pether = (struct ether_header *)((char *)ip6h -
1652 			    sizeof (*pether));
1653 			addr = pether->ether_shost.ether_addr_octet;
1654 			alen = ETHERADDRL;
1655 		}
1656 	} else {
1657 		dl_unitdata_ind_t *dlu;
1658 
1659 		dlu = (dl_unitdata_ind_t *)dl_mp->b_rptr;
1660 		alen = dlu->dl_src_addr_length;
1661 		if (alen > 0 && dlu->dl_src_addr_offset >= sizeof (*dlu) &&
1662 		    dlu->dl_src_addr_offset + alen <= MBLKL(dl_mp)) {
1663 			addr = dl_mp->b_rptr + dlu->dl_src_addr_offset;
1664 			if (ill->ill_sap_length < 0) {
1665 				alen += ill->ill_sap_length;
1666 			} else {
1667 				addr += ill->ill_sap_length;
1668 				alen -= ill->ill_sap_length;
1669 			}
1670 		}
1671 	}
1672 
1673 	if (alen > 0) {
1674 		*haddr = addr;
1675 		*haddrlenp = alen;
1676 	} else {
1677 		*haddr = NULL;
1678 		*haddrlenp = 0;
1679 	}
1680 
1681 	/* nd_ns_target and nd_na_target are at the same offset, so we cheat */
1682 	*targp = ns->nd_ns_target;
1683 }
1684 
1685 /*
1686  * This is for exclusive changes due to NDP duplicate address detection
1687  * failure.
1688  */
1689 /* ARGSUSED */
1690 static void
1691 ip_ndp_excl(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg)
1692 {
1693 	ill_t	*ill = rq->q_ptr;
1694 	ipif_t	*ipif;
1695 	mblk_t	*dl_mp = NULL;
1696 	uchar_t	*haddr;
1697 	uint_t	haddrlen;
1698 	ip_stack_t *ipst = ill->ill_ipst;
1699 	in6_addr_t targ;
1700 
1701 	if (DB_TYPE(mp) != M_DATA) {
1702 		dl_mp = mp;
1703 		mp = mp->b_cont;
1704 	}
1705 
1706 	ip_ndp_find_addresses(mp, dl_mp, ill, &targ, &haddr, &haddrlen);
1707 	if (haddr != NULL && haddrlen == ill->ill_phys_addr_length) {
1708 		/*
1709 		 * Ignore conflicts generated by misbehaving switches that
1710 		 * just reflect our own messages back to us.  For IPMP, we may
1711 		 * see reflections across any ill in the illgrp.
1712 		 */
1713 		if (bcmp(haddr, ill->ill_phys_addr, haddrlen) == 0 ||
1714 		    IS_UNDER_IPMP(ill) &&
1715 		    ipmp_illgrp_find_ill(ill->ill_grp, haddr, haddrlen) != NULL)
1716 			goto ignore_conflict;
1717 	}
1718 
1719 	/*
1720 	 * Look up the appropriate ipif.
1721 	 */
1722 	ipif = ipif_lookup_addr_v6(&targ, ill, ALL_ZONES, NULL, NULL, NULL,
1723 	    NULL, ipst);
1724 	if (ipif == NULL)
1725 		goto ignore_conflict;
1726 
1727 	/* Reload the ill to match the ipif */
1728 	ill = ipif->ipif_ill;
1729 
1730 	/* If it's already duplicate or ineligible, then don't do anything. */
1731 	if (ipif->ipif_flags & (IPIF_POINTOPOINT|IPIF_DUPLICATE)) {
1732 		ipif_refrele(ipif);
1733 		goto ignore_conflict;
1734 	}
1735 
1736 	/*
1737 	 * If this is a failure during duplicate recovery, then don't
1738 	 * complain.  It may take a long time to recover.
1739 	 */
1740 	if (!ipif->ipif_was_dup) {
1741 		char ibuf[LIFNAMSIZ];
1742 		char hbuf[MAC_STR_LEN];
1743 		char sbuf[INET6_ADDRSTRLEN];
1744 
1745 		ipif_get_name(ipif, ibuf, sizeof (ibuf));
1746 		cmn_err(CE_WARN, "%s has duplicate address %s (in use by %s);"
1747 		    " disabled", ibuf,
1748 		    inet_ntop(AF_INET6, &targ, sbuf, sizeof (sbuf)),
1749 		    mac_colon_addr(haddr, haddrlen, hbuf, sizeof (hbuf)));
1750 	}
1751 	mutex_enter(&ill->ill_lock);
1752 	ASSERT(!(ipif->ipif_flags & IPIF_DUPLICATE));
1753 	ipif->ipif_flags |= IPIF_DUPLICATE;
1754 	ill->ill_ipif_dup_count++;
1755 	mutex_exit(&ill->ill_lock);
1756 	(void) ipif_down(ipif, NULL, NULL);
1757 	ipif_down_tail(ipif);
1758 	mutex_enter(&ill->ill_lock);
1759 	if (!(ipif->ipif_flags & (IPIF_DHCPRUNNING|IPIF_TEMPORARY)) &&
1760 	    ill->ill_net_type == IRE_IF_RESOLVER &&
1761 	    !(ipif->ipif_state_flags & IPIF_CONDEMNED) &&
1762 	    ipst->ips_ip_dup_recovery > 0) {
1763 		ASSERT(ipif->ipif_recovery_id == 0);
1764 		ipif->ipif_recovery_id = timeout(ipif6_dup_recovery,
1765 		    ipif, MSEC_TO_TICK(ipst->ips_ip_dup_recovery));
1766 	}
1767 	mutex_exit(&ill->ill_lock);
1768 	ipif_refrele(ipif);
1769 ignore_conflict:
1770 	if (dl_mp != NULL)
1771 		freeb(dl_mp);
1772 	freemsg(mp);
1773 }
1774 
1775 /*
1776  * Handle failure by tearing down the ipifs with the specified address.  Note
1777  * that tearing down the ipif also means deleting the nce through ipif_down, so
1778  * it's not possible to do recovery by just restarting the nce timer.  Instead,
1779  * we start a timer on the ipif.
1780  */
1781 static void
1782 ip_ndp_failure(ill_t *ill, mblk_t *mp, mblk_t *dl_mp)
1783 {
1784 	if ((mp = copymsg(mp)) != NULL) {
1785 		if (dl_mp == NULL)
1786 			dl_mp = mp;
1787 		else if ((dl_mp = copyb(dl_mp)) != NULL)
1788 			dl_mp->b_cont = mp;
1789 		if (dl_mp == NULL) {
1790 			freemsg(mp);
1791 		} else {
1792 			ill_refhold(ill);
1793 			qwriter_ip(ill, ill->ill_rq, dl_mp, ip_ndp_excl, NEW_OP,
1794 			    B_FALSE);
1795 		}
1796 	}
1797 }
1798 
1799 /*
1800  * Handle a discovered conflict: some other system is advertising that it owns
1801  * one of our IP addresses.  We need to defend ourselves, or just shut down the
1802  * interface.
1803  */
1804 static void
1805 ip_ndp_conflict(ill_t *ill, mblk_t *mp, mblk_t *dl_mp, nce_t *nce)
1806 {
1807 	ipif_t *ipif;
1808 	uint32_t now;
1809 	uint_t maxdefense;
1810 	uint_t defs;
1811 	ip_stack_t *ipst = ill->ill_ipst;
1812 
1813 	ipif = ipif_lookup_addr_v6(&nce->nce_addr, ill, ALL_ZONES, NULL, NULL,
1814 	    NULL, NULL, ipst);
1815 	if (ipif == NULL)
1816 		return;
1817 
1818 	/*
1819 	 * First, figure out if this address is disposable.
1820 	 */
1821 	if (ipif->ipif_flags & (IPIF_DHCPRUNNING | IPIF_TEMPORARY))
1822 		maxdefense = ipst->ips_ip_max_temp_defend;
1823 	else
1824 		maxdefense = ipst->ips_ip_max_defend;
1825 
1826 	/*
1827 	 * Now figure out how many times we've defended ourselves.  Ignore
1828 	 * defenses that happened long in the past.
1829 	 */
1830 	now = gethrestime_sec();
1831 	mutex_enter(&nce->nce_lock);
1832 	if ((defs = nce->nce_defense_count) > 0 &&
1833 	    now - nce->nce_defense_time > ipst->ips_ip_defend_interval) {
1834 		nce->nce_defense_count = defs = 0;
1835 	}
1836 	nce->nce_defense_count++;
1837 	nce->nce_defense_time = now;
1838 	mutex_exit(&nce->nce_lock);
1839 	ipif_refrele(ipif);
1840 
1841 	/*
1842 	 * If we've defended ourselves too many times already, then give up and
1843 	 * tear down the interface(s) using this address.  Otherwise, defend by
1844 	 * sending out an unsolicited Neighbor Advertisement.
1845 	 */
1846 	if (defs >= maxdefense) {
1847 		ip_ndp_failure(ill, mp, dl_mp);
1848 	} else {
1849 		char hbuf[MAC_STR_LEN];
1850 		char sbuf[INET6_ADDRSTRLEN];
1851 		uchar_t *haddr;
1852 		uint_t haddrlen;
1853 		in6_addr_t targ;
1854 
1855 		ip_ndp_find_addresses(mp, dl_mp, ill, &targ, &haddr, &haddrlen);
1856 		cmn_err(CE_WARN, "node %s is using our IP address %s on %s",
1857 		    mac_colon_addr(haddr, haddrlen, hbuf, sizeof (hbuf)),
1858 		    inet_ntop(AF_INET6, &targ, sbuf, sizeof (sbuf)),
1859 		    ill->ill_name);
1860 
1861 		(void) nce_xmit_advert(nce, B_FALSE, &ipv6_all_hosts_mcast, 0);
1862 	}
1863 }
1864 
1865 static void
1866 ndp_input_solicit(ill_t *ill, mblk_t *mp, mblk_t *dl_mp)
1867 {
1868 	nd_neighbor_solicit_t *ns;
1869 	uint32_t	hlen = ill->ill_nd_lla_len;
1870 	uchar_t		*haddr = NULL;
1871 	icmp6_t		*icmp_nd;
1872 	ip6_t		*ip6h;
1873 	nce_t		*our_nce = NULL;
1874 	in6_addr_t	target;
1875 	in6_addr_t	src;
1876 	int		len;
1877 	int		flag = 0;
1878 	nd_opt_hdr_t	*opt = NULL;
1879 	boolean_t	bad_solicit = B_FALSE;
1880 	mib2_ipv6IfIcmpEntry_t	*mib = ill->ill_icmp6_mib;
1881 
1882 	ip6h = (ip6_t *)mp->b_rptr;
1883 	icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
1884 	len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN;
1885 	src = ip6h->ip6_src;
1886 	ns = (nd_neighbor_solicit_t *)icmp_nd;
1887 	target = ns->nd_ns_target;
1888 	if (IN6_IS_ADDR_MULTICAST(&target)) {
1889 		if (ip_debug > 2) {
1890 			/* ip1dbg */
1891 			pr_addr_dbg("ndp_input_solicit: Target is"
1892 			    " multicast! %s\n", AF_INET6, &target);
1893 		}
1894 		bad_solicit = B_TRUE;
1895 		goto done;
1896 	}
1897 	if (len > sizeof (nd_neighbor_solicit_t)) {
1898 		/* Options present */
1899 		opt = (nd_opt_hdr_t *)&ns[1];
1900 		len -= sizeof (nd_neighbor_solicit_t);
1901 		if (!ndp_verify_optlen(opt, len)) {
1902 			ip1dbg(("ndp_input_solicit: Bad opt len\n"));
1903 			bad_solicit = B_TRUE;
1904 			goto done;
1905 		}
1906 
1907 	}
1908 	if (IN6_IS_ADDR_UNSPECIFIED(&src)) {
1909 		/* Check to see if this is a valid DAD solicitation */
1910 		if (!IN6_IS_ADDR_MC_SOLICITEDNODE(&ip6h->ip6_dst)) {
1911 			if (ip_debug > 2) {
1912 				/* ip1dbg */
1913 				pr_addr_dbg("ndp_input_solicit: IPv6 "
1914 				    "Destination is not solicited node "
1915 				    "multicast %s\n", AF_INET6,
1916 				    &ip6h->ip6_dst);
1917 			}
1918 			bad_solicit = B_TRUE;
1919 			goto done;
1920 		}
1921 	}
1922 
1923 	/*
1924 	 * NOTE: with IPMP, it's possible the nominated multicast ill (which
1925 	 * received this packet if it's multicast) is not the ill tied to
1926 	 * e.g. the IPMP ill's data link-local.  So we match across the illgrp
1927 	 * to ensure we find the associated NCE.
1928 	 */
1929 	our_nce = ndp_lookup_v6(ill, B_TRUE, &target, B_FALSE);
1930 	/*
1931 	 * If this is a valid Solicitation, a permanent
1932 	 * entry should exist in the cache
1933 	 */
1934 	if (our_nce == NULL ||
1935 	    !(our_nce->nce_flags & NCE_F_PERMANENT)) {
1936 		ip1dbg(("ndp_input_solicit: Wrong target in NS?!"
1937 		    "ifname=%s ", ill->ill_name));
1938 		if (ip_debug > 2) {
1939 			/* ip1dbg */
1940 			pr_addr_dbg(" dst %s\n", AF_INET6, &target);
1941 		}
1942 		bad_solicit = B_TRUE;
1943 		goto done;
1944 	}
1945 
1946 	/* At this point we should have a verified NS per spec */
1947 	if (opt != NULL) {
1948 		opt = ndp_get_option(opt, len, ND_OPT_SOURCE_LINKADDR);
1949 		if (opt != NULL) {
1950 			haddr = (uchar_t *)&opt[1];
1951 			if (hlen > opt->nd_opt_len * 8 - sizeof (*opt) ||
1952 			    hlen == 0) {
1953 				ip1dbg(("ndp_input_solicit: bad SLLA\n"));
1954 				bad_solicit = B_TRUE;
1955 				goto done;
1956 			}
1957 		}
1958 	}
1959 
1960 	/* If sending directly to peer, set the unicast flag */
1961 	if (!IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst))
1962 		flag |= NDP_UNICAST;
1963 
1964 	/*
1965 	 * Create/update the entry for the soliciting node.
1966 	 * or respond to outstanding queries, don't if
1967 	 * the source is unspecified address.
1968 	 */
1969 	if (!IN6_IS_ADDR_UNSPECIFIED(&src)) {
1970 		int	err;
1971 		nce_t	*nnce;
1972 
1973 		ASSERT(ill->ill_isv6);
1974 		/*
1975 		 * Regular solicitations *must* include the Source Link-Layer
1976 		 * Address option.  Ignore messages that do not.
1977 		 */
1978 		if (haddr == NULL && IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst)) {
1979 			ip1dbg(("ndp_input_solicit: source link-layer address "
1980 			    "option missing with a specified source.\n"));
1981 			bad_solicit = B_TRUE;
1982 			goto done;
1983 		}
1984 
1985 		/*
1986 		 * This is a regular solicitation.  If we're still in the
1987 		 * process of verifying the address, then don't respond at all
1988 		 * and don't keep track of the sender.
1989 		 */
1990 		if (our_nce->nce_state == ND_PROBE)
1991 			goto done;
1992 
1993 		/*
1994 		 * If the solicitation doesn't have sender hardware address
1995 		 * (legal for unicast solicitation), then process without
1996 		 * installing the return NCE.  Either we already know it, or
1997 		 * we'll be forced to look it up when (and if) we reply to the
1998 		 * packet.
1999 		 */
2000 		if (haddr == NULL)
2001 			goto no_source;
2002 
2003 		err = ndp_lookup_then_add_v6(ill,
2004 		    B_FALSE,
2005 		    haddr,
2006 		    &src,	/* Soliciting nodes address */
2007 		    &ipv6_all_ones,
2008 		    &ipv6_all_zeros,
2009 		    0,
2010 		    0,
2011 		    ND_STALE,
2012 		    &nnce);
2013 		switch (err) {
2014 		case 0:
2015 			/* done with this entry */
2016 			NCE_REFRELE(nnce);
2017 			break;
2018 		case EEXIST:
2019 			/*
2020 			 * B_FALSE indicates this is not an an advertisement.
2021 			 */
2022 			ndp_process(nnce, haddr, 0, B_FALSE);
2023 			NCE_REFRELE(nnce);
2024 			break;
2025 		default:
2026 			ip1dbg(("ndp_input_solicit: Can't create NCE %d\n",
2027 			    err));
2028 			goto done;
2029 		}
2030 no_source:
2031 		flag |= NDP_SOLICITED;
2032 	} else {
2033 		/*
2034 		 * No source link layer address option should be present in a
2035 		 * valid DAD request.
2036 		 */
2037 		if (haddr != NULL) {
2038 			ip1dbg(("ndp_input_solicit: source link-layer address "
2039 			    "option present with an unspecified source.\n"));
2040 			bad_solicit = B_TRUE;
2041 			goto done;
2042 		}
2043 		if (our_nce->nce_state == ND_PROBE) {
2044 			/*
2045 			 * Internally looped-back probes won't have DLPI
2046 			 * attached to them.  External ones (which are sent by
2047 			 * multicast) always will.  Just ignore our own
2048 			 * transmissions.
2049 			 */
2050 			if (dl_mp != NULL) {
2051 				/*
2052 				 * If someone else is probing our address, then
2053 				 * we've crossed wires.  Declare failure.
2054 				 */
2055 				ip_ndp_failure(ill, mp, dl_mp);
2056 			}
2057 			goto done;
2058 		}
2059 		/*
2060 		 * This is a DAD probe.  Multicast the advertisement to the
2061 		 * all-nodes address.
2062 		 */
2063 		src = ipv6_all_hosts_mcast;
2064 	}
2065 	/* Response to a solicitation */
2066 	(void) nce_xmit_advert(our_nce, B_TRUE, &src, flag);
2067 done:
2068 	if (bad_solicit)
2069 		BUMP_MIB(mib, ipv6IfIcmpInBadNeighborSolicitations);
2070 	if (our_nce != NULL)
2071 		NCE_REFRELE(our_nce);
2072 }
2073 
2074 void
2075 ndp_input_advert(ill_t *ill, mblk_t *mp, mblk_t *dl_mp)
2076 {
2077 	nd_neighbor_advert_t *na;
2078 	uint32_t	hlen = ill->ill_nd_lla_len;
2079 	uchar_t		*haddr = NULL;
2080 	icmp6_t		*icmp_nd;
2081 	ip6_t		*ip6h;
2082 	nce_t		*dst_nce = NULL;
2083 	in6_addr_t	target;
2084 	nd_opt_hdr_t	*opt = NULL;
2085 	int		len;
2086 	ip_stack_t	*ipst = ill->ill_ipst;
2087 	mib2_ipv6IfIcmpEntry_t	*mib = ill->ill_icmp6_mib;
2088 
2089 	ip6h = (ip6_t *)mp->b_rptr;
2090 	icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
2091 	len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN;
2092 	na = (nd_neighbor_advert_t *)icmp_nd;
2093 	if (IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst) &&
2094 	    (na->nd_na_flags_reserved & ND_NA_FLAG_SOLICITED)) {
2095 		ip1dbg(("ndp_input_advert: Target is multicast but the "
2096 		    "solicited flag is not zero\n"));
2097 		BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements);
2098 		return;
2099 	}
2100 	target = na->nd_na_target;
2101 	if (IN6_IS_ADDR_MULTICAST(&target)) {
2102 		ip1dbg(("ndp_input_advert: Target is multicast!\n"));
2103 		BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements);
2104 		return;
2105 	}
2106 	if (len > sizeof (nd_neighbor_advert_t)) {
2107 		opt = (nd_opt_hdr_t *)&na[1];
2108 		if (!ndp_verify_optlen(opt,
2109 		    len - sizeof (nd_neighbor_advert_t))) {
2110 			ip1dbg(("ndp_input_advert: cannot verify SLLA\n"));
2111 			BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements);
2112 			return;
2113 		}
2114 		/* At this point we have a verified NA per spec */
2115 		len -= sizeof (nd_neighbor_advert_t);
2116 		opt = ndp_get_option(opt, len, ND_OPT_TARGET_LINKADDR);
2117 		if (opt != NULL) {
2118 			haddr = (uchar_t *)&opt[1];
2119 			if (hlen > opt->nd_opt_len * 8 - sizeof (*opt) ||
2120 			    hlen == 0) {
2121 				ip1dbg(("ndp_input_advert: bad SLLA\n"));
2122 				BUMP_MIB(mib,
2123 				    ipv6IfIcmpInBadNeighborAdvertisements);
2124 				return;
2125 			}
2126 		}
2127 	}
2128 
2129 	/*
2130 	 * NOTE: we match across the illgrp since we need to do DAD for all of
2131 	 * our local addresses, and those are spread across all the active
2132 	 * ills in the group.
2133 	 */
2134 	if ((dst_nce = ndp_lookup_v6(ill, B_TRUE, &target, B_FALSE)) == NULL)
2135 		return;
2136 
2137 	if (dst_nce->nce_flags & NCE_F_PERMANENT) {
2138 		/*
2139 		 * Someone just advertised one of our local addresses.	First,
2140 		 * check it it was us -- if so, we can safely ignore it.
2141 		 */
2142 		if (haddr != NULL) {
2143 			if (!nce_cmp_ll_addr(dst_nce, haddr, hlen))
2144 				goto out;   /* from us -- no conflict */
2145 
2146 			/*
2147 			 * If we're in an IPMP group, check if this is an echo
2148 			 * from another ill in the group.  Use the double-
2149 			 * checked locking pattern to avoid grabbing
2150 			 * ill_g_lock in the non-IPMP case.
2151 			 */
2152 			if (IS_UNDER_IPMP(ill)) {
2153 				rw_enter(&ipst->ips_ill_g_lock, RW_READER);
2154 				if (IS_UNDER_IPMP(ill) && ipmp_illgrp_find_ill(
2155 				    ill->ill_grp, haddr, hlen) != NULL) {
2156 					rw_exit(&ipst->ips_ill_g_lock);
2157 					goto out;
2158 				}
2159 				rw_exit(&ipst->ips_ill_g_lock);
2160 			}
2161 		}
2162 
2163 		/*
2164 		 * This appears to be a real conflict.  If we're trying to
2165 		 * configure this NCE (ND_PROBE), then shut it down.
2166 		 * Otherwise, handle the discovered conflict.
2167 		 *
2168 		 * Note that dl_mp might be NULL if we're getting a unicast
2169 		 * reply.  This isn't typically done (multicast is the norm in
2170 		 * response to a probe), but we can handle the dl_mp == NULL
2171 		 * case as well.
2172 		 */
2173 		if (dst_nce->nce_state == ND_PROBE)
2174 			ip_ndp_failure(ill, mp, dl_mp);
2175 		else
2176 			ip_ndp_conflict(ill, mp, dl_mp, dst_nce);
2177 	} else {
2178 		if (na->nd_na_flags_reserved & ND_NA_FLAG_ROUTER)
2179 			dst_nce->nce_flags |= NCE_F_ISROUTER;
2180 
2181 		/* B_TRUE indicates this an advertisement */
2182 		ndp_process(dst_nce, haddr, na->nd_na_flags_reserved, B_TRUE);
2183 	}
2184 out:
2185 	NCE_REFRELE(dst_nce);
2186 }
2187 
2188 /*
2189  * Process NDP neighbor solicitation/advertisement messages.
2190  * The checksum has already checked o.k before reaching here.
2191  */
2192 void
2193 ndp_input(ill_t *ill, mblk_t *mp, mblk_t *dl_mp)
2194 {
2195 	icmp6_t		*icmp_nd;
2196 	ip6_t		*ip6h;
2197 	int		len;
2198 	mib2_ipv6IfIcmpEntry_t	*mib = ill->ill_icmp6_mib;
2199 
2200 
2201 	if (!pullupmsg(mp, -1)) {
2202 		ip1dbg(("ndp_input: pullupmsg failed\n"));
2203 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2204 		goto done;
2205 	}
2206 	ip6h = (ip6_t *)mp->b_rptr;
2207 	if (ip6h->ip6_hops != IPV6_MAX_HOPS) {
2208 		ip1dbg(("ndp_input: hoplimit != IPV6_MAX_HOPS\n"));
2209 		BUMP_MIB(mib, ipv6IfIcmpBadHoplimit);
2210 		goto done;
2211 	}
2212 	/*
2213 	 * NDP does not accept any extension headers between the
2214 	 * IP header and the ICMP header since e.g. a routing
2215 	 * header could be dangerous.
2216 	 * This assumes that any AH or ESP headers are removed
2217 	 * by ip prior to passing the packet to ndp_input.
2218 	 */
2219 	if (ip6h->ip6_nxt != IPPROTO_ICMPV6) {
2220 		ip1dbg(("ndp_input: Wrong next header 0x%x\n",
2221 		    ip6h->ip6_nxt));
2222 		BUMP_MIB(mib, ipv6IfIcmpInErrors);
2223 		goto done;
2224 	}
2225 	icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
2226 	ASSERT(icmp_nd->icmp6_type == ND_NEIGHBOR_SOLICIT ||
2227 	    icmp_nd->icmp6_type == ND_NEIGHBOR_ADVERT);
2228 	if (icmp_nd->icmp6_code != 0) {
2229 		ip1dbg(("ndp_input: icmp6 code != 0 \n"));
2230 		BUMP_MIB(mib, ipv6IfIcmpInErrors);
2231 		goto done;
2232 	}
2233 	len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN;
2234 	/*
2235 	 * Make sure packet length is large enough for either
2236 	 * a NS or a NA icmp packet.
2237 	 */
2238 	if (len <  sizeof (struct icmp6_hdr) + sizeof (struct in6_addr)) {
2239 		ip1dbg(("ndp_input: packet too short\n"));
2240 		BUMP_MIB(mib, ipv6IfIcmpInErrors);
2241 		goto done;
2242 	}
2243 	if (icmp_nd->icmp6_type == ND_NEIGHBOR_SOLICIT) {
2244 		ndp_input_solicit(ill, mp, dl_mp);
2245 	} else {
2246 		ndp_input_advert(ill, mp, dl_mp);
2247 	}
2248 done:
2249 	freemsg(mp);
2250 }
2251 
2252 /*
2253  * Utility routine to send an advertisement.  Assumes that the NCE cannot
2254  * go away (e.g., because it's refheld).
2255  */
2256 static boolean_t
2257 nce_xmit_advert(nce_t *nce, boolean_t use_nd_lla, const in6_addr_t *target,
2258     uint_t flags)
2259 {
2260 	ASSERT((flags & NDP_PROBE) == 0);
2261 
2262 	if (nce->nce_flags & NCE_F_ISROUTER)
2263 		flags |= NDP_ISROUTER;
2264 	if (!(nce->nce_flags & NCE_F_ANYCAST))
2265 		flags |= NDP_ORIDE;
2266 
2267 	return (nce_xmit(nce->nce_ill, ND_NEIGHBOR_ADVERT, use_nd_lla,
2268 	    &nce->nce_addr, target, flags));
2269 }
2270 
2271 /*
2272  * Utility routine to send a solicitation.  Assumes that the NCE cannot
2273  * go away (e.g., because it's refheld).
2274  */
2275 static boolean_t
2276 nce_xmit_solicit(nce_t *nce, boolean_t use_nd_lla, const in6_addr_t *sender,
2277     uint_t flags)
2278 {
2279 	if (flags & NDP_PROBE)
2280 		sender = &ipv6_all_zeros;
2281 
2282 	return (nce_xmit(nce->nce_ill, ND_NEIGHBOR_SOLICIT, use_nd_lla,
2283 	    sender, &nce->nce_addr, flags));
2284 }
2285 
2286 /*
2287  * nce_xmit is called to form and transmit a ND solicitation or
2288  * advertisement ICMP packet.
2289  *
2290  * If the source address is unspecified and this isn't a probe (used for
2291  * duplicate address detection), an appropriate source address and link layer
2292  * address will be chosen here.  The link layer address option is included if
2293  * the source is specified (i.e., all non-probe packets), and omitted (per the
2294  * specification) otherwise.
2295  *
2296  * It returns B_FALSE only if it does a successful put() to the
2297  * corresponding ill's ill_wq otherwise returns B_TRUE.
2298  */
2299 static boolean_t
2300 nce_xmit(ill_t *ill, uint8_t type, boolean_t use_nd_lla,
2301     const in6_addr_t *sender, const in6_addr_t *target, int flag)
2302 {
2303 	ill_t		*hwaddr_ill;
2304 	uint32_t	len;
2305 	icmp6_t 	*icmp6;
2306 	mblk_t		*mp;
2307 	ip6_t		*ip6h;
2308 	nd_opt_hdr_t	*opt;
2309 	uint_t		plen, maxplen;
2310 	ip6i_t		*ip6i;
2311 	ipif_t		*src_ipif = NULL;
2312 	uint8_t		*hw_addr;
2313 	zoneid_t	zoneid = GLOBAL_ZONEID;
2314 	char		buf[INET6_ADDRSTRLEN];
2315 
2316 	ASSERT(!IS_IPMP(ill));
2317 
2318 	/*
2319 	 * Check that the sender is actually a usable address on `ill', and if
2320 	 * so, track that as the src_ipif.  If not, for solicitations, set the
2321 	 * sender to :: so that a new one will be picked below; for adverts,
2322 	 * drop the packet since we expect nce_xmit_advert() to always provide
2323 	 * a valid sender.
2324 	 */
2325 	if (!IN6_IS_ADDR_UNSPECIFIED(sender)) {
2326 		if ((src_ipif = ip_ndp_lookup_addr_v6(sender, ill)) == NULL ||
2327 		    !src_ipif->ipif_addr_ready) {
2328 			if (src_ipif != NULL) {
2329 				ipif_refrele(src_ipif);
2330 				src_ipif = NULL;
2331 			}
2332 			if (type == ND_NEIGHBOR_ADVERT) {
2333 				ip1dbg(("nce_xmit: No source ipif for src %s\n",
2334 				    inet_ntop(AF_INET6, sender, buf,
2335 				    sizeof (buf))));
2336 				return (B_TRUE);
2337 			}
2338 			sender = &ipv6_all_zeros;
2339 		}
2340 	}
2341 
2342 	/*
2343 	 * If we still have an unspecified source (sender) address and this
2344 	 * isn't a probe, select a source address from `ill'.
2345 	 */
2346 	if (IN6_IS_ADDR_UNSPECIFIED(sender) && !(flag & NDP_PROBE)) {
2347 		ASSERT(type != ND_NEIGHBOR_ADVERT);
2348 		/*
2349 		 * Pick a source address for this solicitation, but restrict
2350 		 * the selection to addresses assigned to the output
2351 		 * interface.  We do this because the destination will create
2352 		 * a neighbor cache entry for the source address of this
2353 		 * packet, so the source address needs to be a valid neighbor.
2354 		 */
2355 		src_ipif = ipif_select_source_v6(ill, target, B_TRUE,
2356 		    IPV6_PREFER_SRC_DEFAULT, ALL_ZONES);
2357 		if (src_ipif == NULL) {
2358 			ip1dbg(("nce_xmit: No source ipif for dst %s\n",
2359 			    inet_ntop(AF_INET6, target, buf, sizeof (buf))));
2360 			return (B_TRUE);
2361 		}
2362 		sender = &src_ipif->ipif_v6src_addr;
2363 	}
2364 
2365 	/*
2366 	 * We're either sending a probe or we have a source address.
2367 	 */
2368 	ASSERT((flag & NDP_PROBE) || src_ipif != NULL);
2369 
2370 	maxplen = roundup(sizeof (nd_opt_hdr_t) + ND_MAX_HDW_LEN, 8);
2371 	len = IPV6_HDR_LEN + sizeof (ip6i_t) + sizeof (nd_neighbor_advert_t) +
2372 	    maxplen;
2373 	mp = allocb(len,  BPRI_LO);
2374 	if (mp == NULL) {
2375 		if (src_ipif != NULL)
2376 			ipif_refrele(src_ipif);
2377 		return (B_TRUE);
2378 	}
2379 	bzero((char *)mp->b_rptr, len);
2380 	mp->b_wptr = mp->b_rptr + len;
2381 
2382 	ip6i = (ip6i_t *)mp->b_rptr;
2383 	ip6i->ip6i_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
2384 	ip6i->ip6i_nxt = IPPROTO_RAW;
2385 	ip6i->ip6i_flags = IP6I_HOPLIMIT;
2386 	if (flag & NDP_PROBE)
2387 		ip6i->ip6i_flags |= IP6I_UNSPEC_SRC;
2388 
2389 	ip6h = (ip6_t *)(mp->b_rptr + sizeof (ip6i_t));
2390 	ip6h->ip6_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
2391 	ip6h->ip6_plen = htons(len - IPV6_HDR_LEN - sizeof (ip6i_t));
2392 	ip6h->ip6_nxt = IPPROTO_ICMPV6;
2393 	ip6h->ip6_hops = IPV6_MAX_HOPS;
2394 	ip6h->ip6_src = *sender;
2395 	ip6h->ip6_dst = *target;
2396 	icmp6 = (icmp6_t *)&ip6h[1];
2397 
2398 	opt = (nd_opt_hdr_t *)((uint8_t *)ip6h + IPV6_HDR_LEN +
2399 	    sizeof (nd_neighbor_advert_t));
2400 
2401 	if (type == ND_NEIGHBOR_SOLICIT) {
2402 		nd_neighbor_solicit_t *ns = (nd_neighbor_solicit_t *)icmp6;
2403 
2404 		if (!(flag & NDP_PROBE))
2405 			opt->nd_opt_type = ND_OPT_SOURCE_LINKADDR;
2406 		ns->nd_ns_target = *target;
2407 		if (!(flag & NDP_UNICAST)) {
2408 			/* Form multicast address of the target */
2409 			ip6h->ip6_dst = ipv6_solicited_node_mcast;
2410 			ip6h->ip6_dst.s6_addr32[3] |=
2411 			    ns->nd_ns_target.s6_addr32[3];
2412 		}
2413 	} else {
2414 		nd_neighbor_advert_t *na = (nd_neighbor_advert_t *)icmp6;
2415 
2416 		ASSERT(!(flag & NDP_PROBE));
2417 		opt->nd_opt_type = ND_OPT_TARGET_LINKADDR;
2418 		na->nd_na_target = *sender;
2419 		if (flag & NDP_ISROUTER)
2420 			na->nd_na_flags_reserved |= ND_NA_FLAG_ROUTER;
2421 		if (flag & NDP_SOLICITED)
2422 			na->nd_na_flags_reserved |= ND_NA_FLAG_SOLICITED;
2423 		if (flag & NDP_ORIDE)
2424 			na->nd_na_flags_reserved |= ND_NA_FLAG_OVERRIDE;
2425 	}
2426 
2427 	hw_addr = NULL;
2428 	if (!(flag & NDP_PROBE)) {
2429 		/*
2430 		 * Use our source address to find the hardware address to put
2431 		 * in the packet, so that the hardware address and IP address
2432 		 * will match up -- even if that hardware address doesn't
2433 		 * match the ill we actually transmit the packet through.
2434 		 */
2435 		if (IS_IPMP(src_ipif->ipif_ill)) {
2436 			hwaddr_ill = ipmp_ipif_hold_bound_ill(src_ipif);
2437 			if (hwaddr_ill == NULL) {
2438 				ip1dbg(("nce_xmit: no bound ill!\n"));
2439 				ipif_refrele(src_ipif);
2440 				freemsg(mp);
2441 				return (B_TRUE);
2442 			}
2443 		} else {
2444 			hwaddr_ill = src_ipif->ipif_ill;
2445 			ill_refhold(hwaddr_ill);	/* for symmetry */
2446 		}
2447 
2448 		plen = roundup(sizeof (nd_opt_hdr_t) +
2449 		    hwaddr_ill->ill_nd_lla_len, 8);
2450 
2451 		hw_addr = use_nd_lla ? hwaddr_ill->ill_nd_lla :
2452 		    hwaddr_ill->ill_phys_addr;
2453 		if (hw_addr != NULL) {
2454 			/* Fill in link layer address and option len */
2455 			opt->nd_opt_len = (uint8_t)(plen / 8);
2456 			bcopy(hw_addr, &opt[1], hwaddr_ill->ill_nd_lla_len);
2457 		}
2458 
2459 		ill_refrele(hwaddr_ill);
2460 	}
2461 
2462 	if (hw_addr == NULL)
2463 		plen = 0;
2464 
2465 	/* Fix up the length of the packet now that plen is known */
2466 	len -= (maxplen - plen);
2467 	mp->b_wptr = mp->b_rptr + len;
2468 	ip6h->ip6_plen = htons(len - IPV6_HDR_LEN - sizeof (ip6i_t));
2469 
2470 	icmp6->icmp6_type = type;
2471 	icmp6->icmp6_code = 0;
2472 	/*
2473 	 * Prepare for checksum by putting icmp length in the icmp
2474 	 * checksum field. The checksum is calculated in ip_wput_v6.
2475 	 */
2476 	icmp6->icmp6_cksum = ip6h->ip6_plen;
2477 
2478 	/*
2479 	 * Before we toss the src_ipif, look up the zoneid to pass to
2480 	 * ip_output_v6().  This is to ensure unicast ND_NEIGHBOR_ADVERT
2481 	 * packets to be routed correctly by IP (we cannot guarantee that the
2482 	 * global zone has an interface route to the destination).
2483 	 */
2484 	if (src_ipif != NULL) {
2485 		if ((zoneid = src_ipif->ipif_zoneid) == ALL_ZONES)
2486 			zoneid = GLOBAL_ZONEID;
2487 		ipif_refrele(src_ipif);
2488 	}
2489 
2490 	ip_output_v6((void *)(uintptr_t)zoneid, mp, ill->ill_wq, IP_WPUT);
2491 	return (B_FALSE);
2492 }
2493 
2494 /*
2495  * Make a link layer address (does not include the SAP) from an nce.
2496  * To form the link layer address, use the last four bytes of ipv6
2497  * address passed in and the fixed offset stored in nce.
2498  */
2499 static void
2500 nce_make_mapping(nce_t *nce, uchar_t *addrpos, uchar_t *addr)
2501 {
2502 	uchar_t *mask, *to;
2503 	ill_t	*ill = nce->nce_ill;
2504 	int 	len;
2505 
2506 	if (ill->ill_net_type == IRE_IF_NORESOLVER)
2507 		return;
2508 	ASSERT(nce->nce_res_mp != NULL);
2509 	ASSERT(ill->ill_net_type == IRE_IF_RESOLVER);
2510 	ASSERT(nce->nce_flags & NCE_F_MAPPING);
2511 	ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&nce->nce_extract_mask));
2512 	ASSERT(addr != NULL);
2513 	bcopy(nce->nce_res_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill),
2514 	    addrpos, ill->ill_nd_lla_len);
2515 	len = MIN((int)ill->ill_nd_lla_len - nce->nce_ll_extract_start,
2516 	    IPV6_ADDR_LEN);
2517 	mask = (uchar_t *)&nce->nce_extract_mask;
2518 	mask += (IPV6_ADDR_LEN - len);
2519 	addr += (IPV6_ADDR_LEN - len);
2520 	to = addrpos + nce->nce_ll_extract_start;
2521 	while (len-- > 0)
2522 		*to++ |= *mask++ & *addr++;
2523 }
2524 
2525 mblk_t *
2526 nce_udreq_alloc(ill_t *ill)
2527 {
2528 	mblk_t	*template_mp = NULL;
2529 	dl_unitdata_req_t *dlur;
2530 	int	sap_length;
2531 
2532 	ASSERT(ill->ill_isv6);
2533 
2534 	sap_length = ill->ill_sap_length;
2535 	template_mp = ip_dlpi_alloc(sizeof (dl_unitdata_req_t) +
2536 	    ill->ill_nd_lla_len + ABS(sap_length), DL_UNITDATA_REQ);
2537 	if (template_mp == NULL)
2538 		return (NULL);
2539 
2540 	dlur = (dl_unitdata_req_t *)template_mp->b_rptr;
2541 	dlur->dl_priority.dl_min = 0;
2542 	dlur->dl_priority.dl_max = 0;
2543 	dlur->dl_dest_addr_length = ABS(sap_length) + ill->ill_nd_lla_len;
2544 	dlur->dl_dest_addr_offset = sizeof (dl_unitdata_req_t);
2545 
2546 	/* Copy in the SAP value. */
2547 	NCE_LL_SAP_COPY(ill, template_mp);
2548 
2549 	return (template_mp);
2550 }
2551 
2552 /*
2553  * NDP retransmit timer.
2554  * This timer goes off when:
2555  * a. It is time to retransmit NS for resolver.
2556  * b. It is time to send reachability probes.
2557  */
2558 void
2559 ndp_timer(void *arg)
2560 {
2561 	nce_t		*nce = arg;
2562 	ill_t		*ill = nce->nce_ill;
2563 	uint32_t	ms;
2564 	char		addrbuf[INET6_ADDRSTRLEN];
2565 	boolean_t	dropped = B_FALSE;
2566 	ip_stack_t	*ipst = ill->ill_ipst;
2567 
2568 	/*
2569 	 * The timer has to be cancelled by ndp_delete before doing the final
2570 	 * refrele. So the NCE is guaranteed to exist when the timer runs
2571 	 * until it clears the timeout_id. Before clearing the timeout_id
2572 	 * bump up the refcnt so that we can continue to use the nce
2573 	 */
2574 	ASSERT(nce != NULL);
2575 
2576 	mutex_enter(&nce->nce_lock);
2577 	NCE_REFHOLD_LOCKED(nce);
2578 	nce->nce_timeout_id = 0;
2579 
2580 	/*
2581 	 * Check the reachability state first.
2582 	 */
2583 	switch (nce->nce_state) {
2584 	case ND_DELAY:
2585 		nce->nce_state = ND_PROBE;
2586 		mutex_exit(&nce->nce_lock);
2587 		(void) nce_xmit_solicit(nce, B_FALSE, &ipv6_all_zeros,
2588 		    NDP_UNICAST);
2589 		if (ip_debug > 3) {
2590 			/* ip2dbg */
2591 			pr_addr_dbg("ndp_timer: state for %s changed "
2592 			    "to PROBE\n", AF_INET6, &nce->nce_addr);
2593 		}
2594 		NDP_RESTART_TIMER(nce, ill->ill_reachable_retrans_time);
2595 		NCE_REFRELE(nce);
2596 		return;
2597 	case ND_PROBE:
2598 		/* must be retransmit timer */
2599 		nce->nce_pcnt--;
2600 		ASSERT(nce->nce_pcnt < ND_MAX_UNICAST_SOLICIT &&
2601 		    nce->nce_pcnt >= -1);
2602 		if (nce->nce_pcnt > 0) {
2603 			/*
2604 			 * As per RFC2461, the nce gets deleted after
2605 			 * MAX_UNICAST_SOLICIT unsuccessful re-transmissions.
2606 			 * Note that the first unicast solicitation is sent
2607 			 * during the DELAY state.
2608 			 */
2609 			ip2dbg(("ndp_timer: pcount=%x dst %s\n",
2610 			    nce->nce_pcnt, inet_ntop(AF_INET6, &nce->nce_addr,
2611 			    addrbuf, sizeof (addrbuf))));
2612 			mutex_exit(&nce->nce_lock);
2613 			dropped = nce_xmit_solicit(nce, B_FALSE,
2614 			    &ipv6_all_zeros,
2615 			    (nce->nce_flags & NCE_F_PERMANENT) ? NDP_PROBE :
2616 			    NDP_UNICAST);
2617 			if (dropped) {
2618 				mutex_enter(&nce->nce_lock);
2619 				nce->nce_pcnt++;
2620 				mutex_exit(&nce->nce_lock);
2621 			}
2622 			NDP_RESTART_TIMER(nce, ILL_PROBE_INTERVAL(ill));
2623 		} else if (nce->nce_pcnt < 0) {
2624 			/* No hope, delete the nce */
2625 			nce->nce_state = ND_UNREACHABLE;
2626 			mutex_exit(&nce->nce_lock);
2627 			if (ip_debug > 2) {
2628 				/* ip1dbg */
2629 				pr_addr_dbg("ndp_timer: Delete IRE for"
2630 				    " dst %s\n", AF_INET6, &nce->nce_addr);
2631 			}
2632 			ndp_delete(nce);
2633 		} else if (!(nce->nce_flags & NCE_F_PERMANENT)) {
2634 			/* Wait RetransTimer, before deleting the entry */
2635 			ip2dbg(("ndp_timer: pcount=%x dst %s\n",
2636 			    nce->nce_pcnt, inet_ntop(AF_INET6,
2637 			    &nce->nce_addr, addrbuf, sizeof (addrbuf))));
2638 			mutex_exit(&nce->nce_lock);
2639 			/* Wait one interval before killing */
2640 			NDP_RESTART_TIMER(nce, ill->ill_reachable_retrans_time);
2641 		} else if (ill->ill_phyint->phyint_flags & PHYI_RUNNING) {
2642 			ipif_t *ipif;
2643 
2644 			/*
2645 			 * We're done probing, and we can now declare this
2646 			 * address to be usable.  Let IP know that it's ok to
2647 			 * use.
2648 			 */
2649 			nce->nce_state = ND_REACHABLE;
2650 			mutex_exit(&nce->nce_lock);
2651 			ipif = ip_ndp_lookup_addr_v6(&nce->nce_addr,
2652 			    nce->nce_ill);
2653 			if (ipif != NULL) {
2654 				if (ipif->ipif_was_dup) {
2655 					char ibuf[LIFNAMSIZ + 10];
2656 					char sbuf[INET6_ADDRSTRLEN];
2657 
2658 					ipif->ipif_was_dup = B_FALSE;
2659 					(void) inet_ntop(AF_INET6,
2660 					    &ipif->ipif_v6lcl_addr,
2661 					    sbuf, sizeof (sbuf));
2662 					ipif_get_name(ipif, ibuf,
2663 					    sizeof (ibuf));
2664 					cmn_err(CE_NOTE, "recovered address "
2665 					    "%s on %s", sbuf, ibuf);
2666 				}
2667 				if ((ipif->ipif_flags & IPIF_UP) &&
2668 				    !ipif->ipif_addr_ready)
2669 					ipif_up_notify(ipif);
2670 				ipif->ipif_addr_ready = 1;
2671 				ipif_refrele(ipif);
2672 			}
2673 			/* Begin defending our new address */
2674 			nce->nce_unsolicit_count = 0;
2675 			dropped = nce_xmit_advert(nce, B_FALSE,
2676 			    &ipv6_all_hosts_mcast, 0);
2677 			if (dropped) {
2678 				nce->nce_unsolicit_count = 1;
2679 				NDP_RESTART_TIMER(nce,
2680 				    ipst->ips_ip_ndp_unsolicit_interval);
2681 			} else if (ipst->ips_ip_ndp_defense_interval != 0) {
2682 				NDP_RESTART_TIMER(nce,
2683 				    ipst->ips_ip_ndp_defense_interval);
2684 			}
2685 		} else {
2686 			/*
2687 			 * This is an address we're probing to be our own, but
2688 			 * the ill is down.  Wait until it comes back before
2689 			 * doing anything, but switch to reachable state so
2690 			 * that the restart will work.
2691 			 */
2692 			nce->nce_state = ND_REACHABLE;
2693 			mutex_exit(&nce->nce_lock);
2694 		}
2695 		NCE_REFRELE(nce);
2696 		return;
2697 	case ND_INCOMPLETE: {
2698 		ip6_t	*ip6h;
2699 		ip6i_t	*ip6i;
2700 		mblk_t	*mp, *datamp, *nextmp, **prevmpp;
2701 
2702 		/*
2703 		 * Per case (2) in the nce_queue_mp() comments, scan nce_qd_mp
2704 		 * for any IPMP probe packets, and toss 'em.  IPMP probe
2705 		 * packets will always be at the head of nce_qd_mp and always
2706 		 * have an ip6i_t header, so we can stop at the first queued
2707 		 * ND packet without an ip6i_t.
2708 		 */
2709 		prevmpp = &nce->nce_qd_mp;
2710 		for (mp = nce->nce_qd_mp; mp != NULL; mp = nextmp) {
2711 			nextmp = mp->b_next;
2712 			datamp = (DB_TYPE(mp) == M_CTL) ? mp->b_cont : mp;
2713 			ip6h = (ip6_t *)datamp->b_rptr;
2714 			if (ip6h->ip6_nxt != IPPROTO_RAW)
2715 				break;
2716 
2717 			ip6i = (ip6i_t *)ip6h;
2718 			if (ip6i->ip6i_flags & IP6I_IPMP_PROBE) {
2719 				inet_freemsg(mp);
2720 				*prevmpp = nextmp;
2721 			} else {
2722 				prevmpp = &mp->b_next;
2723 			}
2724 		}
2725 
2726 		/*
2727 		 * Must be resolver's retransmit timer.
2728 		 */
2729 		if (nce->nce_qd_mp != NULL) {
2730 			if ((ms = nce_solicit(nce, NULL)) == 0) {
2731 				if (nce->nce_state != ND_REACHABLE) {
2732 					mutex_exit(&nce->nce_lock);
2733 					nce_resolv_failed(nce);
2734 					ndp_delete(nce);
2735 				} else {
2736 					mutex_exit(&nce->nce_lock);
2737 				}
2738 			} else {
2739 				mutex_exit(&nce->nce_lock);
2740 				NDP_RESTART_TIMER(nce, (clock_t)ms);
2741 			}
2742 			NCE_REFRELE(nce);
2743 			return;
2744 		}
2745 		mutex_exit(&nce->nce_lock);
2746 		NCE_REFRELE(nce);
2747 		break;
2748 	}
2749 	case ND_REACHABLE:
2750 		if (((nce->nce_flags & NCE_F_UNSOL_ADV) &&
2751 		    nce->nce_unsolicit_count != 0) ||
2752 		    ((nce->nce_flags & NCE_F_PERMANENT) &&
2753 		    ipst->ips_ip_ndp_defense_interval != 0)) {
2754 			if (nce->nce_unsolicit_count > 0)
2755 				nce->nce_unsolicit_count--;
2756 			mutex_exit(&nce->nce_lock);
2757 			dropped = nce_xmit_advert(nce, B_FALSE,
2758 			    &ipv6_all_hosts_mcast, 0);
2759 			if (dropped) {
2760 				mutex_enter(&nce->nce_lock);
2761 				nce->nce_unsolicit_count++;
2762 				mutex_exit(&nce->nce_lock);
2763 			}
2764 			if (nce->nce_unsolicit_count != 0) {
2765 				NDP_RESTART_TIMER(nce,
2766 				    ipst->ips_ip_ndp_unsolicit_interval);
2767 			} else {
2768 				NDP_RESTART_TIMER(nce,
2769 				    ipst->ips_ip_ndp_defense_interval);
2770 			}
2771 		} else {
2772 			mutex_exit(&nce->nce_lock);
2773 		}
2774 		NCE_REFRELE(nce);
2775 		break;
2776 	default:
2777 		mutex_exit(&nce->nce_lock);
2778 		NCE_REFRELE(nce);
2779 		break;
2780 	}
2781 }
2782 
2783 /*
2784  * Set a link layer address from the ll_addr passed in.
2785  * Copy SAP from ill.
2786  */
2787 static void
2788 nce_set_ll(nce_t *nce, uchar_t *ll_addr)
2789 {
2790 	ill_t	*ill = nce->nce_ill;
2791 	uchar_t	*woffset;
2792 
2793 	ASSERT(ll_addr != NULL);
2794 	/* Always called before fast_path_probe */
2795 	ASSERT(nce->nce_fp_mp == NULL);
2796 	if (ill->ill_sap_length != 0) {
2797 		/*
2798 		 * Copy the SAP type specified in the
2799 		 * request into the xmit template.
2800 		 */
2801 		NCE_LL_SAP_COPY(ill, nce->nce_res_mp);
2802 	}
2803 	if (ill->ill_phys_addr_length > 0) {
2804 		/*
2805 		 * The bcopy() below used to be called for the physical address
2806 		 * length rather than the link layer address length. For
2807 		 * ethernet and many other media, the phys_addr and lla are
2808 		 * identical.
2809 		 * However, with xresolv interfaces being introduced, the
2810 		 * phys_addr and lla are no longer the same, and the physical
2811 		 * address may not have any useful meaning, so we use the lla
2812 		 * for IPv6 address resolution and destination addressing.
2813 		 *
2814 		 * For PPP or other interfaces with a zero length
2815 		 * physical address, don't do anything here.
2816 		 * The bcopy() with a zero phys_addr length was previously
2817 		 * a no-op for interfaces with a zero-length physical address.
2818 		 * Using the lla for them would change the way they operate.
2819 		 * Doing nothing in such cases preserves expected behavior.
2820 		 */
2821 		woffset = nce->nce_res_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill);
2822 		bcopy(ll_addr, woffset, ill->ill_nd_lla_len);
2823 	}
2824 }
2825 
2826 static boolean_t
2827 nce_cmp_ll_addr(const nce_t *nce, const uchar_t *ll_addr, uint32_t ll_addr_len)
2828 {
2829 	ill_t	*ill = nce->nce_ill;
2830 	uchar_t	*ll_offset;
2831 
2832 	ASSERT(nce->nce_res_mp != NULL);
2833 	if (ll_addr == NULL)
2834 		return (B_FALSE);
2835 	ll_offset = nce->nce_res_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill);
2836 	if (bcmp(ll_addr, ll_offset, ll_addr_len) != 0)
2837 		return (B_TRUE);
2838 	return (B_FALSE);
2839 }
2840 
2841 /*
2842  * Updates the link layer address or the reachability state of
2843  * a cache entry.  Reset probe counter if needed.
2844  */
2845 static void
2846 nce_update(nce_t *nce, uint16_t new_state, uchar_t *new_ll_addr)
2847 {
2848 	ill_t	*ill = nce->nce_ill;
2849 	boolean_t need_stop_timer = B_FALSE;
2850 	boolean_t need_fastpath_update = B_FALSE;
2851 
2852 	ASSERT(MUTEX_HELD(&nce->nce_lock));
2853 	ASSERT(nce->nce_ipversion == IPV6_VERSION);
2854 	/*
2855 	 * If this interface does not do NUD, there is no point
2856 	 * in allowing an update to the cache entry.  Although
2857 	 * we will respond to NS.
2858 	 * The only time we accept an update for a resolver when
2859 	 * NUD is turned off is when it has just been created.
2860 	 * Non-Resolvers will always be created as REACHABLE.
2861 	 */
2862 	if (new_state != ND_UNCHANGED) {
2863 		if ((nce->nce_flags & NCE_F_NONUD) &&
2864 		    (nce->nce_state != ND_INCOMPLETE))
2865 			return;
2866 		ASSERT((int16_t)new_state >= ND_STATE_VALID_MIN);
2867 		ASSERT((int16_t)new_state <= ND_STATE_VALID_MAX);
2868 		need_stop_timer = B_TRUE;
2869 		if (new_state == ND_REACHABLE)
2870 			nce->nce_last = TICK_TO_MSEC(lbolt64);
2871 		else {
2872 			/* We force NUD in this case */
2873 			nce->nce_last = 0;
2874 		}
2875 		nce->nce_state = new_state;
2876 		nce->nce_pcnt = ND_MAX_UNICAST_SOLICIT;
2877 	}
2878 	/*
2879 	 * In case of fast path we need to free the the fastpath
2880 	 * M_DATA and do another probe.  Otherwise we can just
2881 	 * overwrite the DL_UNITDATA_REQ data, noting we'll lose
2882 	 * whatever packets that happens to be transmitting at the time.
2883 	 */
2884 	if (new_ll_addr != NULL) {
2885 		ASSERT(nce->nce_res_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill) +
2886 		    ill->ill_nd_lla_len <= nce->nce_res_mp->b_wptr);
2887 		bcopy(new_ll_addr, nce->nce_res_mp->b_rptr +
2888 		    NCE_LL_ADDR_OFFSET(ill), ill->ill_nd_lla_len);
2889 		if (nce->nce_fp_mp != NULL) {
2890 			freemsg(nce->nce_fp_mp);
2891 			nce->nce_fp_mp = NULL;
2892 		}
2893 		need_fastpath_update = B_TRUE;
2894 	}
2895 	mutex_exit(&nce->nce_lock);
2896 	if (need_stop_timer) {
2897 		(void) untimeout(nce->nce_timeout_id);
2898 		nce->nce_timeout_id = 0;
2899 	}
2900 	if (need_fastpath_update)
2901 		nce_fastpath(nce);
2902 	mutex_enter(&nce->nce_lock);
2903 }
2904 
2905 void
2906 nce_queue_mp_common(nce_t *nce, mblk_t *mp, boolean_t head_insert)
2907 {
2908 	uint_t	count = 0;
2909 	mblk_t  **mpp, *tmp;
2910 
2911 	ASSERT(MUTEX_HELD(&nce->nce_lock));
2912 
2913 	for (mpp = &nce->nce_qd_mp; *mpp != NULL; mpp = &(*mpp)->b_next) {
2914 		if (++count > nce->nce_ill->ill_max_buf) {
2915 			tmp = nce->nce_qd_mp->b_next;
2916 			nce->nce_qd_mp->b_next = NULL;
2917 			nce->nce_qd_mp->b_prev = NULL;
2918 			freemsg(nce->nce_qd_mp);
2919 			nce->nce_qd_mp = tmp;
2920 		}
2921 	}
2922 
2923 	if (head_insert) {
2924 		mp->b_next = nce->nce_qd_mp;
2925 		nce->nce_qd_mp = mp;
2926 	} else {
2927 		*mpp = mp;
2928 	}
2929 }
2930 
2931 static void
2932 nce_queue_mp(nce_t *nce, mblk_t *mp)
2933 {
2934 	boolean_t head_insert = B_FALSE;
2935 	ip6_t	*ip6h;
2936 	ip6i_t  *ip6i;
2937 	mblk_t	*data_mp;
2938 
2939 	ASSERT(MUTEX_HELD(&nce->nce_lock));
2940 
2941 	if (mp->b_datap->db_type == M_CTL)
2942 		data_mp = mp->b_cont;
2943 	else
2944 		data_mp = mp;
2945 	ip6h = (ip6_t *)data_mp->b_rptr;
2946 	if (ip6h->ip6_nxt == IPPROTO_RAW) {
2947 		/*
2948 		 * This message should have been pulled up already in
2949 		 * ip_wput_v6. We can't do pullups here because the message
2950 		 * could be from the nce_qd_mp which could have b_next/b_prev
2951 		 * non-NULL.
2952 		 */
2953 		ip6i = (ip6i_t *)ip6h;
2954 		ASSERT(MBLKL(data_mp) >= sizeof (ip6i_t) + IPV6_HDR_LEN);
2955 
2956 		/*
2957 		 * If this packet is marked IP6I_IPMP_PROBE, then we need to:
2958 		 *
2959 		 *   1. Insert it at the head of the nce_qd_mp list.  Consider
2960 		 *	the normal (non-probe) load-speading case where the
2961 		 *	source address of the ND packet is not tied to nce_ill.
2962 		 *	If the ill bound to the source address cannot receive,
2963 		 *	the response to the ND packet will not be received.
2964 		 *	However, if ND packets for nce_ill's probes are queued
2965 		 *	behind that ND packet, those probes will also fail to
2966 		 *	be sent, and thus in.mpathd will erroneously conclude
2967 		 *	that nce_ill has also failed.
2968 		 *
2969 		 *   2. Drop the probe packet in ndp_timer() if the ND did
2970 		 *	not succeed on the first attempt.  This ensures that
2971 		 *	ND problems do not manifest as probe RTT spikes.
2972 		 */
2973 		if (ip6i->ip6i_flags & IP6I_IPMP_PROBE)
2974 			head_insert = B_TRUE;
2975 	}
2976 	nce_queue_mp_common(nce, mp, head_insert);
2977 }
2978 
2979 /*
2980  * Called when address resolution failed due to a timeout.
2981  * Send an ICMP unreachable in response to all queued packets.
2982  */
2983 void
2984 nce_resolv_failed(nce_t *nce)
2985 {
2986 	mblk_t	*mp, *nxt_mp, *first_mp;
2987 	char	buf[INET6_ADDRSTRLEN];
2988 	ip6_t *ip6h;
2989 	zoneid_t zoneid = GLOBAL_ZONEID;
2990 	ip_stack_t	*ipst = nce->nce_ill->ill_ipst;
2991 
2992 	ip1dbg(("nce_resolv_failed: dst %s\n",
2993 	    inet_ntop(AF_INET6, (char *)&nce->nce_addr, buf, sizeof (buf))));
2994 	mutex_enter(&nce->nce_lock);
2995 	mp = nce->nce_qd_mp;
2996 	nce->nce_qd_mp = NULL;
2997 	mutex_exit(&nce->nce_lock);
2998 	while (mp != NULL) {
2999 		nxt_mp = mp->b_next;
3000 		mp->b_next = NULL;
3001 		mp->b_prev = NULL;
3002 
3003 		first_mp = mp;
3004 		if (mp->b_datap->db_type == M_CTL) {
3005 			ipsec_out_t *io = (ipsec_out_t *)mp->b_rptr;
3006 			ASSERT(io->ipsec_out_type == IPSEC_OUT);
3007 			zoneid = io->ipsec_out_zoneid;
3008 			ASSERT(zoneid != ALL_ZONES);
3009 			mp = mp->b_cont;
3010 			mp->b_next = NULL;
3011 			mp->b_prev = NULL;
3012 		}
3013 
3014 		ip6h = (ip6_t *)mp->b_rptr;
3015 		if (ip6h->ip6_nxt == IPPROTO_RAW) {
3016 			ip6i_t *ip6i;
3017 			/*
3018 			 * This message should have been pulled up already
3019 			 * in ip_wput_v6. ip_hdr_complete_v6 assumes that
3020 			 * the header is pulled up.
3021 			 */
3022 			ip6i = (ip6i_t *)ip6h;
3023 			ASSERT((mp->b_wptr - (uchar_t *)ip6i) >=
3024 			    sizeof (ip6i_t) + IPV6_HDR_LEN);
3025 			mp->b_rptr += sizeof (ip6i_t);
3026 		}
3027 		/*
3028 		 * Ignore failure since icmp_unreachable_v6 will silently
3029 		 * drop packets with an unspecified source address.
3030 		 */
3031 		(void) ip_hdr_complete_v6((ip6_t *)mp->b_rptr, zoneid, ipst);
3032 		icmp_unreachable_v6(nce->nce_ill->ill_wq, first_mp,
3033 		    ICMP6_DST_UNREACH_ADDR, B_FALSE, B_FALSE, zoneid, ipst);
3034 		mp = nxt_mp;
3035 	}
3036 }
3037 
3038 /*
3039  * Called by SIOCSNDP* ioctl to add/change an nce entry
3040  * and the corresponding attributes.
3041  * Disallow states other than ND_REACHABLE or ND_STALE.
3042  */
3043 int
3044 ndp_sioc_update(ill_t *ill, lif_nd_req_t *lnr)
3045 {
3046 	sin6_t		*sin6;
3047 	in6_addr_t	*addr;
3048 	nce_t		*nce;
3049 	int		err;
3050 	uint16_t	new_flags = 0;
3051 	uint16_t	old_flags = 0;
3052 	int		inflags = lnr->lnr_flags;
3053 	ip_stack_t	*ipst = ill->ill_ipst;
3054 
3055 	ASSERT(ill->ill_isv6);
3056 	if ((lnr->lnr_state_create != ND_REACHABLE) &&
3057 	    (lnr->lnr_state_create != ND_STALE))
3058 		return (EINVAL);
3059 
3060 	if (lnr->lnr_hdw_len > ND_MAX_HDW_LEN)
3061 		return (EINVAL);
3062 
3063 	sin6 = (sin6_t *)&lnr->lnr_addr;
3064 	addr = &sin6->sin6_addr;
3065 
3066 	mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
3067 	/* We know it can not be mapping so just look in the hash table */
3068 	nce = *((nce_t **)NCE_HASH_PTR_V6(ipst, *addr));
3069 	/* See comment in ndp_query() regarding IS_IPMP(ill) usage */
3070 	nce = nce_lookup_addr(ill, IS_IPMP(ill), addr, nce);
3071 	if (nce != NULL)
3072 		new_flags = nce->nce_flags;
3073 
3074 	switch (inflags & (NDF_ISROUTER_ON|NDF_ISROUTER_OFF)) {
3075 	case NDF_ISROUTER_ON:
3076 		new_flags |= NCE_F_ISROUTER;
3077 		break;
3078 	case NDF_ISROUTER_OFF:
3079 		new_flags &= ~NCE_F_ISROUTER;
3080 		break;
3081 	case (NDF_ISROUTER_OFF|NDF_ISROUTER_ON):
3082 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
3083 		if (nce != NULL)
3084 			NCE_REFRELE(nce);
3085 		return (EINVAL);
3086 	}
3087 
3088 	switch (inflags & (NDF_ANYCAST_ON|NDF_ANYCAST_OFF)) {
3089 	case NDF_ANYCAST_ON:
3090 		new_flags |= NCE_F_ANYCAST;
3091 		break;
3092 	case NDF_ANYCAST_OFF:
3093 		new_flags &= ~NCE_F_ANYCAST;
3094 		break;
3095 	case (NDF_ANYCAST_OFF|NDF_ANYCAST_ON):
3096 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
3097 		if (nce != NULL)
3098 			NCE_REFRELE(nce);
3099 		return (EINVAL);
3100 	}
3101 
3102 	if (nce == NULL) {
3103 		err = ndp_add_v6(ill,
3104 		    (uchar_t *)lnr->lnr_hdw_addr,
3105 		    addr,
3106 		    &ipv6_all_ones,
3107 		    &ipv6_all_zeros,
3108 		    0,
3109 		    new_flags,
3110 		    lnr->lnr_state_create,
3111 		    &nce);
3112 		if (err != 0) {
3113 			mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
3114 			ip1dbg(("ndp_sioc_update: Can't create NCE %d\n", err));
3115 			return (err);
3116 		}
3117 	}
3118 	old_flags = nce->nce_flags;
3119 	if (old_flags & NCE_F_ISROUTER && !(new_flags & NCE_F_ISROUTER)) {
3120 		/*
3121 		 * Router turned to host, delete all ires.
3122 		 * XXX Just delete the entry, but we need to add too.
3123 		 */
3124 		nce->nce_flags &= ~NCE_F_ISROUTER;
3125 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
3126 		ndp_delete(nce);
3127 		NCE_REFRELE(nce);
3128 		return (0);
3129 	}
3130 	mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
3131 
3132 	mutex_enter(&nce->nce_lock);
3133 	nce->nce_flags = new_flags;
3134 	mutex_exit(&nce->nce_lock);
3135 	/*
3136 	 * Note that we ignore the state at this point, which
3137 	 * should be either STALE or REACHABLE.  Instead we let
3138 	 * the link layer address passed in to determine the state
3139 	 * much like incoming packets.
3140 	 */
3141 	nce_process(nce, (uchar_t *)lnr->lnr_hdw_addr, 0, B_FALSE);
3142 	NCE_REFRELE(nce);
3143 	return (0);
3144 }
3145 
3146 /*
3147  * If the device driver supports it, we make nce_fp_mp to have
3148  * an M_DATA prepend.  Otherwise nce_fp_mp will be null.
3149  * The caller ensures there is hold on nce for this function.
3150  * Note that since ill_fastpath_probe() copies the mblk there is
3151  * no need for the hold beyond this function.
3152  */
3153 void
3154 nce_fastpath(nce_t *nce)
3155 {
3156 	ill_t	*ill = nce->nce_ill;
3157 	int res;
3158 
3159 	ASSERT(ill != NULL);
3160 	ASSERT(nce->nce_state != ND_INITIAL && nce->nce_state != ND_INCOMPLETE);
3161 
3162 	if (nce->nce_fp_mp != NULL) {
3163 		/* Already contains fastpath info */
3164 		return;
3165 	}
3166 	if (nce->nce_res_mp != NULL) {
3167 		nce_fastpath_list_add(nce);
3168 		res = ill_fastpath_probe(ill, nce->nce_res_mp);
3169 		/*
3170 		 * EAGAIN is an indication of a transient error
3171 		 * i.e. allocation failure etc. leave the nce in the list it
3172 		 * will be updated when another probe happens for another ire
3173 		 * if not it will be taken out of the list when the ire is
3174 		 * deleted.
3175 		 */
3176 
3177 		if (res != 0 && res != EAGAIN)
3178 			nce_fastpath_list_delete(nce);
3179 	}
3180 }
3181 
3182 /*
3183  * Drain the list of nce's waiting for fastpath response.
3184  */
3185 void
3186 nce_fastpath_list_dispatch(ill_t *ill, boolean_t (*func)(nce_t *, void  *),
3187     void *arg)
3188 {
3189 
3190 	nce_t *next_nce;
3191 	nce_t *current_nce;
3192 	nce_t *first_nce;
3193 	nce_t *prev_nce = NULL;
3194 
3195 	mutex_enter(&ill->ill_lock);
3196 	first_nce = current_nce = (nce_t *)ill->ill_fastpath_list;
3197 	while (current_nce != (nce_t *)&ill->ill_fastpath_list) {
3198 		next_nce = current_nce->nce_fastpath;
3199 		/*
3200 		 * Take it off the list if we're flushing, or if the callback
3201 		 * routine tells us to do so.  Otherwise, leave the nce in the
3202 		 * fastpath list to handle any pending response from the lower
3203 		 * layer.  We can't drain the list when the callback routine
3204 		 * comparison failed, because the response is asynchronous in
3205 		 * nature, and may not arrive in the same order as the list
3206 		 * insertion.
3207 		 */
3208 		if (func == NULL || func(current_nce, arg)) {
3209 			current_nce->nce_fastpath = NULL;
3210 			if (current_nce == first_nce)
3211 				ill->ill_fastpath_list = first_nce = next_nce;
3212 			else
3213 				prev_nce->nce_fastpath = next_nce;
3214 		} else {
3215 			/* previous element that is still in the list */
3216 			prev_nce = current_nce;
3217 		}
3218 		current_nce = next_nce;
3219 	}
3220 	mutex_exit(&ill->ill_lock);
3221 }
3222 
3223 /*
3224  * Add nce to the nce fastpath list.
3225  */
3226 void
3227 nce_fastpath_list_add(nce_t *nce)
3228 {
3229 	ill_t *ill;
3230 
3231 	ill = nce->nce_ill;
3232 
3233 	mutex_enter(&ill->ill_lock);
3234 	mutex_enter(&nce->nce_lock);
3235 
3236 	/*
3237 	 * if nce has not been deleted and
3238 	 * is not already in the list add it.
3239 	 */
3240 	if (!(nce->nce_flags & NCE_F_CONDEMNED) &&
3241 	    (nce->nce_fastpath == NULL)) {
3242 		nce->nce_fastpath = (nce_t *)ill->ill_fastpath_list;
3243 		ill->ill_fastpath_list = nce;
3244 	}
3245 
3246 	mutex_exit(&nce->nce_lock);
3247 	mutex_exit(&ill->ill_lock);
3248 }
3249 
3250 /*
3251  * remove nce from the nce fastpath list.
3252  */
3253 void
3254 nce_fastpath_list_delete(nce_t *nce)
3255 {
3256 	nce_t *nce_ptr;
3257 
3258 	ill_t *ill;
3259 
3260 	ill = nce->nce_ill;
3261 	ASSERT(ill != NULL);
3262 
3263 	mutex_enter(&ill->ill_lock);
3264 	if (nce->nce_fastpath == NULL)
3265 		goto done;
3266 
3267 	ASSERT(ill->ill_fastpath_list != &ill->ill_fastpath_list);
3268 
3269 	if (ill->ill_fastpath_list == nce) {
3270 		ill->ill_fastpath_list = nce->nce_fastpath;
3271 	} else {
3272 		nce_ptr = ill->ill_fastpath_list;
3273 		while (nce_ptr != (nce_t *)&ill->ill_fastpath_list) {
3274 			if (nce_ptr->nce_fastpath == nce) {
3275 				nce_ptr->nce_fastpath = nce->nce_fastpath;
3276 				break;
3277 			}
3278 			nce_ptr = nce_ptr->nce_fastpath;
3279 		}
3280 	}
3281 
3282 	nce->nce_fastpath = NULL;
3283 done:
3284 	mutex_exit(&ill->ill_lock);
3285 }
3286 
3287 /*
3288  * Update all NCE's that are not in fastpath mode and
3289  * have an nce_fp_mp that matches mp. mp->b_cont contains
3290  * the fastpath header.
3291  *
3292  * Returns TRUE if entry should be dequeued, or FALSE otherwise.
3293  */
3294 boolean_t
3295 ndp_fastpath_update(nce_t *nce, void *arg)
3296 {
3297 	mblk_t 	*mp, *fp_mp;
3298 	uchar_t	*mp_rptr, *ud_mp_rptr;
3299 	mblk_t	*ud_mp = nce->nce_res_mp;
3300 	ptrdiff_t	cmplen;
3301 
3302 	if (nce->nce_flags & NCE_F_MAPPING)
3303 		return (B_TRUE);
3304 	if ((nce->nce_fp_mp != NULL) || (ud_mp == NULL))
3305 		return (B_TRUE);
3306 
3307 	ip2dbg(("ndp_fastpath_update: trying\n"));
3308 	mp = (mblk_t *)arg;
3309 	mp_rptr = mp->b_rptr;
3310 	cmplen = mp->b_wptr - mp_rptr;
3311 	ASSERT(cmplen >= 0);
3312 	ud_mp_rptr = ud_mp->b_rptr;
3313 	/*
3314 	 * The nce is locked here to prevent any other threads
3315 	 * from accessing and changing nce_res_mp when the IPv6 address
3316 	 * becomes resolved to an lla while we're in the middle
3317 	 * of looking at and comparing the hardware address (lla).
3318 	 * It is also locked to prevent multiple threads in nce_fastpath_update
3319 	 * from examining nce_res_mp atthe same time.
3320 	 */
3321 	mutex_enter(&nce->nce_lock);
3322 	if (ud_mp->b_wptr - ud_mp_rptr != cmplen ||
3323 	    bcmp((char *)mp_rptr, (char *)ud_mp_rptr, cmplen) != 0) {
3324 		mutex_exit(&nce->nce_lock);
3325 		/*
3326 		 * Don't take the ire off the fastpath list yet,
3327 		 * since the response may come later.
3328 		 */
3329 		return (B_FALSE);
3330 	}
3331 	/* Matched - install mp as the fastpath mp */
3332 	ip1dbg(("ndp_fastpath_update: match\n"));
3333 	fp_mp = dupb(mp->b_cont);
3334 	if (fp_mp != NULL) {
3335 		nce->nce_fp_mp = fp_mp;
3336 	}
3337 	mutex_exit(&nce->nce_lock);
3338 	return (B_TRUE);
3339 }
3340 
3341 /*
3342  * This function handles the DL_NOTE_FASTPATH_FLUSH notification from
3343  * driver.  Note that it assumes IP is exclusive...
3344  */
3345 /* ARGSUSED */
3346 void
3347 ndp_fastpath_flush(nce_t *nce, char *arg)
3348 {
3349 	if (nce->nce_flags & NCE_F_MAPPING)
3350 		return;
3351 	/* No fastpath info? */
3352 	if (nce->nce_fp_mp == NULL || nce->nce_res_mp == NULL)
3353 		return;
3354 
3355 	if (nce->nce_ipversion == IPV4_VERSION &&
3356 	    nce->nce_flags & NCE_F_BCAST) {
3357 		/*
3358 		 * IPv4 BROADCAST entries:
3359 		 * We can't delete the nce since it is difficult to
3360 		 * recreate these without going through the
3361 		 * ipif down/up dance.
3362 		 *
3363 		 * All access to nce->nce_fp_mp in the case of these
3364 		 * is protected by nce_lock.
3365 		 */
3366 		mutex_enter(&nce->nce_lock);
3367 		if (nce->nce_fp_mp != NULL) {
3368 			freeb(nce->nce_fp_mp);
3369 			nce->nce_fp_mp = NULL;
3370 			mutex_exit(&nce->nce_lock);
3371 			nce_fastpath(nce);
3372 		} else {
3373 			mutex_exit(&nce->nce_lock);
3374 		}
3375 	} else {
3376 		/* Just delete the NCE... */
3377 		ndp_delete(nce);
3378 	}
3379 }
3380 
3381 /*
3382  * Return a pointer to a given option in the packet.
3383  * Assumes that option part of the packet have already been validated.
3384  */
3385 nd_opt_hdr_t *
3386 ndp_get_option(nd_opt_hdr_t *opt, int optlen, int opt_type)
3387 {
3388 	while (optlen > 0) {
3389 		if (opt->nd_opt_type == opt_type)
3390 			return (opt);
3391 		optlen -= 8 * opt->nd_opt_len;
3392 		opt = (struct nd_opt_hdr *)((char *)opt + 8 * opt->nd_opt_len);
3393 	}
3394 	return (NULL);
3395 }
3396 
3397 /*
3398  * Verify all option lengths present are > 0, also check to see
3399  * if the option lengths and packet length are consistent.
3400  */
3401 boolean_t
3402 ndp_verify_optlen(nd_opt_hdr_t *opt, int optlen)
3403 {
3404 	ASSERT(opt != NULL);
3405 	while (optlen > 0) {
3406 		if (opt->nd_opt_len == 0)
3407 			return (B_FALSE);
3408 		optlen -= 8 * opt->nd_opt_len;
3409 		if (optlen < 0)
3410 			return (B_FALSE);
3411 		opt = (struct nd_opt_hdr *)((char *)opt + 8 * opt->nd_opt_len);
3412 	}
3413 	return (B_TRUE);
3414 }
3415 
3416 /*
3417  * ndp_walk function.
3418  * Free a fraction of the NCE cache entries.
3419  * A fraction of zero means to not free any in that category.
3420  */
3421 void
3422 ndp_cache_reclaim(nce_t *nce, char *arg)
3423 {
3424 	nce_cache_reclaim_t *ncr = (nce_cache_reclaim_t *)arg;
3425 	uint_t	rand;
3426 
3427 	if (nce->nce_flags & NCE_F_PERMANENT)
3428 		return;
3429 
3430 	rand = (uint_t)lbolt +
3431 	    NCE_ADDR_HASH_V6(nce->nce_addr, NCE_TABLE_SIZE);
3432 	if (ncr->ncr_host != 0 &&
3433 	    (rand/ncr->ncr_host)*ncr->ncr_host == rand) {
3434 		ndp_delete(nce);
3435 		return;
3436 	}
3437 }
3438 
3439 /*
3440  * ndp_walk function.
3441  * Count the number of NCEs that can be deleted.
3442  * These would be hosts but not routers.
3443  */
3444 void
3445 ndp_cache_count(nce_t *nce, char *arg)
3446 {
3447 	ncc_cache_count_t *ncc = (ncc_cache_count_t *)arg;
3448 
3449 	if (nce->nce_flags & NCE_F_PERMANENT)
3450 		return;
3451 
3452 	ncc->ncc_total++;
3453 	if (!(nce->nce_flags & NCE_F_ISROUTER))
3454 		ncc->ncc_host++;
3455 }
3456 
3457 #ifdef DEBUG
3458 void
3459 nce_trace_ref(nce_t *nce)
3460 {
3461 	ASSERT(MUTEX_HELD(&nce->nce_lock));
3462 
3463 	if (nce->nce_trace_disable)
3464 		return;
3465 
3466 	if (!th_trace_ref(nce, nce->nce_ill->ill_ipst)) {
3467 		nce->nce_trace_disable = B_TRUE;
3468 		nce_trace_cleanup(nce);
3469 	}
3470 }
3471 
3472 void
3473 nce_untrace_ref(nce_t *nce)
3474 {
3475 	ASSERT(MUTEX_HELD(&nce->nce_lock));
3476 
3477 	if (!nce->nce_trace_disable)
3478 		th_trace_unref(nce);
3479 }
3480 
3481 static void
3482 nce_trace_cleanup(const nce_t *nce)
3483 {
3484 	th_trace_cleanup(nce, nce->nce_trace_disable);
3485 }
3486 #endif
3487 
3488 /*
3489  * Called when address resolution fails due to a timeout.
3490  * Send an ICMP unreachable in response to all queued packets.
3491  */
3492 void
3493 arp_resolv_failed(nce_t *nce)
3494 {
3495 	mblk_t	*mp, *nxt_mp, *first_mp;
3496 	char	buf[INET6_ADDRSTRLEN];
3497 	zoneid_t zoneid = GLOBAL_ZONEID;
3498 	struct in_addr ipv4addr;
3499 	ip_stack_t *ipst = nce->nce_ill->ill_ipst;
3500 
3501 	IN6_V4MAPPED_TO_INADDR(&nce->nce_addr, &ipv4addr);
3502 	ip3dbg(("arp_resolv_failed: dst %s\n",
3503 	    inet_ntop(AF_INET, &ipv4addr, buf, sizeof (buf))));
3504 	mutex_enter(&nce->nce_lock);
3505 	mp = nce->nce_qd_mp;
3506 	nce->nce_qd_mp = NULL;
3507 	mutex_exit(&nce->nce_lock);
3508 
3509 	while (mp != NULL) {
3510 		nxt_mp = mp->b_next;
3511 		mp->b_next = NULL;
3512 		mp->b_prev = NULL;
3513 
3514 		first_mp = mp;
3515 		/*
3516 		 * Send icmp unreachable messages
3517 		 * to the hosts.
3518 		 */
3519 		(void) ip_hdr_complete((ipha_t *)mp->b_rptr, zoneid, ipst);
3520 		ip3dbg(("arp_resolv_failed: Calling icmp_unreachable\n"));
3521 		icmp_unreachable(nce->nce_ill->ill_wq, first_mp,
3522 		    ICMP_HOST_UNREACHABLE, zoneid, ipst);
3523 		mp = nxt_mp;
3524 	}
3525 }
3526 
3527 int
3528 ndp_lookup_then_add_v4(ill_t *ill, const in_addr_t *addr, uint16_t flags,
3529     nce_t **newnce, nce_t *src_nce)
3530 {
3531 	int	err;
3532 	nce_t	*nce;
3533 	in6_addr_t addr6;
3534 	ip_stack_t *ipst = ill->ill_ipst;
3535 
3536 	mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
3537 	nce = *((nce_t **)NCE_HASH_PTR_V4(ipst, *addr));
3538 	IN6_IPADDR_TO_V4MAPPED(*addr, &addr6);
3539 	/*
3540 	 * NOTE: IPv4 never matches across the illgrp since the NCE's we're
3541 	 * looking up have fastpath headers that are inherently per-ill.
3542 	 */
3543 	nce = nce_lookup_addr(ill, B_FALSE, &addr6, nce);
3544 	if (nce == NULL) {
3545 		err = ndp_add_v4(ill, addr, flags, newnce, src_nce);
3546 	} else {
3547 		*newnce = nce;
3548 		err = EEXIST;
3549 	}
3550 	mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
3551 	return (err);
3552 }
3553 
3554 /*
3555  * NDP Cache Entry creation routine for IPv4.
3556  * Mapped entries are handled in arp.
3557  * This routine must always be called with ndp4->ndp_g_lock held.
3558  * Prior to return, nce_refcnt is incremented.
3559  */
3560 static int
3561 ndp_add_v4(ill_t *ill, const in_addr_t *addr, uint16_t flags,
3562     nce_t **newnce, nce_t *src_nce)
3563 {
3564 	static	nce_t		nce_nil;
3565 	nce_t		*nce;
3566 	mblk_t		*mp;
3567 	mblk_t		*template = NULL;
3568 	nce_t		**ncep;
3569 	ip_stack_t	*ipst = ill->ill_ipst;
3570 	uint16_t	state = ND_INITIAL;
3571 	int		err;
3572 
3573 	ASSERT(MUTEX_HELD(&ipst->ips_ndp4->ndp_g_lock));
3574 	ASSERT(!ill->ill_isv6);
3575 	ASSERT((flags & NCE_F_MAPPING) == 0);
3576 
3577 	if (ill->ill_resolver_mp == NULL)
3578 		return (EINVAL);
3579 	/*
3580 	 * Allocate the mblk to hold the nce.
3581 	 */
3582 	mp = allocb(sizeof (nce_t), BPRI_MED);
3583 	if (mp == NULL)
3584 		return (ENOMEM);
3585 
3586 	nce = (nce_t *)mp->b_rptr;
3587 	mp->b_wptr = (uchar_t *)&nce[1];
3588 	*nce = nce_nil;
3589 	nce->nce_ill = ill;
3590 	nce->nce_ipversion = IPV4_VERSION;
3591 	nce->nce_flags = flags;
3592 	nce->nce_pcnt = ND_MAX_UNICAST_SOLICIT;
3593 	nce->nce_rcnt = ill->ill_xmit_count;
3594 	IN6_IPADDR_TO_V4MAPPED(*addr, &nce->nce_addr);
3595 	nce->nce_mask = ipv6_all_ones;
3596 	nce->nce_extract_mask = ipv6_all_zeros;
3597 	nce->nce_ll_extract_start = 0;
3598 	nce->nce_qd_mp = NULL;
3599 	nce->nce_mp = mp;
3600 	/* This one is for nce getting created */
3601 	nce->nce_refcnt = 1;
3602 	mutex_init(&nce->nce_lock, NULL, MUTEX_DEFAULT, NULL);
3603 	ncep = ((nce_t **)NCE_HASH_PTR_V4(ipst, *addr));
3604 
3605 	nce->nce_trace_disable = B_FALSE;
3606 
3607 	if (src_nce != NULL) {
3608 		/*
3609 		 * src_nce has been provided by the caller. The only
3610 		 * caller who provides a non-null, non-broadcast
3611 		 * src_nce is from ip_newroute() which must pass in
3612 		 * a ND_REACHABLE src_nce (this condition is verified
3613 		 * via an ASSERT for the save_ire->ire_nce in ip_newroute())
3614 		 */
3615 		mutex_enter(&src_nce->nce_lock);
3616 		state = src_nce->nce_state;
3617 		if ((src_nce->nce_flags & NCE_F_CONDEMNED) ||
3618 		    (ipst->ips_ndp4->ndp_g_hw_change > 0)) {
3619 			/*
3620 			 * src_nce has been deleted, or
3621 			 * ip_arp_news is in the middle of
3622 			 * flushing entries in the the nce.
3623 			 * Fail the add, since we don't know
3624 			 * if it is safe to copy the contents of
3625 			 * src_nce
3626 			 */
3627 			DTRACE_PROBE2(nce__bad__src__nce,
3628 			    nce_t *, src_nce, ill_t *, ill);
3629 			mutex_exit(&src_nce->nce_lock);
3630 			err = EINVAL;
3631 			goto err_ret;
3632 		}
3633 		template = copyb(src_nce->nce_res_mp);
3634 		mutex_exit(&src_nce->nce_lock);
3635 		if (template == NULL) {
3636 			err = ENOMEM;
3637 			goto err_ret;
3638 		}
3639 	} else if (flags & NCE_F_BCAST) {
3640 		/*
3641 		 * broadcast nce.
3642 		 */
3643 		template = copyb(ill->ill_bcast_mp);
3644 		if (template == NULL) {
3645 			err = ENOMEM;
3646 			goto err_ret;
3647 		}
3648 		state = ND_REACHABLE;
3649 	} else if (ill->ill_net_type == IRE_IF_NORESOLVER) {
3650 		/*
3651 		 * NORESOLVER entries are always created in the REACHABLE
3652 		 * state. We create a nce_res_mp with the IP nexthop address
3653 		 * in the destination address in the DLPI hdr if the
3654 		 * physical length is exactly 4 bytes.
3655 		 *
3656 		 * XXX not clear which drivers set ill_phys_addr_length to
3657 		 * IP_ADDR_LEN.
3658 		 */
3659 		if (ill->ill_phys_addr_length == IP_ADDR_LEN) {
3660 			template = ill_dlur_gen((uchar_t *)addr,
3661 			    ill->ill_phys_addr_length,
3662 			    ill->ill_sap, ill->ill_sap_length);
3663 		} else {
3664 			template = copyb(ill->ill_resolver_mp);
3665 		}
3666 		if (template == NULL) {
3667 			err = ENOMEM;
3668 			goto err_ret;
3669 		}
3670 		state = ND_REACHABLE;
3671 	}
3672 	nce->nce_fp_mp = NULL;
3673 	nce->nce_res_mp = template;
3674 	nce->nce_state = state;
3675 	if (state == ND_REACHABLE) {
3676 		nce->nce_last = TICK_TO_MSEC(lbolt64);
3677 		nce->nce_init_time = TICK_TO_MSEC(lbolt64);
3678 	} else {
3679 		nce->nce_last = 0;
3680 		if (state == ND_INITIAL)
3681 			nce->nce_init_time = TICK_TO_MSEC(lbolt64);
3682 	}
3683 
3684 	ASSERT((nce->nce_res_mp == NULL && nce->nce_state == ND_INITIAL) ||
3685 	    (nce->nce_res_mp != NULL && nce->nce_state == ND_REACHABLE));
3686 	/*
3687 	 * Atomically ensure that the ill is not CONDEMNED, before
3688 	 * adding the NCE.
3689 	 */
3690 	mutex_enter(&ill->ill_lock);
3691 	if (ill->ill_state_flags & ILL_CONDEMNED) {
3692 		mutex_exit(&ill->ill_lock);
3693 		err = EINVAL;
3694 		goto err_ret;
3695 	}
3696 	if ((nce->nce_next = *ncep) != NULL)
3697 		nce->nce_next->nce_ptpn = &nce->nce_next;
3698 	*ncep = nce;
3699 	nce->nce_ptpn = ncep;
3700 	*newnce = nce;
3701 	/* This one is for nce being used by an active thread */
3702 	NCE_REFHOLD(*newnce);
3703 
3704 	/* Bump up the number of nce's referencing this ill */
3705 	DTRACE_PROBE3(ill__incr__cnt, (ill_t *), ill,
3706 	    (char *), "nce", (void *), nce);
3707 	ill->ill_nce_cnt++;
3708 	mutex_exit(&ill->ill_lock);
3709 	DTRACE_PROBE1(ndp__add__v4, nce_t *, nce);
3710 	return (0);
3711 err_ret:
3712 	freeb(mp);
3713 	freemsg(template);
3714 	return (err);
3715 }
3716 
3717 /*
3718  * ndp_walk routine to delete all entries that have a given destination or
3719  * gateway address and cached link layer (MAC) address.  This is used when ARP
3720  * informs us that a network-to-link-layer mapping may have changed.
3721  */
3722 void
3723 nce_delete_hw_changed(nce_t *nce, void *arg)
3724 {
3725 	nce_hw_map_t *hwm = arg;
3726 	mblk_t *mp;
3727 	dl_unitdata_req_t *dlu;
3728 	uchar_t *macaddr;
3729 	ill_t *ill;
3730 	int saplen;
3731 	ipaddr_t nce_addr;
3732 
3733 	if (nce->nce_state != ND_REACHABLE)
3734 		return;
3735 
3736 	IN6_V4MAPPED_TO_IPADDR(&nce->nce_addr, nce_addr);
3737 	if (nce_addr != hwm->hwm_addr)
3738 		return;
3739 
3740 	mutex_enter(&nce->nce_lock);
3741 	if ((mp = nce->nce_res_mp) == NULL) {
3742 		mutex_exit(&nce->nce_lock);
3743 		return;
3744 	}
3745 	dlu = (dl_unitdata_req_t *)mp->b_rptr;
3746 	macaddr = (uchar_t *)(dlu + 1);
3747 	ill = nce->nce_ill;
3748 	if ((saplen = ill->ill_sap_length) > 0)
3749 		macaddr += saplen;
3750 	else
3751 		saplen = -saplen;
3752 
3753 	/*
3754 	 * If the hardware address is unchanged, then leave this one alone.
3755 	 * Note that saplen == abs(saplen) now.
3756 	 */
3757 	if (hwm->hwm_hwlen == dlu->dl_dest_addr_length - saplen &&
3758 	    bcmp(hwm->hwm_hwaddr, macaddr, hwm->hwm_hwlen) == 0) {
3759 		mutex_exit(&nce->nce_lock);
3760 		return;
3761 	}
3762 	mutex_exit(&nce->nce_lock);
3763 
3764 	DTRACE_PROBE1(nce__hw__deleted, nce_t *, nce);
3765 	ndp_delete(nce);
3766 }
3767 
3768 /*
3769  * This function verifies whether a given IPv4 address is potentially known to
3770  * the NCE subsystem.  If so, then ARP must not delete the corresponding ace_t,
3771  * so that it can continue to look for hardware changes on that address.
3772  */
3773 boolean_t
3774 ndp_lookup_ipaddr(in_addr_t addr, netstack_t *ns)
3775 {
3776 	nce_t		*nce;
3777 	struct in_addr	nceaddr;
3778 	ip_stack_t	*ipst = ns->netstack_ip;
3779 
3780 	if (addr == INADDR_ANY)
3781 		return (B_FALSE);
3782 
3783 	mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
3784 	nce = *(nce_t **)NCE_HASH_PTR_V4(ipst, addr);
3785 	for (; nce != NULL; nce = nce->nce_next) {
3786 		/* Note that only v4 mapped entries are in the table. */
3787 		IN6_V4MAPPED_TO_INADDR(&nce->nce_addr, &nceaddr);
3788 		if (addr == nceaddr.s_addr &&
3789 		    IN6_ARE_ADDR_EQUAL(&nce->nce_mask, &ipv6_all_ones)) {
3790 			/* Single flag check; no lock needed */
3791 			if (!(nce->nce_flags & NCE_F_CONDEMNED))
3792 				break;
3793 		}
3794 	}
3795 	mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
3796 	return (nce != NULL);
3797 }
3798 
3799 /*
3800  * Wrapper around ipif_lookup_addr_exact_v6() that allows ND to work properly
3801  * with IPMP.  Specifically, since neighbor discovery is always done on
3802  * underlying interfaces (even for addresses owned by an IPMP interface), we
3803  * need to check for `v6addrp' on both `ill' and on the IPMP meta-interface
3804  * associated with `ill' (if it exists).
3805  */
3806 static ipif_t *
3807 ip_ndp_lookup_addr_v6(const in6_addr_t *v6addrp, ill_t *ill)
3808 {
3809 	ipif_t *ipif;
3810 	ip_stack_t *ipst = ill->ill_ipst;
3811 
3812 	ipif = ipif_lookup_addr_exact_v6(v6addrp, ill, ipst);
3813 	if (ipif == NULL && IS_UNDER_IPMP(ill)) {
3814 		if ((ill = ipmp_ill_hold_ipmp_ill(ill)) != NULL) {
3815 			ipif = ipif_lookup_addr_exact_v6(v6addrp, ill, ipst);
3816 			ill_refrele(ill);
3817 		}
3818 	}
3819 	return (ipif);
3820 }
3821