xref: /titanic_52/usr/src/uts/common/inet/ip/ip_ndp.c (revision 22f5594a529d50114d839d4ddecc2c499731a3d7)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 #include <sys/types.h>
29 #include <sys/stream.h>
30 #include <sys/stropts.h>
31 #include <sys/strsun.h>
32 #include <sys/sysmacros.h>
33 #include <sys/errno.h>
34 #include <sys/dlpi.h>
35 #include <sys/socket.h>
36 #include <sys/ddi.h>
37 #include <sys/sunddi.h>
38 #include <sys/cmn_err.h>
39 #include <sys/debug.h>
40 #include <sys/vtrace.h>
41 #include <sys/kmem.h>
42 #include <sys/zone.h>
43 #include <sys/ethernet.h>
44 #include <sys/sdt.h>
45 
46 #include <net/if.h>
47 #include <net/if_types.h>
48 #include <net/if_dl.h>
49 #include <net/route.h>
50 #include <netinet/in.h>
51 #include <netinet/ip6.h>
52 #include <netinet/icmp6.h>
53 
54 #include <inet/common.h>
55 #include <inet/mi.h>
56 #include <inet/mib2.h>
57 #include <inet/nd.h>
58 #include <inet/ip.h>
59 #include <inet/ip_impl.h>
60 #include <inet/ipclassifier.h>
61 #include <inet/ip_if.h>
62 #include <inet/ip_ire.h>
63 #include <inet/ip_rts.h>
64 #include <inet/ip6.h>
65 #include <inet/ip_ndp.h>
66 #include <inet/ipsec_impl.h>
67 #include <inet/ipsec_info.h>
68 #include <inet/sctp_ip.h>
69 
70 /*
71  * Function names with nce_ prefix are static while function
72  * names with ndp_ prefix are used by rest of the IP.
73  *
74  * Lock ordering:
75  *
76  *	ndp_g_lock -> ill_lock -> nce_lock
77  *
78  * The ndp_g_lock protects the NCE hash (nce_hash_tbl, NCE_HASH_PTR) and
79  * nce_next.  Nce_lock protects the contents of the NCE (particularly
80  * nce_refcnt).
81  */
82 
83 static	boolean_t nce_cmp_ll_addr(const nce_t *nce, const uchar_t *new_ll_addr,
84     uint32_t ll_addr_len);
85 static	void	nce_ire_delete(nce_t *nce);
86 static	void	nce_ire_delete1(ire_t *ire, char *nce_arg);
87 static	void 	nce_set_ll(nce_t *nce, uchar_t *ll_addr);
88 static	nce_t	*nce_lookup_addr(ill_t *, const in6_addr_t *, nce_t *);
89 static	nce_t	*nce_lookup_mapping(ill_t *ill, const in6_addr_t *addr);
90 static	void	nce_make_mapping(nce_t *nce, uchar_t *addrpos,
91     uchar_t *addr);
92 static	int	nce_set_multicast(ill_t *ill, const in6_addr_t *addr);
93 static	void	nce_queue_mp(nce_t *nce, mblk_t *mp);
94 static	mblk_t	*nce_udreq_alloc(ill_t *ill);
95 static	void	nce_update(nce_t *nce, uint16_t new_state,
96     uchar_t *new_ll_addr);
97 static	uint32_t	nce_solicit(nce_t *nce, mblk_t *mp);
98 static	boolean_t	nce_xmit(ill_t *ill, uint32_t operation,
99     ill_t *hwaddr_ill, boolean_t use_lla_addr, const in6_addr_t *sender,
100     const in6_addr_t *target, int flag);
101 static int	ndp_add_v4(ill_t *, const in_addr_t *, uint16_t,
102     nce_t **, nce_t *);
103 
104 #ifdef DEBUG
105 static void	nce_trace_cleanup(const nce_t *);
106 #endif
107 
108 #define	NCE_HASH_PTR_V4(ipst, addr)					\
109 	(&((ipst)->ips_ndp4->nce_hash_tbl[IRE_ADDR_HASH(addr, NCE_TABLE_SIZE)]))
110 
111 #define	NCE_HASH_PTR_V6(ipst, addr)				 \
112 	(&((ipst)->ips_ndp6->nce_hash_tbl[NCE_ADDR_HASH_V6(addr, \
113 		NCE_TABLE_SIZE)]))
114 
115 /*
116  * Compute default flags to use for an advertisement of this nce's address.
117  */
118 static int
119 nce_advert_flags(const nce_t *nce)
120 {
121 	int flag = 0;
122 
123 	if (nce->nce_flags & NCE_F_ISROUTER)
124 		flag |= NDP_ISROUTER;
125 	if (!(nce->nce_flags & NCE_F_ANYCAST))
126 		flag |= NDP_ORIDE;
127 
128 	return (flag);
129 }
130 
131 /* Non-tunable probe interval, based on link capabilities */
132 #define	ILL_PROBE_INTERVAL(ill)	((ill)->ill_note_link ? 150 : 1500)
133 
134 /*
135  * NDP Cache Entry creation routine.
136  * Mapped entries will never do NUD .
137  * This routine must always be called with ndp6->ndp_g_lock held.
138  * Prior to return, nce_refcnt is incremented.
139  */
140 int
141 ndp_add_v6(ill_t *ill, uchar_t *hw_addr, const in6_addr_t *addr,
142     const in6_addr_t *mask, const in6_addr_t *extract_mask,
143     uint32_t hw_extract_start, uint16_t flags, uint16_t state,
144     nce_t **newnce)
145 {
146 	static	nce_t		nce_nil;
147 	nce_t		*nce;
148 	mblk_t		*mp;
149 	mblk_t		*template;
150 	nce_t		**ncep;
151 	int		err;
152 	boolean_t	dropped = B_FALSE;
153 	ip_stack_t	*ipst = ill->ill_ipst;
154 
155 	ASSERT(MUTEX_HELD(&ipst->ips_ndp6->ndp_g_lock));
156 	ASSERT(ill != NULL && ill->ill_isv6);
157 	if (IN6_IS_ADDR_UNSPECIFIED(addr)) {
158 		ip0dbg(("ndp_add_v6: no addr\n"));
159 		return (EINVAL);
160 	}
161 	if ((flags & ~NCE_EXTERNAL_FLAGS_MASK)) {
162 		ip0dbg(("ndp_add_v6: flags = %x\n", (int)flags));
163 		return (EINVAL);
164 	}
165 	if (IN6_IS_ADDR_UNSPECIFIED(extract_mask) &&
166 	    (flags & NCE_F_MAPPING)) {
167 		ip0dbg(("ndp_add_v6: extract mask zero for mapping"));
168 		return (EINVAL);
169 	}
170 	/*
171 	 * Allocate the mblk to hold the nce.
172 	 *
173 	 * XXX This can come out of a separate cache - nce_cache.
174 	 * We don't need the mp anymore as there are no more
175 	 * "qwriter"s
176 	 */
177 	mp = allocb(sizeof (nce_t), BPRI_MED);
178 	if (mp == NULL)
179 		return (ENOMEM);
180 
181 	nce = (nce_t *)mp->b_rptr;
182 	mp->b_wptr = (uchar_t *)&nce[1];
183 	*nce = nce_nil;
184 
185 	/*
186 	 * This one holds link layer address
187 	 */
188 	if (ill->ill_net_type == IRE_IF_RESOLVER) {
189 		template = nce_udreq_alloc(ill);
190 	} else {
191 		if (ill->ill_resolver_mp == NULL) {
192 			freeb(mp);
193 			return (EINVAL);
194 		}
195 		ASSERT((ill->ill_net_type == IRE_IF_NORESOLVER));
196 		template = copyb(ill->ill_resolver_mp);
197 	}
198 	if (template == NULL) {
199 		freeb(mp);
200 		return (ENOMEM);
201 	}
202 	nce->nce_ill = ill;
203 	nce->nce_ipversion = IPV6_VERSION;
204 	nce->nce_flags = flags;
205 	nce->nce_state = state;
206 	nce->nce_pcnt = ND_MAX_UNICAST_SOLICIT;
207 	nce->nce_rcnt = ill->ill_xmit_count;
208 	nce->nce_addr = *addr;
209 	nce->nce_mask = *mask;
210 	nce->nce_extract_mask = *extract_mask;
211 	nce->nce_ll_extract_start = hw_extract_start;
212 	nce->nce_fp_mp = NULL;
213 	nce->nce_res_mp = template;
214 	if (state == ND_REACHABLE)
215 		nce->nce_last = TICK_TO_MSEC(lbolt64);
216 	else
217 		nce->nce_last = 0;
218 	nce->nce_qd_mp = NULL;
219 	nce->nce_mp = mp;
220 	if (hw_addr != NULL)
221 		nce_set_ll(nce, hw_addr);
222 	/* This one is for nce getting created */
223 	nce->nce_refcnt = 1;
224 	mutex_init(&nce->nce_lock, NULL, MUTEX_DEFAULT, NULL);
225 	if (nce->nce_flags & NCE_F_MAPPING) {
226 		ASSERT(IN6_IS_ADDR_MULTICAST(addr));
227 		ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&nce->nce_mask));
228 		ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&nce->nce_extract_mask));
229 		ncep = &ipst->ips_ndp6->nce_mask_entries;
230 	} else {
231 		ncep = ((nce_t **)NCE_HASH_PTR_V6(ipst, *addr));
232 	}
233 
234 	nce->nce_trace_disable = B_FALSE;
235 
236 	/*
237 	 * Atomically ensure that the ill is not CONDEMNED, before
238 	 * adding the NCE.
239 	 */
240 	mutex_enter(&ill->ill_lock);
241 	if (ill->ill_state_flags & ILL_CONDEMNED) {
242 		mutex_exit(&ill->ill_lock);
243 		freeb(mp);
244 		freeb(template);
245 		return (EINVAL);
246 	}
247 	if ((nce->nce_next = *ncep) != NULL)
248 		nce->nce_next->nce_ptpn = &nce->nce_next;
249 	*ncep = nce;
250 	nce->nce_ptpn = ncep;
251 	*newnce = nce;
252 	/* This one is for nce being used by an active thread */
253 	NCE_REFHOLD(*newnce);
254 
255 	/* Bump up the number of nce's referencing this ill */
256 	DTRACE_PROBE3(ill__incr__cnt, (ill_t *), ill,
257 	    (char *), "nce", (void *), nce);
258 	ill->ill_nce_cnt++;
259 	mutex_exit(&ill->ill_lock);
260 
261 	err = 0;
262 	if ((flags & NCE_F_PERMANENT) && state == ND_PROBE) {
263 		mutex_enter(&nce->nce_lock);
264 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
265 		nce->nce_pcnt = ND_MAX_UNICAST_SOLICIT;
266 		mutex_exit(&nce->nce_lock);
267 		dropped = nce_xmit(ill, ND_NEIGHBOR_SOLICIT, NULL, B_FALSE,
268 		    &ipv6_all_zeros, addr, NDP_PROBE);
269 		if (dropped) {
270 			mutex_enter(&nce->nce_lock);
271 			nce->nce_pcnt++;
272 			mutex_exit(&nce->nce_lock);
273 		}
274 		NDP_RESTART_TIMER(nce, ILL_PROBE_INTERVAL(ill));
275 		mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
276 		err = EINPROGRESS;
277 	} else if (flags & NCE_F_UNSOL_ADV) {
278 		/*
279 		 * We account for the transmit below by assigning one
280 		 * less than the ndd variable. Subsequent decrements
281 		 * are done in ndp_timer.
282 		 */
283 		mutex_enter(&nce->nce_lock);
284 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
285 		nce->nce_unsolicit_count = ipst->ips_ip_ndp_unsolicit_count - 1;
286 		mutex_exit(&nce->nce_lock);
287 		dropped = nce_xmit(ill,
288 		    ND_NEIGHBOR_ADVERT,
289 		    ill,	/* ill to be used for extracting ill_nd_lla */
290 		    B_TRUE,	/* use ill_nd_lla */
291 		    addr,	/* Source and target of the advertisement pkt */
292 		    &ipv6_all_hosts_mcast, /* Destination of the packet */
293 		    nce_advert_flags(nce));
294 		mutex_enter(&nce->nce_lock);
295 		if (dropped)
296 			nce->nce_unsolicit_count++;
297 		if (nce->nce_unsolicit_count != 0) {
298 			nce->nce_timeout_id = timeout(ndp_timer, nce,
299 			    MSEC_TO_TICK(ipst->ips_ip_ndp_unsolicit_interval));
300 		}
301 		mutex_exit(&nce->nce_lock);
302 		mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
303 	}
304 	/*
305 	 * If the hw_addr is NULL, typically for ND_INCOMPLETE nces, then
306 	 * we call nce_fastpath as soon as the nce is resolved in ndp_process.
307 	 * We call nce_fastpath from nce_update if the link layer address of
308 	 * the peer changes from nce_update
309 	 */
310 	if (hw_addr != NULL || ill->ill_net_type == IRE_IF_NORESOLVER)
311 		nce_fastpath(nce);
312 	return (err);
313 }
314 
315 int
316 ndp_lookup_then_add_v6(ill_t *ill, uchar_t *hw_addr, const in6_addr_t *addr,
317     const in6_addr_t *mask, const in6_addr_t *extract_mask,
318     uint32_t hw_extract_start, uint16_t flags, uint16_t state,
319     nce_t **newnce)
320 {
321 	int	err = 0;
322 	nce_t	*nce;
323 	ip_stack_t	*ipst = ill->ill_ipst;
324 
325 	ASSERT(ill->ill_isv6);
326 	mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
327 
328 	/* Get head of v6 hash table */
329 	nce = *((nce_t **)NCE_HASH_PTR_V6(ipst, *addr));
330 	nce = nce_lookup_addr(ill, addr, nce);
331 	if (nce == NULL) {
332 		err = ndp_add_v6(ill,
333 		    hw_addr,
334 		    addr,
335 		    mask,
336 		    extract_mask,
337 		    hw_extract_start,
338 		    flags,
339 		    state,
340 		    newnce);
341 	} else {
342 		*newnce = nce;
343 		err = EEXIST;
344 	}
345 	mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
346 	return (err);
347 }
348 
349 /*
350  * Remove all the CONDEMNED nces from the appropriate hash table.
351  * We create a private list of NCEs, these may have ires pointing
352  * to them, so the list will be passed through to clean up dependent
353  * ires and only then we can do NCE_REFRELE which can make NCE inactive.
354  */
355 static void
356 nce_remove(ndp_g_t *ndp, nce_t *nce, nce_t **free_nce_list)
357 {
358 	nce_t *nce1;
359 	nce_t **ptpn;
360 
361 	ASSERT(MUTEX_HELD(&ndp->ndp_g_lock));
362 	ASSERT(ndp->ndp_g_walker == 0);
363 	for (; nce; nce = nce1) {
364 		nce1 = nce->nce_next;
365 		mutex_enter(&nce->nce_lock);
366 		if (nce->nce_flags & NCE_F_CONDEMNED) {
367 			ptpn = nce->nce_ptpn;
368 			nce1 = nce->nce_next;
369 			if (nce1 != NULL)
370 				nce1->nce_ptpn = ptpn;
371 			*ptpn = nce1;
372 			nce->nce_ptpn = NULL;
373 			nce->nce_next = NULL;
374 			nce->nce_next = *free_nce_list;
375 			*free_nce_list = nce;
376 		}
377 		mutex_exit(&nce->nce_lock);
378 	}
379 }
380 
381 /*
382  * 1. Mark the nce CONDEMNED. This ensures that no new nce_lookup()
383  *    will return this NCE. Also no new IREs will be created that
384  *    point to this NCE (See ire_add_v6).  Also no new timeouts will
385  *    be started (See NDP_RESTART_TIMER).
386  * 2. Cancel any currently running timeouts.
387  * 3. If there is an ndp walker, return. The walker will do the cleanup.
388  *    This ensures that walkers see a consistent list of NCEs while walking.
389  * 4. Otherwise remove the NCE from the list of NCEs
390  * 5. Delete all IREs pointing to this NCE.
391  */
392 void
393 ndp_delete(nce_t *nce)
394 {
395 	nce_t	**ptpn;
396 	nce_t	*nce1;
397 	int	ipversion = nce->nce_ipversion;
398 	ndp_g_t *ndp;
399 	ip_stack_t	*ipst = nce->nce_ill->ill_ipst;
400 
401 	if (ipversion == IPV4_VERSION)
402 		ndp = ipst->ips_ndp4;
403 	else
404 		ndp = ipst->ips_ndp6;
405 
406 	/* Serialize deletes */
407 	mutex_enter(&nce->nce_lock);
408 	if (nce->nce_flags & NCE_F_CONDEMNED) {
409 		/* Some other thread is doing the delete */
410 		mutex_exit(&nce->nce_lock);
411 		return;
412 	}
413 	/*
414 	 * Caller has a refhold. Also 1 ref for being in the list. Thus
415 	 * refcnt has to be >= 2
416 	 */
417 	ASSERT(nce->nce_refcnt >= 2);
418 	nce->nce_flags |= NCE_F_CONDEMNED;
419 	mutex_exit(&nce->nce_lock);
420 
421 	nce_fastpath_list_delete(nce);
422 
423 	/*
424 	 * Cancel any running timer. Timeout can't be restarted
425 	 * since CONDEMNED is set. Can't hold nce_lock across untimeout.
426 	 * Passing invalid timeout id is fine.
427 	 */
428 	if (nce->nce_timeout_id != 0) {
429 		(void) untimeout(nce->nce_timeout_id);
430 		nce->nce_timeout_id = 0;
431 	}
432 
433 	mutex_enter(&ndp->ndp_g_lock);
434 	if (nce->nce_ptpn == NULL) {
435 		/*
436 		 * The last ndp walker has already removed this nce from
437 		 * the list after we marked the nce CONDEMNED and before
438 		 * we grabbed the global lock.
439 		 */
440 		mutex_exit(&ndp->ndp_g_lock);
441 		return;
442 	}
443 	if (ndp->ndp_g_walker > 0) {
444 		/*
445 		 * Can't unlink. The walker will clean up
446 		 */
447 		ndp->ndp_g_walker_cleanup = B_TRUE;
448 		mutex_exit(&ndp->ndp_g_lock);
449 		return;
450 	}
451 
452 	/*
453 	 * Now remove the nce from the list. NDP_RESTART_TIMER won't restart
454 	 * the timer since it is marked CONDEMNED.
455 	 */
456 	ptpn = nce->nce_ptpn;
457 	nce1 = nce->nce_next;
458 	if (nce1 != NULL)
459 		nce1->nce_ptpn = ptpn;
460 	*ptpn = nce1;
461 	nce->nce_ptpn = NULL;
462 	nce->nce_next = NULL;
463 	mutex_exit(&ndp->ndp_g_lock);
464 
465 	nce_ire_delete(nce);
466 }
467 
468 void
469 ndp_inactive(nce_t *nce)
470 {
471 	mblk_t		**mpp;
472 	ill_t		*ill;
473 
474 	ASSERT(nce->nce_refcnt == 0);
475 	ASSERT(MUTEX_HELD(&nce->nce_lock));
476 	ASSERT(nce->nce_fastpath == NULL);
477 
478 	/* Free all nce allocated messages */
479 	mpp = &nce->nce_first_mp_to_free;
480 	do {
481 		while (*mpp != NULL) {
482 			mblk_t  *mp;
483 
484 			mp = *mpp;
485 			*mpp = mp->b_next;
486 
487 			inet_freemsg(mp);
488 		}
489 	} while (mpp++ != &nce->nce_last_mp_to_free);
490 
491 #ifdef DEBUG
492 	nce_trace_cleanup(nce);
493 #endif
494 
495 	ill = nce->nce_ill;
496 	mutex_enter(&ill->ill_lock);
497 	DTRACE_PROBE3(ill__decr__cnt, (ill_t *), ill,
498 	    (char *), "nce", (void *), nce);
499 	ill->ill_nce_cnt--;
500 	/*
501 	 * If the number of nce's associated with this ill have dropped
502 	 * to zero, check whether we need to restart any operation that
503 	 * is waiting for this to happen.
504 	 */
505 	if (ILL_DOWN_OK(ill)) {
506 		/* ipif_ill_refrele_tail drops the ill_lock */
507 		ipif_ill_refrele_tail(ill);
508 	} else {
509 		mutex_exit(&ill->ill_lock);
510 	}
511 	mutex_destroy(&nce->nce_lock);
512 	if (nce->nce_mp != NULL)
513 		inet_freemsg(nce->nce_mp);
514 }
515 
516 /*
517  * ndp_walk routine.  Delete the nce if it is associated with the ill
518  * that is going away.  Always called as a writer.
519  */
520 void
521 ndp_delete_per_ill(nce_t *nce, uchar_t *arg)
522 {
523 	if ((nce != NULL) && nce->nce_ill == (ill_t *)arg) {
524 		ndp_delete(nce);
525 	}
526 }
527 
528 /*
529  * Walk a list of to be inactive NCEs and blow away all the ires.
530  */
531 static void
532 nce_ire_delete_list(nce_t *nce)
533 {
534 	nce_t *nce_next;
535 
536 	ASSERT(nce != NULL);
537 	while (nce != NULL) {
538 		nce_next = nce->nce_next;
539 		nce->nce_next = NULL;
540 
541 		/*
542 		 * It is possible for the last ndp walker (this thread)
543 		 * to come here after ndp_delete has marked the nce CONDEMNED
544 		 * and before it has removed the nce from the fastpath list
545 		 * or called untimeout. So we need to do it here. It is safe
546 		 * for both ndp_delete and this thread to do it twice or
547 		 * even simultaneously since each of the threads has a
548 		 * reference on the nce.
549 		 */
550 		nce_fastpath_list_delete(nce);
551 		/*
552 		 * Cancel any running timer. Timeout can't be restarted
553 		 * since CONDEMNED is set. Can't hold nce_lock across untimeout.
554 		 * Passing invalid timeout id is fine.
555 		 */
556 		if (nce->nce_timeout_id != 0) {
557 			(void) untimeout(nce->nce_timeout_id);
558 			nce->nce_timeout_id = 0;
559 		}
560 		/*
561 		 * We might hit this func thus in the v4 case:
562 		 * ipif_down->ipif_ndp_down->ndp_walk
563 		 */
564 
565 		if (nce->nce_ipversion == IPV4_VERSION) {
566 			ire_walk_ill_v4(MATCH_IRE_ILL | MATCH_IRE_TYPE,
567 			    IRE_CACHE, nce_ire_delete1,
568 			    (char *)nce, nce->nce_ill);
569 		} else {
570 			ASSERT(nce->nce_ipversion == IPV6_VERSION);
571 			ire_walk_ill_v6(MATCH_IRE_ILL | MATCH_IRE_TYPE,
572 			    IRE_CACHE, nce_ire_delete1,
573 			    (char *)nce, nce->nce_ill);
574 		}
575 		NCE_REFRELE_NOTR(nce);
576 		nce = nce_next;
577 	}
578 }
579 
580 /*
581  * Delete an ire when the nce goes away.
582  */
583 /* ARGSUSED */
584 static void
585 nce_ire_delete(nce_t *nce)
586 {
587 	if (nce->nce_ipversion == IPV6_VERSION) {
588 		ire_walk_ill_v6(MATCH_IRE_ILL | MATCH_IRE_TYPE, IRE_CACHE,
589 		    nce_ire_delete1, (char *)nce, nce->nce_ill);
590 		NCE_REFRELE_NOTR(nce);
591 	} else {
592 		ire_walk_ill_v4(MATCH_IRE_ILL | MATCH_IRE_TYPE, IRE_CACHE,
593 		    nce_ire_delete1, (char *)nce, nce->nce_ill);
594 		NCE_REFRELE_NOTR(nce);
595 	}
596 }
597 
598 /*
599  * ire_walk routine used to delete every IRE that shares this nce
600  */
601 static void
602 nce_ire_delete1(ire_t *ire, char *nce_arg)
603 {
604 	nce_t	*nce = (nce_t *)nce_arg;
605 
606 	ASSERT(ire->ire_type == IRE_CACHE);
607 
608 	if (ire->ire_nce == nce) {
609 		ASSERT(ire->ire_ipversion == nce->nce_ipversion);
610 		ire_delete(ire);
611 	}
612 }
613 
614 /*
615  * Restart DAD on given NCE.  Returns B_TRUE if DAD has been restarted.
616  */
617 boolean_t
618 ndp_restart_dad(nce_t *nce)
619 {
620 	boolean_t started;
621 	boolean_t dropped;
622 
623 	if (nce == NULL)
624 		return (B_FALSE);
625 	mutex_enter(&nce->nce_lock);
626 	if (nce->nce_state == ND_PROBE) {
627 		mutex_exit(&nce->nce_lock);
628 		started = B_TRUE;
629 	} else if (nce->nce_state == ND_REACHABLE) {
630 		nce->nce_state = ND_PROBE;
631 		nce->nce_pcnt = ND_MAX_UNICAST_SOLICIT - 1;
632 		mutex_exit(&nce->nce_lock);
633 		dropped = nce_xmit(nce->nce_ill, ND_NEIGHBOR_SOLICIT, NULL,
634 		    B_FALSE, &ipv6_all_zeros, &nce->nce_addr, NDP_PROBE);
635 		if (dropped) {
636 			mutex_enter(&nce->nce_lock);
637 			nce->nce_pcnt++;
638 			mutex_exit(&nce->nce_lock);
639 		}
640 		NDP_RESTART_TIMER(nce, ILL_PROBE_INTERVAL(nce->nce_ill));
641 		started = B_TRUE;
642 	} else {
643 		mutex_exit(&nce->nce_lock);
644 		started = B_FALSE;
645 	}
646 	return (started);
647 }
648 
649 /*
650  * IPv6 Cache entry lookup.  Try to find an nce matching the parameters passed.
651  * If one is found, the refcnt on the nce will be incremented.
652  */
653 nce_t *
654 ndp_lookup_v6(ill_t *ill, const in6_addr_t *addr, boolean_t caller_holds_lock)
655 {
656 	nce_t	*nce;
657 	ip_stack_t	*ipst;
658 
659 	ASSERT(ill != NULL);
660 	ipst = ill->ill_ipst;
661 
662 	ASSERT(ill != NULL && ill->ill_isv6);
663 	if (!caller_holds_lock) {
664 		mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
665 	}
666 
667 	/* Get head of v6 hash table */
668 	nce = *((nce_t **)NCE_HASH_PTR_V6(ipst, *addr));
669 	nce = nce_lookup_addr(ill, addr, nce);
670 	if (nce == NULL)
671 		nce = nce_lookup_mapping(ill, addr);
672 	if (!caller_holds_lock)
673 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
674 	return (nce);
675 }
676 /*
677  * IPv4 Cache entry lookup.  Try to find an nce matching the parameters passed.
678  * If one is found, the refcnt on the nce will be incremented.
679  * Since multicast mappings are handled in arp, there are no nce_mcast_entries
680  * so we skip the nce_lookup_mapping call.
681  * XXX TODO: if the nce is found to be ND_STALE, ndp_delete it and return NULL
682  */
683 nce_t *
684 ndp_lookup_v4(ill_t *ill, const in_addr_t *addr, boolean_t caller_holds_lock)
685 {
686 	nce_t	*nce;
687 	in6_addr_t addr6;
688 	ip_stack_t *ipst = ill->ill_ipst;
689 
690 	if (!caller_holds_lock) {
691 		mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
692 	}
693 
694 	/* Get head of v4 hash table */
695 	nce = *((nce_t **)NCE_HASH_PTR_V4(ipst, *addr));
696 	IN6_IPADDR_TO_V4MAPPED(*addr, &addr6);
697 	nce = nce_lookup_addr(ill, &addr6, nce);
698 	if (!caller_holds_lock)
699 		mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
700 	return (nce);
701 }
702 
703 /*
704  * Cache entry lookup.  Try to find an nce matching the parameters passed.
705  * Look only for exact entries (no mappings).  If an nce is found, increment
706  * the hold count on that nce. The caller passes in the start of the
707  * appropriate hash table, and must be holding the appropriate global
708  * lock (ndp_g_lock).
709  */
710 static nce_t *
711 nce_lookup_addr(ill_t *ill, const in6_addr_t *addr, nce_t *nce)
712 {
713 	ndp_g_t		*ndp;
714 	ip_stack_t	*ipst = ill->ill_ipst;
715 
716 	if (ill->ill_isv6)
717 		ndp = ipst->ips_ndp6;
718 	else
719 		ndp = ipst->ips_ndp4;
720 
721 	ASSERT(ill != NULL);
722 	ASSERT(MUTEX_HELD(&ndp->ndp_g_lock));
723 	if (IN6_IS_ADDR_UNSPECIFIED(addr))
724 		return (NULL);
725 	for (; nce != NULL; nce = nce->nce_next) {
726 		if (nce->nce_ill == ill) {
727 			if (IN6_ARE_ADDR_EQUAL(&nce->nce_addr, addr) &&
728 			    IN6_ARE_ADDR_EQUAL(&nce->nce_mask,
729 			    &ipv6_all_ones)) {
730 				mutex_enter(&nce->nce_lock);
731 				if (!(nce->nce_flags & NCE_F_CONDEMNED)) {
732 					NCE_REFHOLD_LOCKED(nce);
733 					mutex_exit(&nce->nce_lock);
734 					break;
735 				}
736 				mutex_exit(&nce->nce_lock);
737 			}
738 		}
739 	}
740 	return (nce);
741 }
742 
743 /*
744  * Cache entry lookup.  Try to find an nce matching the parameters passed.
745  * Look only for mappings.
746  */
747 static nce_t *
748 nce_lookup_mapping(ill_t *ill, const in6_addr_t *addr)
749 {
750 	nce_t	*nce;
751 	ip_stack_t	*ipst = ill->ill_ipst;
752 
753 	ASSERT(ill != NULL && ill->ill_isv6);
754 	ASSERT(MUTEX_HELD(&ipst->ips_ndp6->ndp_g_lock));
755 	if (!IN6_IS_ADDR_MULTICAST(addr))
756 		return (NULL);
757 	nce = ipst->ips_ndp6->nce_mask_entries;
758 	for (; nce != NULL; nce = nce->nce_next)
759 		if (nce->nce_ill == ill &&
760 		    (V6_MASK_EQ(*addr, nce->nce_mask, nce->nce_addr))) {
761 			mutex_enter(&nce->nce_lock);
762 			if (!(nce->nce_flags & NCE_F_CONDEMNED)) {
763 				NCE_REFHOLD_LOCKED(nce);
764 				mutex_exit(&nce->nce_lock);
765 				break;
766 			}
767 			mutex_exit(&nce->nce_lock);
768 		}
769 	return (nce);
770 }
771 
772 /*
773  * Process passed in parameters either from an incoming packet or via
774  * user ioctl.
775  */
776 void
777 ndp_process(nce_t *nce, uchar_t *hw_addr, uint32_t flag, boolean_t is_adv)
778 {
779 	ill_t	*ill = nce->nce_ill;
780 	uint32_t hw_addr_len = ill->ill_nd_lla_len;
781 	mblk_t	*mp;
782 	boolean_t ll_updated = B_FALSE;
783 	boolean_t ll_changed;
784 	ip_stack_t	*ipst = ill->ill_ipst;
785 
786 	ASSERT(nce->nce_ipversion == IPV6_VERSION);
787 	/*
788 	 * No updates of link layer address or the neighbor state is
789 	 * allowed, when the cache is in NONUD state.  This still
790 	 * allows for responding to reachability solicitation.
791 	 */
792 	mutex_enter(&nce->nce_lock);
793 	if (nce->nce_state == ND_INCOMPLETE) {
794 		if (hw_addr == NULL) {
795 			mutex_exit(&nce->nce_lock);
796 			return;
797 		}
798 		nce_set_ll(nce, hw_addr);
799 		/*
800 		 * Update nce state and send the queued packets
801 		 * back to ip this time ire will be added.
802 		 */
803 		if (flag & ND_NA_FLAG_SOLICITED) {
804 			nce_update(nce, ND_REACHABLE, NULL);
805 		} else {
806 			nce_update(nce, ND_STALE, NULL);
807 		}
808 		mutex_exit(&nce->nce_lock);
809 		nce_fastpath(nce);
810 		mutex_enter(&nce->nce_lock);
811 		mp = nce->nce_qd_mp;
812 		nce->nce_qd_mp = NULL;
813 		mutex_exit(&nce->nce_lock);
814 		while (mp != NULL) {
815 			mblk_t *nxt_mp, *data_mp;
816 
817 			nxt_mp = mp->b_next;
818 			mp->b_next = NULL;
819 
820 			if (mp->b_datap->db_type == M_CTL)
821 				data_mp = mp->b_cont;
822 			else
823 				data_mp = mp;
824 			if (data_mp->b_prev != NULL) {
825 				ill_t   *inbound_ill;
826 				queue_t *fwdq = NULL;
827 				uint_t ifindex;
828 
829 				ifindex = (uint_t)(uintptr_t)data_mp->b_prev;
830 				inbound_ill = ill_lookup_on_ifindex(ifindex,
831 				    B_TRUE, NULL, NULL, NULL, NULL, ipst);
832 				if (inbound_ill == NULL) {
833 					data_mp->b_prev = NULL;
834 					freemsg(mp);
835 					return;
836 				} else {
837 					fwdq = inbound_ill->ill_rq;
838 				}
839 				data_mp->b_prev = NULL;
840 				/*
841 				 * Send a forwarded packet back into ip_rput_v6
842 				 * just as in ire_send_v6().
843 				 * Extract the queue from b_prev (set in
844 				 * ip_rput_data_v6).
845 				 */
846 				if (fwdq != NULL) {
847 					/*
848 					 * Forwarded packets hop count will
849 					 * get decremented in ip_rput_data_v6
850 					 */
851 					if (data_mp != mp)
852 						freeb(mp);
853 					put(fwdq, data_mp);
854 				} else {
855 					/*
856 					 * Send locally originated packets back
857 					 * into * ip_wput_v6.
858 					 */
859 					put(ill->ill_wq, mp);
860 				}
861 				ill_refrele(inbound_ill);
862 			} else {
863 				put(ill->ill_wq, mp);
864 			}
865 			mp = nxt_mp;
866 		}
867 		return;
868 	}
869 	ll_changed = nce_cmp_ll_addr(nce, hw_addr, hw_addr_len);
870 	if (!is_adv) {
871 		/* If this is a SOLICITATION request only */
872 		if (ll_changed)
873 			nce_update(nce, ND_STALE, hw_addr);
874 		mutex_exit(&nce->nce_lock);
875 		return;
876 	}
877 	if (!(flag & ND_NA_FLAG_OVERRIDE) && ll_changed) {
878 		/* If in any other state than REACHABLE, ignore */
879 		if (nce->nce_state == ND_REACHABLE) {
880 			nce_update(nce, ND_STALE, NULL);
881 		}
882 		mutex_exit(&nce->nce_lock);
883 		return;
884 	} else {
885 		if (ll_changed) {
886 			nce_update(nce, ND_UNCHANGED, hw_addr);
887 			ll_updated = B_TRUE;
888 		}
889 		if (flag & ND_NA_FLAG_SOLICITED) {
890 			nce_update(nce, ND_REACHABLE, NULL);
891 		} else {
892 			if (ll_updated) {
893 				nce_update(nce, ND_STALE, NULL);
894 			}
895 		}
896 		mutex_exit(&nce->nce_lock);
897 		if (!(flag & ND_NA_FLAG_ROUTER) && (nce->nce_flags &
898 		    NCE_F_ISROUTER)) {
899 			ire_t *ire;
900 
901 			/*
902 			 * Router turned to host.  We need to remove the
903 			 * entry as well as any default route that may be
904 			 * using this as a next hop.  This is required by
905 			 * section 7.2.5 of RFC 2461.
906 			 */
907 			ire = ire_ftable_lookup_v6(&ipv6_all_zeros,
908 			    &ipv6_all_zeros, &nce->nce_addr, IRE_DEFAULT,
909 			    nce->nce_ill->ill_ipif, NULL, ALL_ZONES, 0, NULL,
910 			    MATCH_IRE_ILL | MATCH_IRE_TYPE | MATCH_IRE_GW |
911 			    MATCH_IRE_DEFAULT, ipst);
912 			if (ire != NULL) {
913 				ip_rts_rtmsg(RTM_DELETE, ire, 0, ipst);
914 				ire_delete(ire);
915 				ire_refrele(ire);
916 			}
917 			ndp_delete(nce);
918 		}
919 	}
920 }
921 
922 /*
923  * Pass arg1 to the pfi supplied, along with each nce in existence.
924  * ndp_walk() places a REFHOLD on the nce and drops the lock when
925  * walking the hash list.
926  */
927 void
928 ndp_walk_common(ndp_g_t *ndp, ill_t *ill, pfi_t pfi, void *arg1,
929     boolean_t trace)
930 {
931 
932 	nce_t	*nce;
933 	nce_t	*nce1;
934 	nce_t	**ncep;
935 	nce_t	*free_nce_list = NULL;
936 
937 	mutex_enter(&ndp->ndp_g_lock);
938 	/* Prevent ndp_delete from unlink and free of NCE */
939 	ndp->ndp_g_walker++;
940 	mutex_exit(&ndp->ndp_g_lock);
941 	for (ncep = ndp->nce_hash_tbl;
942 	    ncep < A_END(ndp->nce_hash_tbl); ncep++) {
943 		for (nce = *ncep; nce != NULL; nce = nce1) {
944 			nce1 = nce->nce_next;
945 			if (ill == NULL || nce->nce_ill == ill) {
946 				if (trace) {
947 					NCE_REFHOLD(nce);
948 					(*pfi)(nce, arg1);
949 					NCE_REFRELE(nce);
950 				} else {
951 					NCE_REFHOLD_NOTR(nce);
952 					(*pfi)(nce, arg1);
953 					NCE_REFRELE_NOTR(nce);
954 				}
955 			}
956 		}
957 	}
958 	for (nce = ndp->nce_mask_entries; nce != NULL; nce = nce1) {
959 		nce1 = nce->nce_next;
960 		if (ill == NULL || nce->nce_ill == ill) {
961 			if (trace) {
962 				NCE_REFHOLD(nce);
963 				(*pfi)(nce, arg1);
964 				NCE_REFRELE(nce);
965 			} else {
966 				NCE_REFHOLD_NOTR(nce);
967 				(*pfi)(nce, arg1);
968 				NCE_REFRELE_NOTR(nce);
969 			}
970 		}
971 	}
972 	mutex_enter(&ndp->ndp_g_lock);
973 	ndp->ndp_g_walker--;
974 	/*
975 	 * While NCE's are removed from global list they are placed
976 	 * in a private list, to be passed to nce_ire_delete_list().
977 	 * The reason is, there may be ires pointing to this nce
978 	 * which needs to cleaned up.
979 	 */
980 	if (ndp->ndp_g_walker_cleanup && ndp->ndp_g_walker == 0) {
981 		/* Time to delete condemned entries */
982 		for (ncep = ndp->nce_hash_tbl;
983 		    ncep < A_END(ndp->nce_hash_tbl); ncep++) {
984 			nce = *ncep;
985 			if (nce != NULL) {
986 				nce_remove(ndp, nce, &free_nce_list);
987 			}
988 		}
989 		nce = ndp->nce_mask_entries;
990 		if (nce != NULL) {
991 			nce_remove(ndp, nce, &free_nce_list);
992 		}
993 		ndp->ndp_g_walker_cleanup = B_FALSE;
994 	}
995 
996 	mutex_exit(&ndp->ndp_g_lock);
997 
998 	if (free_nce_list != NULL) {
999 		nce_ire_delete_list(free_nce_list);
1000 	}
1001 }
1002 
1003 /*
1004  * Walk everything.
1005  * Note that ill can be NULL hence can't derive the ipst from it.
1006  */
1007 void
1008 ndp_walk(ill_t *ill, pfi_t pfi, void *arg1, ip_stack_t *ipst)
1009 {
1010 	ndp_walk_common(ipst->ips_ndp4, ill, pfi, arg1, B_TRUE);
1011 	ndp_walk_common(ipst->ips_ndp6, ill, pfi, arg1, B_TRUE);
1012 }
1013 
1014 /*
1015  * Process resolve requests.  Handles both mapped entries
1016  * as well as cases that needs to be send out on the wire.
1017  * Lookup a NCE for a given IRE.  Regardless of whether one exists
1018  * or one is created, we defer making ire point to nce until the
1019  * ire is actually added at which point the nce_refcnt on the nce is
1020  * incremented.  This is done primarily to have symmetry between ire_add()
1021  * and ire_delete() which decrements the nce_refcnt, when an ire is deleted.
1022  */
1023 int
1024 ndp_resolver(ill_t *ill, const in6_addr_t *dst, mblk_t *mp, zoneid_t zoneid)
1025 {
1026 	nce_t		*nce;
1027 	int		err = 0;
1028 	uint32_t	ms;
1029 	mblk_t		*mp_nce = NULL;
1030 	ip_stack_t	*ipst = ill->ill_ipst;
1031 
1032 	ASSERT(ill->ill_isv6);
1033 	if (IN6_IS_ADDR_MULTICAST(dst)) {
1034 		err = nce_set_multicast(ill, dst);
1035 		return (err);
1036 	}
1037 	err = ndp_lookup_then_add_v6(ill,
1038 	    NULL,	/* No hardware address */
1039 	    dst,
1040 	    &ipv6_all_ones,
1041 	    &ipv6_all_zeros,
1042 	    0,
1043 	    (ill->ill_flags & ILLF_NONUD) ? NCE_F_NONUD : 0,
1044 	    ND_INCOMPLETE,
1045 	    &nce);
1046 
1047 	switch (err) {
1048 	case 0:
1049 		/*
1050 		 * New cache entry was created. Make sure that the state
1051 		 * is not ND_INCOMPLETE. It can be in some other state
1052 		 * even before we send out the solicitation as we could
1053 		 * get un-solicited advertisements.
1054 		 *
1055 		 * If this is an XRESOLV interface, simply return 0,
1056 		 * since we don't want to solicit just yet.
1057 		 */
1058 		if (ill->ill_flags & ILLF_XRESOLV) {
1059 			NCE_REFRELE(nce);
1060 			return (0);
1061 		}
1062 		rw_enter(&ipst->ips_ill_g_lock, RW_READER);
1063 		mutex_enter(&nce->nce_lock);
1064 		if (nce->nce_state != ND_INCOMPLETE) {
1065 			mutex_exit(&nce->nce_lock);
1066 			rw_exit(&ipst->ips_ill_g_lock);
1067 			NCE_REFRELE(nce);
1068 			return (0);
1069 		}
1070 		mp_nce = ip_prepend_zoneid(mp, zoneid, ipst);
1071 		if (mp_nce == NULL) {
1072 			/* The caller will free mp */
1073 			mutex_exit(&nce->nce_lock);
1074 			rw_exit(&ipst->ips_ill_g_lock);
1075 			ndp_delete(nce);
1076 			NCE_REFRELE(nce);
1077 			return (ENOMEM);
1078 		}
1079 		ms = nce_solicit(nce, mp_nce);
1080 		rw_exit(&ipst->ips_ill_g_lock);
1081 		if (ms == 0) {
1082 			/* The caller will free mp */
1083 			if (mp_nce != mp)
1084 				freeb(mp_nce);
1085 			mutex_exit(&nce->nce_lock);
1086 			ndp_delete(nce);
1087 			NCE_REFRELE(nce);
1088 			return (EBUSY);
1089 		}
1090 		mutex_exit(&nce->nce_lock);
1091 		NDP_RESTART_TIMER(nce, (clock_t)ms);
1092 		NCE_REFRELE(nce);
1093 		return (EINPROGRESS);
1094 	case EEXIST:
1095 		/* Resolution in progress just queue the packet */
1096 		mutex_enter(&nce->nce_lock);
1097 		if (nce->nce_state == ND_INCOMPLETE) {
1098 			mp_nce = ip_prepend_zoneid(mp, zoneid, ipst);
1099 			if (mp_nce == NULL) {
1100 				err = ENOMEM;
1101 			} else {
1102 				nce_queue_mp(nce, mp_nce);
1103 				err = EINPROGRESS;
1104 			}
1105 		} else {
1106 			/*
1107 			 * Any other state implies we have
1108 			 * a nce but IRE needs to be added ...
1109 			 * ire_add_v6() will take care of the
1110 			 * the case when the nce becomes CONDEMNED
1111 			 * before the ire is added to the table.
1112 			 */
1113 			err = 0;
1114 		}
1115 		mutex_exit(&nce->nce_lock);
1116 		NCE_REFRELE(nce);
1117 		break;
1118 	default:
1119 		ip1dbg(("ndp_resolver: Can't create NCE %d\n", err));
1120 		break;
1121 	}
1122 	return (err);
1123 }
1124 
1125 /*
1126  * When there is no resolver, the link layer template is passed in
1127  * the IRE.
1128  * Lookup a NCE for a given IRE.  Regardless of whether one exists
1129  * or one is created, we defer making ire point to nce until the
1130  * ire is actually added at which point the nce_refcnt on the nce is
1131  * incremented.  This is done primarily to have symmetry between ire_add()
1132  * and ire_delete() which decrements the nce_refcnt, when an ire is deleted.
1133  */
1134 int
1135 ndp_noresolver(ill_t *ill, const in6_addr_t *dst)
1136 {
1137 	nce_t		*nce;
1138 	int		err = 0;
1139 
1140 	ASSERT(ill != NULL);
1141 	ASSERT(ill->ill_isv6);
1142 	if (IN6_IS_ADDR_MULTICAST(dst)) {
1143 		err = nce_set_multicast(ill, dst);
1144 		return (err);
1145 	}
1146 
1147 	err = ndp_lookup_then_add_v6(ill,
1148 	    NULL,	/* hardware address */
1149 	    dst,
1150 	    &ipv6_all_ones,
1151 	    &ipv6_all_zeros,
1152 	    0,
1153 	    (ill->ill_flags & ILLF_NONUD) ? NCE_F_NONUD : 0,
1154 	    ND_REACHABLE,
1155 	    &nce);
1156 
1157 	switch (err) {
1158 	case 0:
1159 		/*
1160 		 * Cache entry with a proper resolver cookie was
1161 		 * created.
1162 		 */
1163 		NCE_REFRELE(nce);
1164 		break;
1165 	case EEXIST:
1166 		err = 0;
1167 		NCE_REFRELE(nce);
1168 		break;
1169 	default:
1170 		ip1dbg(("ndp_noresolver: Can't create NCE %d\n", err));
1171 		break;
1172 	}
1173 	return (err);
1174 }
1175 
1176 /*
1177  * For each interface an entry is added for the unspecified multicast group.
1178  * Here that mapping is used to form the multicast cache entry for a particular
1179  * multicast destination.
1180  */
1181 static int
1182 nce_set_multicast(ill_t *ill, const in6_addr_t *dst)
1183 {
1184 	nce_t		*mnce;	/* Multicast mapping entry */
1185 	nce_t		*nce;
1186 	uchar_t		*hw_addr = NULL;
1187 	int		err = 0;
1188 	ip_stack_t	*ipst = ill->ill_ipst;
1189 
1190 	ASSERT(ill != NULL);
1191 	ASSERT(ill->ill_isv6);
1192 	ASSERT(!(IN6_IS_ADDR_UNSPECIFIED(dst)));
1193 
1194 	mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
1195 	nce = *((nce_t **)NCE_HASH_PTR_V6(ipst, *dst));
1196 	nce = nce_lookup_addr(ill, dst, nce);
1197 	if (nce != NULL) {
1198 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
1199 		NCE_REFRELE(nce);
1200 		return (0);
1201 	}
1202 	/* No entry, now lookup for a mapping this should never fail */
1203 	mnce = nce_lookup_mapping(ill, dst);
1204 	if (mnce == NULL) {
1205 		/* Something broken for the interface. */
1206 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
1207 		return (ESRCH);
1208 	}
1209 	ASSERT(mnce->nce_flags & NCE_F_MAPPING);
1210 	if (ill->ill_net_type == IRE_IF_RESOLVER) {
1211 		/*
1212 		 * For IRE_IF_RESOLVER a hardware mapping can be
1213 		 * generated, for IRE_IF_NORESOLVER, resolution cookie
1214 		 * in the ill is copied in ndp_add_v6().
1215 		 */
1216 		hw_addr = kmem_alloc(ill->ill_nd_lla_len, KM_NOSLEEP);
1217 		if (hw_addr == NULL) {
1218 			mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
1219 			NCE_REFRELE(mnce);
1220 			return (ENOMEM);
1221 		}
1222 		nce_make_mapping(mnce, hw_addr, (uchar_t *)dst);
1223 	}
1224 	NCE_REFRELE(mnce);
1225 	/*
1226 	 * IRE_IF_NORESOLVER type simply copies the resolution
1227 	 * cookie passed in.  So no hw_addr is needed.
1228 	 */
1229 	err = ndp_add_v6(ill,
1230 	    hw_addr,
1231 	    dst,
1232 	    &ipv6_all_ones,
1233 	    &ipv6_all_zeros,
1234 	    0,
1235 	    NCE_F_NONUD,
1236 	    ND_REACHABLE,
1237 	    &nce);
1238 	mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
1239 	if (hw_addr != NULL)
1240 		kmem_free(hw_addr, ill->ill_nd_lla_len);
1241 	if (err != 0) {
1242 		ip1dbg(("nce_set_multicast: create failed" "%d\n", err));
1243 		return (err);
1244 	}
1245 	NCE_REFRELE(nce);
1246 	return (0);
1247 }
1248 
1249 /*
1250  * Return the link layer address, and any flags of a nce.
1251  */
1252 int
1253 ndp_query(ill_t *ill, struct lif_nd_req *lnr)
1254 {
1255 	nce_t		*nce;
1256 	in6_addr_t	*addr;
1257 	sin6_t		*sin6;
1258 	dl_unitdata_req_t	*dl;
1259 
1260 	ASSERT(ill != NULL && ill->ill_isv6);
1261 	sin6 = (sin6_t *)&lnr->lnr_addr;
1262 	addr =  &sin6->sin6_addr;
1263 
1264 	nce = ndp_lookup_v6(ill, addr, B_FALSE);
1265 	if (nce == NULL)
1266 		return (ESRCH);
1267 	/* If in INCOMPLETE state, no link layer address is available yet */
1268 	if (nce->nce_state == ND_INCOMPLETE)
1269 		goto done;
1270 	dl = (dl_unitdata_req_t *)nce->nce_res_mp->b_rptr;
1271 	if (ill->ill_flags & ILLF_XRESOLV)
1272 		lnr->lnr_hdw_len = dl->dl_dest_addr_length;
1273 	else
1274 		lnr->lnr_hdw_len = ill->ill_nd_lla_len;
1275 	ASSERT(NCE_LL_ADDR_OFFSET(ill) + lnr->lnr_hdw_len <=
1276 	    sizeof (lnr->lnr_hdw_addr));
1277 	bcopy(nce->nce_res_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill),
1278 	    (uchar_t *)&lnr->lnr_hdw_addr, lnr->lnr_hdw_len);
1279 	if (nce->nce_flags & NCE_F_ISROUTER)
1280 		lnr->lnr_flags = NDF_ISROUTER_ON;
1281 	if (nce->nce_flags & NCE_F_ANYCAST)
1282 		lnr->lnr_flags |= NDF_ANYCAST_ON;
1283 done:
1284 	NCE_REFRELE(nce);
1285 	return (0);
1286 }
1287 
1288 /*
1289  * Send Enable/Disable multicast reqs to driver.
1290  */
1291 int
1292 ndp_mcastreq(ill_t *ill, const in6_addr_t *addr, uint32_t hw_addr_len,
1293     uint32_t hw_addr_offset, mblk_t *mp)
1294 {
1295 	nce_t		*nce;
1296 	uchar_t		*hw_addr;
1297 	ip_stack_t	*ipst = ill->ill_ipst;
1298 
1299 	ASSERT(ill != NULL && ill->ill_isv6);
1300 	ASSERT(ill->ill_net_type == IRE_IF_RESOLVER);
1301 	hw_addr = mi_offset_paramc(mp, hw_addr_offset, hw_addr_len);
1302 	if (hw_addr == NULL || !IN6_IS_ADDR_MULTICAST(addr)) {
1303 		freemsg(mp);
1304 		return (EINVAL);
1305 	}
1306 	mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
1307 	nce = nce_lookup_mapping(ill, addr);
1308 	if (nce == NULL) {
1309 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
1310 		freemsg(mp);
1311 		return (ESRCH);
1312 	}
1313 	mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
1314 	/*
1315 	 * Update dl_addr_length and dl_addr_offset for primitives that
1316 	 * have physical addresses as opposed to full saps
1317 	 */
1318 	switch (((union DL_primitives *)mp->b_rptr)->dl_primitive) {
1319 	case DL_ENABMULTI_REQ:
1320 		/* Track the state if this is the first enabmulti */
1321 		if (ill->ill_dlpi_multicast_state == IDS_UNKNOWN)
1322 			ill->ill_dlpi_multicast_state = IDS_INPROGRESS;
1323 		ip1dbg(("ndp_mcastreq: ENABMULTI\n"));
1324 		break;
1325 	case DL_DISABMULTI_REQ:
1326 		ip1dbg(("ndp_mcastreq: DISABMULTI\n"));
1327 		break;
1328 	default:
1329 		NCE_REFRELE(nce);
1330 		ip1dbg(("ndp_mcastreq: default\n"));
1331 		return (EINVAL);
1332 	}
1333 	nce_make_mapping(nce, hw_addr, (uchar_t *)addr);
1334 	NCE_REFRELE(nce);
1335 	ill_dlpi_send(ill, mp);
1336 	return (0);
1337 }
1338 
1339 /*
1340  * Send a neighbor solicitation.
1341  * Returns number of milliseconds after which we should either rexmit or abort.
1342  * Return of zero means we should abort.
1343  * The caller holds the nce_lock to protect nce_qd_mp and nce_rcnt.
1344  *
1345  * NOTE: This routine drops nce_lock (and later reacquires it) when sending
1346  * the packet.
1347  * NOTE: This routine does not consume mp.
1348  */
1349 uint32_t
1350 nce_solicit(nce_t *nce, mblk_t *mp)
1351 {
1352 	ill_t		*ill;
1353 	ill_t		*src_ill;
1354 	ip6_t		*ip6h;
1355 	in6_addr_t	src;
1356 	in6_addr_t	dst;
1357 	ipif_t		*ipif;
1358 	ip6i_t		*ip6i;
1359 	boolean_t	dropped = B_FALSE;
1360 	ip_stack_t	*ipst = nce->nce_ill->ill_ipst;
1361 
1362 	ASSERT(RW_READ_HELD(&ipst->ips_ill_g_lock));
1363 	ASSERT(MUTEX_HELD(&nce->nce_lock));
1364 	ill = nce->nce_ill;
1365 	ASSERT(ill != NULL);
1366 
1367 	if (nce->nce_rcnt == 0) {
1368 		return (0);
1369 	}
1370 
1371 	if (mp == NULL) {
1372 		ASSERT(nce->nce_qd_mp != NULL);
1373 		mp = nce->nce_qd_mp;
1374 	} else {
1375 		nce_queue_mp(nce, mp);
1376 	}
1377 
1378 	/* Handle ip_newroute_v6 giving us IPSEC packets */
1379 	if (mp->b_datap->db_type == M_CTL)
1380 		mp = mp->b_cont;
1381 
1382 	ip6h = (ip6_t *)mp->b_rptr;
1383 	if (ip6h->ip6_nxt == IPPROTO_RAW) {
1384 		/*
1385 		 * This message should have been pulled up already in
1386 		 * ip_wput_v6. We can't do pullups here because the message
1387 		 * could be from the nce_qd_mp which could have b_next/b_prev
1388 		 * non-NULL.
1389 		 */
1390 		ip6i = (ip6i_t *)ip6h;
1391 		ASSERT((mp->b_wptr - (uchar_t *)ip6i) >=
1392 		    sizeof (ip6i_t) + IPV6_HDR_LEN);
1393 		ip6h = (ip6_t *)(mp->b_rptr + sizeof (ip6i_t));
1394 	}
1395 	src = ip6h->ip6_src;
1396 	/*
1397 	 * If the src of outgoing packet is one of the assigned interface
1398 	 * addresses use it, otherwise we will pick the source address below.
1399 	 */
1400 	src_ill = ill;
1401 	if (!IN6_IS_ADDR_UNSPECIFIED(&src)) {
1402 		if (ill->ill_group != NULL)
1403 			src_ill = ill->ill_group->illgrp_ill;
1404 		for (; src_ill != NULL; src_ill = src_ill->ill_group_next) {
1405 			for (ipif = src_ill->ill_ipif; ipif != NULL;
1406 			    ipif = ipif->ipif_next) {
1407 				if (IN6_ARE_ADDR_EQUAL(&src,
1408 				    &ipif->ipif_v6lcl_addr)) {
1409 					break;
1410 				}
1411 			}
1412 			if (ipif != NULL)
1413 				break;
1414 		}
1415 		/*
1416 		 * If no relevant ipif can be found, then it's not one of our
1417 		 * addresses.  Reset to :: and let nce_xmit.  If an ipif can be
1418 		 * found, but it's not yet done with DAD verification, then
1419 		 * just postpone this transmission until later.
1420 		 */
1421 		if (src_ill == NULL)
1422 			src = ipv6_all_zeros;
1423 		else if (!ipif->ipif_addr_ready)
1424 			return (ill->ill_reachable_retrans_time);
1425 	}
1426 	dst = nce->nce_addr;
1427 	/*
1428 	 * If source address is unspecified, nce_xmit will choose
1429 	 * one for us and initialize the hardware address also
1430 	 * appropriately.
1431 	 */
1432 	if (IN6_IS_ADDR_UNSPECIFIED(&src))
1433 		src_ill = NULL;
1434 	nce->nce_rcnt--;
1435 	mutex_exit(&nce->nce_lock);
1436 	rw_exit(&ipst->ips_ill_g_lock);
1437 	dropped = nce_xmit(ill, ND_NEIGHBOR_SOLICIT, src_ill, B_TRUE, &src,
1438 	    &dst, 0);
1439 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
1440 	mutex_enter(&nce->nce_lock);
1441 	if (dropped)
1442 		nce->nce_rcnt++;
1443 	return (ill->ill_reachable_retrans_time);
1444 }
1445 
1446 /*
1447  * Attempt to recover an address on an interface that's been marked as a
1448  * duplicate.  Because NCEs are destroyed when the interface goes down, there's
1449  * no easy way to just probe the address and have the right thing happen if
1450  * it's no longer in use.  Instead, we just bring it up normally and allow the
1451  * regular interface start-up logic to probe for a remaining duplicate and take
1452  * us back down if necessary.
1453  * Neither DHCP nor temporary addresses arrive here; they're excluded by
1454  * ip_ndp_excl.
1455  */
1456 /* ARGSUSED */
1457 static void
1458 ip_ndp_recover(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg)
1459 {
1460 	ill_t	*ill = rq->q_ptr;
1461 	ipif_t	*ipif;
1462 	in6_addr_t *addr = (in6_addr_t *)mp->b_rptr;
1463 
1464 	for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
1465 		/*
1466 		 * We do not support recovery of proxy ARP'd interfaces,
1467 		 * because the system lacks a complete proxy ARP mechanism.
1468 		 */
1469 		if ((ipif->ipif_flags & IPIF_POINTOPOINT) ||
1470 		    !IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6lcl_addr, addr)) {
1471 			continue;
1472 		}
1473 
1474 		/*
1475 		 * If we have already recovered or if the interface is going
1476 		 * away, then ignore.
1477 		 */
1478 		mutex_enter(&ill->ill_lock);
1479 		if (!(ipif->ipif_flags & IPIF_DUPLICATE) ||
1480 		    (ipif->ipif_flags & (IPIF_MOVING | IPIF_CONDEMNED))) {
1481 			mutex_exit(&ill->ill_lock);
1482 			continue;
1483 		}
1484 
1485 		ipif->ipif_flags &= ~IPIF_DUPLICATE;
1486 		ill->ill_ipif_dup_count--;
1487 		mutex_exit(&ill->ill_lock);
1488 		ipif->ipif_was_dup = B_TRUE;
1489 
1490 		if (ipif_ndp_up(ipif) != EINPROGRESS)
1491 			(void) ipif_up_done_v6(ipif);
1492 	}
1493 	freeb(mp);
1494 }
1495 
1496 /*
1497  * Attempt to recover an IPv6 interface that's been shut down as a duplicate.
1498  * As long as someone else holds the address, the interface will stay down.
1499  * When that conflict goes away, the interface is brought back up.  This is
1500  * done so that accidental shutdowns of addresses aren't made permanent.  Your
1501  * server will recover from a failure.
1502  *
1503  * For DHCP and temporary addresses, recovery is not done in the kernel.
1504  * Instead, it's handled by user space processes (dhcpagent and in.ndpd).
1505  *
1506  * This function is entered on a timer expiry; the ID is in ipif_recovery_id.
1507  */
1508 static void
1509 ipif6_dup_recovery(void *arg)
1510 {
1511 	ipif_t *ipif = arg;
1512 
1513 	ipif->ipif_recovery_id = 0;
1514 	if (!(ipif->ipif_flags & IPIF_DUPLICATE))
1515 		return;
1516 
1517 	/*
1518 	 * No lock, because this is just an optimization.
1519 	 */
1520 	if (ipif->ipif_state_flags & (IPIF_MOVING | IPIF_CONDEMNED))
1521 		return;
1522 
1523 	/* If the link is down, we'll retry this later */
1524 	if (!(ipif->ipif_ill->ill_phyint->phyint_flags & PHYI_RUNNING))
1525 		return;
1526 
1527 	ndp_do_recovery(ipif);
1528 }
1529 
1530 /*
1531  * Perform interface recovery by forcing the duplicate interfaces up and
1532  * allowing the system to determine which ones should stay up.
1533  *
1534  * Called both by recovery timer expiry and link-up notification.
1535  */
1536 void
1537 ndp_do_recovery(ipif_t *ipif)
1538 {
1539 	ill_t *ill = ipif->ipif_ill;
1540 	mblk_t *mp;
1541 	ip_stack_t *ipst = ill->ill_ipst;
1542 
1543 	mp = allocb(sizeof (ipif->ipif_v6lcl_addr), BPRI_MED);
1544 	if (mp == NULL) {
1545 		mutex_enter(&ill->ill_lock);
1546 		if (ipif->ipif_recovery_id == 0 &&
1547 		    !(ipif->ipif_state_flags & (IPIF_MOVING |
1548 		    IPIF_CONDEMNED))) {
1549 			ipif->ipif_recovery_id = timeout(ipif6_dup_recovery,
1550 			    ipif, MSEC_TO_TICK(ipst->ips_ip_dup_recovery));
1551 		}
1552 		mutex_exit(&ill->ill_lock);
1553 	} else {
1554 		bcopy(&ipif->ipif_v6lcl_addr, mp->b_rptr,
1555 		    sizeof (ipif->ipif_v6lcl_addr));
1556 		ill_refhold(ill);
1557 		qwriter_ip(ill, ill->ill_rq, mp, ip_ndp_recover, NEW_OP,
1558 		    B_FALSE);
1559 	}
1560 }
1561 
1562 /*
1563  * Find the solicitation in the given message, and extract printable details
1564  * (MAC and IP addresses) from it.
1565  */
1566 static nd_neighbor_solicit_t *
1567 ip_ndp_find_solicitation(mblk_t *mp, mblk_t *dl_mp, ill_t *ill, char *hbuf,
1568     size_t hlen, char *sbuf, size_t slen, uchar_t **haddr)
1569 {
1570 	nd_neighbor_solicit_t *ns;
1571 	ip6_t *ip6h;
1572 	uchar_t *addr;
1573 	int alen;
1574 
1575 	alen = 0;
1576 	ip6h = (ip6_t *)mp->b_rptr;
1577 	if (dl_mp == NULL) {
1578 		nd_opt_hdr_t *opt;
1579 		int nslen;
1580 
1581 		/*
1582 		 * If it's from the fast-path, then it can't be a probe
1583 		 * message, and thus must include the source linkaddr option.
1584 		 * Extract that here.
1585 		 */
1586 		ns = (nd_neighbor_solicit_t *)((char *)ip6h + IPV6_HDR_LEN);
1587 		nslen = mp->b_wptr - (uchar_t *)ns;
1588 		if ((nslen -= sizeof (*ns)) > 0) {
1589 			opt = ndp_get_option((nd_opt_hdr_t *)(ns + 1), nslen,
1590 			    ND_OPT_SOURCE_LINKADDR);
1591 			if (opt != NULL &&
1592 			    opt->nd_opt_len * 8 - sizeof (*opt) >=
1593 			    ill->ill_nd_lla_len) {
1594 				addr = (uchar_t *)(opt + 1);
1595 				alen = ill->ill_nd_lla_len;
1596 			}
1597 		}
1598 		/*
1599 		 * We cheat a bit here for the sake of printing usable log
1600 		 * messages in the rare case where the reply we got was unicast
1601 		 * without a source linkaddr option, and the interface is in
1602 		 * fastpath mode.  (Sigh.)
1603 		 */
1604 		if (alen == 0 && ill->ill_type == IFT_ETHER &&
1605 		    MBLKHEAD(mp) >= sizeof (struct ether_header)) {
1606 			struct ether_header *pether;
1607 
1608 			pether = (struct ether_header *)((char *)ip6h -
1609 			    sizeof (*pether));
1610 			addr = pether->ether_shost.ether_addr_octet;
1611 			alen = ETHERADDRL;
1612 		}
1613 	} else {
1614 		dl_unitdata_ind_t *dlu;
1615 
1616 		dlu = (dl_unitdata_ind_t *)dl_mp->b_rptr;
1617 		alen = dlu->dl_src_addr_length;
1618 		if (alen > 0 && dlu->dl_src_addr_offset >= sizeof (*dlu) &&
1619 		    dlu->dl_src_addr_offset + alen <= MBLKL(dl_mp)) {
1620 			addr = dl_mp->b_rptr + dlu->dl_src_addr_offset;
1621 			if (ill->ill_sap_length < 0) {
1622 				alen += ill->ill_sap_length;
1623 			} else {
1624 				addr += ill->ill_sap_length;
1625 				alen -= ill->ill_sap_length;
1626 			}
1627 		}
1628 	}
1629 	if (alen > 0) {
1630 		*haddr = addr;
1631 		(void) mac_colon_addr(addr, alen, hbuf, hlen);
1632 	} else {
1633 		*haddr = NULL;
1634 		(void) strcpy(hbuf, "?");
1635 	}
1636 	ns = (nd_neighbor_solicit_t *)((char *)ip6h + IPV6_HDR_LEN);
1637 	(void) inet_ntop(AF_INET6, &ns->nd_ns_target, sbuf, slen);
1638 	return (ns);
1639 }
1640 
1641 /*
1642  * This is for exclusive changes due to NDP duplicate address detection
1643  * failure.
1644  */
1645 /* ARGSUSED */
1646 static void
1647 ip_ndp_excl(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg)
1648 {
1649 	ill_t	*ill = rq->q_ptr;
1650 	ipif_t	*ipif;
1651 	char ibuf[LIFNAMSIZ + 10];	/* 10 digits for logical i/f number */
1652 	char hbuf[MAC_STR_LEN];
1653 	char sbuf[INET6_ADDRSTRLEN];
1654 	nd_neighbor_solicit_t *ns;
1655 	mblk_t *dl_mp = NULL;
1656 	uchar_t *haddr;
1657 	ip_stack_t *ipst = ill->ill_ipst;
1658 
1659 	if (DB_TYPE(mp) != M_DATA) {
1660 		dl_mp = mp;
1661 		mp = mp->b_cont;
1662 	}
1663 	ns = ip_ndp_find_solicitation(mp, dl_mp, ill, hbuf, sizeof (hbuf), sbuf,
1664 	    sizeof (sbuf), &haddr);
1665 	if (haddr != NULL &&
1666 	    bcmp(haddr, ill->ill_phys_addr, ill->ill_phys_addr_length) == 0) {
1667 		/*
1668 		 * Ignore conflicts generated by misbehaving switches that just
1669 		 * reflect our own messages back to us.
1670 		 */
1671 		goto ignore_conflict;
1672 	}
1673 
1674 	for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
1675 
1676 		if ((ipif->ipif_flags & IPIF_POINTOPOINT) ||
1677 		    !IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6lcl_addr,
1678 		    &ns->nd_ns_target)) {
1679 			continue;
1680 		}
1681 
1682 		/* If it's already marked, then don't do anything. */
1683 		if (ipif->ipif_flags & IPIF_DUPLICATE)
1684 			continue;
1685 
1686 		/*
1687 		 * If this is a failure during duplicate recovery, then don't
1688 		 * complain.  It may take a long time to recover.
1689 		 */
1690 		if (!ipif->ipif_was_dup) {
1691 			ipif_get_name(ipif, ibuf, sizeof (ibuf));
1692 			cmn_err(CE_WARN, "%s has duplicate address %s (in "
1693 			    "use by %s); disabled", ibuf, sbuf, hbuf);
1694 		}
1695 		mutex_enter(&ill->ill_lock);
1696 		ASSERT(!(ipif->ipif_flags & IPIF_DUPLICATE));
1697 		ipif->ipif_flags |= IPIF_DUPLICATE;
1698 		ill->ill_ipif_dup_count++;
1699 		mutex_exit(&ill->ill_lock);
1700 		(void) ipif_down(ipif, NULL, NULL);
1701 		ipif_down_tail(ipif);
1702 		mutex_enter(&ill->ill_lock);
1703 		if (!(ipif->ipif_flags & (IPIF_DHCPRUNNING|IPIF_TEMPORARY)) &&
1704 		    ill->ill_net_type == IRE_IF_RESOLVER &&
1705 		    !(ipif->ipif_state_flags & (IPIF_MOVING |
1706 		    IPIF_CONDEMNED)) &&
1707 		    ipst->ips_ip_dup_recovery > 0) {
1708 			ipif->ipif_recovery_id = timeout(ipif6_dup_recovery,
1709 			    ipif, MSEC_TO_TICK(ipst->ips_ip_dup_recovery));
1710 		}
1711 		mutex_exit(&ill->ill_lock);
1712 	}
1713 ignore_conflict:
1714 	if (dl_mp != NULL)
1715 		freeb(dl_mp);
1716 	freemsg(mp);
1717 }
1718 
1719 /*
1720  * Handle failure by tearing down the ipifs with the specified address.  Note
1721  * that tearing down the ipif also means deleting the nce through ipif_down, so
1722  * it's not possible to do recovery by just restarting the nce timer.  Instead,
1723  * we start a timer on the ipif.
1724  */
1725 static void
1726 ip_ndp_failure(ill_t *ill, mblk_t *mp, mblk_t *dl_mp, nce_t *nce)
1727 {
1728 	if ((mp = copymsg(mp)) != NULL) {
1729 		if (dl_mp == NULL)
1730 			dl_mp = mp;
1731 		else if ((dl_mp = copyb(dl_mp)) != NULL)
1732 			dl_mp->b_cont = mp;
1733 		if (dl_mp == NULL) {
1734 			freemsg(mp);
1735 		} else {
1736 			ill_refhold(ill);
1737 			qwriter_ip(ill, ill->ill_rq, dl_mp, ip_ndp_excl, NEW_OP,
1738 			    B_FALSE);
1739 		}
1740 	}
1741 	ndp_delete(nce);
1742 }
1743 
1744 /*
1745  * Handle a discovered conflict: some other system is advertising that it owns
1746  * one of our IP addresses.  We need to defend ourselves, or just shut down the
1747  * interface.
1748  */
1749 static void
1750 ip_ndp_conflict(ill_t *ill, mblk_t *mp, mblk_t *dl_mp, nce_t *nce)
1751 {
1752 	ipif_t *ipif;
1753 	uint32_t now;
1754 	uint_t maxdefense;
1755 	uint_t defs;
1756 	ip_stack_t *ipst = ill->ill_ipst;
1757 
1758 	ipif = ipif_lookup_addr_v6(&nce->nce_addr, ill, ALL_ZONES, NULL, NULL,
1759 	    NULL, NULL, ipst);
1760 	if (ipif == NULL)
1761 		return;
1762 	/*
1763 	 * First, figure out if this address is disposable.
1764 	 */
1765 	if (ipif->ipif_flags & (IPIF_DHCPRUNNING | IPIF_TEMPORARY))
1766 		maxdefense = ipst->ips_ip_max_temp_defend;
1767 	else
1768 		maxdefense = ipst->ips_ip_max_defend;
1769 
1770 	/*
1771 	 * Now figure out how many times we've defended ourselves.  Ignore
1772 	 * defenses that happened long in the past.
1773 	 */
1774 	now = gethrestime_sec();
1775 	mutex_enter(&nce->nce_lock);
1776 	if ((defs = nce->nce_defense_count) > 0 &&
1777 	    now - nce->nce_defense_time > ipst->ips_ip_defend_interval) {
1778 		nce->nce_defense_count = defs = 0;
1779 	}
1780 	nce->nce_defense_count++;
1781 	nce->nce_defense_time = now;
1782 	mutex_exit(&nce->nce_lock);
1783 	ipif_refrele(ipif);
1784 
1785 	/*
1786 	 * If we've defended ourselves too many times already, then give up and
1787 	 * tear down the interface(s) using this address.  Otherwise, defend by
1788 	 * sending out an unsolicited Neighbor Advertisement.
1789 	 */
1790 	if (defs >= maxdefense) {
1791 		ip_ndp_failure(ill, mp, dl_mp, nce);
1792 	} else {
1793 		char hbuf[MAC_STR_LEN];
1794 		char sbuf[INET6_ADDRSTRLEN];
1795 		uchar_t *haddr;
1796 
1797 		(void) ip_ndp_find_solicitation(mp, dl_mp, ill, hbuf,
1798 		    sizeof (hbuf), sbuf, sizeof (sbuf), &haddr);
1799 		cmn_err(CE_WARN, "node %s is using our IP address %s on %s",
1800 		    hbuf, sbuf, ill->ill_name);
1801 		(void) nce_xmit(ill, ND_NEIGHBOR_ADVERT, ill, B_FALSE,
1802 		    &nce->nce_addr, &ipv6_all_hosts_mcast,
1803 		    nce_advert_flags(nce));
1804 	}
1805 }
1806 
1807 static void
1808 ndp_input_solicit(ill_t *ill, mblk_t *mp, mblk_t *dl_mp)
1809 {
1810 	nd_neighbor_solicit_t *ns;
1811 	uint32_t	hlen = ill->ill_nd_lla_len;
1812 	uchar_t		*haddr = NULL;
1813 	icmp6_t		*icmp_nd;
1814 	ip6_t		*ip6h;
1815 	nce_t		*our_nce = NULL;
1816 	in6_addr_t	target;
1817 	in6_addr_t	src;
1818 	int		len;
1819 	int		flag = 0;
1820 	nd_opt_hdr_t	*opt = NULL;
1821 	boolean_t	bad_solicit = B_FALSE;
1822 	mib2_ipv6IfIcmpEntry_t	*mib = ill->ill_icmp6_mib;
1823 
1824 	ip6h = (ip6_t *)mp->b_rptr;
1825 	icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
1826 	len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN;
1827 	src = ip6h->ip6_src;
1828 	ns = (nd_neighbor_solicit_t *)icmp_nd;
1829 	target = ns->nd_ns_target;
1830 	if (IN6_IS_ADDR_MULTICAST(&target)) {
1831 		if (ip_debug > 2) {
1832 			/* ip1dbg */
1833 			pr_addr_dbg("ndp_input_solicit: Target is"
1834 			    " multicast! %s\n", AF_INET6, &target);
1835 		}
1836 		bad_solicit = B_TRUE;
1837 		goto done;
1838 	}
1839 	if (len > sizeof (nd_neighbor_solicit_t)) {
1840 		/* Options present */
1841 		opt = (nd_opt_hdr_t *)&ns[1];
1842 		len -= sizeof (nd_neighbor_solicit_t);
1843 		if (!ndp_verify_optlen(opt, len)) {
1844 			ip1dbg(("ndp_input_solicit: Bad opt len\n"));
1845 			bad_solicit = B_TRUE;
1846 			goto done;
1847 		}
1848 	}
1849 	if (IN6_IS_ADDR_UNSPECIFIED(&src)) {
1850 		/* Check to see if this is a valid DAD solicitation */
1851 		if (!IN6_IS_ADDR_MC_SOLICITEDNODE(&ip6h->ip6_dst)) {
1852 			if (ip_debug > 2) {
1853 				/* ip1dbg */
1854 				pr_addr_dbg("ndp_input_solicit: IPv6 "
1855 				    "Destination is not solicited node "
1856 				    "multicast %s\n", AF_INET6,
1857 				    &ip6h->ip6_dst);
1858 			}
1859 			bad_solicit = B_TRUE;
1860 			goto done;
1861 		}
1862 	}
1863 
1864 	our_nce = ndp_lookup_v6(ill, &target, B_FALSE);
1865 	/*
1866 	 * If this is a valid Solicitation, a permanent
1867 	 * entry should exist in the cache
1868 	 */
1869 	if (our_nce == NULL ||
1870 	    !(our_nce->nce_flags & NCE_F_PERMANENT)) {
1871 		ip1dbg(("ndp_input_solicit: Wrong target in NS?!"
1872 		    "ifname=%s ", ill->ill_name));
1873 		if (ip_debug > 2) {
1874 			/* ip1dbg */
1875 			pr_addr_dbg(" dst %s\n", AF_INET6, &target);
1876 		}
1877 		bad_solicit = B_TRUE;
1878 		goto done;
1879 	}
1880 
1881 	/* At this point we should have a verified NS per spec */
1882 	if (opt != NULL) {
1883 		opt = ndp_get_option(opt, len, ND_OPT_SOURCE_LINKADDR);
1884 		if (opt != NULL) {
1885 			haddr = (uchar_t *)&opt[1];
1886 			if (hlen > opt->nd_opt_len * 8 - sizeof (*opt) ||
1887 			    hlen == 0) {
1888 				ip1dbg(("ndp_input_advert: bad SLLA\n"));
1889 				bad_solicit = B_TRUE;
1890 				goto done;
1891 			}
1892 		}
1893 	}
1894 
1895 	/* If sending directly to peer, set the unicast flag */
1896 	if (!IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst))
1897 		flag |= NDP_UNICAST;
1898 
1899 	/*
1900 	 * Create/update the entry for the soliciting node.
1901 	 * or respond to outstanding queries, don't if
1902 	 * the source is unspecified address.
1903 	 */
1904 	if (!IN6_IS_ADDR_UNSPECIFIED(&src)) {
1905 		int	err;
1906 		nce_t	*nnce;
1907 
1908 		ASSERT(ill->ill_isv6);
1909 		/*
1910 		 * Regular solicitations *must* include the Source Link-Layer
1911 		 * Address option.  Ignore messages that do not.
1912 		 */
1913 		if (haddr == NULL && IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst)) {
1914 			ip1dbg(("ndp_input_solicit: source link-layer address "
1915 			    "option missing with a specified source.\n"));
1916 			bad_solicit = B_TRUE;
1917 			goto done;
1918 		}
1919 
1920 		/*
1921 		 * This is a regular solicitation.  If we're still in the
1922 		 * process of verifying the address, then don't respond at all
1923 		 * and don't keep track of the sender.
1924 		 */
1925 		if (our_nce->nce_state == ND_PROBE)
1926 			goto done;
1927 
1928 		/*
1929 		 * If the solicitation doesn't have sender hardware address
1930 		 * (legal for unicast solicitation), then process without
1931 		 * installing the return NCE.  Either we already know it, or
1932 		 * we'll be forced to look it up when (and if) we reply to the
1933 		 * packet.
1934 		 */
1935 		if (haddr == NULL)
1936 			goto no_source;
1937 
1938 		err = ndp_lookup_then_add_v6(ill,
1939 		    haddr,
1940 		    &src,	/* Soliciting nodes address */
1941 		    &ipv6_all_ones,
1942 		    &ipv6_all_zeros,
1943 		    0,
1944 		    0,
1945 		    ND_STALE,
1946 		    &nnce);
1947 		switch (err) {
1948 		case 0:
1949 			/* done with this entry */
1950 			NCE_REFRELE(nnce);
1951 			break;
1952 		case EEXIST:
1953 			/*
1954 			 * B_FALSE indicates this is not an
1955 			 * an advertisement.
1956 			 */
1957 			ndp_process(nnce, haddr, 0, B_FALSE);
1958 			NCE_REFRELE(nnce);
1959 			break;
1960 		default:
1961 			ip1dbg(("ndp_input_solicit: Can't create NCE %d\n",
1962 			    err));
1963 			goto done;
1964 		}
1965 no_source:
1966 		flag |= NDP_SOLICITED;
1967 	} else {
1968 		/*
1969 		 * No source link layer address option should be present in a
1970 		 * valid DAD request.
1971 		 */
1972 		if (haddr != NULL) {
1973 			ip1dbg(("ndp_input_solicit: source link-layer address "
1974 			    "option present with an unspecified source.\n"));
1975 			bad_solicit = B_TRUE;
1976 			goto done;
1977 		}
1978 		if (our_nce->nce_state == ND_PROBE) {
1979 			/*
1980 			 * Internally looped-back probes won't have DLPI
1981 			 * attached to them.  External ones (which are sent by
1982 			 * multicast) always will.  Just ignore our own
1983 			 * transmissions.
1984 			 */
1985 			if (dl_mp != NULL) {
1986 				/*
1987 				 * If someone else is probing our address, then
1988 				 * we've crossed wires.  Declare failure.
1989 				 */
1990 				ip_ndp_failure(ill, mp, dl_mp, our_nce);
1991 			}
1992 			goto done;
1993 		}
1994 		/*
1995 		 * This is a DAD probe.  Multicast the advertisement to the
1996 		 * all-nodes address.
1997 		 */
1998 		src = ipv6_all_hosts_mcast;
1999 	}
2000 	flag |= nce_advert_flags(our_nce);
2001 	/* Response to a solicitation */
2002 	(void) nce_xmit(ill,
2003 	    ND_NEIGHBOR_ADVERT,
2004 	    ill,	/* ill to be used for extracting ill_nd_lla */
2005 	    B_TRUE,	/* use ill_nd_lla */
2006 	    &target,	/* Source and target of the advertisement pkt */
2007 	    &src,	/* IP Destination (source of original pkt) */
2008 	    flag);
2009 done:
2010 	if (bad_solicit)
2011 		BUMP_MIB(mib, ipv6IfIcmpInBadNeighborSolicitations);
2012 	if (our_nce != NULL)
2013 		NCE_REFRELE(our_nce);
2014 }
2015 
2016 void
2017 ndp_input_advert(ill_t *ill, mblk_t *mp, mblk_t *dl_mp)
2018 {
2019 	nd_neighbor_advert_t *na;
2020 	uint32_t	hlen = ill->ill_nd_lla_len;
2021 	uchar_t		*haddr = NULL;
2022 	icmp6_t		*icmp_nd;
2023 	ip6_t		*ip6h;
2024 	nce_t		*dst_nce = NULL;
2025 	in6_addr_t	target;
2026 	nd_opt_hdr_t	*opt = NULL;
2027 	int		len;
2028 	mib2_ipv6IfIcmpEntry_t	*mib = ill->ill_icmp6_mib;
2029 	ip_stack_t	*ipst = ill->ill_ipst;
2030 
2031 	ip6h = (ip6_t *)mp->b_rptr;
2032 	icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
2033 	len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN;
2034 	na = (nd_neighbor_advert_t *)icmp_nd;
2035 	if (IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst) &&
2036 	    (na->nd_na_flags_reserved & ND_NA_FLAG_SOLICITED)) {
2037 		ip1dbg(("ndp_input_advert: Target is multicast but the "
2038 		    "solicited flag is not zero\n"));
2039 		BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements);
2040 		return;
2041 	}
2042 	target = na->nd_na_target;
2043 	if (IN6_IS_ADDR_MULTICAST(&target)) {
2044 		ip1dbg(("ndp_input_advert: Target is multicast!\n"));
2045 		BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements);
2046 		return;
2047 	}
2048 	if (len > sizeof (nd_neighbor_advert_t)) {
2049 		opt = (nd_opt_hdr_t *)&na[1];
2050 		if (!ndp_verify_optlen(opt,
2051 		    len - sizeof (nd_neighbor_advert_t))) {
2052 			ip1dbg(("ndp_input_advert: cannot verify SLLA\n"));
2053 			BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements);
2054 			return;
2055 		}
2056 		/* At this point we have a verified NA per spec */
2057 		len -= sizeof (nd_neighbor_advert_t);
2058 		opt = ndp_get_option(opt, len, ND_OPT_TARGET_LINKADDR);
2059 		if (opt != NULL) {
2060 			haddr = (uchar_t *)&opt[1];
2061 			if (hlen > opt->nd_opt_len * 8 - sizeof (*opt) ||
2062 			    hlen == 0) {
2063 				ip1dbg(("ndp_input_advert: bad SLLA\n"));
2064 				BUMP_MIB(mib,
2065 				    ipv6IfIcmpInBadNeighborAdvertisements);
2066 				return;
2067 			}
2068 		}
2069 	}
2070 
2071 	/*
2072 	 * If this interface is part of the group look at all the
2073 	 * ills in the group.
2074 	 */
2075 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
2076 	if (ill->ill_group != NULL)
2077 		ill = ill->ill_group->illgrp_ill;
2078 
2079 	for (; ill != NULL; ill = ill->ill_group_next) {
2080 		mutex_enter(&ill->ill_lock);
2081 		if (!ILL_CAN_LOOKUP(ill)) {
2082 			mutex_exit(&ill->ill_lock);
2083 			continue;
2084 		}
2085 		ill_refhold_locked(ill);
2086 		mutex_exit(&ill->ill_lock);
2087 		dst_nce = ndp_lookup_v6(ill, &target, B_FALSE);
2088 		/* We have to drop the lock since ndp_process calls put* */
2089 		rw_exit(&ipst->ips_ill_g_lock);
2090 		if (dst_nce != NULL) {
2091 			if ((dst_nce->nce_flags & NCE_F_PERMANENT) &&
2092 			    dst_nce->nce_state == ND_PROBE) {
2093 				/*
2094 				 * Someone else sent an advertisement for an
2095 				 * address that we're trying to configure.
2096 				 * Tear it down.  Note that dl_mp might be NULL
2097 				 * if we're getting a unicast reply.  This
2098 				 * isn't typically done (multicast is the norm
2099 				 * in response to a probe), but ip_ndp_failure
2100 				 * will handle the dl_mp == NULL case as well.
2101 				 */
2102 				ip_ndp_failure(ill, mp, dl_mp, dst_nce);
2103 			} else if (dst_nce->nce_flags & NCE_F_PERMANENT) {
2104 				/*
2105 				 * Someone just announced one of our local
2106 				 * addresses.  If it wasn't us, then this is a
2107 				 * conflict.  Defend the address or shut it
2108 				 * down.
2109 				 */
2110 				if (dl_mp != NULL &&
2111 				    (haddr == NULL ||
2112 				    nce_cmp_ll_addr(dst_nce, haddr,
2113 				    ill->ill_nd_lla_len))) {
2114 					ip_ndp_conflict(ill, mp, dl_mp,
2115 					    dst_nce);
2116 				}
2117 			} else {
2118 				if (na->nd_na_flags_reserved &
2119 				    ND_NA_FLAG_ROUTER) {
2120 					dst_nce->nce_flags |= NCE_F_ISROUTER;
2121 				}
2122 				/* B_TRUE indicates this an advertisement */
2123 				ndp_process(dst_nce, haddr,
2124 				    na->nd_na_flags_reserved, B_TRUE);
2125 			}
2126 			NCE_REFRELE(dst_nce);
2127 		}
2128 		rw_enter(&ipst->ips_ill_g_lock, RW_READER);
2129 		ill_refrele(ill);
2130 	}
2131 	rw_exit(&ipst->ips_ill_g_lock);
2132 }
2133 
2134 /*
2135  * Process NDP neighbor solicitation/advertisement messages.
2136  * The checksum has already checked o.k before reaching here.
2137  */
2138 void
2139 ndp_input(ill_t *ill, mblk_t *mp, mblk_t *dl_mp)
2140 {
2141 	icmp6_t		*icmp_nd;
2142 	ip6_t		*ip6h;
2143 	int		len;
2144 	mib2_ipv6IfIcmpEntry_t	*mib = ill->ill_icmp6_mib;
2145 
2146 
2147 	if (!pullupmsg(mp, -1)) {
2148 		ip1dbg(("ndp_input: pullupmsg failed\n"));
2149 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2150 		goto done;
2151 	}
2152 	ip6h = (ip6_t *)mp->b_rptr;
2153 	if (ip6h->ip6_hops != IPV6_MAX_HOPS) {
2154 		ip1dbg(("ndp_input: hoplimit != IPV6_MAX_HOPS\n"));
2155 		BUMP_MIB(mib, ipv6IfIcmpBadHoplimit);
2156 		goto done;
2157 	}
2158 	/*
2159 	 * NDP does not accept any extension headers between the
2160 	 * IP header and the ICMP header since e.g. a routing
2161 	 * header could be dangerous.
2162 	 * This assumes that any AH or ESP headers are removed
2163 	 * by ip prior to passing the packet to ndp_input.
2164 	 */
2165 	if (ip6h->ip6_nxt != IPPROTO_ICMPV6) {
2166 		ip1dbg(("ndp_input: Wrong next header 0x%x\n",
2167 		    ip6h->ip6_nxt));
2168 		BUMP_MIB(mib, ipv6IfIcmpInErrors);
2169 		goto done;
2170 	}
2171 	icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
2172 	ASSERT(icmp_nd->icmp6_type == ND_NEIGHBOR_SOLICIT ||
2173 	    icmp_nd->icmp6_type == ND_NEIGHBOR_ADVERT);
2174 	if (icmp_nd->icmp6_code != 0) {
2175 		ip1dbg(("ndp_input: icmp6 code != 0 \n"));
2176 		BUMP_MIB(mib, ipv6IfIcmpInErrors);
2177 		goto done;
2178 	}
2179 	len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN;
2180 	/*
2181 	 * Make sure packet length is large enough for either
2182 	 * a NS or a NA icmp packet.
2183 	 */
2184 	if (len <  sizeof (struct icmp6_hdr) + sizeof (struct in6_addr)) {
2185 		ip1dbg(("ndp_input: packet too short\n"));
2186 		BUMP_MIB(mib, ipv6IfIcmpInErrors);
2187 		goto done;
2188 	}
2189 	if (icmp_nd->icmp6_type == ND_NEIGHBOR_SOLICIT) {
2190 		ndp_input_solicit(ill, mp, dl_mp);
2191 	} else {
2192 		ndp_input_advert(ill, mp, dl_mp);
2193 	}
2194 done:
2195 	freemsg(mp);
2196 }
2197 
2198 /*
2199  * nce_xmit is called to form and transmit a ND solicitation or
2200  * advertisement ICMP packet.
2201  *
2202  * If the source address is unspecified and this isn't a probe (used for
2203  * duplicate address detection), an appropriate source address and link layer
2204  * address will be chosen here.  The link layer address option is included if
2205  * the source is specified (i.e., all non-probe packets), and omitted (per the
2206  * specification) otherwise.
2207  *
2208  * It returns B_FALSE only if it does a successful put() to the
2209  * corresponding ill's ill_wq otherwise returns B_TRUE.
2210  */
2211 static boolean_t
2212 nce_xmit(ill_t *ill, uint32_t operation, ill_t *hwaddr_ill,
2213     boolean_t use_nd_lla, const in6_addr_t *sender, const in6_addr_t *target,
2214     int flag)
2215 {
2216 	uint32_t	len;
2217 	icmp6_t 	*icmp6;
2218 	mblk_t		*mp;
2219 	ip6_t		*ip6h;
2220 	nd_opt_hdr_t	*opt;
2221 	uint_t		plen;
2222 	ip6i_t		*ip6i;
2223 	ipif_t		*src_ipif = NULL;
2224 	uint8_t		*hw_addr;
2225 	zoneid_t	zoneid = GLOBAL_ZONEID;
2226 
2227 	/*
2228 	 * If we have a unspecified source(sender) address, select a
2229 	 * proper source address for the solicitation here itself so
2230 	 * that we can initialize the h/w address correctly. This is
2231 	 * needed for interface groups as source address can come from
2232 	 * the whole group and the h/w address initialized from ill will
2233 	 * be wrong if the source address comes from a different ill.
2234 	 *
2235 	 * If the sender is specified then we use this address in order
2236 	 * to lookup the zoneid before calling ip_output_v6(). This is to
2237 	 * enable unicast ND_NEIGHBOR_ADVERT packets to be routed correctly
2238 	 * by IP (we cannot guarantee that the global zone has an interface
2239 	 * route to the destination).
2240 	 *
2241 	 * Note that the NA never comes here with the unspecified source
2242 	 * address. The following asserts that whenever the source
2243 	 * address is specified, the haddr also should be specified.
2244 	 */
2245 	ASSERT(IN6_IS_ADDR_UNSPECIFIED(sender) || (hwaddr_ill != NULL));
2246 
2247 	if (IN6_IS_ADDR_UNSPECIFIED(sender) && !(flag & NDP_PROBE)) {
2248 		ASSERT(operation != ND_NEIGHBOR_ADVERT);
2249 		/*
2250 		 * Pick a source address for this solicitation, but
2251 		 * restrict the selection to addresses assigned to the
2252 		 * output interface (or interface group).  We do this
2253 		 * because the destination will create a neighbor cache
2254 		 * entry for the source address of this packet, so the
2255 		 * source address had better be a valid neighbor.
2256 		 */
2257 		src_ipif = ipif_select_source_v6(ill, target, RESTRICT_TO_ILL,
2258 		    IPV6_PREFER_SRC_DEFAULT, ALL_ZONES);
2259 		if (src_ipif == NULL) {
2260 			char buf[INET6_ADDRSTRLEN];
2261 
2262 			ip1dbg(("nce_xmit: No source ipif for dst %s\n",
2263 			    inet_ntop(AF_INET6, (char *)target, buf,
2264 			    sizeof (buf))));
2265 			return (B_TRUE);
2266 		}
2267 		sender = &src_ipif->ipif_v6src_addr;
2268 		hwaddr_ill = src_ipif->ipif_ill;
2269 	} else if (!(IN6_IS_ADDR_UNSPECIFIED(sender))) {
2270 		zoneid = ipif_lookup_addr_zoneid_v6(sender, ill, ill->ill_ipst);
2271 		/*
2272 		 * It's possible for ipif_lookup_addr_zoneid_v6() to return
2273 		 * ALL_ZONES if it cannot find a matching ipif for the address
2274 		 * we are trying to use. In this case we err on the side of
2275 		 * trying to send the packet by defaulting to the GLOBAL_ZONEID.
2276 		 */
2277 		if (zoneid == ALL_ZONES)
2278 			zoneid = GLOBAL_ZONEID;
2279 	}
2280 
2281 	/*
2282 	 * Always make sure that the NS/NA packets don't get load
2283 	 * spread. This is needed so that the probe packets sent
2284 	 * by the in.mpathd daemon can really go out on the desired
2285 	 * interface. Probe packets are made to go out on a desired
2286 	 * interface by including a ip6i with ATTACH_IF flag. As these
2287 	 * packets indirectly end up sending/receiving NS/NA packets
2288 	 * (neighbor doing NUD), we have to make sure that NA
2289 	 * also go out on the same interface.
2290 	 */
2291 	plen = (sizeof (nd_opt_hdr_t) + ill->ill_nd_lla_len + 7) / 8;
2292 	len = IPV6_HDR_LEN + sizeof (ip6i_t) + sizeof (nd_neighbor_advert_t) +
2293 	    plen * 8;
2294 	mp = allocb(len,  BPRI_LO);
2295 	if (mp == NULL) {
2296 		if (src_ipif != NULL)
2297 			ipif_refrele(src_ipif);
2298 		return (B_TRUE);
2299 	}
2300 	bzero((char *)mp->b_rptr, len);
2301 	mp->b_wptr = mp->b_rptr + len;
2302 
2303 	ip6i = (ip6i_t *)mp->b_rptr;
2304 	ip6i->ip6i_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
2305 	ip6i->ip6i_nxt = IPPROTO_RAW;
2306 	ip6i->ip6i_flags = IP6I_ATTACH_IF | IP6I_HOPLIMIT;
2307 	if (flag & NDP_PROBE)
2308 		ip6i->ip6i_flags |= IP6I_UNSPEC_SRC;
2309 	ip6i->ip6i_ifindex = ill->ill_phyint->phyint_ifindex;
2310 
2311 	ip6h = (ip6_t *)(mp->b_rptr + sizeof (ip6i_t));
2312 	ip6h->ip6_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
2313 	ip6h->ip6_plen = htons(len - IPV6_HDR_LEN - sizeof (ip6i_t));
2314 	ip6h->ip6_nxt = IPPROTO_ICMPV6;
2315 	ip6h->ip6_hops = IPV6_MAX_HOPS;
2316 	ip6h->ip6_dst = *target;
2317 	icmp6 = (icmp6_t *)&ip6h[1];
2318 
2319 	opt = (nd_opt_hdr_t *)((uint8_t *)ip6h + IPV6_HDR_LEN +
2320 	    sizeof (nd_neighbor_advert_t));
2321 
2322 	if (operation == ND_NEIGHBOR_SOLICIT) {
2323 		nd_neighbor_solicit_t *ns = (nd_neighbor_solicit_t *)icmp6;
2324 
2325 		if (!(flag & NDP_PROBE))
2326 			opt->nd_opt_type = ND_OPT_SOURCE_LINKADDR;
2327 		ip6h->ip6_src = *sender;
2328 		ns->nd_ns_target = *target;
2329 		if (!(flag & NDP_UNICAST)) {
2330 			/* Form multicast address of the target */
2331 			ip6h->ip6_dst = ipv6_solicited_node_mcast;
2332 			ip6h->ip6_dst.s6_addr32[3] |=
2333 			    ns->nd_ns_target.s6_addr32[3];
2334 		}
2335 	} else {
2336 		nd_neighbor_advert_t *na = (nd_neighbor_advert_t *)icmp6;
2337 
2338 		ASSERT(!(flag & NDP_PROBE));
2339 		opt->nd_opt_type = ND_OPT_TARGET_LINKADDR;
2340 		ip6h->ip6_src = *sender;
2341 		na->nd_na_target = *sender;
2342 		if (flag & NDP_ISROUTER)
2343 			na->nd_na_flags_reserved |= ND_NA_FLAG_ROUTER;
2344 		if (flag & NDP_SOLICITED)
2345 			na->nd_na_flags_reserved |= ND_NA_FLAG_SOLICITED;
2346 		if (flag & NDP_ORIDE)
2347 			na->nd_na_flags_reserved |= ND_NA_FLAG_OVERRIDE;
2348 	}
2349 
2350 	hw_addr = NULL;
2351 	if (!(flag & NDP_PROBE)) {
2352 		hw_addr = use_nd_lla ? hwaddr_ill->ill_nd_lla :
2353 		    hwaddr_ill->ill_phys_addr;
2354 		if (hw_addr != NULL) {
2355 			/* Fill in link layer address and option len */
2356 			opt->nd_opt_len = (uint8_t)plen;
2357 			bcopy(hw_addr, &opt[1], hwaddr_ill->ill_nd_lla_len);
2358 		}
2359 	}
2360 	if (hw_addr == NULL) {
2361 		/* If there's no link layer address option, then strip it. */
2362 		len -= plen * 8;
2363 		mp->b_wptr = mp->b_rptr + len;
2364 		ip6h->ip6_plen = htons(len - IPV6_HDR_LEN - sizeof (ip6i_t));
2365 	}
2366 
2367 	icmp6->icmp6_type = (uint8_t)operation;
2368 	icmp6->icmp6_code = 0;
2369 	/*
2370 	 * Prepare for checksum by putting icmp length in the icmp
2371 	 * checksum field. The checksum is calculated in ip_wput_v6.
2372 	 */
2373 	icmp6->icmp6_cksum = ip6h->ip6_plen;
2374 
2375 	if (src_ipif != NULL)
2376 		ipif_refrele(src_ipif);
2377 
2378 	ip_output_v6((void *)(uintptr_t)zoneid, mp, ill->ill_wq, IP_WPUT);
2379 	return (B_FALSE);
2380 }
2381 
2382 /*
2383  * Make a link layer address (does not include the SAP) from an nce.
2384  * To form the link layer address, use the last four bytes of ipv6
2385  * address passed in and the fixed offset stored in nce.
2386  */
2387 static void
2388 nce_make_mapping(nce_t *nce, uchar_t *addrpos, uchar_t *addr)
2389 {
2390 	uchar_t *mask, *to;
2391 	ill_t	*ill = nce->nce_ill;
2392 	int 	len;
2393 
2394 	if (ill->ill_net_type == IRE_IF_NORESOLVER)
2395 		return;
2396 	ASSERT(nce->nce_res_mp != NULL);
2397 	ASSERT(ill->ill_net_type == IRE_IF_RESOLVER);
2398 	ASSERT(nce->nce_flags & NCE_F_MAPPING);
2399 	ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&nce->nce_extract_mask));
2400 	ASSERT(addr != NULL);
2401 	bcopy(nce->nce_res_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill),
2402 	    addrpos, ill->ill_nd_lla_len);
2403 	len = MIN((int)ill->ill_nd_lla_len - nce->nce_ll_extract_start,
2404 	    IPV6_ADDR_LEN);
2405 	mask = (uchar_t *)&nce->nce_extract_mask;
2406 	mask += (IPV6_ADDR_LEN - len);
2407 	addr += (IPV6_ADDR_LEN - len);
2408 	to = addrpos + nce->nce_ll_extract_start;
2409 	while (len-- > 0)
2410 		*to++ |= *mask++ & *addr++;
2411 }
2412 
2413 mblk_t *
2414 nce_udreq_alloc(ill_t *ill)
2415 {
2416 	mblk_t	*template_mp = NULL;
2417 	dl_unitdata_req_t *dlur;
2418 	int	sap_length;
2419 
2420 	ASSERT(ill->ill_isv6);
2421 
2422 	sap_length = ill->ill_sap_length;
2423 	template_mp = ip_dlpi_alloc(sizeof (dl_unitdata_req_t) +
2424 	    ill->ill_nd_lla_len + ABS(sap_length), DL_UNITDATA_REQ);
2425 	if (template_mp == NULL)
2426 		return (NULL);
2427 
2428 	dlur = (dl_unitdata_req_t *)template_mp->b_rptr;
2429 	dlur->dl_priority.dl_min = 0;
2430 	dlur->dl_priority.dl_max = 0;
2431 	dlur->dl_dest_addr_length = ABS(sap_length) + ill->ill_nd_lla_len;
2432 	dlur->dl_dest_addr_offset = sizeof (dl_unitdata_req_t);
2433 
2434 	/* Copy in the SAP value. */
2435 	NCE_LL_SAP_COPY(ill, template_mp);
2436 
2437 	return (template_mp);
2438 }
2439 
2440 /*
2441  * NDP retransmit timer.
2442  * This timer goes off when:
2443  * a. It is time to retransmit NS for resolver.
2444  * b. It is time to send reachability probes.
2445  */
2446 void
2447 ndp_timer(void *arg)
2448 {
2449 	nce_t		*nce = arg;
2450 	ill_t		*ill = nce->nce_ill;
2451 	uint32_t	ms;
2452 	char		addrbuf[INET6_ADDRSTRLEN];
2453 	mblk_t		*mp;
2454 	boolean_t	dropped = B_FALSE;
2455 	ip_stack_t	*ipst = ill->ill_ipst;
2456 
2457 	/*
2458 	 * The timer has to be cancelled by ndp_delete before doing the final
2459 	 * refrele. So the NCE is guaranteed to exist when the timer runs
2460 	 * until it clears the timeout_id. Before clearing the timeout_id
2461 	 * bump up the refcnt so that we can continue to use the nce
2462 	 */
2463 	ASSERT(nce != NULL);
2464 
2465 	/*
2466 	 * Grab the ill_g_lock now itself to avoid lock order problems.
2467 	 * nce_solicit needs ill_g_lock to be able to traverse ills
2468 	 */
2469 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
2470 	mutex_enter(&nce->nce_lock);
2471 	NCE_REFHOLD_LOCKED(nce);
2472 	nce->nce_timeout_id = 0;
2473 
2474 	/*
2475 	 * Check the reachability state first.
2476 	 */
2477 	switch (nce->nce_state) {
2478 	case ND_DELAY:
2479 		rw_exit(&ipst->ips_ill_g_lock);
2480 		nce->nce_state = ND_PROBE;
2481 		mutex_exit(&nce->nce_lock);
2482 		(void) nce_xmit(ill, ND_NEIGHBOR_SOLICIT, NULL, B_FALSE,
2483 		    &ipv6_all_zeros, &nce->nce_addr, NDP_UNICAST);
2484 		if (ip_debug > 3) {
2485 			/* ip2dbg */
2486 			pr_addr_dbg("ndp_timer: state for %s changed "
2487 			    "to PROBE\n", AF_INET6, &nce->nce_addr);
2488 		}
2489 		NDP_RESTART_TIMER(nce, ill->ill_reachable_retrans_time);
2490 		NCE_REFRELE(nce);
2491 		return;
2492 	case ND_PROBE:
2493 		/* must be retransmit timer */
2494 		rw_exit(&ipst->ips_ill_g_lock);
2495 		nce->nce_pcnt--;
2496 		ASSERT(nce->nce_pcnt < ND_MAX_UNICAST_SOLICIT &&
2497 		    nce->nce_pcnt >= -1);
2498 		if (nce->nce_pcnt > 0) {
2499 			/*
2500 			 * As per RFC2461, the nce gets deleted after
2501 			 * MAX_UNICAST_SOLICIT unsuccessful re-transmissions.
2502 			 * Note that the first unicast solicitation is sent
2503 			 * during the DELAY state.
2504 			 */
2505 			ip2dbg(("ndp_timer: pcount=%x dst %s\n",
2506 			    nce->nce_pcnt, inet_ntop(AF_INET6, &nce->nce_addr,
2507 			    addrbuf, sizeof (addrbuf))));
2508 			mutex_exit(&nce->nce_lock);
2509 			dropped = nce_xmit(ill, ND_NEIGHBOR_SOLICIT, NULL,
2510 			    B_FALSE, &ipv6_all_zeros, &nce->nce_addr,
2511 			    (nce->nce_flags & NCE_F_PERMANENT) ? NDP_PROBE :
2512 			    NDP_UNICAST);
2513 			if (dropped) {
2514 				mutex_enter(&nce->nce_lock);
2515 				nce->nce_pcnt++;
2516 				mutex_exit(&nce->nce_lock);
2517 			}
2518 			NDP_RESTART_TIMER(nce, ILL_PROBE_INTERVAL(ill));
2519 		} else if (nce->nce_pcnt < 0) {
2520 			/* No hope, delete the nce */
2521 			nce->nce_state = ND_UNREACHABLE;
2522 			mutex_exit(&nce->nce_lock);
2523 			if (ip_debug > 2) {
2524 				/* ip1dbg */
2525 				pr_addr_dbg("ndp_timer: Delete IRE for"
2526 				    " dst %s\n", AF_INET6, &nce->nce_addr);
2527 			}
2528 			ndp_delete(nce);
2529 		} else if (!(nce->nce_flags & NCE_F_PERMANENT)) {
2530 			/* Wait RetransTimer, before deleting the entry */
2531 			ip2dbg(("ndp_timer: pcount=%x dst %s\n",
2532 			    nce->nce_pcnt, inet_ntop(AF_INET6,
2533 			    &nce->nce_addr, addrbuf, sizeof (addrbuf))));
2534 			mutex_exit(&nce->nce_lock);
2535 			/* Wait one interval before killing */
2536 			NDP_RESTART_TIMER(nce, ill->ill_reachable_retrans_time);
2537 		} else if (ill->ill_phyint->phyint_flags & PHYI_RUNNING) {
2538 			ipif_t *ipif;
2539 
2540 			/*
2541 			 * We're done probing, and we can now declare this
2542 			 * address to be usable.  Let IP know that it's ok to
2543 			 * use.
2544 			 */
2545 			nce->nce_state = ND_REACHABLE;
2546 			mutex_exit(&nce->nce_lock);
2547 			ipif = ipif_lookup_addr_v6(&nce->nce_addr, ill,
2548 			    ALL_ZONES, NULL, NULL, NULL, NULL, ipst);
2549 			if (ipif != NULL) {
2550 				if (ipif->ipif_was_dup) {
2551 					char ibuf[LIFNAMSIZ + 10];
2552 					char sbuf[INET6_ADDRSTRLEN];
2553 
2554 					ipif->ipif_was_dup = B_FALSE;
2555 					(void) inet_ntop(AF_INET6,
2556 					    &ipif->ipif_v6lcl_addr,
2557 					    sbuf, sizeof (sbuf));
2558 					ipif_get_name(ipif, ibuf,
2559 					    sizeof (ibuf));
2560 					cmn_err(CE_NOTE, "recovered address "
2561 					    "%s on %s", sbuf, ibuf);
2562 				}
2563 				if ((ipif->ipif_flags & IPIF_UP) &&
2564 				    !ipif->ipif_addr_ready) {
2565 					ip_rts_ifmsg(ipif);
2566 					ip_rts_newaddrmsg(RTM_ADD, 0, ipif);
2567 					sctp_update_ipif(ipif, SCTP_IPIF_UP);
2568 				}
2569 				ipif->ipif_addr_ready = 1;
2570 				ipif_refrele(ipif);
2571 			}
2572 			/* Begin defending our new address */
2573 			nce->nce_unsolicit_count = 0;
2574 			dropped = nce_xmit(ill, ND_NEIGHBOR_ADVERT, ill,
2575 			    B_FALSE, &nce->nce_addr, &ipv6_all_hosts_mcast,
2576 			    nce_advert_flags(nce));
2577 			if (dropped) {
2578 				nce->nce_unsolicit_count = 1;
2579 				NDP_RESTART_TIMER(nce,
2580 				    ipst->ips_ip_ndp_unsolicit_interval);
2581 			} else if (ipst->ips_ip_ndp_defense_interval != 0) {
2582 				NDP_RESTART_TIMER(nce,
2583 				    ipst->ips_ip_ndp_defense_interval);
2584 			}
2585 		} else {
2586 			/*
2587 			 * This is an address we're probing to be our own, but
2588 			 * the ill is down.  Wait until it comes back before
2589 			 * doing anything, but switch to reachable state so
2590 			 * that the restart will work.
2591 			 */
2592 			nce->nce_state = ND_REACHABLE;
2593 			mutex_exit(&nce->nce_lock);
2594 		}
2595 		NCE_REFRELE(nce);
2596 		return;
2597 	case ND_INCOMPLETE:
2598 		/*
2599 		 * Must be resolvers retransmit timer.
2600 		 */
2601 		for (mp = nce->nce_qd_mp; mp != NULL; mp = mp->b_next) {
2602 			ip6i_t	*ip6i;
2603 			ip6_t	*ip6h;
2604 			mblk_t *data_mp;
2605 
2606 			/*
2607 			 * Walk the list of packets queued, and see if there
2608 			 * are any multipathing probe packets. Such packets
2609 			 * are always queued at the head. Since this is a
2610 			 * retransmit timer firing, mark such packets as
2611 			 * delayed in ND resolution. This info will be used
2612 			 * in ip_wput_v6(). Multipathing probe packets will
2613 			 * always have an ip6i_t. Once we hit a packet without
2614 			 * it, we can break out of this loop.
2615 			 */
2616 			if (mp->b_datap->db_type == M_CTL)
2617 				data_mp = mp->b_cont;
2618 			else
2619 				data_mp = mp;
2620 
2621 			ip6h = (ip6_t *)data_mp->b_rptr;
2622 			if (ip6h->ip6_nxt != IPPROTO_RAW)
2623 				break;
2624 
2625 			/*
2626 			 * This message should have been pulled up already in
2627 			 * ip_wput_v6. We can't do pullups here because the
2628 			 * b_next/b_prev is non-NULL.
2629 			 */
2630 			ip6i = (ip6i_t *)ip6h;
2631 			ASSERT((data_mp->b_wptr - (uchar_t *)ip6i) >=
2632 			    sizeof (ip6i_t) + IPV6_HDR_LEN);
2633 
2634 			/* Mark this packet as delayed due to ND resolution */
2635 			if (ip6i->ip6i_flags & IP6I_DROP_IFDELAYED)
2636 				ip6i->ip6i_flags |= IP6I_ND_DELAYED;
2637 		}
2638 		if (nce->nce_qd_mp != NULL) {
2639 			ms = nce_solicit(nce, NULL);
2640 			rw_exit(&ipst->ips_ill_g_lock);
2641 			if (ms == 0) {
2642 				if (nce->nce_state != ND_REACHABLE) {
2643 					mutex_exit(&nce->nce_lock);
2644 					nce_resolv_failed(nce);
2645 					ndp_delete(nce);
2646 				} else {
2647 					mutex_exit(&nce->nce_lock);
2648 				}
2649 			} else {
2650 				mutex_exit(&nce->nce_lock);
2651 				NDP_RESTART_TIMER(nce, (clock_t)ms);
2652 			}
2653 			NCE_REFRELE(nce);
2654 			return;
2655 		}
2656 		mutex_exit(&nce->nce_lock);
2657 		rw_exit(&ipst->ips_ill_g_lock);
2658 		NCE_REFRELE(nce);
2659 		break;
2660 	case ND_REACHABLE :
2661 		rw_exit(&ipst->ips_ill_g_lock);
2662 		if (((nce->nce_flags & NCE_F_UNSOL_ADV) &&
2663 		    nce->nce_unsolicit_count != 0) ||
2664 		    ((nce->nce_flags & NCE_F_PERMANENT) &&
2665 		    ipst->ips_ip_ndp_defense_interval != 0)) {
2666 			if (nce->nce_unsolicit_count > 0)
2667 				nce->nce_unsolicit_count--;
2668 			mutex_exit(&nce->nce_lock);
2669 			dropped = nce_xmit(ill,
2670 			    ND_NEIGHBOR_ADVERT,
2671 			    ill,	/* ill to be used for hw addr */
2672 			    B_FALSE,	/* use ill_phys_addr */
2673 			    &nce->nce_addr,
2674 			    &ipv6_all_hosts_mcast,
2675 			    nce_advert_flags(nce));
2676 			if (dropped) {
2677 				mutex_enter(&nce->nce_lock);
2678 				nce->nce_unsolicit_count++;
2679 				mutex_exit(&nce->nce_lock);
2680 			}
2681 			if (nce->nce_unsolicit_count != 0) {
2682 				NDP_RESTART_TIMER(nce,
2683 				    ipst->ips_ip_ndp_unsolicit_interval);
2684 			} else {
2685 				NDP_RESTART_TIMER(nce,
2686 				    ipst->ips_ip_ndp_defense_interval);
2687 			}
2688 		} else {
2689 			mutex_exit(&nce->nce_lock);
2690 		}
2691 		NCE_REFRELE(nce);
2692 		break;
2693 	default:
2694 		rw_exit(&ipst->ips_ill_g_lock);
2695 		mutex_exit(&nce->nce_lock);
2696 		NCE_REFRELE(nce);
2697 		break;
2698 	}
2699 }
2700 
2701 /*
2702  * Set a link layer address from the ll_addr passed in.
2703  * Copy SAP from ill.
2704  */
2705 static void
2706 nce_set_ll(nce_t *nce, uchar_t *ll_addr)
2707 {
2708 	ill_t	*ill = nce->nce_ill;
2709 	uchar_t	*woffset;
2710 
2711 	ASSERT(ll_addr != NULL);
2712 	/* Always called before fast_path_probe */
2713 	ASSERT(nce->nce_fp_mp == NULL);
2714 	if (ill->ill_sap_length != 0) {
2715 		/*
2716 		 * Copy the SAP type specified in the
2717 		 * request into the xmit template.
2718 		 */
2719 		NCE_LL_SAP_COPY(ill, nce->nce_res_mp);
2720 	}
2721 	if (ill->ill_phys_addr_length > 0) {
2722 		/*
2723 		 * The bcopy() below used to be called for the physical address
2724 		 * length rather than the link layer address length. For
2725 		 * ethernet and many other media, the phys_addr and lla are
2726 		 * identical.
2727 		 * However, with xresolv interfaces being introduced, the
2728 		 * phys_addr and lla are no longer the same, and the physical
2729 		 * address may not have any useful meaning, so we use the lla
2730 		 * for IPv6 address resolution and destination addressing.
2731 		 *
2732 		 * For PPP or other interfaces with a zero length
2733 		 * physical address, don't do anything here.
2734 		 * The bcopy() with a zero phys_addr length was previously
2735 		 * a no-op for interfaces with a zero-length physical address.
2736 		 * Using the lla for them would change the way they operate.
2737 		 * Doing nothing in such cases preserves expected behavior.
2738 		 */
2739 		woffset = nce->nce_res_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill);
2740 		bcopy(ll_addr, woffset, ill->ill_nd_lla_len);
2741 	}
2742 }
2743 
2744 static boolean_t
2745 nce_cmp_ll_addr(const nce_t *nce, const uchar_t *ll_addr, uint32_t ll_addr_len)
2746 {
2747 	ill_t	*ill = nce->nce_ill;
2748 	uchar_t	*ll_offset;
2749 
2750 	ASSERT(nce->nce_res_mp != NULL);
2751 	if (ll_addr == NULL)
2752 		return (B_FALSE);
2753 	ll_offset = nce->nce_res_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill);
2754 	if (bcmp(ll_addr, ll_offset, ll_addr_len) != 0)
2755 		return (B_TRUE);
2756 	return (B_FALSE);
2757 }
2758 
2759 /*
2760  * Updates the link layer address or the reachability state of
2761  * a cache entry.  Reset probe counter if needed.
2762  */
2763 static void
2764 nce_update(nce_t *nce, uint16_t new_state, uchar_t *new_ll_addr)
2765 {
2766 	ill_t	*ill = nce->nce_ill;
2767 	boolean_t need_stop_timer = B_FALSE;
2768 	boolean_t need_fastpath_update = B_FALSE;
2769 
2770 	ASSERT(MUTEX_HELD(&nce->nce_lock));
2771 	ASSERT(nce->nce_ipversion == IPV6_VERSION);
2772 	/*
2773 	 * If this interface does not do NUD, there is no point
2774 	 * in allowing an update to the cache entry.  Although
2775 	 * we will respond to NS.
2776 	 * The only time we accept an update for a resolver when
2777 	 * NUD is turned off is when it has just been created.
2778 	 * Non-Resolvers will always be created as REACHABLE.
2779 	 */
2780 	if (new_state != ND_UNCHANGED) {
2781 		if ((nce->nce_flags & NCE_F_NONUD) &&
2782 		    (nce->nce_state != ND_INCOMPLETE))
2783 			return;
2784 		ASSERT((int16_t)new_state >= ND_STATE_VALID_MIN);
2785 		ASSERT((int16_t)new_state <= ND_STATE_VALID_MAX);
2786 		need_stop_timer = B_TRUE;
2787 		if (new_state == ND_REACHABLE)
2788 			nce->nce_last = TICK_TO_MSEC(lbolt64);
2789 		else {
2790 			/* We force NUD in this case */
2791 			nce->nce_last = 0;
2792 		}
2793 		nce->nce_state = new_state;
2794 		nce->nce_pcnt = ND_MAX_UNICAST_SOLICIT;
2795 	}
2796 	/*
2797 	 * In case of fast path we need to free the the fastpath
2798 	 * M_DATA and do another probe.  Otherwise we can just
2799 	 * overwrite the DL_UNITDATA_REQ data, noting we'll lose
2800 	 * whatever packets that happens to be transmitting at the time.
2801 	 */
2802 	if (new_ll_addr != NULL) {
2803 		ASSERT(nce->nce_res_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill) +
2804 		    ill->ill_nd_lla_len <= nce->nce_res_mp->b_wptr);
2805 		bcopy(new_ll_addr, nce->nce_res_mp->b_rptr +
2806 		    NCE_LL_ADDR_OFFSET(ill), ill->ill_nd_lla_len);
2807 		if (nce->nce_fp_mp != NULL) {
2808 			freemsg(nce->nce_fp_mp);
2809 			nce->nce_fp_mp = NULL;
2810 		}
2811 		need_fastpath_update = B_TRUE;
2812 	}
2813 	mutex_exit(&nce->nce_lock);
2814 	if (need_stop_timer) {
2815 		(void) untimeout(nce->nce_timeout_id);
2816 		nce->nce_timeout_id = 0;
2817 	}
2818 	if (need_fastpath_update)
2819 		nce_fastpath(nce);
2820 	mutex_enter(&nce->nce_lock);
2821 }
2822 
2823 void
2824 nce_queue_mp_common(nce_t *nce, mblk_t *mp, boolean_t head_insert)
2825 {
2826 	uint_t	count = 0;
2827 	mblk_t  **mpp;
2828 
2829 	ASSERT(MUTEX_HELD(&nce->nce_lock));
2830 
2831 	for (mpp = &nce->nce_qd_mp; *mpp != NULL;
2832 	    mpp = &(*mpp)->b_next) {
2833 		if (++count >
2834 		    nce->nce_ill->ill_max_buf) {
2835 			mblk_t *tmp = nce->nce_qd_mp->b_next;
2836 
2837 			nce->nce_qd_mp->b_next = NULL;
2838 			nce->nce_qd_mp->b_prev = NULL;
2839 			freemsg(nce->nce_qd_mp);
2840 			nce->nce_qd_mp = tmp;
2841 		}
2842 	}
2843 	/* put this on the list */
2844 	if (head_insert) {
2845 		mp->b_next = nce->nce_qd_mp;
2846 		nce->nce_qd_mp = mp;
2847 	} else {
2848 		*mpp = mp;
2849 	}
2850 }
2851 
2852 static void
2853 nce_queue_mp(nce_t *nce, mblk_t *mp)
2854 {
2855 	boolean_t head_insert = B_FALSE;
2856 	ip6_t	*ip6h;
2857 	ip6i_t	*ip6i;
2858 	mblk_t *data_mp;
2859 
2860 	ASSERT(MUTEX_HELD(&nce->nce_lock));
2861 
2862 	if (mp->b_datap->db_type == M_CTL)
2863 		data_mp = mp->b_cont;
2864 	else
2865 		data_mp = mp;
2866 	ip6h = (ip6_t *)data_mp->b_rptr;
2867 	if (ip6h->ip6_nxt == IPPROTO_RAW) {
2868 		/*
2869 		 * This message should have been pulled up already in
2870 		 * ip_wput_v6. We can't do pullups here because the message
2871 		 * could be from the nce_qd_mp which could have b_next/b_prev
2872 		 * non-NULL.
2873 		 */
2874 		ip6i = (ip6i_t *)ip6h;
2875 		ASSERT((data_mp->b_wptr - (uchar_t *)ip6i) >=
2876 		    sizeof (ip6i_t) + IPV6_HDR_LEN);
2877 		/*
2878 		 * Multipathing probe packets have IP6I_DROP_IFDELAYED set.
2879 		 * This has 2 aspects mentioned below.
2880 		 * 1. Perform head insertion in the nce_qd_mp for these packets.
2881 		 * This ensures that next retransmit of ND solicitation
2882 		 * will use the interface specified by the probe packet,
2883 		 * for both NS and NA. This corresponds to the src address
2884 		 * in the IPv6 packet. If we insert at tail, we will be
2885 		 * depending on the packet at the head for successful
2886 		 * ND resolution. This is not reliable, because the interface
2887 		 * on which the NA arrives could be different from the interface
2888 		 * on which the NS was sent, and if the receiving interface is
2889 		 * failed, it will appear that the sending interface is also
2890 		 * failed, causing in.mpathd to misdiagnose this as link
2891 		 * failure.
2892 		 * 2. Drop the original packet, if the ND resolution did not
2893 		 * succeed in the first attempt. However we will create the
2894 		 * nce and the ire, as soon as the ND resolution succeeds.
2895 		 * We don't gain anything by queueing multiple probe packets
2896 		 * and sending them back-to-back once resolution succeeds.
2897 		 * It is sufficient to send just 1 packet after ND resolution
2898 		 * succeeds. Since mpathd is sending down probe packets at a
2899 		 * constant rate, we don't need to send the queued packet. We
2900 		 * need to queue it only for NDP resolution. The benefit of
2901 		 * dropping the probe packets that were delayed in ND
2902 		 * resolution, is that in.mpathd will not see inflated
2903 		 * RTT. If the ND resolution does not succeed within
2904 		 * in.mpathd's failure detection time, mpathd may detect
2905 		 * a failure, and it does not matter whether the packet
2906 		 * was queued or dropped.
2907 		 */
2908 		if (ip6i->ip6i_flags & IP6I_DROP_IFDELAYED)
2909 			head_insert = B_TRUE;
2910 	}
2911 
2912 	nce_queue_mp_common(nce, mp, head_insert);
2913 }
2914 
2915 /*
2916  * Called when address resolution failed due to a timeout.
2917  * Send an ICMP unreachable in response to all queued packets.
2918  */
2919 void
2920 nce_resolv_failed(nce_t *nce)
2921 {
2922 	mblk_t	*mp, *nxt_mp, *first_mp;
2923 	char	buf[INET6_ADDRSTRLEN];
2924 	ip6_t *ip6h;
2925 	zoneid_t zoneid = GLOBAL_ZONEID;
2926 	ip_stack_t	*ipst = nce->nce_ill->ill_ipst;
2927 
2928 	ip1dbg(("nce_resolv_failed: dst %s\n",
2929 	    inet_ntop(AF_INET6, (char *)&nce->nce_addr, buf, sizeof (buf))));
2930 	mutex_enter(&nce->nce_lock);
2931 	mp = nce->nce_qd_mp;
2932 	nce->nce_qd_mp = NULL;
2933 	mutex_exit(&nce->nce_lock);
2934 	while (mp != NULL) {
2935 		nxt_mp = mp->b_next;
2936 		mp->b_next = NULL;
2937 		mp->b_prev = NULL;
2938 
2939 		first_mp = mp;
2940 		if (mp->b_datap->db_type == M_CTL) {
2941 			ipsec_out_t *io = (ipsec_out_t *)mp->b_rptr;
2942 			ASSERT(io->ipsec_out_type == IPSEC_OUT);
2943 			zoneid = io->ipsec_out_zoneid;
2944 			ASSERT(zoneid != ALL_ZONES);
2945 			mp = mp->b_cont;
2946 			mp->b_next = NULL;
2947 			mp->b_prev = NULL;
2948 		}
2949 
2950 		ip6h = (ip6_t *)mp->b_rptr;
2951 		if (ip6h->ip6_nxt == IPPROTO_RAW) {
2952 			ip6i_t *ip6i;
2953 			/*
2954 			 * This message should have been pulled up already
2955 			 * in ip_wput_v6. ip_hdr_complete_v6 assumes that
2956 			 * the header is pulled up.
2957 			 */
2958 			ip6i = (ip6i_t *)ip6h;
2959 			ASSERT((mp->b_wptr - (uchar_t *)ip6i) >=
2960 			    sizeof (ip6i_t) + IPV6_HDR_LEN);
2961 			mp->b_rptr += sizeof (ip6i_t);
2962 		}
2963 		/*
2964 		 * Ignore failure since icmp_unreachable_v6 will silently
2965 		 * drop packets with an unspecified source address.
2966 		 */
2967 		(void) ip_hdr_complete_v6((ip6_t *)mp->b_rptr, zoneid, ipst);
2968 		icmp_unreachable_v6(nce->nce_ill->ill_wq, first_mp,
2969 		    ICMP6_DST_UNREACH_ADDR, B_FALSE, B_FALSE, zoneid, ipst);
2970 		mp = nxt_mp;
2971 	}
2972 }
2973 
2974 /*
2975  * Called by SIOCSNDP* ioctl to add/change an nce entry
2976  * and the corresponding attributes.
2977  * Disallow states other than ND_REACHABLE or ND_STALE.
2978  */
2979 int
2980 ndp_sioc_update(ill_t *ill, lif_nd_req_t *lnr)
2981 {
2982 	sin6_t		*sin6;
2983 	in6_addr_t	*addr;
2984 	nce_t		*nce;
2985 	int		err;
2986 	uint16_t	new_flags = 0;
2987 	uint16_t	old_flags = 0;
2988 	int		inflags = lnr->lnr_flags;
2989 	ip_stack_t	*ipst = ill->ill_ipst;
2990 
2991 	ASSERT(ill->ill_isv6);
2992 	if ((lnr->lnr_state_create != ND_REACHABLE) &&
2993 	    (lnr->lnr_state_create != ND_STALE))
2994 		return (EINVAL);
2995 
2996 	sin6 = (sin6_t *)&lnr->lnr_addr;
2997 	addr = &sin6->sin6_addr;
2998 
2999 	mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
3000 	/* We know it can not be mapping so just look in the hash table */
3001 	nce = *((nce_t **)NCE_HASH_PTR_V6(ipst, *addr));
3002 	nce = nce_lookup_addr(ill, addr, nce);
3003 	if (nce != NULL)
3004 		new_flags = nce->nce_flags;
3005 
3006 	switch (inflags & (NDF_ISROUTER_ON|NDF_ISROUTER_OFF)) {
3007 	case NDF_ISROUTER_ON:
3008 		new_flags |= NCE_F_ISROUTER;
3009 		break;
3010 	case NDF_ISROUTER_OFF:
3011 		new_flags &= ~NCE_F_ISROUTER;
3012 		break;
3013 	case (NDF_ISROUTER_OFF|NDF_ISROUTER_ON):
3014 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
3015 		if (nce != NULL)
3016 			NCE_REFRELE(nce);
3017 		return (EINVAL);
3018 	}
3019 
3020 	switch (inflags & (NDF_ANYCAST_ON|NDF_ANYCAST_OFF)) {
3021 	case NDF_ANYCAST_ON:
3022 		new_flags |= NCE_F_ANYCAST;
3023 		break;
3024 	case NDF_ANYCAST_OFF:
3025 		new_flags &= ~NCE_F_ANYCAST;
3026 		break;
3027 	case (NDF_ANYCAST_OFF|NDF_ANYCAST_ON):
3028 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
3029 		if (nce != NULL)
3030 			NCE_REFRELE(nce);
3031 		return (EINVAL);
3032 	}
3033 
3034 	if (nce == NULL) {
3035 		err = ndp_add_v6(ill,
3036 		    (uchar_t *)lnr->lnr_hdw_addr,
3037 		    addr,
3038 		    &ipv6_all_ones,
3039 		    &ipv6_all_zeros,
3040 		    0,
3041 		    new_flags,
3042 		    lnr->lnr_state_create,
3043 		    &nce);
3044 		if (err != 0) {
3045 			mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
3046 			ip1dbg(("ndp_sioc_update: Can't create NCE %d\n", err));
3047 			return (err);
3048 		}
3049 	}
3050 	old_flags = nce->nce_flags;
3051 	if (old_flags & NCE_F_ISROUTER && !(new_flags & NCE_F_ISROUTER)) {
3052 		/*
3053 		 * Router turned to host, delete all ires.
3054 		 * XXX Just delete the entry, but we need to add too.
3055 		 */
3056 		nce->nce_flags &= ~NCE_F_ISROUTER;
3057 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
3058 		ndp_delete(nce);
3059 		NCE_REFRELE(nce);
3060 		return (0);
3061 	}
3062 	mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
3063 
3064 	mutex_enter(&nce->nce_lock);
3065 	nce->nce_flags = new_flags;
3066 	mutex_exit(&nce->nce_lock);
3067 	/*
3068 	 * Note that we ignore the state at this point, which
3069 	 * should be either STALE or REACHABLE.  Instead we let
3070 	 * the link layer address passed in to determine the state
3071 	 * much like incoming packets.
3072 	 */
3073 	ndp_process(nce, (uchar_t *)lnr->lnr_hdw_addr, 0, B_FALSE);
3074 	NCE_REFRELE(nce);
3075 	return (0);
3076 }
3077 
3078 /*
3079  * If the device driver supports it, we make nce_fp_mp to have
3080  * an M_DATA prepend.  Otherwise nce_fp_mp will be null.
3081  * The caller ensures there is hold on nce for this function.
3082  * Note that since ill_fastpath_probe() copies the mblk there is
3083  * no need for the hold beyond this function.
3084  */
3085 void
3086 nce_fastpath(nce_t *nce)
3087 {
3088 	ill_t	*ill = nce->nce_ill;
3089 	int res;
3090 
3091 	ASSERT(ill != NULL);
3092 	ASSERT(nce->nce_state != ND_INITIAL && nce->nce_state != ND_INCOMPLETE);
3093 
3094 	if (nce->nce_fp_mp != NULL) {
3095 		/* Already contains fastpath info */
3096 		return;
3097 	}
3098 	if (nce->nce_res_mp != NULL) {
3099 		nce_fastpath_list_add(nce);
3100 		res = ill_fastpath_probe(ill, nce->nce_res_mp);
3101 		/*
3102 		 * EAGAIN is an indication of a transient error
3103 		 * i.e. allocation failure etc. leave the nce in the list it
3104 		 * will be updated when another probe happens for another ire
3105 		 * if not it will be taken out of the list when the ire is
3106 		 * deleted.
3107 		 */
3108 
3109 		if (res != 0 && res != EAGAIN)
3110 			nce_fastpath_list_delete(nce);
3111 	}
3112 }
3113 
3114 /*
3115  * Drain the list of nce's waiting for fastpath response.
3116  */
3117 void
3118 nce_fastpath_list_dispatch(ill_t *ill, boolean_t (*func)(nce_t *, void  *),
3119     void *arg)
3120 {
3121 
3122 	nce_t *next_nce;
3123 	nce_t *current_nce;
3124 	nce_t *first_nce;
3125 	nce_t *prev_nce = NULL;
3126 
3127 	mutex_enter(&ill->ill_lock);
3128 	first_nce = current_nce = (nce_t *)ill->ill_fastpath_list;
3129 	while (current_nce != (nce_t *)&ill->ill_fastpath_list) {
3130 		next_nce = current_nce->nce_fastpath;
3131 		/*
3132 		 * Take it off the list if we're flushing, or if the callback
3133 		 * routine tells us to do so.  Otherwise, leave the nce in the
3134 		 * fastpath list to handle any pending response from the lower
3135 		 * layer.  We can't drain the list when the callback routine
3136 		 * comparison failed, because the response is asynchronous in
3137 		 * nature, and may not arrive in the same order as the list
3138 		 * insertion.
3139 		 */
3140 		if (func == NULL || func(current_nce, arg)) {
3141 			current_nce->nce_fastpath = NULL;
3142 			if (current_nce == first_nce)
3143 				ill->ill_fastpath_list = first_nce = next_nce;
3144 			else
3145 				prev_nce->nce_fastpath = next_nce;
3146 		} else {
3147 			/* previous element that is still in the list */
3148 			prev_nce = current_nce;
3149 		}
3150 		current_nce = next_nce;
3151 	}
3152 	mutex_exit(&ill->ill_lock);
3153 }
3154 
3155 /*
3156  * Add nce to the nce fastpath list.
3157  */
3158 void
3159 nce_fastpath_list_add(nce_t *nce)
3160 {
3161 	ill_t *ill;
3162 
3163 	ill = nce->nce_ill;
3164 
3165 	mutex_enter(&ill->ill_lock);
3166 	mutex_enter(&nce->nce_lock);
3167 
3168 	/*
3169 	 * if nce has not been deleted and
3170 	 * is not already in the list add it.
3171 	 */
3172 	if (!(nce->nce_flags & NCE_F_CONDEMNED) &&
3173 	    (nce->nce_fastpath == NULL)) {
3174 		nce->nce_fastpath = (nce_t *)ill->ill_fastpath_list;
3175 		ill->ill_fastpath_list = nce;
3176 	}
3177 
3178 	mutex_exit(&nce->nce_lock);
3179 	mutex_exit(&ill->ill_lock);
3180 }
3181 
3182 /*
3183  * remove nce from the nce fastpath list.
3184  */
3185 void
3186 nce_fastpath_list_delete(nce_t *nce)
3187 {
3188 	nce_t *nce_ptr;
3189 
3190 	ill_t *ill;
3191 
3192 	ill = nce->nce_ill;
3193 	ASSERT(ill != NULL);
3194 
3195 	mutex_enter(&ill->ill_lock);
3196 	if (nce->nce_fastpath == NULL)
3197 		goto done;
3198 
3199 	ASSERT(ill->ill_fastpath_list != &ill->ill_fastpath_list);
3200 
3201 	if (ill->ill_fastpath_list == nce) {
3202 		ill->ill_fastpath_list = nce->nce_fastpath;
3203 	} else {
3204 		nce_ptr = ill->ill_fastpath_list;
3205 		while (nce_ptr != (nce_t *)&ill->ill_fastpath_list) {
3206 			if (nce_ptr->nce_fastpath == nce) {
3207 				nce_ptr->nce_fastpath = nce->nce_fastpath;
3208 				break;
3209 			}
3210 			nce_ptr = nce_ptr->nce_fastpath;
3211 		}
3212 	}
3213 
3214 	nce->nce_fastpath = NULL;
3215 done:
3216 	mutex_exit(&ill->ill_lock);
3217 }
3218 
3219 /*
3220  * Update all NCE's that are not in fastpath mode and
3221  * have an nce_fp_mp that matches mp. mp->b_cont contains
3222  * the fastpath header.
3223  *
3224  * Returns TRUE if entry should be dequeued, or FALSE otherwise.
3225  */
3226 boolean_t
3227 ndp_fastpath_update(nce_t *nce, void *arg)
3228 {
3229 	mblk_t 	*mp, *fp_mp;
3230 	uchar_t	*mp_rptr, *ud_mp_rptr;
3231 	mblk_t	*ud_mp = nce->nce_res_mp;
3232 	ptrdiff_t	cmplen;
3233 
3234 	if (nce->nce_flags & NCE_F_MAPPING)
3235 		return (B_TRUE);
3236 	if ((nce->nce_fp_mp != NULL) || (ud_mp == NULL))
3237 		return (B_TRUE);
3238 
3239 	ip2dbg(("ndp_fastpath_update: trying\n"));
3240 	mp = (mblk_t *)arg;
3241 	mp_rptr = mp->b_rptr;
3242 	cmplen = mp->b_wptr - mp_rptr;
3243 	ASSERT(cmplen >= 0);
3244 	ud_mp_rptr = ud_mp->b_rptr;
3245 	/*
3246 	 * The nce is locked here to prevent any other threads
3247 	 * from accessing and changing nce_res_mp when the IPv6 address
3248 	 * becomes resolved to an lla while we're in the middle
3249 	 * of looking at and comparing the hardware address (lla).
3250 	 * It is also locked to prevent multiple threads in nce_fastpath_update
3251 	 * from examining nce_res_mp atthe same time.
3252 	 */
3253 	mutex_enter(&nce->nce_lock);
3254 	if (ud_mp->b_wptr - ud_mp_rptr != cmplen ||
3255 	    bcmp((char *)mp_rptr, (char *)ud_mp_rptr, cmplen) != 0) {
3256 		mutex_exit(&nce->nce_lock);
3257 		/*
3258 		 * Don't take the ire off the fastpath list yet,
3259 		 * since the response may come later.
3260 		 */
3261 		return (B_FALSE);
3262 	}
3263 	/* Matched - install mp as the fastpath mp */
3264 	ip1dbg(("ndp_fastpath_update: match\n"));
3265 	fp_mp = dupb(mp->b_cont);
3266 	if (fp_mp != NULL) {
3267 		nce->nce_fp_mp = fp_mp;
3268 	}
3269 	mutex_exit(&nce->nce_lock);
3270 	return (B_TRUE);
3271 }
3272 
3273 /*
3274  * This function handles the DL_NOTE_FASTPATH_FLUSH notification from
3275  * driver.  Note that it assumes IP is exclusive...
3276  */
3277 /* ARGSUSED */
3278 void
3279 ndp_fastpath_flush(nce_t *nce, char *arg)
3280 {
3281 	if (nce->nce_flags & NCE_F_MAPPING)
3282 		return;
3283 	/* No fastpath info? */
3284 	if (nce->nce_fp_mp == NULL || nce->nce_res_mp == NULL)
3285 		return;
3286 
3287 	if (nce->nce_ipversion == IPV4_VERSION &&
3288 	    nce->nce_flags & NCE_F_BCAST) {
3289 		/*
3290 		 * IPv4 BROADCAST entries:
3291 		 * We can't delete the nce since it is difficult to
3292 		 * recreate these without going through the
3293 		 * ipif down/up dance.
3294 		 *
3295 		 * All access to nce->nce_fp_mp in the case of these
3296 		 * is protected by nce_lock.
3297 		 */
3298 		mutex_enter(&nce->nce_lock);
3299 		if (nce->nce_fp_mp != NULL) {
3300 			freeb(nce->nce_fp_mp);
3301 			nce->nce_fp_mp = NULL;
3302 			mutex_exit(&nce->nce_lock);
3303 			nce_fastpath(nce);
3304 		} else {
3305 			mutex_exit(&nce->nce_lock);
3306 		}
3307 	} else {
3308 		/* Just delete the NCE... */
3309 		ndp_delete(nce);
3310 	}
3311 }
3312 
3313 /*
3314  * Return a pointer to a given option in the packet.
3315  * Assumes that option part of the packet have already been validated.
3316  */
3317 nd_opt_hdr_t *
3318 ndp_get_option(nd_opt_hdr_t *opt, int optlen, int opt_type)
3319 {
3320 	while (optlen > 0) {
3321 		if (opt->nd_opt_type == opt_type)
3322 			return (opt);
3323 		optlen -= 8 * opt->nd_opt_len;
3324 		opt = (struct nd_opt_hdr *)((char *)opt + 8 * opt->nd_opt_len);
3325 	}
3326 	return (NULL);
3327 }
3328 
3329 /*
3330  * Verify all option lengths present are > 0, also check to see
3331  * if the option lengths and packet length are consistent.
3332  */
3333 boolean_t
3334 ndp_verify_optlen(nd_opt_hdr_t *opt, int optlen)
3335 {
3336 	ASSERT(opt != NULL);
3337 	while (optlen > 0) {
3338 		if (opt->nd_opt_len == 0)
3339 			return (B_FALSE);
3340 		optlen -= 8 * opt->nd_opt_len;
3341 		if (optlen < 0)
3342 			return (B_FALSE);
3343 		opt = (struct nd_opt_hdr *)((char *)opt + 8 * opt->nd_opt_len);
3344 	}
3345 	return (B_TRUE);
3346 }
3347 
3348 /*
3349  * ndp_walk function.
3350  * Free a fraction of the NCE cache entries.
3351  * A fraction of zero means to not free any in that category.
3352  */
3353 void
3354 ndp_cache_reclaim(nce_t *nce, char *arg)
3355 {
3356 	nce_cache_reclaim_t *ncr = (nce_cache_reclaim_t *)arg;
3357 	uint_t	rand;
3358 
3359 	if (nce->nce_flags & NCE_F_PERMANENT)
3360 		return;
3361 
3362 	rand = (uint_t)lbolt +
3363 	    NCE_ADDR_HASH_V6(nce->nce_addr, NCE_TABLE_SIZE);
3364 	if (ncr->ncr_host != 0 &&
3365 	    (rand/ncr->ncr_host)*ncr->ncr_host == rand) {
3366 		ndp_delete(nce);
3367 		return;
3368 	}
3369 }
3370 
3371 /*
3372  * ndp_walk function.
3373  * Count the number of NCEs that can be deleted.
3374  * These would be hosts but not routers.
3375  */
3376 void
3377 ndp_cache_count(nce_t *nce, char *arg)
3378 {
3379 	ncc_cache_count_t *ncc = (ncc_cache_count_t *)arg;
3380 
3381 	if (nce->nce_flags & NCE_F_PERMANENT)
3382 		return;
3383 
3384 	ncc->ncc_total++;
3385 	if (!(nce->nce_flags & NCE_F_ISROUTER))
3386 		ncc->ncc_host++;
3387 }
3388 
3389 #ifdef DEBUG
3390 void
3391 nce_trace_ref(nce_t *nce)
3392 {
3393 	ASSERT(MUTEX_HELD(&nce->nce_lock));
3394 
3395 	if (nce->nce_trace_disable)
3396 		return;
3397 
3398 	if (!th_trace_ref(nce, nce->nce_ill->ill_ipst)) {
3399 		nce->nce_trace_disable = B_TRUE;
3400 		nce_trace_cleanup(nce);
3401 	}
3402 }
3403 
3404 void
3405 nce_untrace_ref(nce_t *nce)
3406 {
3407 	ASSERT(MUTEX_HELD(&nce->nce_lock));
3408 
3409 	if (!nce->nce_trace_disable)
3410 		th_trace_unref(nce);
3411 }
3412 
3413 static void
3414 nce_trace_cleanup(const nce_t *nce)
3415 {
3416 	th_trace_cleanup(nce, nce->nce_trace_disable);
3417 }
3418 #endif
3419 
3420 /*
3421  * Called when address resolution fails due to a timeout.
3422  * Send an ICMP unreachable in response to all queued packets.
3423  */
3424 void
3425 arp_resolv_failed(nce_t *nce)
3426 {
3427 	mblk_t	*mp, *nxt_mp, *first_mp;
3428 	char	buf[INET6_ADDRSTRLEN];
3429 	zoneid_t zoneid = GLOBAL_ZONEID;
3430 	struct in_addr ipv4addr;
3431 	ip_stack_t *ipst = nce->nce_ill->ill_ipst;
3432 
3433 	IN6_V4MAPPED_TO_INADDR(&nce->nce_addr, &ipv4addr);
3434 	ip3dbg(("arp_resolv_failed: dst %s\n",
3435 	    inet_ntop(AF_INET, &ipv4addr, buf, sizeof (buf))));
3436 	mutex_enter(&nce->nce_lock);
3437 	mp = nce->nce_qd_mp;
3438 	nce->nce_qd_mp = NULL;
3439 	mutex_exit(&nce->nce_lock);
3440 
3441 	while (mp != NULL) {
3442 		nxt_mp = mp->b_next;
3443 		mp->b_next = NULL;
3444 		mp->b_prev = NULL;
3445 
3446 		first_mp = mp;
3447 		/*
3448 		 * Send icmp unreachable messages
3449 		 * to the hosts.
3450 		 */
3451 		(void) ip_hdr_complete((ipha_t *)mp->b_rptr, zoneid, ipst);
3452 		ip3dbg(("arp_resolv_failed: Calling icmp_unreachable\n"));
3453 		icmp_unreachable(nce->nce_ill->ill_wq, first_mp,
3454 		    ICMP_HOST_UNREACHABLE, zoneid, ipst);
3455 		mp = nxt_mp;
3456 	}
3457 }
3458 
3459 int
3460 ndp_lookup_then_add_v4(ill_t *ill, const in_addr_t *addr, uint16_t flags,
3461     nce_t **newnce, nce_t *src_nce)
3462 {
3463 	int	err;
3464 	nce_t	*nce;
3465 	in6_addr_t addr6;
3466 	ip_stack_t *ipst = ill->ill_ipst;
3467 
3468 	mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
3469 	nce = *((nce_t **)NCE_HASH_PTR_V4(ipst, *addr));
3470 	IN6_IPADDR_TO_V4MAPPED(*addr, &addr6);
3471 	nce = nce_lookup_addr(ill, &addr6, nce);
3472 	if (nce == NULL) {
3473 		err = ndp_add_v4(ill, addr, flags, newnce, src_nce);
3474 	} else {
3475 		*newnce = nce;
3476 		err = EEXIST;
3477 	}
3478 	mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
3479 	return (err);
3480 }
3481 
3482 /*
3483  * NDP Cache Entry creation routine for IPv4.
3484  * Mapped entries are handled in arp.
3485  * This routine must always be called with ndp4->ndp_g_lock held.
3486  * Prior to return, nce_refcnt is incremented.
3487  */
3488 static int
3489 ndp_add_v4(ill_t *ill, const in_addr_t *addr, uint16_t flags,
3490     nce_t **newnce, nce_t *src_nce)
3491 {
3492 	static	nce_t		nce_nil;
3493 	nce_t		*nce;
3494 	mblk_t		*mp;
3495 	mblk_t		*template = NULL;
3496 	nce_t		**ncep;
3497 	ip_stack_t	*ipst = ill->ill_ipst;
3498 	uint16_t	state = ND_INITIAL;
3499 	int		err;
3500 
3501 	ASSERT(MUTEX_HELD(&ipst->ips_ndp4->ndp_g_lock));
3502 	ASSERT(!ill->ill_isv6);
3503 	ASSERT((flags & NCE_F_MAPPING) == 0);
3504 
3505 	if (ill->ill_resolver_mp == NULL)
3506 		return (EINVAL);
3507 	/*
3508 	 * Allocate the mblk to hold the nce.
3509 	 */
3510 	mp = allocb(sizeof (nce_t), BPRI_MED);
3511 	if (mp == NULL)
3512 		return (ENOMEM);
3513 
3514 	nce = (nce_t *)mp->b_rptr;
3515 	mp->b_wptr = (uchar_t *)&nce[1];
3516 	*nce = nce_nil;
3517 	nce->nce_ill = ill;
3518 	nce->nce_ipversion = IPV4_VERSION;
3519 	nce->nce_flags = flags;
3520 	nce->nce_pcnt = ND_MAX_UNICAST_SOLICIT;
3521 	nce->nce_rcnt = ill->ill_xmit_count;
3522 	IN6_IPADDR_TO_V4MAPPED(*addr, &nce->nce_addr);
3523 	nce->nce_mask = ipv6_all_ones;
3524 	nce->nce_extract_mask = ipv6_all_zeros;
3525 	nce->nce_ll_extract_start = 0;
3526 	nce->nce_qd_mp = NULL;
3527 	nce->nce_mp = mp;
3528 	/* This one is for nce getting created */
3529 	nce->nce_refcnt = 1;
3530 	mutex_init(&nce->nce_lock, NULL, MUTEX_DEFAULT, NULL);
3531 	ncep = ((nce_t **)NCE_HASH_PTR_V4(ipst, *addr));
3532 
3533 	nce->nce_trace_disable = B_FALSE;
3534 
3535 	if (src_nce != NULL) {
3536 		/*
3537 		 * src_nce has been provided by the caller. The only
3538 		 * caller who provides a non-null, non-broadcast
3539 		 * src_nce is from ip_newroute() which must pass in
3540 		 * a ND_REACHABLE src_nce (this condition is verified
3541 		 * via an ASSERT for the save_ire->ire_nce in ip_newroute())
3542 		 */
3543 		mutex_enter(&src_nce->nce_lock);
3544 		state = src_nce->nce_state;
3545 		if ((src_nce->nce_flags & NCE_F_CONDEMNED) ||
3546 		    (ipst->ips_ndp4->ndp_g_hw_change > 0)) {
3547 			/*
3548 			 * src_nce has been deleted, or
3549 			 * ip_arp_news is in the middle of
3550 			 * flushing entries in the the nce.
3551 			 * Fail the add, since we don't know
3552 			 * if it is safe to copy the contents of
3553 			 * src_nce
3554 			 */
3555 			DTRACE_PROBE2(nce__bad__src__nce,
3556 			    nce_t *, src_nce, ill_t *, ill);
3557 			mutex_exit(&src_nce->nce_lock);
3558 			err = EINVAL;
3559 			goto err_ret;
3560 		}
3561 		template = copyb(src_nce->nce_res_mp);
3562 		mutex_exit(&src_nce->nce_lock);
3563 		if (template == NULL) {
3564 			err = ENOMEM;
3565 			goto err_ret;
3566 		}
3567 	} else if (flags & NCE_F_BCAST) {
3568 		/*
3569 		 * broadcast nce.
3570 		 */
3571 		template = copyb(ill->ill_bcast_mp);
3572 		if (template == NULL) {
3573 			err = ENOMEM;
3574 			goto err_ret;
3575 		}
3576 		state = ND_REACHABLE;
3577 	} else if (ill->ill_net_type == IRE_IF_NORESOLVER) {
3578 		/*
3579 		 * NORESOLVER entries are always created in the REACHABLE
3580 		 * state. We create a nce_res_mp with the IP nexthop address
3581 		 * in the destination address in the DLPI hdr if the
3582 		 * physical length is exactly 4 bytes.
3583 		 *
3584 		 * XXX not clear which drivers set ill_phys_addr_length to
3585 		 * IP_ADDR_LEN.
3586 		 */
3587 		if (ill->ill_phys_addr_length == IP_ADDR_LEN) {
3588 			template = ill_dlur_gen((uchar_t *)addr,
3589 			    ill->ill_phys_addr_length,
3590 			    ill->ill_sap, ill->ill_sap_length);
3591 		} else {
3592 			template = copyb(ill->ill_resolver_mp);
3593 		}
3594 		if (template == NULL) {
3595 			err = ENOMEM;
3596 			goto err_ret;
3597 		}
3598 		state = ND_REACHABLE;
3599 	}
3600 	nce->nce_fp_mp = NULL;
3601 	nce->nce_res_mp = template;
3602 	nce->nce_state = state;
3603 	if (state == ND_REACHABLE) {
3604 		nce->nce_last = TICK_TO_MSEC(lbolt64);
3605 		nce->nce_init_time = TICK_TO_MSEC(lbolt64);
3606 	} else {
3607 		nce->nce_last = 0;
3608 		if (state == ND_INITIAL)
3609 			nce->nce_init_time = TICK_TO_MSEC(lbolt64);
3610 	}
3611 
3612 	ASSERT((nce->nce_res_mp == NULL && nce->nce_state == ND_INITIAL) ||
3613 	    (nce->nce_res_mp != NULL && nce->nce_state == ND_REACHABLE));
3614 	/*
3615 	 * Atomically ensure that the ill is not CONDEMNED, before
3616 	 * adding the NCE.
3617 	 */
3618 	mutex_enter(&ill->ill_lock);
3619 	if (ill->ill_state_flags & ILL_CONDEMNED) {
3620 		mutex_exit(&ill->ill_lock);
3621 		err = EINVAL;
3622 		goto err_ret;
3623 	}
3624 	if ((nce->nce_next = *ncep) != NULL)
3625 		nce->nce_next->nce_ptpn = &nce->nce_next;
3626 	*ncep = nce;
3627 	nce->nce_ptpn = ncep;
3628 	*newnce = nce;
3629 	/* This one is for nce being used by an active thread */
3630 	NCE_REFHOLD(*newnce);
3631 
3632 	/* Bump up the number of nce's referencing this ill */
3633 	DTRACE_PROBE3(ill__incr__cnt, (ill_t *), ill,
3634 	    (char *), "nce", (void *), nce);
3635 	ill->ill_nce_cnt++;
3636 	mutex_exit(&ill->ill_lock);
3637 	DTRACE_PROBE1(ndp__add__v4, nce_t *, nce);
3638 	return (0);
3639 err_ret:
3640 	freeb(mp);
3641 	freemsg(template);
3642 	return (err);
3643 }
3644 
3645 /*
3646  * ndp_walk routine to delete all entries that have a given destination or
3647  * gateway address and cached link layer (MAC) address.  This is used when ARP
3648  * informs us that a network-to-link-layer mapping may have changed.
3649  */
3650 void
3651 nce_delete_hw_changed(nce_t *nce, void *arg)
3652 {
3653 	nce_hw_map_t *hwm = arg;
3654 	mblk_t *mp;
3655 	dl_unitdata_req_t *dlu;
3656 	uchar_t *macaddr;
3657 	ill_t *ill;
3658 	int saplen;
3659 	ipaddr_t nce_addr;
3660 
3661 	if (nce->nce_state != ND_REACHABLE)
3662 		return;
3663 
3664 	IN6_V4MAPPED_TO_IPADDR(&nce->nce_addr, nce_addr);
3665 	if (nce_addr != hwm->hwm_addr)
3666 		return;
3667 
3668 	mutex_enter(&nce->nce_lock);
3669 	if ((mp = nce->nce_res_mp) == NULL) {
3670 		mutex_exit(&nce->nce_lock);
3671 		return;
3672 	}
3673 	dlu = (dl_unitdata_req_t *)mp->b_rptr;
3674 	macaddr = (uchar_t *)(dlu + 1);
3675 	ill = nce->nce_ill;
3676 	if ((saplen = ill->ill_sap_length) > 0)
3677 		macaddr += saplen;
3678 	else
3679 		saplen = -saplen;
3680 
3681 	/*
3682 	 * If the hardware address is unchanged, then leave this one alone.
3683 	 * Note that saplen == abs(saplen) now.
3684 	 */
3685 	if (hwm->hwm_hwlen == dlu->dl_dest_addr_length - saplen &&
3686 	    bcmp(hwm->hwm_hwaddr, macaddr, hwm->hwm_hwlen) == 0) {
3687 		mutex_exit(&nce->nce_lock);
3688 		return;
3689 	}
3690 	mutex_exit(&nce->nce_lock);
3691 
3692 	DTRACE_PROBE1(nce__hw__deleted, nce_t *, nce);
3693 	ndp_delete(nce);
3694 }
3695 
3696 /*
3697  * This function verifies whether a given IPv4 address is potentially known to
3698  * the NCE subsystem.  If so, then ARP must not delete the corresponding ace_t,
3699  * so that it can continue to look for hardware changes on that address.
3700  */
3701 boolean_t
3702 ndp_lookup_ipaddr(in_addr_t addr, netstack_t *ns)
3703 {
3704 	nce_t		*nce;
3705 	struct in_addr	nceaddr;
3706 	ip_stack_t	*ipst = ns->netstack_ip;
3707 
3708 	if (addr == INADDR_ANY)
3709 		return (B_FALSE);
3710 
3711 	mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
3712 	nce = *(nce_t **)NCE_HASH_PTR_V4(ipst, addr);
3713 	for (; nce != NULL; nce = nce->nce_next) {
3714 		/* Note that only v4 mapped entries are in the table. */
3715 		IN6_V4MAPPED_TO_INADDR(&nce->nce_addr, &nceaddr);
3716 		if (addr == nceaddr.s_addr &&
3717 		    IN6_ARE_ADDR_EQUAL(&nce->nce_mask, &ipv6_all_ones)) {
3718 			/* Single flag check; no lock needed */
3719 			if (!(nce->nce_flags & NCE_F_CONDEMNED))
3720 				break;
3721 		}
3722 	}
3723 	mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
3724 	return (nce != NULL);
3725 }
3726