xref: /titanic_51/usr/src/uts/common/inet/ip/ip_ndp.c (revision e45175738699353ea28df283c6ee436b5f1feac1)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 #include <sys/types.h>
29 #include <sys/stream.h>
30 #include <sys/stropts.h>
31 #include <sys/strsun.h>
32 #include <sys/sysmacros.h>
33 #include <sys/errno.h>
34 #include <sys/dlpi.h>
35 #include <sys/socket.h>
36 #include <sys/ddi.h>
37 #include <sys/sunddi.h>
38 #include <sys/cmn_err.h>
39 #include <sys/debug.h>
40 #include <sys/vtrace.h>
41 #include <sys/kmem.h>
42 #include <sys/zone.h>
43 #include <sys/ethernet.h>
44 #include <sys/sdt.h>
45 
46 #include <net/if.h>
47 #include <net/if_types.h>
48 #include <net/if_dl.h>
49 #include <net/route.h>
50 #include <netinet/in.h>
51 #include <netinet/ip6.h>
52 #include <netinet/icmp6.h>
53 
54 #include <inet/common.h>
55 #include <inet/mi.h>
56 #include <inet/mib2.h>
57 #include <inet/nd.h>
58 #include <inet/ip.h>
59 #include <inet/ip_impl.h>
60 #include <inet/ipclassifier.h>
61 #include <inet/ip_if.h>
62 #include <inet/ip_ire.h>
63 #include <inet/ip_rts.h>
64 #include <inet/ip6.h>
65 #include <inet/ip_ndp.h>
66 #include <inet/ipsec_impl.h>
67 #include <inet/ipsec_info.h>
68 #include <inet/sctp_ip.h>
69 
70 /*
71  * Function names with nce_ prefix are static while function
72  * names with ndp_ prefix are used by rest of the IP.
73  *
74  * Lock ordering:
75  *
76  *	ndp_g_lock -> ill_lock -> nce_lock
77  *
78  * The ndp_g_lock protects the NCE hash (nce_hash_tbl, NCE_HASH_PTR) and
79  * nce_next.  Nce_lock protects the contents of the NCE (particularly
80  * nce_refcnt).
81  */
82 
83 static	boolean_t nce_cmp_ll_addr(const nce_t *nce, const uchar_t *new_ll_addr,
84     uint32_t ll_addr_len);
85 static	void	nce_ire_delete(nce_t *nce);
86 static	void	nce_ire_delete1(ire_t *ire, char *nce_arg);
87 static	void 	nce_set_ll(nce_t *nce, uchar_t *ll_addr);
88 static	nce_t	*nce_lookup_addr(ill_t *, const in6_addr_t *, nce_t *);
89 static	nce_t	*nce_lookup_mapping(ill_t *ill, const in6_addr_t *addr);
90 static	void	nce_make_mapping(nce_t *nce, uchar_t *addrpos,
91     uchar_t *addr);
92 static	int	nce_set_multicast(ill_t *ill, const in6_addr_t *addr);
93 static	void	nce_queue_mp(nce_t *nce, mblk_t *mp);
94 static	mblk_t	*nce_udreq_alloc(ill_t *ill);
95 static	void	nce_update(nce_t *nce, uint16_t new_state,
96     uchar_t *new_ll_addr);
97 static	uint32_t	nce_solicit(nce_t *nce, mblk_t *mp);
98 static	boolean_t	nce_xmit(ill_t *ill, uint32_t operation,
99     ill_t *hwaddr_ill, boolean_t use_lla_addr, const in6_addr_t *sender,
100     const in6_addr_t *target, int flag);
101 static int	ndp_add_v4(ill_t *, const in_addr_t *, uint16_t,
102     nce_t **, nce_t *);
103 
104 #ifdef DEBUG
105 static void	nce_trace_cleanup(const nce_t *);
106 #endif
107 
108 #define	NCE_HASH_PTR_V4(ipst, addr)					\
109 	(&((ipst)->ips_ndp4->nce_hash_tbl[IRE_ADDR_HASH(addr, NCE_TABLE_SIZE)]))
110 
111 #define	NCE_HASH_PTR_V6(ipst, addr)				 \
112 	(&((ipst)->ips_ndp6->nce_hash_tbl[NCE_ADDR_HASH_V6(addr, \
113 		NCE_TABLE_SIZE)]))
114 
115 /*
116  * Compute default flags to use for an advertisement of this nce's address.
117  */
118 static int
119 nce_advert_flags(const nce_t *nce)
120 {
121 	int flag = 0;
122 
123 	if (nce->nce_flags & NCE_F_ISROUTER)
124 		flag |= NDP_ISROUTER;
125 	return (flag);
126 }
127 
128 /* Non-tunable probe interval, based on link capabilities */
129 #define	ILL_PROBE_INTERVAL(ill)	((ill)->ill_note_link ? 150 : 1500)
130 
131 /*
132  * NDP Cache Entry creation routine.
133  * Mapped entries will never do NUD .
134  * This routine must always be called with ndp6->ndp_g_lock held.
135  * Prior to return, nce_refcnt is incremented.
136  */
137 int
138 ndp_add_v6(ill_t *ill, uchar_t *hw_addr, const in6_addr_t *addr,
139     const in6_addr_t *mask, const in6_addr_t *extract_mask,
140     uint32_t hw_extract_start, uint16_t flags, uint16_t state,
141     nce_t **newnce)
142 {
143 	static	nce_t		nce_nil;
144 	nce_t		*nce;
145 	mblk_t		*mp;
146 	mblk_t		*template;
147 	nce_t		**ncep;
148 	int		err;
149 	boolean_t	dropped = B_FALSE;
150 	ip_stack_t	*ipst = ill->ill_ipst;
151 
152 	ASSERT(MUTEX_HELD(&ipst->ips_ndp6->ndp_g_lock));
153 	ASSERT(ill != NULL && ill->ill_isv6);
154 	if (IN6_IS_ADDR_UNSPECIFIED(addr)) {
155 		ip0dbg(("ndp_add_v6: no addr\n"));
156 		return (EINVAL);
157 	}
158 	if ((flags & ~NCE_EXTERNAL_FLAGS_MASK)) {
159 		ip0dbg(("ndp_add_v6: flags = %x\n", (int)flags));
160 		return (EINVAL);
161 	}
162 	if (IN6_IS_ADDR_UNSPECIFIED(extract_mask) &&
163 	    (flags & NCE_F_MAPPING)) {
164 		ip0dbg(("ndp_add_v6: extract mask zero for mapping"));
165 		return (EINVAL);
166 	}
167 	/*
168 	 * Allocate the mblk to hold the nce.
169 	 *
170 	 * XXX This can come out of a separate cache - nce_cache.
171 	 * We don't need the mp anymore as there are no more
172 	 * "qwriter"s
173 	 */
174 	mp = allocb(sizeof (nce_t), BPRI_MED);
175 	if (mp == NULL)
176 		return (ENOMEM);
177 
178 	nce = (nce_t *)mp->b_rptr;
179 	mp->b_wptr = (uchar_t *)&nce[1];
180 	*nce = nce_nil;
181 
182 	/*
183 	 * This one holds link layer address
184 	 */
185 	if (ill->ill_net_type == IRE_IF_RESOLVER) {
186 		template = nce_udreq_alloc(ill);
187 	} else {
188 		if (ill->ill_resolver_mp == NULL) {
189 			freeb(mp);
190 			return (EINVAL);
191 		}
192 		ASSERT((ill->ill_net_type == IRE_IF_NORESOLVER));
193 		template = copyb(ill->ill_resolver_mp);
194 	}
195 	if (template == NULL) {
196 		freeb(mp);
197 		return (ENOMEM);
198 	}
199 	nce->nce_ill = ill;
200 	nce->nce_ipversion = IPV6_VERSION;
201 	nce->nce_flags = flags;
202 	nce->nce_state = state;
203 	nce->nce_pcnt = ND_MAX_UNICAST_SOLICIT;
204 	nce->nce_rcnt = ill->ill_xmit_count;
205 	nce->nce_addr = *addr;
206 	nce->nce_mask = *mask;
207 	nce->nce_extract_mask = *extract_mask;
208 	nce->nce_ll_extract_start = hw_extract_start;
209 	nce->nce_fp_mp = NULL;
210 	nce->nce_res_mp = template;
211 	if (state == ND_REACHABLE)
212 		nce->nce_last = TICK_TO_MSEC(lbolt64);
213 	else
214 		nce->nce_last = 0;
215 	nce->nce_qd_mp = NULL;
216 	nce->nce_mp = mp;
217 	if (hw_addr != NULL)
218 		nce_set_ll(nce, hw_addr);
219 	/* This one is for nce getting created */
220 	nce->nce_refcnt = 1;
221 	mutex_init(&nce->nce_lock, NULL, MUTEX_DEFAULT, NULL);
222 	if (nce->nce_flags & NCE_F_MAPPING) {
223 		ASSERT(IN6_IS_ADDR_MULTICAST(addr));
224 		ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&nce->nce_mask));
225 		ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&nce->nce_extract_mask));
226 		ncep = &ipst->ips_ndp6->nce_mask_entries;
227 	} else {
228 		ncep = ((nce_t **)NCE_HASH_PTR_V6(ipst, *addr));
229 	}
230 
231 	nce->nce_trace_disable = B_FALSE;
232 
233 	/*
234 	 * Atomically ensure that the ill is not CONDEMNED, before
235 	 * adding the NCE.
236 	 */
237 	mutex_enter(&ill->ill_lock);
238 	if (ill->ill_state_flags & ILL_CONDEMNED) {
239 		mutex_exit(&ill->ill_lock);
240 		freeb(mp);
241 		freeb(template);
242 		return (EINVAL);
243 	}
244 	if ((nce->nce_next = *ncep) != NULL)
245 		nce->nce_next->nce_ptpn = &nce->nce_next;
246 	*ncep = nce;
247 	nce->nce_ptpn = ncep;
248 	*newnce = nce;
249 	/* This one is for nce being used by an active thread */
250 	NCE_REFHOLD(*newnce);
251 
252 	/* Bump up the number of nce's referencing this ill */
253 	ill->ill_nce_cnt++;
254 	mutex_exit(&ill->ill_lock);
255 
256 	err = 0;
257 	if ((flags & NCE_F_PERMANENT) && state == ND_PROBE) {
258 		mutex_enter(&nce->nce_lock);
259 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
260 		nce->nce_pcnt = ND_MAX_UNICAST_SOLICIT;
261 		mutex_exit(&nce->nce_lock);
262 		dropped = nce_xmit(ill, ND_NEIGHBOR_SOLICIT, NULL, B_FALSE,
263 		    &ipv6_all_zeros, addr, NDP_PROBE);
264 		if (dropped) {
265 			mutex_enter(&nce->nce_lock);
266 			nce->nce_pcnt++;
267 			mutex_exit(&nce->nce_lock);
268 		}
269 		NDP_RESTART_TIMER(nce, ILL_PROBE_INTERVAL(ill));
270 		mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
271 		err = EINPROGRESS;
272 	} else if (flags & NCE_F_UNSOL_ADV) {
273 		/*
274 		 * We account for the transmit below by assigning one
275 		 * less than the ndd variable. Subsequent decrements
276 		 * are done in ndp_timer.
277 		 */
278 		mutex_enter(&nce->nce_lock);
279 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
280 		nce->nce_unsolicit_count = ipst->ips_ip_ndp_unsolicit_count - 1;
281 		mutex_exit(&nce->nce_lock);
282 		dropped = nce_xmit(ill,
283 		    ND_NEIGHBOR_ADVERT,
284 		    ill,	/* ill to be used for extracting ill_nd_lla */
285 		    B_TRUE,	/* use ill_nd_lla */
286 		    addr,	/* Source and target of the advertisement pkt */
287 		    &ipv6_all_hosts_mcast, /* Destination of the packet */
288 		    nce_advert_flags(nce));
289 		mutex_enter(&nce->nce_lock);
290 		if (dropped)
291 			nce->nce_unsolicit_count++;
292 		if (nce->nce_unsolicit_count != 0) {
293 			nce->nce_timeout_id = timeout(ndp_timer, nce,
294 			    MSEC_TO_TICK(ipst->ips_ip_ndp_unsolicit_interval));
295 		}
296 		mutex_exit(&nce->nce_lock);
297 		mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
298 	}
299 	/*
300 	 * If the hw_addr is NULL, typically for ND_INCOMPLETE nces, then
301 	 * we call nce_fastpath as soon as the nce is resolved in ndp_process.
302 	 * We call nce_fastpath from nce_update if the link layer address of
303 	 * the peer changes from nce_update
304 	 */
305 	if (hw_addr != NULL || ill->ill_net_type == IRE_IF_NORESOLVER)
306 		nce_fastpath(nce);
307 	return (err);
308 }
309 
310 int
311 ndp_lookup_then_add_v6(ill_t *ill, uchar_t *hw_addr, const in6_addr_t *addr,
312     const in6_addr_t *mask, const in6_addr_t *extract_mask,
313     uint32_t hw_extract_start, uint16_t flags, uint16_t state,
314     nce_t **newnce)
315 {
316 	int	err = 0;
317 	nce_t	*nce;
318 	ip_stack_t	*ipst = ill->ill_ipst;
319 
320 	ASSERT(ill->ill_isv6);
321 	mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
322 
323 	/* Get head of v6 hash table */
324 	nce = *((nce_t **)NCE_HASH_PTR_V6(ipst, *addr));
325 	nce = nce_lookup_addr(ill, addr, nce);
326 	if (nce == NULL) {
327 		err = ndp_add_v6(ill,
328 		    hw_addr,
329 		    addr,
330 		    mask,
331 		    extract_mask,
332 		    hw_extract_start,
333 		    flags,
334 		    state,
335 		    newnce);
336 	} else {
337 		*newnce = nce;
338 		err = EEXIST;
339 	}
340 	mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
341 	return (err);
342 }
343 
344 /*
345  * Remove all the CONDEMNED nces from the appropriate hash table.
346  * We create a private list of NCEs, these may have ires pointing
347  * to them, so the list will be passed through to clean up dependent
348  * ires and only then we can do NCE_REFRELE which can make NCE inactive.
349  */
350 static void
351 nce_remove(ndp_g_t *ndp, nce_t *nce, nce_t **free_nce_list)
352 {
353 	nce_t *nce1;
354 	nce_t **ptpn;
355 
356 	ASSERT(MUTEX_HELD(&ndp->ndp_g_lock));
357 	ASSERT(ndp->ndp_g_walker == 0);
358 	for (; nce; nce = nce1) {
359 		nce1 = nce->nce_next;
360 		mutex_enter(&nce->nce_lock);
361 		if (nce->nce_flags & NCE_F_CONDEMNED) {
362 			ptpn = nce->nce_ptpn;
363 			nce1 = nce->nce_next;
364 			if (nce1 != NULL)
365 				nce1->nce_ptpn = ptpn;
366 			*ptpn = nce1;
367 			nce->nce_ptpn = NULL;
368 			nce->nce_next = NULL;
369 			nce->nce_next = *free_nce_list;
370 			*free_nce_list = nce;
371 		}
372 		mutex_exit(&nce->nce_lock);
373 	}
374 }
375 
376 /*
377  * 1. Mark the nce CONDEMNED. This ensures that no new nce_lookup()
378  *    will return this NCE. Also no new IREs will be created that
379  *    point to this NCE (See ire_add_v6).  Also no new timeouts will
380  *    be started (See NDP_RESTART_TIMER).
381  * 2. Cancel any currently running timeouts.
382  * 3. If there is an ndp walker, return. The walker will do the cleanup.
383  *    This ensures that walkers see a consistent list of NCEs while walking.
384  * 4. Otherwise remove the NCE from the list of NCEs
385  * 5. Delete all IREs pointing to this NCE.
386  */
387 void
388 ndp_delete(nce_t *nce)
389 {
390 	nce_t	**ptpn;
391 	nce_t	*nce1;
392 	int	ipversion = nce->nce_ipversion;
393 	ndp_g_t *ndp;
394 	ip_stack_t	*ipst = nce->nce_ill->ill_ipst;
395 
396 	if (ipversion == IPV4_VERSION)
397 		ndp = ipst->ips_ndp4;
398 	else
399 		ndp = ipst->ips_ndp6;
400 
401 	/* Serialize deletes */
402 	mutex_enter(&nce->nce_lock);
403 	if (nce->nce_flags & NCE_F_CONDEMNED) {
404 		/* Some other thread is doing the delete */
405 		mutex_exit(&nce->nce_lock);
406 		return;
407 	}
408 	/*
409 	 * Caller has a refhold. Also 1 ref for being in the list. Thus
410 	 * refcnt has to be >= 2
411 	 */
412 	ASSERT(nce->nce_refcnt >= 2);
413 	nce->nce_flags |= NCE_F_CONDEMNED;
414 	mutex_exit(&nce->nce_lock);
415 
416 	nce_fastpath_list_delete(nce);
417 
418 	/*
419 	 * Cancel any running timer. Timeout can't be restarted
420 	 * since CONDEMNED is set. Can't hold nce_lock across untimeout.
421 	 * Passing invalid timeout id is fine.
422 	 */
423 	if (nce->nce_timeout_id != 0) {
424 		(void) untimeout(nce->nce_timeout_id);
425 		nce->nce_timeout_id = 0;
426 	}
427 
428 	mutex_enter(&ndp->ndp_g_lock);
429 	if (nce->nce_ptpn == NULL) {
430 		/*
431 		 * The last ndp walker has already removed this nce from
432 		 * the list after we marked the nce CONDEMNED and before
433 		 * we grabbed the global lock.
434 		 */
435 		mutex_exit(&ndp->ndp_g_lock);
436 		return;
437 	}
438 	if (ndp->ndp_g_walker > 0) {
439 		/*
440 		 * Can't unlink. The walker will clean up
441 		 */
442 		ndp->ndp_g_walker_cleanup = B_TRUE;
443 		mutex_exit(&ndp->ndp_g_lock);
444 		return;
445 	}
446 
447 	/*
448 	 * Now remove the nce from the list. NDP_RESTART_TIMER won't restart
449 	 * the timer since it is marked CONDEMNED.
450 	 */
451 	ptpn = nce->nce_ptpn;
452 	nce1 = nce->nce_next;
453 	if (nce1 != NULL)
454 		nce1->nce_ptpn = ptpn;
455 	*ptpn = nce1;
456 	nce->nce_ptpn = NULL;
457 	nce->nce_next = NULL;
458 	mutex_exit(&ndp->ndp_g_lock);
459 
460 	nce_ire_delete(nce);
461 }
462 
463 void
464 ndp_inactive(nce_t *nce)
465 {
466 	mblk_t		**mpp;
467 	ill_t		*ill;
468 
469 	ASSERT(nce->nce_refcnt == 0);
470 	ASSERT(MUTEX_HELD(&nce->nce_lock));
471 	ASSERT(nce->nce_fastpath == NULL);
472 
473 	/* Free all nce allocated messages */
474 	mpp = &nce->nce_first_mp_to_free;
475 	do {
476 		while (*mpp != NULL) {
477 			mblk_t  *mp;
478 
479 			mp = *mpp;
480 			*mpp = mp->b_next;
481 
482 			inet_freemsg(mp);
483 		}
484 	} while (mpp++ != &nce->nce_last_mp_to_free);
485 
486 #ifdef DEBUG
487 	nce_trace_cleanup(nce);
488 #endif
489 
490 	ill = nce->nce_ill;
491 	mutex_enter(&ill->ill_lock);
492 	ill->ill_nce_cnt--;
493 	/*
494 	 * If the number of nce's associated with this ill have dropped
495 	 * to zero, check whether we need to restart any operation that
496 	 * is waiting for this to happen.
497 	 */
498 	if (ill->ill_nce_cnt == 0) {
499 		/* ipif_ill_refrele_tail drops the ill_lock */
500 		ipif_ill_refrele_tail(ill);
501 	} else {
502 		mutex_exit(&ill->ill_lock);
503 	}
504 	mutex_destroy(&nce->nce_lock);
505 	if (nce->nce_mp != NULL)
506 		inet_freemsg(nce->nce_mp);
507 }
508 
509 /*
510  * ndp_walk routine.  Delete the nce if it is associated with the ill
511  * that is going away.  Always called as a writer.
512  */
513 void
514 ndp_delete_per_ill(nce_t *nce, uchar_t *arg)
515 {
516 	if ((nce != NULL) && nce->nce_ill == (ill_t *)arg) {
517 		ndp_delete(nce);
518 	}
519 }
520 
521 /*
522  * Walk a list of to be inactive NCEs and blow away all the ires.
523  */
524 static void
525 nce_ire_delete_list(nce_t *nce)
526 {
527 	nce_t *nce_next;
528 
529 	ASSERT(nce != NULL);
530 	while (nce != NULL) {
531 		nce_next = nce->nce_next;
532 		nce->nce_next = NULL;
533 
534 		/*
535 		 * It is possible for the last ndp walker (this thread)
536 		 * to come here after ndp_delete has marked the nce CONDEMNED
537 		 * and before it has removed the nce from the fastpath list
538 		 * or called untimeout. So we need to do it here. It is safe
539 		 * for both ndp_delete and this thread to do it twice or
540 		 * even simultaneously since each of the threads has a
541 		 * reference on the nce.
542 		 */
543 		nce_fastpath_list_delete(nce);
544 		/*
545 		 * Cancel any running timer. Timeout can't be restarted
546 		 * since CONDEMNED is set. Can't hold nce_lock across untimeout.
547 		 * Passing invalid timeout id is fine.
548 		 */
549 		if (nce->nce_timeout_id != 0) {
550 			(void) untimeout(nce->nce_timeout_id);
551 			nce->nce_timeout_id = 0;
552 		}
553 		/*
554 		 * We might hit this func thus in the v4 case:
555 		 * ipif_down->ipif_ndp_down->ndp_walk
556 		 */
557 
558 		if (nce->nce_ipversion == IPV4_VERSION) {
559 			ire_walk_ill_v4(MATCH_IRE_ILL | MATCH_IRE_TYPE,
560 			    IRE_CACHE, nce_ire_delete1,
561 			    (char *)nce, nce->nce_ill);
562 		} else {
563 			ASSERT(nce->nce_ipversion == IPV6_VERSION);
564 			ire_walk_ill_v6(MATCH_IRE_ILL | MATCH_IRE_TYPE,
565 			    IRE_CACHE, nce_ire_delete1,
566 			    (char *)nce, nce->nce_ill);
567 		}
568 		NCE_REFRELE_NOTR(nce);
569 		nce = nce_next;
570 	}
571 }
572 
573 /*
574  * Delete an ire when the nce goes away.
575  */
576 /* ARGSUSED */
577 static void
578 nce_ire_delete(nce_t *nce)
579 {
580 	if (nce->nce_ipversion == IPV6_VERSION) {
581 		ire_walk_ill_v6(MATCH_IRE_ILL | MATCH_IRE_TYPE, IRE_CACHE,
582 		    nce_ire_delete1, (char *)nce, nce->nce_ill);
583 		NCE_REFRELE_NOTR(nce);
584 	} else {
585 		ire_walk_ill_v4(MATCH_IRE_ILL | MATCH_IRE_TYPE, IRE_CACHE,
586 		    nce_ire_delete1, (char *)nce, nce->nce_ill);
587 		NCE_REFRELE_NOTR(nce);
588 	}
589 }
590 
591 /*
592  * ire_walk routine used to delete every IRE that shares this nce
593  */
594 static void
595 nce_ire_delete1(ire_t *ire, char *nce_arg)
596 {
597 	nce_t	*nce = (nce_t *)nce_arg;
598 
599 	ASSERT(ire->ire_type == IRE_CACHE);
600 
601 	if (ire->ire_nce == nce) {
602 		ASSERT(ire->ire_ipversion == nce->nce_ipversion);
603 		ire_delete(ire);
604 	}
605 }
606 
607 /*
608  * Restart DAD on given NCE.  Returns B_TRUE if DAD has been restarted.
609  */
610 boolean_t
611 ndp_restart_dad(nce_t *nce)
612 {
613 	boolean_t started;
614 	boolean_t dropped;
615 
616 	if (nce == NULL)
617 		return (B_FALSE);
618 	mutex_enter(&nce->nce_lock);
619 	if (nce->nce_state == ND_PROBE) {
620 		mutex_exit(&nce->nce_lock);
621 		started = B_TRUE;
622 	} else if (nce->nce_state == ND_REACHABLE) {
623 		nce->nce_state = ND_PROBE;
624 		nce->nce_pcnt = ND_MAX_UNICAST_SOLICIT - 1;
625 		mutex_exit(&nce->nce_lock);
626 		dropped = nce_xmit(nce->nce_ill, ND_NEIGHBOR_SOLICIT, NULL,
627 		    B_FALSE, &ipv6_all_zeros, &nce->nce_addr, NDP_PROBE);
628 		if (dropped) {
629 			mutex_enter(&nce->nce_lock);
630 			nce->nce_pcnt++;
631 			mutex_exit(&nce->nce_lock);
632 		}
633 		NDP_RESTART_TIMER(nce, ILL_PROBE_INTERVAL(nce->nce_ill));
634 		started = B_TRUE;
635 	} else {
636 		mutex_exit(&nce->nce_lock);
637 		started = B_FALSE;
638 	}
639 	return (started);
640 }
641 
642 /*
643  * IPv6 Cache entry lookup.  Try to find an nce matching the parameters passed.
644  * If one is found, the refcnt on the nce will be incremented.
645  */
646 nce_t *
647 ndp_lookup_v6(ill_t *ill, const in6_addr_t *addr, boolean_t caller_holds_lock)
648 {
649 	nce_t	*nce;
650 	ip_stack_t	*ipst;
651 
652 	ASSERT(ill != NULL);
653 	ipst = ill->ill_ipst;
654 
655 	ASSERT(ill != NULL && ill->ill_isv6);
656 	if (!caller_holds_lock) {
657 		mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
658 	}
659 
660 	/* Get head of v6 hash table */
661 	nce = *((nce_t **)NCE_HASH_PTR_V6(ipst, *addr));
662 	nce = nce_lookup_addr(ill, addr, nce);
663 	if (nce == NULL)
664 		nce = nce_lookup_mapping(ill, addr);
665 	if (!caller_holds_lock)
666 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
667 	return (nce);
668 }
669 /*
670  * IPv4 Cache entry lookup.  Try to find an nce matching the parameters passed.
671  * If one is found, the refcnt on the nce will be incremented.
672  * Since multicast mappings are handled in arp, there are no nce_mcast_entries
673  * so we skip the nce_lookup_mapping call.
674  * XXX TODO: if the nce is found to be ND_STALE, ndp_delete it and return NULL
675  */
676 nce_t *
677 ndp_lookup_v4(ill_t *ill, const in_addr_t *addr, boolean_t caller_holds_lock)
678 {
679 	nce_t	*nce;
680 	in6_addr_t addr6;
681 	ip_stack_t *ipst = ill->ill_ipst;
682 
683 	if (!caller_holds_lock) {
684 		mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
685 	}
686 
687 	/* Get head of v4 hash table */
688 	nce = *((nce_t **)NCE_HASH_PTR_V4(ipst, *addr));
689 	IN6_IPADDR_TO_V4MAPPED(*addr, &addr6);
690 	nce = nce_lookup_addr(ill, &addr6, nce);
691 	if (!caller_holds_lock)
692 		mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
693 	return (nce);
694 }
695 
696 /*
697  * Cache entry lookup.  Try to find an nce matching the parameters passed.
698  * Look only for exact entries (no mappings).  If an nce is found, increment
699  * the hold count on that nce. The caller passes in the start of the
700  * appropriate hash table, and must be holding the appropriate global
701  * lock (ndp_g_lock).
702  */
703 static nce_t *
704 nce_lookup_addr(ill_t *ill, const in6_addr_t *addr, nce_t *nce)
705 {
706 	ndp_g_t		*ndp;
707 	ip_stack_t	*ipst = ill->ill_ipst;
708 
709 	if (ill->ill_isv6)
710 		ndp = ipst->ips_ndp6;
711 	else
712 		ndp = ipst->ips_ndp4;
713 
714 	ASSERT(ill != NULL);
715 	ASSERT(MUTEX_HELD(&ndp->ndp_g_lock));
716 	if (IN6_IS_ADDR_UNSPECIFIED(addr))
717 		return (NULL);
718 	for (; nce != NULL; nce = nce->nce_next) {
719 		if (nce->nce_ill == ill) {
720 			if (IN6_ARE_ADDR_EQUAL(&nce->nce_addr, addr) &&
721 			    IN6_ARE_ADDR_EQUAL(&nce->nce_mask,
722 			    &ipv6_all_ones)) {
723 				mutex_enter(&nce->nce_lock);
724 				if (!(nce->nce_flags & NCE_F_CONDEMNED)) {
725 					NCE_REFHOLD_LOCKED(nce);
726 					mutex_exit(&nce->nce_lock);
727 					break;
728 				}
729 				mutex_exit(&nce->nce_lock);
730 			}
731 		}
732 	}
733 	return (nce);
734 }
735 
736 /*
737  * Cache entry lookup.  Try to find an nce matching the parameters passed.
738  * Look only for mappings.
739  */
740 static nce_t *
741 nce_lookup_mapping(ill_t *ill, const in6_addr_t *addr)
742 {
743 	nce_t	*nce;
744 	ip_stack_t	*ipst = ill->ill_ipst;
745 
746 	ASSERT(ill != NULL && ill->ill_isv6);
747 	ASSERT(MUTEX_HELD(&ipst->ips_ndp6->ndp_g_lock));
748 	if (!IN6_IS_ADDR_MULTICAST(addr))
749 		return (NULL);
750 	nce = ipst->ips_ndp6->nce_mask_entries;
751 	for (; nce != NULL; nce = nce->nce_next)
752 		if (nce->nce_ill == ill &&
753 		    (V6_MASK_EQ(*addr, nce->nce_mask, nce->nce_addr))) {
754 			mutex_enter(&nce->nce_lock);
755 			if (!(nce->nce_flags & NCE_F_CONDEMNED)) {
756 				NCE_REFHOLD_LOCKED(nce);
757 				mutex_exit(&nce->nce_lock);
758 				break;
759 			}
760 			mutex_exit(&nce->nce_lock);
761 		}
762 	return (nce);
763 }
764 
765 /*
766  * Process passed in parameters either from an incoming packet or via
767  * user ioctl.
768  */
769 void
770 ndp_process(nce_t *nce, uchar_t *hw_addr, uint32_t flag, boolean_t is_adv)
771 {
772 	ill_t	*ill = nce->nce_ill;
773 	uint32_t hw_addr_len = ill->ill_nd_lla_len;
774 	mblk_t	*mp;
775 	boolean_t ll_updated = B_FALSE;
776 	boolean_t ll_changed;
777 	ip_stack_t	*ipst = ill->ill_ipst;
778 
779 	ASSERT(nce->nce_ipversion == IPV6_VERSION);
780 	/*
781 	 * No updates of link layer address or the neighbor state is
782 	 * allowed, when the cache is in NONUD state.  This still
783 	 * allows for responding to reachability solicitation.
784 	 */
785 	mutex_enter(&nce->nce_lock);
786 	if (nce->nce_state == ND_INCOMPLETE) {
787 		if (hw_addr == NULL) {
788 			mutex_exit(&nce->nce_lock);
789 			return;
790 		}
791 		nce_set_ll(nce, hw_addr);
792 		/*
793 		 * Update nce state and send the queued packets
794 		 * back to ip this time ire will be added.
795 		 */
796 		if (flag & ND_NA_FLAG_SOLICITED) {
797 			nce_update(nce, ND_REACHABLE, NULL);
798 		} else {
799 			nce_update(nce, ND_STALE, NULL);
800 		}
801 		mutex_exit(&nce->nce_lock);
802 		nce_fastpath(nce);
803 		mutex_enter(&nce->nce_lock);
804 		mp = nce->nce_qd_mp;
805 		nce->nce_qd_mp = NULL;
806 		mutex_exit(&nce->nce_lock);
807 		while (mp != NULL) {
808 			mblk_t *nxt_mp, *data_mp;
809 
810 			nxt_mp = mp->b_next;
811 			mp->b_next = NULL;
812 
813 			if (mp->b_datap->db_type == M_CTL)
814 				data_mp = mp->b_cont;
815 			else
816 				data_mp = mp;
817 			if (data_mp->b_prev != NULL) {
818 				ill_t   *inbound_ill;
819 				queue_t *fwdq = NULL;
820 				uint_t ifindex;
821 
822 				ifindex = (uint_t)(uintptr_t)data_mp->b_prev;
823 				inbound_ill = ill_lookup_on_ifindex(ifindex,
824 				    B_TRUE, NULL, NULL, NULL, NULL, ipst);
825 				if (inbound_ill == NULL) {
826 					data_mp->b_prev = NULL;
827 					freemsg(mp);
828 					return;
829 				} else {
830 					fwdq = inbound_ill->ill_rq;
831 				}
832 				data_mp->b_prev = NULL;
833 				/*
834 				 * Send a forwarded packet back into ip_rput_v6
835 				 * just as in ire_send_v6().
836 				 * Extract the queue from b_prev (set in
837 				 * ip_rput_data_v6).
838 				 */
839 				if (fwdq != NULL) {
840 					/*
841 					 * Forwarded packets hop count will
842 					 * get decremented in ip_rput_data_v6
843 					 */
844 					if (data_mp != mp)
845 						freeb(mp);
846 					put(fwdq, data_mp);
847 				} else {
848 					/*
849 					 * Send locally originated packets back
850 					 * into * ip_wput_v6.
851 					 */
852 					put(ill->ill_wq, mp);
853 				}
854 				ill_refrele(inbound_ill);
855 			} else {
856 				put(ill->ill_wq, mp);
857 			}
858 			mp = nxt_mp;
859 		}
860 		return;
861 	}
862 	ll_changed = nce_cmp_ll_addr(nce, hw_addr, hw_addr_len);
863 	if (!is_adv) {
864 		/* If this is a SOLICITATION request only */
865 		if (ll_changed)
866 			nce_update(nce, ND_STALE, hw_addr);
867 		mutex_exit(&nce->nce_lock);
868 		return;
869 	}
870 	if (!(flag & ND_NA_FLAG_OVERRIDE) && ll_changed) {
871 		/* If in any other state than REACHABLE, ignore */
872 		if (nce->nce_state == ND_REACHABLE) {
873 			nce_update(nce, ND_STALE, NULL);
874 		}
875 		mutex_exit(&nce->nce_lock);
876 		return;
877 	} else {
878 		if (ll_changed) {
879 			nce_update(nce, ND_UNCHANGED, hw_addr);
880 			ll_updated = B_TRUE;
881 		}
882 		if (flag & ND_NA_FLAG_SOLICITED) {
883 			nce_update(nce, ND_REACHABLE, NULL);
884 		} else {
885 			if (ll_updated) {
886 				nce_update(nce, ND_STALE, NULL);
887 			}
888 		}
889 		mutex_exit(&nce->nce_lock);
890 		if (!(flag & ND_NA_FLAG_ROUTER) && (nce->nce_flags &
891 		    NCE_F_ISROUTER)) {
892 			ire_t *ire;
893 
894 			/*
895 			 * Router turned to host.  We need to remove the
896 			 * entry as well as any default route that may be
897 			 * using this as a next hop.  This is required by
898 			 * section 7.2.5 of RFC 2461.
899 			 */
900 			ire = ire_ftable_lookup_v6(&ipv6_all_zeros,
901 			    &ipv6_all_zeros, &nce->nce_addr, IRE_DEFAULT,
902 			    nce->nce_ill->ill_ipif, NULL, ALL_ZONES, 0, NULL,
903 			    MATCH_IRE_ILL | MATCH_IRE_TYPE | MATCH_IRE_GW |
904 			    MATCH_IRE_DEFAULT, ipst);
905 			if (ire != NULL) {
906 				ip_rts_rtmsg(RTM_DELETE, ire, 0, ipst);
907 				ire_delete(ire);
908 				ire_refrele(ire);
909 			}
910 			ndp_delete(nce);
911 		}
912 	}
913 }
914 
915 /*
916  * Pass arg1 to the pfi supplied, along with each nce in existence.
917  * ndp_walk() places a REFHOLD on the nce and drops the lock when
918  * walking the hash list.
919  */
920 void
921 ndp_walk_common(ndp_g_t *ndp, ill_t *ill, pfi_t pfi, void *arg1,
922     boolean_t trace)
923 {
924 
925 	nce_t	*nce;
926 	nce_t	*nce1;
927 	nce_t	**ncep;
928 	nce_t	*free_nce_list = NULL;
929 
930 	mutex_enter(&ndp->ndp_g_lock);
931 	/* Prevent ndp_delete from unlink and free of NCE */
932 	ndp->ndp_g_walker++;
933 	mutex_exit(&ndp->ndp_g_lock);
934 	for (ncep = ndp->nce_hash_tbl;
935 	    ncep < A_END(ndp->nce_hash_tbl); ncep++) {
936 		for (nce = *ncep; nce != NULL; nce = nce1) {
937 			nce1 = nce->nce_next;
938 			if (ill == NULL || nce->nce_ill == ill) {
939 				if (trace) {
940 					NCE_REFHOLD(nce);
941 					(*pfi)(nce, arg1);
942 					NCE_REFRELE(nce);
943 				} else {
944 					NCE_REFHOLD_NOTR(nce);
945 					(*pfi)(nce, arg1);
946 					NCE_REFRELE_NOTR(nce);
947 				}
948 			}
949 		}
950 	}
951 	for (nce = ndp->nce_mask_entries; nce != NULL; nce = nce1) {
952 		nce1 = nce->nce_next;
953 		if (ill == NULL || nce->nce_ill == ill) {
954 			if (trace) {
955 				NCE_REFHOLD(nce);
956 				(*pfi)(nce, arg1);
957 				NCE_REFRELE(nce);
958 			} else {
959 				NCE_REFHOLD_NOTR(nce);
960 				(*pfi)(nce, arg1);
961 				NCE_REFRELE_NOTR(nce);
962 			}
963 		}
964 	}
965 	mutex_enter(&ndp->ndp_g_lock);
966 	ndp->ndp_g_walker--;
967 	/*
968 	 * While NCE's are removed from global list they are placed
969 	 * in a private list, to be passed to nce_ire_delete_list().
970 	 * The reason is, there may be ires pointing to this nce
971 	 * which needs to cleaned up.
972 	 */
973 	if (ndp->ndp_g_walker_cleanup && ndp->ndp_g_walker == 0) {
974 		/* Time to delete condemned entries */
975 		for (ncep = ndp->nce_hash_tbl;
976 		    ncep < A_END(ndp->nce_hash_tbl); ncep++) {
977 			nce = *ncep;
978 			if (nce != NULL) {
979 				nce_remove(ndp, nce, &free_nce_list);
980 			}
981 		}
982 		nce = ndp->nce_mask_entries;
983 		if (nce != NULL) {
984 			nce_remove(ndp, nce, &free_nce_list);
985 		}
986 		ndp->ndp_g_walker_cleanup = B_FALSE;
987 	}
988 
989 	mutex_exit(&ndp->ndp_g_lock);
990 
991 	if (free_nce_list != NULL) {
992 		nce_ire_delete_list(free_nce_list);
993 	}
994 }
995 
996 /*
997  * Walk everything.
998  * Note that ill can be NULL hence can't derive the ipst from it.
999  */
1000 void
1001 ndp_walk(ill_t *ill, pfi_t pfi, void *arg1, ip_stack_t *ipst)
1002 {
1003 	ndp_walk_common(ipst->ips_ndp4, ill, pfi, arg1, B_TRUE);
1004 	ndp_walk_common(ipst->ips_ndp6, ill, pfi, arg1, B_TRUE);
1005 }
1006 
1007 /*
1008  * Process resolve requests.  Handles both mapped entries
1009  * as well as cases that needs to be send out on the wire.
1010  * Lookup a NCE for a given IRE.  Regardless of whether one exists
1011  * or one is created, we defer making ire point to nce until the
1012  * ire is actually added at which point the nce_refcnt on the nce is
1013  * incremented.  This is done primarily to have symmetry between ire_add()
1014  * and ire_delete() which decrements the nce_refcnt, when an ire is deleted.
1015  */
1016 int
1017 ndp_resolver(ill_t *ill, const in6_addr_t *dst, mblk_t *mp, zoneid_t zoneid)
1018 {
1019 	nce_t		*nce;
1020 	int		err = 0;
1021 	uint32_t	ms;
1022 	mblk_t		*mp_nce = NULL;
1023 	ip_stack_t	*ipst = ill->ill_ipst;
1024 
1025 	ASSERT(ill->ill_isv6);
1026 	if (IN6_IS_ADDR_MULTICAST(dst)) {
1027 		err = nce_set_multicast(ill, dst);
1028 		return (err);
1029 	}
1030 	err = ndp_lookup_then_add_v6(ill,
1031 	    NULL,	/* No hardware address */
1032 	    dst,
1033 	    &ipv6_all_ones,
1034 	    &ipv6_all_zeros,
1035 	    0,
1036 	    (ill->ill_flags & ILLF_NONUD) ? NCE_F_NONUD : 0,
1037 	    ND_INCOMPLETE,
1038 	    &nce);
1039 
1040 	switch (err) {
1041 	case 0:
1042 		/*
1043 		 * New cache entry was created. Make sure that the state
1044 		 * is not ND_INCOMPLETE. It can be in some other state
1045 		 * even before we send out the solicitation as we could
1046 		 * get un-solicited advertisements.
1047 		 *
1048 		 * If this is an XRESOLV interface, simply return 0,
1049 		 * since we don't want to solicit just yet.
1050 		 */
1051 		if (ill->ill_flags & ILLF_XRESOLV) {
1052 			NCE_REFRELE(nce);
1053 			return (0);
1054 		}
1055 		rw_enter(&ipst->ips_ill_g_lock, RW_READER);
1056 		mutex_enter(&nce->nce_lock);
1057 		if (nce->nce_state != ND_INCOMPLETE) {
1058 			mutex_exit(&nce->nce_lock);
1059 			rw_exit(&ipst->ips_ill_g_lock);
1060 			NCE_REFRELE(nce);
1061 			return (0);
1062 		}
1063 		mp_nce = ip_prepend_zoneid(mp, zoneid, ipst);
1064 		if (mp_nce == NULL) {
1065 			/* The caller will free mp */
1066 			mutex_exit(&nce->nce_lock);
1067 			rw_exit(&ipst->ips_ill_g_lock);
1068 			ndp_delete(nce);
1069 			NCE_REFRELE(nce);
1070 			return (ENOMEM);
1071 		}
1072 		ms = nce_solicit(nce, mp_nce);
1073 		rw_exit(&ipst->ips_ill_g_lock);
1074 		if (ms == 0) {
1075 			/* The caller will free mp */
1076 			if (mp_nce != mp)
1077 				freeb(mp_nce);
1078 			mutex_exit(&nce->nce_lock);
1079 			ndp_delete(nce);
1080 			NCE_REFRELE(nce);
1081 			return (EBUSY);
1082 		}
1083 		mutex_exit(&nce->nce_lock);
1084 		NDP_RESTART_TIMER(nce, (clock_t)ms);
1085 		NCE_REFRELE(nce);
1086 		return (EINPROGRESS);
1087 	case EEXIST:
1088 		/* Resolution in progress just queue the packet */
1089 		mutex_enter(&nce->nce_lock);
1090 		if (nce->nce_state == ND_INCOMPLETE) {
1091 			mp_nce = ip_prepend_zoneid(mp, zoneid, ipst);
1092 			if (mp_nce == NULL) {
1093 				err = ENOMEM;
1094 			} else {
1095 				nce_queue_mp(nce, mp_nce);
1096 				err = EINPROGRESS;
1097 			}
1098 		} else {
1099 			/*
1100 			 * Any other state implies we have
1101 			 * a nce but IRE needs to be added ...
1102 			 * ire_add_v6() will take care of the
1103 			 * the case when the nce becomes CONDEMNED
1104 			 * before the ire is added to the table.
1105 			 */
1106 			err = 0;
1107 		}
1108 		mutex_exit(&nce->nce_lock);
1109 		NCE_REFRELE(nce);
1110 		break;
1111 	default:
1112 		ip1dbg(("ndp_resolver: Can't create NCE %d\n", err));
1113 		break;
1114 	}
1115 	return (err);
1116 }
1117 
1118 /*
1119  * When there is no resolver, the link layer template is passed in
1120  * the IRE.
1121  * Lookup a NCE for a given IRE.  Regardless of whether one exists
1122  * or one is created, we defer making ire point to nce until the
1123  * ire is actually added at which point the nce_refcnt on the nce is
1124  * incremented.  This is done primarily to have symmetry between ire_add()
1125  * and ire_delete() which decrements the nce_refcnt, when an ire is deleted.
1126  */
1127 int
1128 ndp_noresolver(ill_t *ill, const in6_addr_t *dst)
1129 {
1130 	nce_t		*nce;
1131 	int		err = 0;
1132 
1133 	ASSERT(ill != NULL);
1134 	ASSERT(ill->ill_isv6);
1135 	if (IN6_IS_ADDR_MULTICAST(dst)) {
1136 		err = nce_set_multicast(ill, dst);
1137 		return (err);
1138 	}
1139 
1140 	err = ndp_lookup_then_add_v6(ill,
1141 	    NULL,	/* hardware address */
1142 	    dst,
1143 	    &ipv6_all_ones,
1144 	    &ipv6_all_zeros,
1145 	    0,
1146 	    (ill->ill_flags & ILLF_NONUD) ? NCE_F_NONUD : 0,
1147 	    ND_REACHABLE,
1148 	    &nce);
1149 
1150 	switch (err) {
1151 	case 0:
1152 		/*
1153 		 * Cache entry with a proper resolver cookie was
1154 		 * created.
1155 		 */
1156 		NCE_REFRELE(nce);
1157 		break;
1158 	case EEXIST:
1159 		err = 0;
1160 		NCE_REFRELE(nce);
1161 		break;
1162 	default:
1163 		ip1dbg(("ndp_noresolver: Can't create NCE %d\n", err));
1164 		break;
1165 	}
1166 	return (err);
1167 }
1168 
1169 /*
1170  * For each interface an entry is added for the unspecified multicast group.
1171  * Here that mapping is used to form the multicast cache entry for a particular
1172  * multicast destination.
1173  */
1174 static int
1175 nce_set_multicast(ill_t *ill, const in6_addr_t *dst)
1176 {
1177 	nce_t		*mnce;	/* Multicast mapping entry */
1178 	nce_t		*nce;
1179 	uchar_t		*hw_addr = NULL;
1180 	int		err = 0;
1181 	ip_stack_t	*ipst = ill->ill_ipst;
1182 
1183 	ASSERT(ill != NULL);
1184 	ASSERT(ill->ill_isv6);
1185 	ASSERT(!(IN6_IS_ADDR_UNSPECIFIED(dst)));
1186 
1187 	mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
1188 	nce = *((nce_t **)NCE_HASH_PTR_V6(ipst, *dst));
1189 	nce = nce_lookup_addr(ill, dst, nce);
1190 	if (nce != NULL) {
1191 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
1192 		NCE_REFRELE(nce);
1193 		return (0);
1194 	}
1195 	/* No entry, now lookup for a mapping this should never fail */
1196 	mnce = nce_lookup_mapping(ill, dst);
1197 	if (mnce == NULL) {
1198 		/* Something broken for the interface. */
1199 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
1200 		return (ESRCH);
1201 	}
1202 	ASSERT(mnce->nce_flags & NCE_F_MAPPING);
1203 	if (ill->ill_net_type == IRE_IF_RESOLVER) {
1204 		/*
1205 		 * For IRE_IF_RESOLVER a hardware mapping can be
1206 		 * generated, for IRE_IF_NORESOLVER, resolution cookie
1207 		 * in the ill is copied in ndp_add_v6().
1208 		 */
1209 		hw_addr = kmem_alloc(ill->ill_nd_lla_len, KM_NOSLEEP);
1210 		if (hw_addr == NULL) {
1211 			mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
1212 			NCE_REFRELE(mnce);
1213 			return (ENOMEM);
1214 		}
1215 		nce_make_mapping(mnce, hw_addr, (uchar_t *)dst);
1216 	}
1217 	NCE_REFRELE(mnce);
1218 	/*
1219 	 * IRE_IF_NORESOLVER type simply copies the resolution
1220 	 * cookie passed in.  So no hw_addr is needed.
1221 	 */
1222 	err = ndp_add_v6(ill,
1223 	    hw_addr,
1224 	    dst,
1225 	    &ipv6_all_ones,
1226 	    &ipv6_all_zeros,
1227 	    0,
1228 	    NCE_F_NONUD,
1229 	    ND_REACHABLE,
1230 	    &nce);
1231 	mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
1232 	if (hw_addr != NULL)
1233 		kmem_free(hw_addr, ill->ill_nd_lla_len);
1234 	if (err != 0) {
1235 		ip1dbg(("nce_set_multicast: create failed" "%d\n", err));
1236 		return (err);
1237 	}
1238 	NCE_REFRELE(nce);
1239 	return (0);
1240 }
1241 
1242 /*
1243  * Return the link layer address, and any flags of a nce.
1244  */
1245 int
1246 ndp_query(ill_t *ill, struct lif_nd_req *lnr)
1247 {
1248 	nce_t		*nce;
1249 	in6_addr_t	*addr;
1250 	sin6_t		*sin6;
1251 	dl_unitdata_req_t	*dl;
1252 
1253 	ASSERT(ill != NULL && ill->ill_isv6);
1254 	sin6 = (sin6_t *)&lnr->lnr_addr;
1255 	addr =  &sin6->sin6_addr;
1256 
1257 	nce = ndp_lookup_v6(ill, addr, B_FALSE);
1258 	if (nce == NULL)
1259 		return (ESRCH);
1260 	/* If in INCOMPLETE state, no link layer address is available yet */
1261 	if (nce->nce_state == ND_INCOMPLETE)
1262 		goto done;
1263 	dl = (dl_unitdata_req_t *)nce->nce_res_mp->b_rptr;
1264 	if (ill->ill_flags & ILLF_XRESOLV)
1265 		lnr->lnr_hdw_len = dl->dl_dest_addr_length;
1266 	else
1267 		lnr->lnr_hdw_len = ill->ill_nd_lla_len;
1268 	ASSERT(NCE_LL_ADDR_OFFSET(ill) + lnr->lnr_hdw_len <=
1269 	    sizeof (lnr->lnr_hdw_addr));
1270 	bcopy(nce->nce_res_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill),
1271 	    (uchar_t *)&lnr->lnr_hdw_addr, lnr->lnr_hdw_len);
1272 	if (nce->nce_flags & NCE_F_ISROUTER)
1273 		lnr->lnr_flags = NDF_ISROUTER_ON;
1274 	if (nce->nce_flags & NCE_F_ANYCAST)
1275 		lnr->lnr_flags |= NDF_ANYCAST_ON;
1276 done:
1277 	NCE_REFRELE(nce);
1278 	return (0);
1279 }
1280 
1281 /*
1282  * Send Enable/Disable multicast reqs to driver.
1283  */
1284 int
1285 ndp_mcastreq(ill_t *ill, const in6_addr_t *addr, uint32_t hw_addr_len,
1286     uint32_t hw_addr_offset, mblk_t *mp)
1287 {
1288 	nce_t		*nce;
1289 	uchar_t		*hw_addr;
1290 	ip_stack_t	*ipst = ill->ill_ipst;
1291 
1292 	ASSERT(ill != NULL && ill->ill_isv6);
1293 	ASSERT(ill->ill_net_type == IRE_IF_RESOLVER);
1294 	hw_addr = mi_offset_paramc(mp, hw_addr_offset, hw_addr_len);
1295 	if (hw_addr == NULL || !IN6_IS_ADDR_MULTICAST(addr)) {
1296 		freemsg(mp);
1297 		return (EINVAL);
1298 	}
1299 	mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
1300 	nce = nce_lookup_mapping(ill, addr);
1301 	if (nce == NULL) {
1302 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
1303 		freemsg(mp);
1304 		return (ESRCH);
1305 	}
1306 	mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
1307 	/*
1308 	 * Update dl_addr_length and dl_addr_offset for primitives that
1309 	 * have physical addresses as opposed to full saps
1310 	 */
1311 	switch (((union DL_primitives *)mp->b_rptr)->dl_primitive) {
1312 	case DL_ENABMULTI_REQ:
1313 		/* Track the state if this is the first enabmulti */
1314 		if (ill->ill_dlpi_multicast_state == IDS_UNKNOWN)
1315 			ill->ill_dlpi_multicast_state = IDS_INPROGRESS;
1316 		ip1dbg(("ndp_mcastreq: ENABMULTI\n"));
1317 		break;
1318 	case DL_DISABMULTI_REQ:
1319 		ip1dbg(("ndp_mcastreq: DISABMULTI\n"));
1320 		break;
1321 	default:
1322 		NCE_REFRELE(nce);
1323 		ip1dbg(("ndp_mcastreq: default\n"));
1324 		return (EINVAL);
1325 	}
1326 	nce_make_mapping(nce, hw_addr, (uchar_t *)addr);
1327 	NCE_REFRELE(nce);
1328 	ill_dlpi_send(ill, mp);
1329 	return (0);
1330 }
1331 
1332 /*
1333  * Send a neighbor solicitation.
1334  * Returns number of milliseconds after which we should either rexmit or abort.
1335  * Return of zero means we should abort.
1336  * The caller holds the nce_lock to protect nce_qd_mp and nce_rcnt.
1337  *
1338  * NOTE: This routine drops nce_lock (and later reacquires it) when sending
1339  * the packet.
1340  * NOTE: This routine does not consume mp.
1341  */
1342 uint32_t
1343 nce_solicit(nce_t *nce, mblk_t *mp)
1344 {
1345 	ill_t		*ill;
1346 	ill_t		*src_ill;
1347 	ip6_t		*ip6h;
1348 	in6_addr_t	src;
1349 	in6_addr_t	dst;
1350 	ipif_t		*ipif;
1351 	ip6i_t		*ip6i;
1352 	boolean_t	dropped = B_FALSE;
1353 	ip_stack_t	*ipst = nce->nce_ill->ill_ipst;
1354 
1355 	ASSERT(RW_READ_HELD(&ipst->ips_ill_g_lock));
1356 	ASSERT(MUTEX_HELD(&nce->nce_lock));
1357 	ill = nce->nce_ill;
1358 	ASSERT(ill != NULL);
1359 
1360 	if (nce->nce_rcnt == 0) {
1361 		return (0);
1362 	}
1363 
1364 	if (mp == NULL) {
1365 		ASSERT(nce->nce_qd_mp != NULL);
1366 		mp = nce->nce_qd_mp;
1367 	} else {
1368 		nce_queue_mp(nce, mp);
1369 	}
1370 
1371 	/* Handle ip_newroute_v6 giving us IPSEC packets */
1372 	if (mp->b_datap->db_type == M_CTL)
1373 		mp = mp->b_cont;
1374 
1375 	ip6h = (ip6_t *)mp->b_rptr;
1376 	if (ip6h->ip6_nxt == IPPROTO_RAW) {
1377 		/*
1378 		 * This message should have been pulled up already in
1379 		 * ip_wput_v6. We can't do pullups here because the message
1380 		 * could be from the nce_qd_mp which could have b_next/b_prev
1381 		 * non-NULL.
1382 		 */
1383 		ip6i = (ip6i_t *)ip6h;
1384 		ASSERT((mp->b_wptr - (uchar_t *)ip6i) >=
1385 		    sizeof (ip6i_t) + IPV6_HDR_LEN);
1386 		ip6h = (ip6_t *)(mp->b_rptr + sizeof (ip6i_t));
1387 	}
1388 	src = ip6h->ip6_src;
1389 	/*
1390 	 * If the src of outgoing packet is one of the assigned interface
1391 	 * addresses use it, otherwise we will pick the source address below.
1392 	 */
1393 	src_ill = ill;
1394 	if (!IN6_IS_ADDR_UNSPECIFIED(&src)) {
1395 		if (ill->ill_group != NULL)
1396 			src_ill = ill->ill_group->illgrp_ill;
1397 		for (; src_ill != NULL; src_ill = src_ill->ill_group_next) {
1398 			for (ipif = src_ill->ill_ipif; ipif != NULL;
1399 			    ipif = ipif->ipif_next) {
1400 				if (IN6_ARE_ADDR_EQUAL(&src,
1401 				    &ipif->ipif_v6lcl_addr)) {
1402 					break;
1403 				}
1404 			}
1405 			if (ipif != NULL)
1406 				break;
1407 		}
1408 		/*
1409 		 * If no relevant ipif can be found, then it's not one of our
1410 		 * addresses.  Reset to :: and let nce_xmit.  If an ipif can be
1411 		 * found, but it's not yet done with DAD verification, then
1412 		 * just postpone this transmission until later.
1413 		 */
1414 		if (src_ill == NULL)
1415 			src = ipv6_all_zeros;
1416 		else if (!ipif->ipif_addr_ready)
1417 			return (ill->ill_reachable_retrans_time);
1418 	}
1419 	dst = nce->nce_addr;
1420 	/*
1421 	 * If source address is unspecified, nce_xmit will choose
1422 	 * one for us and initialize the hardware address also
1423 	 * appropriately.
1424 	 */
1425 	if (IN6_IS_ADDR_UNSPECIFIED(&src))
1426 		src_ill = NULL;
1427 	nce->nce_rcnt--;
1428 	mutex_exit(&nce->nce_lock);
1429 	rw_exit(&ipst->ips_ill_g_lock);
1430 	dropped = nce_xmit(ill, ND_NEIGHBOR_SOLICIT, src_ill, B_TRUE, &src,
1431 	    &dst, 0);
1432 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
1433 	mutex_enter(&nce->nce_lock);
1434 	if (dropped)
1435 		nce->nce_rcnt++;
1436 	return (ill->ill_reachable_retrans_time);
1437 }
1438 
1439 /*
1440  * Attempt to recover an address on an interface that's been marked as a
1441  * duplicate.  Because NCEs are destroyed when the interface goes down, there's
1442  * no easy way to just probe the address and have the right thing happen if
1443  * it's no longer in use.  Instead, we just bring it up normally and allow the
1444  * regular interface start-up logic to probe for a remaining duplicate and take
1445  * us back down if necessary.
1446  * Neither DHCP nor temporary addresses arrive here; they're excluded by
1447  * ip_ndp_excl.
1448  */
1449 /* ARGSUSED */
1450 static void
1451 ip_ndp_recover(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg)
1452 {
1453 	ill_t	*ill = rq->q_ptr;
1454 	ipif_t	*ipif;
1455 	in6_addr_t *addr = (in6_addr_t *)mp->b_rptr;
1456 
1457 	for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
1458 		/*
1459 		 * We do not support recovery of proxy ARP'd interfaces,
1460 		 * because the system lacks a complete proxy ARP mechanism.
1461 		 */
1462 		if ((ipif->ipif_flags & IPIF_POINTOPOINT) ||
1463 		    !IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6lcl_addr, addr)) {
1464 			continue;
1465 		}
1466 
1467 		/*
1468 		 * If we have already recovered or if the interface is going
1469 		 * away, then ignore.
1470 		 */
1471 		mutex_enter(&ill->ill_lock);
1472 		if (!(ipif->ipif_flags & IPIF_DUPLICATE) ||
1473 		    (ipif->ipif_flags & (IPIF_MOVING | IPIF_CONDEMNED))) {
1474 			mutex_exit(&ill->ill_lock);
1475 			continue;
1476 		}
1477 
1478 		ipif->ipif_flags &= ~IPIF_DUPLICATE;
1479 		ill->ill_ipif_dup_count--;
1480 		mutex_exit(&ill->ill_lock);
1481 		ipif->ipif_was_dup = B_TRUE;
1482 
1483 		if (ipif_ndp_up(ipif) != EINPROGRESS)
1484 			(void) ipif_up_done_v6(ipif);
1485 	}
1486 	freeb(mp);
1487 }
1488 
1489 /*
1490  * Attempt to recover an IPv6 interface that's been shut down as a duplicate.
1491  * As long as someone else holds the address, the interface will stay down.
1492  * When that conflict goes away, the interface is brought back up.  This is
1493  * done so that accidental shutdowns of addresses aren't made permanent.  Your
1494  * server will recover from a failure.
1495  *
1496  * For DHCP and temporary addresses, recovery is not done in the kernel.
1497  * Instead, it's handled by user space processes (dhcpagent and in.ndpd).
1498  *
1499  * This function is entered on a timer expiry; the ID is in ipif_recovery_id.
1500  */
1501 static void
1502 ipif6_dup_recovery(void *arg)
1503 {
1504 	ipif_t *ipif = arg;
1505 
1506 	ipif->ipif_recovery_id = 0;
1507 	if (!(ipif->ipif_flags & IPIF_DUPLICATE))
1508 		return;
1509 
1510 	/*
1511 	 * No lock, because this is just an optimization.
1512 	 */
1513 	if (ipif->ipif_state_flags & (IPIF_MOVING | IPIF_CONDEMNED))
1514 		return;
1515 
1516 	/* If the link is down, we'll retry this later */
1517 	if (!(ipif->ipif_ill->ill_phyint->phyint_flags & PHYI_RUNNING))
1518 		return;
1519 
1520 	ndp_do_recovery(ipif);
1521 }
1522 
1523 /*
1524  * Perform interface recovery by forcing the duplicate interfaces up and
1525  * allowing the system to determine which ones should stay up.
1526  *
1527  * Called both by recovery timer expiry and link-up notification.
1528  */
1529 void
1530 ndp_do_recovery(ipif_t *ipif)
1531 {
1532 	ill_t *ill = ipif->ipif_ill;
1533 	mblk_t *mp;
1534 	ip_stack_t *ipst = ill->ill_ipst;
1535 
1536 	mp = allocb(sizeof (ipif->ipif_v6lcl_addr), BPRI_MED);
1537 	if (mp == NULL) {
1538 		mutex_enter(&ill->ill_lock);
1539 		if (ipif->ipif_recovery_id == 0 &&
1540 		    !(ipif->ipif_state_flags & (IPIF_MOVING |
1541 		    IPIF_CONDEMNED))) {
1542 			ipif->ipif_recovery_id = timeout(ipif6_dup_recovery,
1543 			    ipif, MSEC_TO_TICK(ipst->ips_ip_dup_recovery));
1544 		}
1545 		mutex_exit(&ill->ill_lock);
1546 	} else {
1547 		bcopy(&ipif->ipif_v6lcl_addr, mp->b_rptr,
1548 		    sizeof (ipif->ipif_v6lcl_addr));
1549 		ill_refhold(ill);
1550 		qwriter_ip(ill, ill->ill_rq, mp, ip_ndp_recover, NEW_OP,
1551 		    B_FALSE);
1552 	}
1553 }
1554 
1555 /*
1556  * Find the solicitation in the given message, and extract printable details
1557  * (MAC and IP addresses) from it.
1558  */
1559 static nd_neighbor_solicit_t *
1560 ip_ndp_find_solicitation(mblk_t *mp, mblk_t *dl_mp, ill_t *ill, char *hbuf,
1561     size_t hlen, char *sbuf, size_t slen, uchar_t **haddr)
1562 {
1563 	nd_neighbor_solicit_t *ns;
1564 	ip6_t *ip6h;
1565 	uchar_t *addr;
1566 	int alen;
1567 
1568 	alen = 0;
1569 	ip6h = (ip6_t *)mp->b_rptr;
1570 	if (dl_mp == NULL) {
1571 		nd_opt_hdr_t *opt;
1572 		int nslen;
1573 
1574 		/*
1575 		 * If it's from the fast-path, then it can't be a probe
1576 		 * message, and thus must include the source linkaddr option.
1577 		 * Extract that here.
1578 		 */
1579 		ns = (nd_neighbor_solicit_t *)((char *)ip6h + IPV6_HDR_LEN);
1580 		nslen = mp->b_wptr - (uchar_t *)ns;
1581 		if ((nslen -= sizeof (*ns)) > 0) {
1582 			opt = ndp_get_option((nd_opt_hdr_t *)(ns + 1), nslen,
1583 			    ND_OPT_SOURCE_LINKADDR);
1584 			if (opt != NULL &&
1585 			    opt->nd_opt_len * 8 - sizeof (*opt) >=
1586 			    ill->ill_nd_lla_len) {
1587 				addr = (uchar_t *)(opt + 1);
1588 				alen = ill->ill_nd_lla_len;
1589 			}
1590 		}
1591 		/*
1592 		 * We cheat a bit here for the sake of printing usable log
1593 		 * messages in the rare case where the reply we got was unicast
1594 		 * without a source linkaddr option, and the interface is in
1595 		 * fastpath mode.  (Sigh.)
1596 		 */
1597 		if (alen == 0 && ill->ill_type == IFT_ETHER &&
1598 		    MBLKHEAD(mp) >= sizeof (struct ether_header)) {
1599 			struct ether_header *pether;
1600 
1601 			pether = (struct ether_header *)((char *)ip6h -
1602 			    sizeof (*pether));
1603 			addr = pether->ether_shost.ether_addr_octet;
1604 			alen = ETHERADDRL;
1605 		}
1606 	} else {
1607 		dl_unitdata_ind_t *dlu;
1608 
1609 		dlu = (dl_unitdata_ind_t *)dl_mp->b_rptr;
1610 		alen = dlu->dl_src_addr_length;
1611 		if (alen > 0 && dlu->dl_src_addr_offset >= sizeof (*dlu) &&
1612 		    dlu->dl_src_addr_offset + alen <= MBLKL(dl_mp)) {
1613 			addr = dl_mp->b_rptr + dlu->dl_src_addr_offset;
1614 			if (ill->ill_sap_length < 0) {
1615 				alen += ill->ill_sap_length;
1616 			} else {
1617 				addr += ill->ill_sap_length;
1618 				alen -= ill->ill_sap_length;
1619 			}
1620 		}
1621 	}
1622 	if (alen > 0) {
1623 		*haddr = addr;
1624 		(void) mac_colon_addr(addr, alen, hbuf, hlen);
1625 	} else {
1626 		*haddr = NULL;
1627 		(void) strcpy(hbuf, "?");
1628 	}
1629 	ns = (nd_neighbor_solicit_t *)((char *)ip6h + IPV6_HDR_LEN);
1630 	(void) inet_ntop(AF_INET6, &ns->nd_ns_target, sbuf, slen);
1631 	return (ns);
1632 }
1633 
1634 /*
1635  * This is for exclusive changes due to NDP duplicate address detection
1636  * failure.
1637  */
1638 /* ARGSUSED */
1639 static void
1640 ip_ndp_excl(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg)
1641 {
1642 	ill_t	*ill = rq->q_ptr;
1643 	ipif_t	*ipif;
1644 	char ibuf[LIFNAMSIZ + 10];	/* 10 digits for logical i/f number */
1645 	char hbuf[MAC_STR_LEN];
1646 	char sbuf[INET6_ADDRSTRLEN];
1647 	nd_neighbor_solicit_t *ns;
1648 	mblk_t *dl_mp = NULL;
1649 	uchar_t *haddr;
1650 	ip_stack_t *ipst = ill->ill_ipst;
1651 
1652 	if (DB_TYPE(mp) != M_DATA) {
1653 		dl_mp = mp;
1654 		mp = mp->b_cont;
1655 	}
1656 	ns = ip_ndp_find_solicitation(mp, dl_mp, ill, hbuf, sizeof (hbuf), sbuf,
1657 	    sizeof (sbuf), &haddr);
1658 	if (haddr != NULL &&
1659 	    bcmp(haddr, ill->ill_phys_addr, ill->ill_phys_addr_length) == 0) {
1660 		/*
1661 		 * Ignore conflicts generated by misbehaving switches that just
1662 		 * reflect our own messages back to us.
1663 		 */
1664 		goto ignore_conflict;
1665 	}
1666 
1667 	for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
1668 
1669 		if ((ipif->ipif_flags & IPIF_POINTOPOINT) ||
1670 		    !IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6lcl_addr,
1671 		    &ns->nd_ns_target)) {
1672 			continue;
1673 		}
1674 
1675 		/* If it's already marked, then don't do anything. */
1676 		if (ipif->ipif_flags & IPIF_DUPLICATE)
1677 			continue;
1678 
1679 		/*
1680 		 * If this is a failure during duplicate recovery, then don't
1681 		 * complain.  It may take a long time to recover.
1682 		 */
1683 		if (!ipif->ipif_was_dup) {
1684 			ipif_get_name(ipif, ibuf, sizeof (ibuf));
1685 			cmn_err(CE_WARN, "%s has duplicate address %s (in "
1686 			    "use by %s); disabled", ibuf, sbuf, hbuf);
1687 		}
1688 		mutex_enter(&ill->ill_lock);
1689 		ASSERT(!(ipif->ipif_flags & IPIF_DUPLICATE));
1690 		ipif->ipif_flags |= IPIF_DUPLICATE;
1691 		ill->ill_ipif_dup_count++;
1692 		mutex_exit(&ill->ill_lock);
1693 		(void) ipif_down(ipif, NULL, NULL);
1694 		ipif_down_tail(ipif);
1695 		mutex_enter(&ill->ill_lock);
1696 		if (!(ipif->ipif_flags & (IPIF_DHCPRUNNING|IPIF_TEMPORARY)) &&
1697 		    ill->ill_net_type == IRE_IF_RESOLVER &&
1698 		    !(ipif->ipif_state_flags & (IPIF_MOVING |
1699 		    IPIF_CONDEMNED)) &&
1700 		    ipst->ips_ip_dup_recovery > 0) {
1701 			ipif->ipif_recovery_id = timeout(ipif6_dup_recovery,
1702 			    ipif, MSEC_TO_TICK(ipst->ips_ip_dup_recovery));
1703 		}
1704 		mutex_exit(&ill->ill_lock);
1705 	}
1706 ignore_conflict:
1707 	if (dl_mp != NULL)
1708 		freeb(dl_mp);
1709 	freemsg(mp);
1710 }
1711 
1712 /*
1713  * Handle failure by tearing down the ipifs with the specified address.  Note
1714  * that tearing down the ipif also means deleting the nce through ipif_down, so
1715  * it's not possible to do recovery by just restarting the nce timer.  Instead,
1716  * we start a timer on the ipif.
1717  */
1718 static void
1719 ip_ndp_failure(ill_t *ill, mblk_t *mp, mblk_t *dl_mp, nce_t *nce)
1720 {
1721 	if ((mp = copymsg(mp)) != NULL) {
1722 		if (dl_mp == NULL)
1723 			dl_mp = mp;
1724 		else if ((dl_mp = copyb(dl_mp)) != NULL)
1725 			dl_mp->b_cont = mp;
1726 		if (dl_mp == NULL) {
1727 			freemsg(mp);
1728 		} else {
1729 			ill_refhold(ill);
1730 			qwriter_ip(ill, ill->ill_rq, dl_mp, ip_ndp_excl, NEW_OP,
1731 			    B_FALSE);
1732 		}
1733 	}
1734 	ndp_delete(nce);
1735 }
1736 
1737 /*
1738  * Handle a discovered conflict: some other system is advertising that it owns
1739  * one of our IP addresses.  We need to defend ourselves, or just shut down the
1740  * interface.
1741  */
1742 static void
1743 ip_ndp_conflict(ill_t *ill, mblk_t *mp, mblk_t *dl_mp, nce_t *nce)
1744 {
1745 	ipif_t *ipif;
1746 	uint32_t now;
1747 	uint_t maxdefense;
1748 	uint_t defs;
1749 	ip_stack_t *ipst = ill->ill_ipst;
1750 
1751 	ipif = ipif_lookup_addr_v6(&nce->nce_addr, ill, ALL_ZONES, NULL, NULL,
1752 	    NULL, NULL, ipst);
1753 	if (ipif == NULL)
1754 		return;
1755 	/*
1756 	 * First, figure out if this address is disposable.
1757 	 */
1758 	if (ipif->ipif_flags & (IPIF_DHCPRUNNING | IPIF_TEMPORARY))
1759 		maxdefense = ipst->ips_ip_max_temp_defend;
1760 	else
1761 		maxdefense = ipst->ips_ip_max_defend;
1762 
1763 	/*
1764 	 * Now figure out how many times we've defended ourselves.  Ignore
1765 	 * defenses that happened long in the past.
1766 	 */
1767 	now = gethrestime_sec();
1768 	mutex_enter(&nce->nce_lock);
1769 	if ((defs = nce->nce_defense_count) > 0 &&
1770 	    now - nce->nce_defense_time > ipst->ips_ip_defend_interval) {
1771 		nce->nce_defense_count = defs = 0;
1772 	}
1773 	nce->nce_defense_count++;
1774 	nce->nce_defense_time = now;
1775 	mutex_exit(&nce->nce_lock);
1776 	ipif_refrele(ipif);
1777 
1778 	/*
1779 	 * If we've defended ourselves too many times already, then give up and
1780 	 * tear down the interface(s) using this address.  Otherwise, defend by
1781 	 * sending out an unsolicited Neighbor Advertisement.
1782 	 */
1783 	if (defs >= maxdefense) {
1784 		ip_ndp_failure(ill, mp, dl_mp, nce);
1785 	} else {
1786 		char hbuf[MAC_STR_LEN];
1787 		char sbuf[INET6_ADDRSTRLEN];
1788 		uchar_t *haddr;
1789 
1790 		(void) ip_ndp_find_solicitation(mp, dl_mp, ill, hbuf,
1791 		    sizeof (hbuf), sbuf, sizeof (sbuf), &haddr);
1792 		cmn_err(CE_WARN, "node %s is using our IP address %s on %s",
1793 		    hbuf, sbuf, ill->ill_name);
1794 		(void) nce_xmit(ill, ND_NEIGHBOR_ADVERT, ill, B_FALSE,
1795 		    &nce->nce_addr, &ipv6_all_hosts_mcast,
1796 		    nce_advert_flags(nce));
1797 	}
1798 }
1799 
1800 static void
1801 ndp_input_solicit(ill_t *ill, mblk_t *mp, mblk_t *dl_mp)
1802 {
1803 	nd_neighbor_solicit_t *ns;
1804 	uint32_t	hlen = ill->ill_nd_lla_len;
1805 	uchar_t		*haddr = NULL;
1806 	icmp6_t		*icmp_nd;
1807 	ip6_t		*ip6h;
1808 	nce_t		*our_nce = NULL;
1809 	in6_addr_t	target;
1810 	in6_addr_t	src;
1811 	int		len;
1812 	int		flag = 0;
1813 	nd_opt_hdr_t	*opt = NULL;
1814 	boolean_t	bad_solicit = B_FALSE;
1815 	mib2_ipv6IfIcmpEntry_t	*mib = ill->ill_icmp6_mib;
1816 
1817 	ip6h = (ip6_t *)mp->b_rptr;
1818 	icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
1819 	len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN;
1820 	src = ip6h->ip6_src;
1821 	ns = (nd_neighbor_solicit_t *)icmp_nd;
1822 	target = ns->nd_ns_target;
1823 	if (IN6_IS_ADDR_MULTICAST(&target)) {
1824 		if (ip_debug > 2) {
1825 			/* ip1dbg */
1826 			pr_addr_dbg("ndp_input_solicit: Target is"
1827 			    " multicast! %s\n", AF_INET6, &target);
1828 		}
1829 		bad_solicit = B_TRUE;
1830 		goto done;
1831 	}
1832 	if (len > sizeof (nd_neighbor_solicit_t)) {
1833 		/* Options present */
1834 		opt = (nd_opt_hdr_t *)&ns[1];
1835 		len -= sizeof (nd_neighbor_solicit_t);
1836 		if (!ndp_verify_optlen(opt, len)) {
1837 			ip1dbg(("ndp_input_solicit: Bad opt len\n"));
1838 			bad_solicit = B_TRUE;
1839 			goto done;
1840 		}
1841 	}
1842 	if (IN6_IS_ADDR_UNSPECIFIED(&src)) {
1843 		/* Check to see if this is a valid DAD solicitation */
1844 		if (!IN6_IS_ADDR_MC_SOLICITEDNODE(&ip6h->ip6_dst)) {
1845 			if (ip_debug > 2) {
1846 				/* ip1dbg */
1847 				pr_addr_dbg("ndp_input_solicit: IPv6 "
1848 				    "Destination is not solicited node "
1849 				    "multicast %s\n", AF_INET6,
1850 				    &ip6h->ip6_dst);
1851 			}
1852 			bad_solicit = B_TRUE;
1853 			goto done;
1854 		}
1855 	}
1856 
1857 	our_nce = ndp_lookup_v6(ill, &target, B_FALSE);
1858 	/*
1859 	 * If this is a valid Solicitation, a permanent
1860 	 * entry should exist in the cache
1861 	 */
1862 	if (our_nce == NULL ||
1863 	    !(our_nce->nce_flags & NCE_F_PERMANENT)) {
1864 		ip1dbg(("ndp_input_solicit: Wrong target in NS?!"
1865 		    "ifname=%s ", ill->ill_name));
1866 		if (ip_debug > 2) {
1867 			/* ip1dbg */
1868 			pr_addr_dbg(" dst %s\n", AF_INET6, &target);
1869 		}
1870 		bad_solicit = B_TRUE;
1871 		goto done;
1872 	}
1873 
1874 	/* At this point we should have a verified NS per spec */
1875 	if (opt != NULL) {
1876 		opt = ndp_get_option(opt, len, ND_OPT_SOURCE_LINKADDR);
1877 		if (opt != NULL) {
1878 			haddr = (uchar_t *)&opt[1];
1879 			if (hlen > opt->nd_opt_len * 8 - sizeof (*opt) ||
1880 			    hlen == 0) {
1881 				ip1dbg(("ndp_input_advert: bad SLLA\n"));
1882 				bad_solicit = B_TRUE;
1883 				goto done;
1884 			}
1885 		}
1886 	}
1887 
1888 	/* If sending directly to peer, set the unicast flag */
1889 	if (!IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst))
1890 		flag |= NDP_UNICAST;
1891 
1892 	/*
1893 	 * Create/update the entry for the soliciting node.
1894 	 * or respond to outstanding queries, don't if
1895 	 * the source is unspecified address.
1896 	 */
1897 	if (!IN6_IS_ADDR_UNSPECIFIED(&src)) {
1898 		int	err;
1899 		nce_t	*nnce;
1900 
1901 		ASSERT(ill->ill_isv6);
1902 		/*
1903 		 * Regular solicitations *must* include the Source Link-Layer
1904 		 * Address option.  Ignore messages that do not.
1905 		 */
1906 		if (haddr == NULL && IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst)) {
1907 			ip1dbg(("ndp_input_solicit: source link-layer address "
1908 			    "option missing with a specified source.\n"));
1909 			bad_solicit = B_TRUE;
1910 			goto done;
1911 		}
1912 
1913 		/*
1914 		 * This is a regular solicitation.  If we're still in the
1915 		 * process of verifying the address, then don't respond at all
1916 		 * and don't keep track of the sender.
1917 		 */
1918 		if (our_nce->nce_state == ND_PROBE)
1919 			goto done;
1920 
1921 		/*
1922 		 * If the solicitation doesn't have sender hardware address
1923 		 * (legal for unicast solicitation), then process without
1924 		 * installing the return NCE.  Either we already know it, or
1925 		 * we'll be forced to look it up when (and if) we reply to the
1926 		 * packet.
1927 		 */
1928 		if (haddr == NULL)
1929 			goto no_source;
1930 
1931 		err = ndp_lookup_then_add_v6(ill,
1932 		    haddr,
1933 		    &src,	/* Soliciting nodes address */
1934 		    &ipv6_all_ones,
1935 		    &ipv6_all_zeros,
1936 		    0,
1937 		    0,
1938 		    ND_STALE,
1939 		    &nnce);
1940 		switch (err) {
1941 		case 0:
1942 			/* done with this entry */
1943 			NCE_REFRELE(nnce);
1944 			break;
1945 		case EEXIST:
1946 			/*
1947 			 * B_FALSE indicates this is not an
1948 			 * an advertisement.
1949 			 */
1950 			ndp_process(nnce, haddr, 0, B_FALSE);
1951 			NCE_REFRELE(nnce);
1952 			break;
1953 		default:
1954 			ip1dbg(("ndp_input_solicit: Can't create NCE %d\n",
1955 			    err));
1956 			goto done;
1957 		}
1958 no_source:
1959 		flag |= NDP_SOLICITED;
1960 	} else {
1961 		/*
1962 		 * No source link layer address option should be present in a
1963 		 * valid DAD request.
1964 		 */
1965 		if (haddr != NULL) {
1966 			ip1dbg(("ndp_input_solicit: source link-layer address "
1967 			    "option present with an unspecified source.\n"));
1968 			bad_solicit = B_TRUE;
1969 			goto done;
1970 		}
1971 		if (our_nce->nce_state == ND_PROBE) {
1972 			/*
1973 			 * Internally looped-back probes won't have DLPI
1974 			 * attached to them.  External ones (which are sent by
1975 			 * multicast) always will.  Just ignore our own
1976 			 * transmissions.
1977 			 */
1978 			if (dl_mp != NULL) {
1979 				/*
1980 				 * If someone else is probing our address, then
1981 				 * we've crossed wires.  Declare failure.
1982 				 */
1983 				ip_ndp_failure(ill, mp, dl_mp, our_nce);
1984 			}
1985 			goto done;
1986 		}
1987 		/*
1988 		 * This is a DAD probe.  Multicast the advertisement to the
1989 		 * all-nodes address.
1990 		 */
1991 		src = ipv6_all_hosts_mcast;
1992 	}
1993 	flag |= nce_advert_flags(our_nce);
1994 	/* Response to a solicitation */
1995 	(void) nce_xmit(ill,
1996 	    ND_NEIGHBOR_ADVERT,
1997 	    ill,	/* ill to be used for extracting ill_nd_lla */
1998 	    B_TRUE,	/* use ill_nd_lla */
1999 	    &target,	/* Source and target of the advertisement pkt */
2000 	    &src,	/* IP Destination (source of original pkt) */
2001 	    flag);
2002 done:
2003 	if (bad_solicit)
2004 		BUMP_MIB(mib, ipv6IfIcmpInBadNeighborSolicitations);
2005 	if (our_nce != NULL)
2006 		NCE_REFRELE(our_nce);
2007 }
2008 
2009 void
2010 ndp_input_advert(ill_t *ill, mblk_t *mp, mblk_t *dl_mp)
2011 {
2012 	nd_neighbor_advert_t *na;
2013 	uint32_t	hlen = ill->ill_nd_lla_len;
2014 	uchar_t		*haddr = NULL;
2015 	icmp6_t		*icmp_nd;
2016 	ip6_t		*ip6h;
2017 	nce_t		*dst_nce = NULL;
2018 	in6_addr_t	target;
2019 	nd_opt_hdr_t	*opt = NULL;
2020 	int		len;
2021 	mib2_ipv6IfIcmpEntry_t	*mib = ill->ill_icmp6_mib;
2022 	ip_stack_t	*ipst = ill->ill_ipst;
2023 
2024 	ip6h = (ip6_t *)mp->b_rptr;
2025 	icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
2026 	len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN;
2027 	na = (nd_neighbor_advert_t *)icmp_nd;
2028 	if (IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst) &&
2029 	    (na->nd_na_flags_reserved & ND_NA_FLAG_SOLICITED)) {
2030 		ip1dbg(("ndp_input_advert: Target is multicast but the "
2031 		    "solicited flag is not zero\n"));
2032 		BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements);
2033 		return;
2034 	}
2035 	target = na->nd_na_target;
2036 	if (IN6_IS_ADDR_MULTICAST(&target)) {
2037 		ip1dbg(("ndp_input_advert: Target is multicast!\n"));
2038 		BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements);
2039 		return;
2040 	}
2041 	if (len > sizeof (nd_neighbor_advert_t)) {
2042 		opt = (nd_opt_hdr_t *)&na[1];
2043 		if (!ndp_verify_optlen(opt,
2044 		    len - sizeof (nd_neighbor_advert_t))) {
2045 			ip1dbg(("ndp_input_advert: cannot verify SLLA\n"));
2046 			BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements);
2047 			return;
2048 		}
2049 		/* At this point we have a verified NA per spec */
2050 		len -= sizeof (nd_neighbor_advert_t);
2051 		opt = ndp_get_option(opt, len, ND_OPT_TARGET_LINKADDR);
2052 		if (opt != NULL) {
2053 			haddr = (uchar_t *)&opt[1];
2054 			if (hlen > opt->nd_opt_len * 8 - sizeof (*opt) ||
2055 			    hlen == 0) {
2056 				ip1dbg(("ndp_input_advert: bad SLLA\n"));
2057 				BUMP_MIB(mib,
2058 				    ipv6IfIcmpInBadNeighborAdvertisements);
2059 				return;
2060 			}
2061 		}
2062 	}
2063 
2064 	/*
2065 	 * If this interface is part of the group look at all the
2066 	 * ills in the group.
2067 	 */
2068 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
2069 	if (ill->ill_group != NULL)
2070 		ill = ill->ill_group->illgrp_ill;
2071 
2072 	for (; ill != NULL; ill = ill->ill_group_next) {
2073 		mutex_enter(&ill->ill_lock);
2074 		if (!ILL_CAN_LOOKUP(ill)) {
2075 			mutex_exit(&ill->ill_lock);
2076 			continue;
2077 		}
2078 		ill_refhold_locked(ill);
2079 		mutex_exit(&ill->ill_lock);
2080 		dst_nce = ndp_lookup_v6(ill, &target, B_FALSE);
2081 		/* We have to drop the lock since ndp_process calls put* */
2082 		rw_exit(&ipst->ips_ill_g_lock);
2083 		if (dst_nce != NULL) {
2084 			if ((dst_nce->nce_flags & NCE_F_PERMANENT) &&
2085 			    dst_nce->nce_state == ND_PROBE) {
2086 				/*
2087 				 * Someone else sent an advertisement for an
2088 				 * address that we're trying to configure.
2089 				 * Tear it down.  Note that dl_mp might be NULL
2090 				 * if we're getting a unicast reply.  This
2091 				 * isn't typically done (multicast is the norm
2092 				 * in response to a probe), but ip_ndp_failure
2093 				 * will handle the dl_mp == NULL case as well.
2094 				 */
2095 				ip_ndp_failure(ill, mp, dl_mp, dst_nce);
2096 			} else if (dst_nce->nce_flags & NCE_F_PERMANENT) {
2097 				/*
2098 				 * Someone just announced one of our local
2099 				 * addresses.  If it wasn't us, then this is a
2100 				 * conflict.  Defend the address or shut it
2101 				 * down.
2102 				 */
2103 				if (dl_mp != NULL &&
2104 				    (haddr == NULL ||
2105 				    nce_cmp_ll_addr(dst_nce, haddr,
2106 				    ill->ill_nd_lla_len))) {
2107 					ip_ndp_conflict(ill, mp, dl_mp,
2108 					    dst_nce);
2109 				}
2110 			} else {
2111 				if (na->nd_na_flags_reserved &
2112 				    ND_NA_FLAG_ROUTER) {
2113 					dst_nce->nce_flags |= NCE_F_ISROUTER;
2114 				}
2115 				/* B_TRUE indicates this an advertisement */
2116 				ndp_process(dst_nce, haddr,
2117 				    na->nd_na_flags_reserved, B_TRUE);
2118 			}
2119 			NCE_REFRELE(dst_nce);
2120 		}
2121 		rw_enter(&ipst->ips_ill_g_lock, RW_READER);
2122 		ill_refrele(ill);
2123 	}
2124 	rw_exit(&ipst->ips_ill_g_lock);
2125 }
2126 
2127 /*
2128  * Process NDP neighbor solicitation/advertisement messages.
2129  * The checksum has already checked o.k before reaching here.
2130  */
2131 void
2132 ndp_input(ill_t *ill, mblk_t *mp, mblk_t *dl_mp)
2133 {
2134 	icmp6_t		*icmp_nd;
2135 	ip6_t		*ip6h;
2136 	int		len;
2137 	mib2_ipv6IfIcmpEntry_t	*mib = ill->ill_icmp6_mib;
2138 
2139 
2140 	if (!pullupmsg(mp, -1)) {
2141 		ip1dbg(("ndp_input: pullupmsg failed\n"));
2142 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2143 		goto done;
2144 	}
2145 	ip6h = (ip6_t *)mp->b_rptr;
2146 	if (ip6h->ip6_hops != IPV6_MAX_HOPS) {
2147 		ip1dbg(("ndp_input: hoplimit != IPV6_MAX_HOPS\n"));
2148 		BUMP_MIB(mib, ipv6IfIcmpBadHoplimit);
2149 		goto done;
2150 	}
2151 	/*
2152 	 * NDP does not accept any extension headers between the
2153 	 * IP header and the ICMP header since e.g. a routing
2154 	 * header could be dangerous.
2155 	 * This assumes that any AH or ESP headers are removed
2156 	 * by ip prior to passing the packet to ndp_input.
2157 	 */
2158 	if (ip6h->ip6_nxt != IPPROTO_ICMPV6) {
2159 		ip1dbg(("ndp_input: Wrong next header 0x%x\n",
2160 		    ip6h->ip6_nxt));
2161 		BUMP_MIB(mib, ipv6IfIcmpInErrors);
2162 		goto done;
2163 	}
2164 	icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
2165 	ASSERT(icmp_nd->icmp6_type == ND_NEIGHBOR_SOLICIT ||
2166 	    icmp_nd->icmp6_type == ND_NEIGHBOR_ADVERT);
2167 	if (icmp_nd->icmp6_code != 0) {
2168 		ip1dbg(("ndp_input: icmp6 code != 0 \n"));
2169 		BUMP_MIB(mib, ipv6IfIcmpInErrors);
2170 		goto done;
2171 	}
2172 	len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN;
2173 	/*
2174 	 * Make sure packet length is large enough for either
2175 	 * a NS or a NA icmp packet.
2176 	 */
2177 	if (len <  sizeof (struct icmp6_hdr) + sizeof (struct in6_addr)) {
2178 		ip1dbg(("ndp_input: packet too short\n"));
2179 		BUMP_MIB(mib, ipv6IfIcmpInErrors);
2180 		goto done;
2181 	}
2182 	if (icmp_nd->icmp6_type == ND_NEIGHBOR_SOLICIT) {
2183 		ndp_input_solicit(ill, mp, dl_mp);
2184 	} else {
2185 		ndp_input_advert(ill, mp, dl_mp);
2186 	}
2187 done:
2188 	freemsg(mp);
2189 }
2190 
2191 /*
2192  * nce_xmit is called to form and transmit a ND solicitation or
2193  * advertisement ICMP packet.
2194  *
2195  * If the source address is unspecified and this isn't a probe (used for
2196  * duplicate address detection), an appropriate source address and link layer
2197  * address will be chosen here.  The link layer address option is included if
2198  * the source is specified (i.e., all non-probe packets), and omitted (per the
2199  * specification) otherwise.
2200  *
2201  * It returns B_FALSE only if it does a successful put() to the
2202  * corresponding ill's ill_wq otherwise returns B_TRUE.
2203  */
2204 static boolean_t
2205 nce_xmit(ill_t *ill, uint32_t operation, ill_t *hwaddr_ill,
2206     boolean_t use_nd_lla, const in6_addr_t *sender, const in6_addr_t *target,
2207     int flag)
2208 {
2209 	uint32_t	len;
2210 	icmp6_t 	*icmp6;
2211 	mblk_t		*mp;
2212 	ip6_t		*ip6h;
2213 	nd_opt_hdr_t	*opt;
2214 	uint_t		plen;
2215 	ip6i_t		*ip6i;
2216 	ipif_t		*src_ipif = NULL;
2217 	uint8_t		*hw_addr;
2218 	zoneid_t	zoneid = GLOBAL_ZONEID;
2219 
2220 	/*
2221 	 * If we have a unspecified source(sender) address, select a
2222 	 * proper source address for the solicitation here itself so
2223 	 * that we can initialize the h/w address correctly. This is
2224 	 * needed for interface groups as source address can come from
2225 	 * the whole group and the h/w address initialized from ill will
2226 	 * be wrong if the source address comes from a different ill.
2227 	 *
2228 	 * If the sender is specified then we use this address in order
2229 	 * to lookup the zoneid before calling ip_output_v6(). This is to
2230 	 * enable unicast ND_NEIGHBOR_ADVERT packets to be routed correctly
2231 	 * by IP (we cannot guarantee that the global zone has an interface
2232 	 * route to the destination).
2233 	 *
2234 	 * Note that the NA never comes here with the unspecified source
2235 	 * address. The following asserts that whenever the source
2236 	 * address is specified, the haddr also should be specified.
2237 	 */
2238 	ASSERT(IN6_IS_ADDR_UNSPECIFIED(sender) || (hwaddr_ill != NULL));
2239 
2240 	if (IN6_IS_ADDR_UNSPECIFIED(sender) && !(flag & NDP_PROBE)) {
2241 		ASSERT(operation != ND_NEIGHBOR_ADVERT);
2242 		/*
2243 		 * Pick a source address for this solicitation, but
2244 		 * restrict the selection to addresses assigned to the
2245 		 * output interface (or interface group).  We do this
2246 		 * because the destination will create a neighbor cache
2247 		 * entry for the source address of this packet, so the
2248 		 * source address had better be a valid neighbor.
2249 		 */
2250 		src_ipif = ipif_select_source_v6(ill, target, RESTRICT_TO_ILL,
2251 		    IPV6_PREFER_SRC_DEFAULT, ALL_ZONES);
2252 		if (src_ipif == NULL) {
2253 			char buf[INET6_ADDRSTRLEN];
2254 
2255 			ip1dbg(("nce_xmit: No source ipif for dst %s\n",
2256 			    inet_ntop(AF_INET6, (char *)target, buf,
2257 			    sizeof (buf))));
2258 			return (B_TRUE);
2259 		}
2260 		sender = &src_ipif->ipif_v6src_addr;
2261 		hwaddr_ill = src_ipif->ipif_ill;
2262 	} else if (!(IN6_IS_ADDR_UNSPECIFIED(sender))) {
2263 		zoneid = ipif_lookup_addr_zoneid_v6(sender, ill, ill->ill_ipst);
2264 		/*
2265 		 * It's possible for ipif_lookup_addr_zoneid_v6() to return
2266 		 * ALL_ZONES if it cannot find a matching ipif for the address
2267 		 * we are trying to use. In this case we err on the side of
2268 		 * trying to send the packet by defaulting to the GLOBAL_ZONEID.
2269 		 */
2270 		if (zoneid == ALL_ZONES)
2271 			zoneid = GLOBAL_ZONEID;
2272 	}
2273 
2274 	/*
2275 	 * Always make sure that the NS/NA packets don't get load
2276 	 * spread. This is needed so that the probe packets sent
2277 	 * by the in.mpathd daemon can really go out on the desired
2278 	 * interface. Probe packets are made to go out on a desired
2279 	 * interface by including a ip6i with ATTACH_IF flag. As these
2280 	 * packets indirectly end up sending/receiving NS/NA packets
2281 	 * (neighbor doing NUD), we have to make sure that NA
2282 	 * also go out on the same interface.
2283 	 */
2284 	plen = (sizeof (nd_opt_hdr_t) + ill->ill_nd_lla_len + 7) / 8;
2285 	len = IPV6_HDR_LEN + sizeof (ip6i_t) + sizeof (nd_neighbor_advert_t) +
2286 	    plen * 8;
2287 	mp = allocb(len,  BPRI_LO);
2288 	if (mp == NULL) {
2289 		if (src_ipif != NULL)
2290 			ipif_refrele(src_ipif);
2291 		return (B_TRUE);
2292 	}
2293 	bzero((char *)mp->b_rptr, len);
2294 	mp->b_wptr = mp->b_rptr + len;
2295 
2296 	ip6i = (ip6i_t *)mp->b_rptr;
2297 	ip6i->ip6i_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
2298 	ip6i->ip6i_nxt = IPPROTO_RAW;
2299 	ip6i->ip6i_flags = IP6I_ATTACH_IF | IP6I_HOPLIMIT;
2300 	if (flag & NDP_PROBE)
2301 		ip6i->ip6i_flags |= IP6I_UNSPEC_SRC;
2302 	ip6i->ip6i_ifindex = ill->ill_phyint->phyint_ifindex;
2303 
2304 	ip6h = (ip6_t *)(mp->b_rptr + sizeof (ip6i_t));
2305 	ip6h->ip6_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
2306 	ip6h->ip6_plen = htons(len - IPV6_HDR_LEN - sizeof (ip6i_t));
2307 	ip6h->ip6_nxt = IPPROTO_ICMPV6;
2308 	ip6h->ip6_hops = IPV6_MAX_HOPS;
2309 	ip6h->ip6_dst = *target;
2310 	icmp6 = (icmp6_t *)&ip6h[1];
2311 
2312 	opt = (nd_opt_hdr_t *)((uint8_t *)ip6h + IPV6_HDR_LEN +
2313 	    sizeof (nd_neighbor_advert_t));
2314 
2315 	if (operation == ND_NEIGHBOR_SOLICIT) {
2316 		nd_neighbor_solicit_t *ns = (nd_neighbor_solicit_t *)icmp6;
2317 
2318 		if (!(flag & NDP_PROBE))
2319 			opt->nd_opt_type = ND_OPT_SOURCE_LINKADDR;
2320 		ip6h->ip6_src = *sender;
2321 		ns->nd_ns_target = *target;
2322 		if (!(flag & NDP_UNICAST)) {
2323 			/* Form multicast address of the target */
2324 			ip6h->ip6_dst = ipv6_solicited_node_mcast;
2325 			ip6h->ip6_dst.s6_addr32[3] |=
2326 			    ns->nd_ns_target.s6_addr32[3];
2327 		}
2328 	} else {
2329 		nd_neighbor_advert_t *na = (nd_neighbor_advert_t *)icmp6;
2330 
2331 		ASSERT(!(flag & NDP_PROBE));
2332 		opt->nd_opt_type = ND_OPT_TARGET_LINKADDR;
2333 		ip6h->ip6_src = *sender;
2334 		na->nd_na_target = *sender;
2335 		if (flag & NDP_ISROUTER)
2336 			na->nd_na_flags_reserved |= ND_NA_FLAG_ROUTER;
2337 		if (flag & NDP_SOLICITED)
2338 			na->nd_na_flags_reserved |= ND_NA_FLAG_SOLICITED;
2339 		if (flag & NDP_ORIDE)
2340 			na->nd_na_flags_reserved |= ND_NA_FLAG_OVERRIDE;
2341 	}
2342 
2343 	hw_addr = NULL;
2344 	if (!(flag & NDP_PROBE)) {
2345 		hw_addr = use_nd_lla ? hwaddr_ill->ill_nd_lla :
2346 		    hwaddr_ill->ill_phys_addr;
2347 		if (hw_addr != NULL) {
2348 			/* Fill in link layer address and option len */
2349 			opt->nd_opt_len = (uint8_t)plen;
2350 			bcopy(hw_addr, &opt[1], hwaddr_ill->ill_nd_lla_len);
2351 		}
2352 	}
2353 	if (hw_addr == NULL) {
2354 		/* If there's no link layer address option, then strip it. */
2355 		len -= plen * 8;
2356 		mp->b_wptr = mp->b_rptr + len;
2357 		ip6h->ip6_plen = htons(len - IPV6_HDR_LEN - sizeof (ip6i_t));
2358 	}
2359 
2360 	icmp6->icmp6_type = (uint8_t)operation;
2361 	icmp6->icmp6_code = 0;
2362 	/*
2363 	 * Prepare for checksum by putting icmp length in the icmp
2364 	 * checksum field. The checksum is calculated in ip_wput_v6.
2365 	 */
2366 	icmp6->icmp6_cksum = ip6h->ip6_plen;
2367 
2368 	if (src_ipif != NULL)
2369 		ipif_refrele(src_ipif);
2370 
2371 	ip_output_v6((void *)(uintptr_t)zoneid, mp, ill->ill_wq, IP_WPUT);
2372 	return (B_FALSE);
2373 }
2374 
2375 /*
2376  * Make a link layer address (does not include the SAP) from an nce.
2377  * To form the link layer address, use the last four bytes of ipv6
2378  * address passed in and the fixed offset stored in nce.
2379  */
2380 static void
2381 nce_make_mapping(nce_t *nce, uchar_t *addrpos, uchar_t *addr)
2382 {
2383 	uchar_t *mask, *to;
2384 	ill_t	*ill = nce->nce_ill;
2385 	int 	len;
2386 
2387 	if (ill->ill_net_type == IRE_IF_NORESOLVER)
2388 		return;
2389 	ASSERT(nce->nce_res_mp != NULL);
2390 	ASSERT(ill->ill_net_type == IRE_IF_RESOLVER);
2391 	ASSERT(nce->nce_flags & NCE_F_MAPPING);
2392 	ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&nce->nce_extract_mask));
2393 	ASSERT(addr != NULL);
2394 	bcopy(nce->nce_res_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill),
2395 	    addrpos, ill->ill_nd_lla_len);
2396 	len = MIN((int)ill->ill_nd_lla_len - nce->nce_ll_extract_start,
2397 	    IPV6_ADDR_LEN);
2398 	mask = (uchar_t *)&nce->nce_extract_mask;
2399 	mask += (IPV6_ADDR_LEN - len);
2400 	addr += (IPV6_ADDR_LEN - len);
2401 	to = addrpos + nce->nce_ll_extract_start;
2402 	while (len-- > 0)
2403 		*to++ |= *mask++ & *addr++;
2404 }
2405 
2406 mblk_t *
2407 nce_udreq_alloc(ill_t *ill)
2408 {
2409 	mblk_t	*template_mp = NULL;
2410 	dl_unitdata_req_t *dlur;
2411 	int	sap_length;
2412 
2413 	ASSERT(ill->ill_isv6);
2414 
2415 	sap_length = ill->ill_sap_length;
2416 	template_mp = ip_dlpi_alloc(sizeof (dl_unitdata_req_t) +
2417 	    ill->ill_nd_lla_len + ABS(sap_length), DL_UNITDATA_REQ);
2418 	if (template_mp == NULL)
2419 		return (NULL);
2420 
2421 	dlur = (dl_unitdata_req_t *)template_mp->b_rptr;
2422 	dlur->dl_priority.dl_min = 0;
2423 	dlur->dl_priority.dl_max = 0;
2424 	dlur->dl_dest_addr_length = ABS(sap_length) + ill->ill_nd_lla_len;
2425 	dlur->dl_dest_addr_offset = sizeof (dl_unitdata_req_t);
2426 
2427 	/* Copy in the SAP value. */
2428 	NCE_LL_SAP_COPY(ill, template_mp);
2429 
2430 	return (template_mp);
2431 }
2432 
2433 /*
2434  * NDP retransmit timer.
2435  * This timer goes off when:
2436  * a. It is time to retransmit NS for resolver.
2437  * b. It is time to send reachability probes.
2438  */
2439 void
2440 ndp_timer(void *arg)
2441 {
2442 	nce_t		*nce = arg;
2443 	ill_t		*ill = nce->nce_ill;
2444 	uint32_t	ms;
2445 	char		addrbuf[INET6_ADDRSTRLEN];
2446 	mblk_t		*mp;
2447 	boolean_t	dropped = B_FALSE;
2448 	ip_stack_t	*ipst = ill->ill_ipst;
2449 
2450 	/*
2451 	 * The timer has to be cancelled by ndp_delete before doing the final
2452 	 * refrele. So the NCE is guaranteed to exist when the timer runs
2453 	 * until it clears the timeout_id. Before clearing the timeout_id
2454 	 * bump up the refcnt so that we can continue to use the nce
2455 	 */
2456 	ASSERT(nce != NULL);
2457 
2458 	/*
2459 	 * Grab the ill_g_lock now itself to avoid lock order problems.
2460 	 * nce_solicit needs ill_g_lock to be able to traverse ills
2461 	 */
2462 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
2463 	mutex_enter(&nce->nce_lock);
2464 	NCE_REFHOLD_LOCKED(nce);
2465 	nce->nce_timeout_id = 0;
2466 
2467 	/*
2468 	 * Check the reachability state first.
2469 	 */
2470 	switch (nce->nce_state) {
2471 	case ND_DELAY:
2472 		rw_exit(&ipst->ips_ill_g_lock);
2473 		nce->nce_state = ND_PROBE;
2474 		mutex_exit(&nce->nce_lock);
2475 		(void) nce_xmit(ill, ND_NEIGHBOR_SOLICIT, NULL, B_FALSE,
2476 		    &ipv6_all_zeros, &nce->nce_addr, NDP_UNICAST);
2477 		if (ip_debug > 3) {
2478 			/* ip2dbg */
2479 			pr_addr_dbg("ndp_timer: state for %s changed "
2480 			    "to PROBE\n", AF_INET6, &nce->nce_addr);
2481 		}
2482 		NDP_RESTART_TIMER(nce, ill->ill_reachable_retrans_time);
2483 		NCE_REFRELE(nce);
2484 		return;
2485 	case ND_PROBE:
2486 		/* must be retransmit timer */
2487 		rw_exit(&ipst->ips_ill_g_lock);
2488 		nce->nce_pcnt--;
2489 		ASSERT(nce->nce_pcnt < ND_MAX_UNICAST_SOLICIT &&
2490 		    nce->nce_pcnt >= -1);
2491 		if (nce->nce_pcnt > 0) {
2492 			/*
2493 			 * As per RFC2461, the nce gets deleted after
2494 			 * MAX_UNICAST_SOLICIT unsuccessful re-transmissions.
2495 			 * Note that the first unicast solicitation is sent
2496 			 * during the DELAY state.
2497 			 */
2498 			ip2dbg(("ndp_timer: pcount=%x dst %s\n",
2499 			    nce->nce_pcnt, inet_ntop(AF_INET6, &nce->nce_addr,
2500 			    addrbuf, sizeof (addrbuf))));
2501 			mutex_exit(&nce->nce_lock);
2502 			dropped = nce_xmit(ill, ND_NEIGHBOR_SOLICIT, NULL,
2503 			    B_FALSE, &ipv6_all_zeros, &nce->nce_addr,
2504 			    (nce->nce_flags & NCE_F_PERMANENT) ? NDP_PROBE :
2505 			    NDP_UNICAST);
2506 			if (dropped) {
2507 				mutex_enter(&nce->nce_lock);
2508 				nce->nce_pcnt++;
2509 				mutex_exit(&nce->nce_lock);
2510 			}
2511 			NDP_RESTART_TIMER(nce, ILL_PROBE_INTERVAL(ill));
2512 		} else if (nce->nce_pcnt < 0) {
2513 			/* No hope, delete the nce */
2514 			nce->nce_state = ND_UNREACHABLE;
2515 			mutex_exit(&nce->nce_lock);
2516 			if (ip_debug > 2) {
2517 				/* ip1dbg */
2518 				pr_addr_dbg("ndp_timer: Delete IRE for"
2519 				    " dst %s\n", AF_INET6, &nce->nce_addr);
2520 			}
2521 			ndp_delete(nce);
2522 		} else if (!(nce->nce_flags & NCE_F_PERMANENT)) {
2523 			/* Wait RetransTimer, before deleting the entry */
2524 			ip2dbg(("ndp_timer: pcount=%x dst %s\n",
2525 			    nce->nce_pcnt, inet_ntop(AF_INET6,
2526 			    &nce->nce_addr, addrbuf, sizeof (addrbuf))));
2527 			mutex_exit(&nce->nce_lock);
2528 			/* Wait one interval before killing */
2529 			NDP_RESTART_TIMER(nce, ill->ill_reachable_retrans_time);
2530 		} else if (ill->ill_phyint->phyint_flags & PHYI_RUNNING) {
2531 			ipif_t *ipif;
2532 
2533 			/*
2534 			 * We're done probing, and we can now declare this
2535 			 * address to be usable.  Let IP know that it's ok to
2536 			 * use.
2537 			 */
2538 			nce->nce_state = ND_REACHABLE;
2539 			mutex_exit(&nce->nce_lock);
2540 			ipif = ipif_lookup_addr_v6(&nce->nce_addr, ill,
2541 			    ALL_ZONES, NULL, NULL, NULL, NULL, ipst);
2542 			if (ipif != NULL) {
2543 				if (ipif->ipif_was_dup) {
2544 					char ibuf[LIFNAMSIZ + 10];
2545 					char sbuf[INET6_ADDRSTRLEN];
2546 
2547 					ipif->ipif_was_dup = B_FALSE;
2548 					(void) inet_ntop(AF_INET6,
2549 					    &ipif->ipif_v6lcl_addr,
2550 					    sbuf, sizeof (sbuf));
2551 					ipif_get_name(ipif, ibuf,
2552 					    sizeof (ibuf));
2553 					cmn_err(CE_NOTE, "recovered address "
2554 					    "%s on %s", sbuf, ibuf);
2555 				}
2556 				if ((ipif->ipif_flags & IPIF_UP) &&
2557 				    !ipif->ipif_addr_ready) {
2558 					ip_rts_ifmsg(ipif);
2559 					ip_rts_newaddrmsg(RTM_ADD, 0, ipif);
2560 					sctp_update_ipif(ipif, SCTP_IPIF_UP);
2561 				}
2562 				ipif->ipif_addr_ready = 1;
2563 				ipif_refrele(ipif);
2564 			}
2565 			/* Begin defending our new address */
2566 			nce->nce_unsolicit_count = 0;
2567 			dropped = nce_xmit(ill, ND_NEIGHBOR_ADVERT, ill,
2568 			    B_FALSE, &nce->nce_addr, &ipv6_all_hosts_mcast,
2569 			    nce_advert_flags(nce));
2570 			if (dropped) {
2571 				nce->nce_unsolicit_count = 1;
2572 				NDP_RESTART_TIMER(nce,
2573 				    ipst->ips_ip_ndp_unsolicit_interval);
2574 			} else if (ipst->ips_ip_ndp_defense_interval != 0) {
2575 				NDP_RESTART_TIMER(nce,
2576 				    ipst->ips_ip_ndp_defense_interval);
2577 			}
2578 		} else {
2579 			/*
2580 			 * This is an address we're probing to be our own, but
2581 			 * the ill is down.  Wait until it comes back before
2582 			 * doing anything, but switch to reachable state so
2583 			 * that the restart will work.
2584 			 */
2585 			nce->nce_state = ND_REACHABLE;
2586 			mutex_exit(&nce->nce_lock);
2587 		}
2588 		NCE_REFRELE(nce);
2589 		return;
2590 	case ND_INCOMPLETE:
2591 		/*
2592 		 * Must be resolvers retransmit timer.
2593 		 */
2594 		for (mp = nce->nce_qd_mp; mp != NULL; mp = mp->b_next) {
2595 			ip6i_t	*ip6i;
2596 			ip6_t	*ip6h;
2597 			mblk_t *data_mp;
2598 
2599 			/*
2600 			 * Walk the list of packets queued, and see if there
2601 			 * are any multipathing probe packets. Such packets
2602 			 * are always queued at the head. Since this is a
2603 			 * retransmit timer firing, mark such packets as
2604 			 * delayed in ND resolution. This info will be used
2605 			 * in ip_wput_v6(). Multipathing probe packets will
2606 			 * always have an ip6i_t. Once we hit a packet without
2607 			 * it, we can break out of this loop.
2608 			 */
2609 			if (mp->b_datap->db_type == M_CTL)
2610 				data_mp = mp->b_cont;
2611 			else
2612 				data_mp = mp;
2613 
2614 			ip6h = (ip6_t *)data_mp->b_rptr;
2615 			if (ip6h->ip6_nxt != IPPROTO_RAW)
2616 				break;
2617 
2618 			/*
2619 			 * This message should have been pulled up already in
2620 			 * ip_wput_v6. We can't do pullups here because the
2621 			 * b_next/b_prev is non-NULL.
2622 			 */
2623 			ip6i = (ip6i_t *)ip6h;
2624 			ASSERT((data_mp->b_wptr - (uchar_t *)ip6i) >=
2625 			    sizeof (ip6i_t) + IPV6_HDR_LEN);
2626 
2627 			/* Mark this packet as delayed due to ND resolution */
2628 			if (ip6i->ip6i_flags & IP6I_DROP_IFDELAYED)
2629 				ip6i->ip6i_flags |= IP6I_ND_DELAYED;
2630 		}
2631 		if (nce->nce_qd_mp != NULL) {
2632 			ms = nce_solicit(nce, NULL);
2633 			rw_exit(&ipst->ips_ill_g_lock);
2634 			if (ms == 0) {
2635 				if (nce->nce_state != ND_REACHABLE) {
2636 					mutex_exit(&nce->nce_lock);
2637 					nce_resolv_failed(nce);
2638 					ndp_delete(nce);
2639 				} else {
2640 					mutex_exit(&nce->nce_lock);
2641 				}
2642 			} else {
2643 				mutex_exit(&nce->nce_lock);
2644 				NDP_RESTART_TIMER(nce, (clock_t)ms);
2645 			}
2646 			NCE_REFRELE(nce);
2647 			return;
2648 		}
2649 		mutex_exit(&nce->nce_lock);
2650 		rw_exit(&ipst->ips_ill_g_lock);
2651 		NCE_REFRELE(nce);
2652 		break;
2653 	case ND_REACHABLE :
2654 		rw_exit(&ipst->ips_ill_g_lock);
2655 		if (((nce->nce_flags & NCE_F_UNSOL_ADV) &&
2656 		    nce->nce_unsolicit_count != 0) ||
2657 		    ((nce->nce_flags & NCE_F_PERMANENT) &&
2658 		    ipst->ips_ip_ndp_defense_interval != 0)) {
2659 			if (nce->nce_unsolicit_count > 0)
2660 				nce->nce_unsolicit_count--;
2661 			mutex_exit(&nce->nce_lock);
2662 			dropped = nce_xmit(ill,
2663 			    ND_NEIGHBOR_ADVERT,
2664 			    ill,	/* ill to be used for hw addr */
2665 			    B_FALSE,	/* use ill_phys_addr */
2666 			    &nce->nce_addr,
2667 			    &ipv6_all_hosts_mcast,
2668 			    nce_advert_flags(nce));
2669 			if (dropped) {
2670 				mutex_enter(&nce->nce_lock);
2671 				nce->nce_unsolicit_count++;
2672 				mutex_exit(&nce->nce_lock);
2673 			}
2674 			if (nce->nce_unsolicit_count != 0) {
2675 				NDP_RESTART_TIMER(nce,
2676 				    ipst->ips_ip_ndp_unsolicit_interval);
2677 			} else {
2678 				NDP_RESTART_TIMER(nce,
2679 				    ipst->ips_ip_ndp_defense_interval);
2680 			}
2681 		} else {
2682 			mutex_exit(&nce->nce_lock);
2683 		}
2684 		NCE_REFRELE(nce);
2685 		break;
2686 	default:
2687 		rw_exit(&ipst->ips_ill_g_lock);
2688 		mutex_exit(&nce->nce_lock);
2689 		NCE_REFRELE(nce);
2690 		break;
2691 	}
2692 }
2693 
2694 /*
2695  * Set a link layer address from the ll_addr passed in.
2696  * Copy SAP from ill.
2697  */
2698 static void
2699 nce_set_ll(nce_t *nce, uchar_t *ll_addr)
2700 {
2701 	ill_t	*ill = nce->nce_ill;
2702 	uchar_t	*woffset;
2703 
2704 	ASSERT(ll_addr != NULL);
2705 	/* Always called before fast_path_probe */
2706 	ASSERT(nce->nce_fp_mp == NULL);
2707 	if (ill->ill_sap_length != 0) {
2708 		/*
2709 		 * Copy the SAP type specified in the
2710 		 * request into the xmit template.
2711 		 */
2712 		NCE_LL_SAP_COPY(ill, nce->nce_res_mp);
2713 	}
2714 	if (ill->ill_phys_addr_length > 0) {
2715 		/*
2716 		 * The bcopy() below used to be called for the physical address
2717 		 * length rather than the link layer address length. For
2718 		 * ethernet and many other media, the phys_addr and lla are
2719 		 * identical.
2720 		 * However, with xresolv interfaces being introduced, the
2721 		 * phys_addr and lla are no longer the same, and the physical
2722 		 * address may not have any useful meaning, so we use the lla
2723 		 * for IPv6 address resolution and destination addressing.
2724 		 *
2725 		 * For PPP or other interfaces with a zero length
2726 		 * physical address, don't do anything here.
2727 		 * The bcopy() with a zero phys_addr length was previously
2728 		 * a no-op for interfaces with a zero-length physical address.
2729 		 * Using the lla for them would change the way they operate.
2730 		 * Doing nothing in such cases preserves expected behavior.
2731 		 */
2732 		woffset = nce->nce_res_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill);
2733 		bcopy(ll_addr, woffset, ill->ill_nd_lla_len);
2734 	}
2735 }
2736 
2737 static boolean_t
2738 nce_cmp_ll_addr(const nce_t *nce, const uchar_t *ll_addr, uint32_t ll_addr_len)
2739 {
2740 	ill_t	*ill = nce->nce_ill;
2741 	uchar_t	*ll_offset;
2742 
2743 	ASSERT(nce->nce_res_mp != NULL);
2744 	if (ll_addr == NULL)
2745 		return (B_FALSE);
2746 	ll_offset = nce->nce_res_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill);
2747 	if (bcmp(ll_addr, ll_offset, ll_addr_len) != 0)
2748 		return (B_TRUE);
2749 	return (B_FALSE);
2750 }
2751 
2752 /*
2753  * Updates the link layer address or the reachability state of
2754  * a cache entry.  Reset probe counter if needed.
2755  */
2756 static void
2757 nce_update(nce_t *nce, uint16_t new_state, uchar_t *new_ll_addr)
2758 {
2759 	ill_t	*ill = nce->nce_ill;
2760 	boolean_t need_stop_timer = B_FALSE;
2761 	boolean_t need_fastpath_update = B_FALSE;
2762 
2763 	ASSERT(MUTEX_HELD(&nce->nce_lock));
2764 	ASSERT(nce->nce_ipversion == IPV6_VERSION);
2765 	/*
2766 	 * If this interface does not do NUD, there is no point
2767 	 * in allowing an update to the cache entry.  Although
2768 	 * we will respond to NS.
2769 	 * The only time we accept an update for a resolver when
2770 	 * NUD is turned off is when it has just been created.
2771 	 * Non-Resolvers will always be created as REACHABLE.
2772 	 */
2773 	if (new_state != ND_UNCHANGED) {
2774 		if ((nce->nce_flags & NCE_F_NONUD) &&
2775 		    (nce->nce_state != ND_INCOMPLETE))
2776 			return;
2777 		ASSERT((int16_t)new_state >= ND_STATE_VALID_MIN);
2778 		ASSERT((int16_t)new_state <= ND_STATE_VALID_MAX);
2779 		need_stop_timer = B_TRUE;
2780 		if (new_state == ND_REACHABLE)
2781 			nce->nce_last = TICK_TO_MSEC(lbolt64);
2782 		else {
2783 			/* We force NUD in this case */
2784 			nce->nce_last = 0;
2785 		}
2786 		nce->nce_state = new_state;
2787 		nce->nce_pcnt = ND_MAX_UNICAST_SOLICIT;
2788 	}
2789 	/*
2790 	 * In case of fast path we need to free the the fastpath
2791 	 * M_DATA and do another probe.  Otherwise we can just
2792 	 * overwrite the DL_UNITDATA_REQ data, noting we'll lose
2793 	 * whatever packets that happens to be transmitting at the time.
2794 	 */
2795 	if (new_ll_addr != NULL) {
2796 		ASSERT(nce->nce_res_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill) +
2797 		    ill->ill_nd_lla_len <= nce->nce_res_mp->b_wptr);
2798 		bcopy(new_ll_addr, nce->nce_res_mp->b_rptr +
2799 		    NCE_LL_ADDR_OFFSET(ill), ill->ill_nd_lla_len);
2800 		if (nce->nce_fp_mp != NULL) {
2801 			freemsg(nce->nce_fp_mp);
2802 			nce->nce_fp_mp = NULL;
2803 		}
2804 		need_fastpath_update = B_TRUE;
2805 	}
2806 	mutex_exit(&nce->nce_lock);
2807 	if (need_stop_timer) {
2808 		(void) untimeout(nce->nce_timeout_id);
2809 		nce->nce_timeout_id = 0;
2810 	}
2811 	if (need_fastpath_update)
2812 		nce_fastpath(nce);
2813 	mutex_enter(&nce->nce_lock);
2814 }
2815 
2816 void
2817 nce_queue_mp_common(nce_t *nce, mblk_t *mp, boolean_t head_insert)
2818 {
2819 	uint_t	count = 0;
2820 	mblk_t  **mpp;
2821 
2822 	ASSERT(MUTEX_HELD(&nce->nce_lock));
2823 
2824 	for (mpp = &nce->nce_qd_mp; *mpp != NULL;
2825 	    mpp = &(*mpp)->b_next) {
2826 		if (++count >
2827 		    nce->nce_ill->ill_max_buf) {
2828 			mblk_t *tmp = nce->nce_qd_mp->b_next;
2829 
2830 			nce->nce_qd_mp->b_next = NULL;
2831 			nce->nce_qd_mp->b_prev = NULL;
2832 			freemsg(nce->nce_qd_mp);
2833 			nce->nce_qd_mp = tmp;
2834 		}
2835 	}
2836 	/* put this on the list */
2837 	if (head_insert) {
2838 		mp->b_next = nce->nce_qd_mp;
2839 		nce->nce_qd_mp = mp;
2840 	} else {
2841 		*mpp = mp;
2842 	}
2843 }
2844 
2845 static void
2846 nce_queue_mp(nce_t *nce, mblk_t *mp)
2847 {
2848 	boolean_t head_insert = B_FALSE;
2849 	ip6_t	*ip6h;
2850 	ip6i_t	*ip6i;
2851 	mblk_t *data_mp;
2852 
2853 	ASSERT(MUTEX_HELD(&nce->nce_lock));
2854 
2855 	if (mp->b_datap->db_type == M_CTL)
2856 		data_mp = mp->b_cont;
2857 	else
2858 		data_mp = mp;
2859 	ip6h = (ip6_t *)data_mp->b_rptr;
2860 	if (ip6h->ip6_nxt == IPPROTO_RAW) {
2861 		/*
2862 		 * This message should have been pulled up already in
2863 		 * ip_wput_v6. We can't do pullups here because the message
2864 		 * could be from the nce_qd_mp which could have b_next/b_prev
2865 		 * non-NULL.
2866 		 */
2867 		ip6i = (ip6i_t *)ip6h;
2868 		ASSERT((data_mp->b_wptr - (uchar_t *)ip6i) >=
2869 		    sizeof (ip6i_t) + IPV6_HDR_LEN);
2870 		/*
2871 		 * Multipathing probe packets have IP6I_DROP_IFDELAYED set.
2872 		 * This has 2 aspects mentioned below.
2873 		 * 1. Perform head insertion in the nce_qd_mp for these packets.
2874 		 * This ensures that next retransmit of ND solicitation
2875 		 * will use the interface specified by the probe packet,
2876 		 * for both NS and NA. This corresponds to the src address
2877 		 * in the IPv6 packet. If we insert at tail, we will be
2878 		 * depending on the packet at the head for successful
2879 		 * ND resolution. This is not reliable, because the interface
2880 		 * on which the NA arrives could be different from the interface
2881 		 * on which the NS was sent, and if the receiving interface is
2882 		 * failed, it will appear that the sending interface is also
2883 		 * failed, causing in.mpathd to misdiagnose this as link
2884 		 * failure.
2885 		 * 2. Drop the original packet, if the ND resolution did not
2886 		 * succeed in the first attempt. However we will create the
2887 		 * nce and the ire, as soon as the ND resolution succeeds.
2888 		 * We don't gain anything by queueing multiple probe packets
2889 		 * and sending them back-to-back once resolution succeeds.
2890 		 * It is sufficient to send just 1 packet after ND resolution
2891 		 * succeeds. Since mpathd is sending down probe packets at a
2892 		 * constant rate, we don't need to send the queued packet. We
2893 		 * need to queue it only for NDP resolution. The benefit of
2894 		 * dropping the probe packets that were delayed in ND
2895 		 * resolution, is that in.mpathd will not see inflated
2896 		 * RTT. If the ND resolution does not succeed within
2897 		 * in.mpathd's failure detection time, mpathd may detect
2898 		 * a failure, and it does not matter whether the packet
2899 		 * was queued or dropped.
2900 		 */
2901 		if (ip6i->ip6i_flags & IP6I_DROP_IFDELAYED)
2902 			head_insert = B_TRUE;
2903 	}
2904 
2905 	nce_queue_mp_common(nce, mp, head_insert);
2906 }
2907 
2908 /*
2909  * Called when address resolution failed due to a timeout.
2910  * Send an ICMP unreachable in response to all queued packets.
2911  */
2912 void
2913 nce_resolv_failed(nce_t *nce)
2914 {
2915 	mblk_t	*mp, *nxt_mp, *first_mp;
2916 	char	buf[INET6_ADDRSTRLEN];
2917 	ip6_t *ip6h;
2918 	zoneid_t zoneid = GLOBAL_ZONEID;
2919 	ip_stack_t	*ipst = nce->nce_ill->ill_ipst;
2920 
2921 	ip1dbg(("nce_resolv_failed: dst %s\n",
2922 	    inet_ntop(AF_INET6, (char *)&nce->nce_addr, buf, sizeof (buf))));
2923 	mutex_enter(&nce->nce_lock);
2924 	mp = nce->nce_qd_mp;
2925 	nce->nce_qd_mp = NULL;
2926 	mutex_exit(&nce->nce_lock);
2927 	while (mp != NULL) {
2928 		nxt_mp = mp->b_next;
2929 		mp->b_next = NULL;
2930 		mp->b_prev = NULL;
2931 
2932 		first_mp = mp;
2933 		if (mp->b_datap->db_type == M_CTL) {
2934 			ipsec_out_t *io = (ipsec_out_t *)mp->b_rptr;
2935 			ASSERT(io->ipsec_out_type == IPSEC_OUT);
2936 			zoneid = io->ipsec_out_zoneid;
2937 			ASSERT(zoneid != ALL_ZONES);
2938 			mp = mp->b_cont;
2939 		}
2940 
2941 		ip6h = (ip6_t *)mp->b_rptr;
2942 		if (ip6h->ip6_nxt == IPPROTO_RAW) {
2943 			ip6i_t *ip6i;
2944 			/*
2945 			 * This message should have been pulled up already
2946 			 * in ip_wput_v6. ip_hdr_complete_v6 assumes that
2947 			 * the header is pulled up.
2948 			 */
2949 			ip6i = (ip6i_t *)ip6h;
2950 			ASSERT((mp->b_wptr - (uchar_t *)ip6i) >=
2951 			    sizeof (ip6i_t) + IPV6_HDR_LEN);
2952 			mp->b_rptr += sizeof (ip6i_t);
2953 		}
2954 		/*
2955 		 * Ignore failure since icmp_unreachable_v6 will silently
2956 		 * drop packets with an unspecified source address.
2957 		 */
2958 		(void) ip_hdr_complete_v6((ip6_t *)mp->b_rptr, zoneid, ipst);
2959 		icmp_unreachable_v6(nce->nce_ill->ill_wq, first_mp,
2960 		    ICMP6_DST_UNREACH_ADDR, B_FALSE, B_FALSE, zoneid, ipst);
2961 		mp = nxt_mp;
2962 	}
2963 }
2964 
2965 /*
2966  * Called by SIOCSNDP* ioctl to add/change an nce entry
2967  * and the corresponding attributes.
2968  * Disallow states other than ND_REACHABLE or ND_STALE.
2969  */
2970 int
2971 ndp_sioc_update(ill_t *ill, lif_nd_req_t *lnr)
2972 {
2973 	sin6_t		*sin6;
2974 	in6_addr_t	*addr;
2975 	nce_t		*nce;
2976 	int		err;
2977 	uint16_t	new_flags = 0;
2978 	uint16_t	old_flags = 0;
2979 	int		inflags = lnr->lnr_flags;
2980 	ip_stack_t	*ipst = ill->ill_ipst;
2981 
2982 	ASSERT(ill->ill_isv6);
2983 	if ((lnr->lnr_state_create != ND_REACHABLE) &&
2984 	    (lnr->lnr_state_create != ND_STALE))
2985 		return (EINVAL);
2986 
2987 	sin6 = (sin6_t *)&lnr->lnr_addr;
2988 	addr = &sin6->sin6_addr;
2989 
2990 	mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
2991 	/* We know it can not be mapping so just look in the hash table */
2992 	nce = *((nce_t **)NCE_HASH_PTR_V6(ipst, *addr));
2993 	nce = nce_lookup_addr(ill, addr, nce);
2994 	if (nce != NULL)
2995 		new_flags = nce->nce_flags;
2996 
2997 	switch (inflags & (NDF_ISROUTER_ON|NDF_ISROUTER_OFF)) {
2998 	case NDF_ISROUTER_ON:
2999 		new_flags |= NCE_F_ISROUTER;
3000 		break;
3001 	case NDF_ISROUTER_OFF:
3002 		new_flags &= ~NCE_F_ISROUTER;
3003 		break;
3004 	case (NDF_ISROUTER_OFF|NDF_ISROUTER_ON):
3005 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
3006 		if (nce != NULL)
3007 			NCE_REFRELE(nce);
3008 		return (EINVAL);
3009 	}
3010 
3011 	switch (inflags & (NDF_ANYCAST_ON|NDF_ANYCAST_OFF)) {
3012 	case NDF_ANYCAST_ON:
3013 		new_flags |= NCE_F_ANYCAST;
3014 		break;
3015 	case NDF_ANYCAST_OFF:
3016 		new_flags &= ~NCE_F_ANYCAST;
3017 		break;
3018 	case (NDF_ANYCAST_OFF|NDF_ANYCAST_ON):
3019 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
3020 		if (nce != NULL)
3021 			NCE_REFRELE(nce);
3022 		return (EINVAL);
3023 	}
3024 
3025 	if (nce == NULL) {
3026 		err = ndp_add_v6(ill,
3027 		    (uchar_t *)lnr->lnr_hdw_addr,
3028 		    addr,
3029 		    &ipv6_all_ones,
3030 		    &ipv6_all_zeros,
3031 		    0,
3032 		    new_flags,
3033 		    lnr->lnr_state_create,
3034 		    &nce);
3035 		if (err != 0) {
3036 			mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
3037 			ip1dbg(("ndp_sioc_update: Can't create NCE %d\n", err));
3038 			return (err);
3039 		}
3040 	}
3041 	old_flags = nce->nce_flags;
3042 	if (old_flags & NCE_F_ISROUTER && !(new_flags & NCE_F_ISROUTER)) {
3043 		/*
3044 		 * Router turned to host, delete all ires.
3045 		 * XXX Just delete the entry, but we need to add too.
3046 		 */
3047 		nce->nce_flags &= ~NCE_F_ISROUTER;
3048 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
3049 		ndp_delete(nce);
3050 		NCE_REFRELE(nce);
3051 		return (0);
3052 	}
3053 	mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
3054 
3055 	mutex_enter(&nce->nce_lock);
3056 	nce->nce_flags = new_flags;
3057 	mutex_exit(&nce->nce_lock);
3058 	/*
3059 	 * Note that we ignore the state at this point, which
3060 	 * should be either STALE or REACHABLE.  Instead we let
3061 	 * the link layer address passed in to determine the state
3062 	 * much like incoming packets.
3063 	 */
3064 	ndp_process(nce, (uchar_t *)lnr->lnr_hdw_addr, 0, B_FALSE);
3065 	NCE_REFRELE(nce);
3066 	return (0);
3067 }
3068 
3069 /*
3070  * If the device driver supports it, we make nce_fp_mp to have
3071  * an M_DATA prepend.  Otherwise nce_fp_mp will be null.
3072  * The caller ensures there is hold on nce for this function.
3073  * Note that since ill_fastpath_probe() copies the mblk there is
3074  * no need for the hold beyond this function.
3075  */
3076 void
3077 nce_fastpath(nce_t *nce)
3078 {
3079 	ill_t	*ill = nce->nce_ill;
3080 	int res;
3081 
3082 	ASSERT(ill != NULL);
3083 	ASSERT(nce->nce_state != ND_INITIAL && nce->nce_state != ND_INCOMPLETE);
3084 
3085 	if (nce->nce_fp_mp != NULL) {
3086 		/* Already contains fastpath info */
3087 		return;
3088 	}
3089 	if (nce->nce_res_mp != NULL) {
3090 		nce_fastpath_list_add(nce);
3091 		res = ill_fastpath_probe(ill, nce->nce_res_mp);
3092 		/*
3093 		 * EAGAIN is an indication of a transient error
3094 		 * i.e. allocation failure etc. leave the nce in the list it
3095 		 * will be updated when another probe happens for another ire
3096 		 * if not it will be taken out of the list when the ire is
3097 		 * deleted.
3098 		 */
3099 
3100 		if (res != 0 && res != EAGAIN)
3101 			nce_fastpath_list_delete(nce);
3102 	}
3103 }
3104 
3105 /*
3106  * Drain the list of nce's waiting for fastpath response.
3107  */
3108 void
3109 nce_fastpath_list_dispatch(ill_t *ill, boolean_t (*func)(nce_t *, void  *),
3110     void *arg)
3111 {
3112 
3113 	nce_t *next_nce;
3114 	nce_t *current_nce;
3115 	nce_t *first_nce;
3116 	nce_t *prev_nce = NULL;
3117 
3118 	mutex_enter(&ill->ill_lock);
3119 	first_nce = current_nce = (nce_t *)ill->ill_fastpath_list;
3120 	while (current_nce != (nce_t *)&ill->ill_fastpath_list) {
3121 		next_nce = current_nce->nce_fastpath;
3122 		/*
3123 		 * Take it off the list if we're flushing, or if the callback
3124 		 * routine tells us to do so.  Otherwise, leave the nce in the
3125 		 * fastpath list to handle any pending response from the lower
3126 		 * layer.  We can't drain the list when the callback routine
3127 		 * comparison failed, because the response is asynchronous in
3128 		 * nature, and may not arrive in the same order as the list
3129 		 * insertion.
3130 		 */
3131 		if (func == NULL || func(current_nce, arg)) {
3132 			current_nce->nce_fastpath = NULL;
3133 			if (current_nce == first_nce)
3134 				ill->ill_fastpath_list = first_nce = next_nce;
3135 			else
3136 				prev_nce->nce_fastpath = next_nce;
3137 		} else {
3138 			/* previous element that is still in the list */
3139 			prev_nce = current_nce;
3140 		}
3141 		current_nce = next_nce;
3142 	}
3143 	mutex_exit(&ill->ill_lock);
3144 }
3145 
3146 /*
3147  * Add nce to the nce fastpath list.
3148  */
3149 void
3150 nce_fastpath_list_add(nce_t *nce)
3151 {
3152 	ill_t *ill;
3153 
3154 	ill = nce->nce_ill;
3155 
3156 	mutex_enter(&ill->ill_lock);
3157 	mutex_enter(&nce->nce_lock);
3158 
3159 	/*
3160 	 * if nce has not been deleted and
3161 	 * is not already in the list add it.
3162 	 */
3163 	if (!(nce->nce_flags & NCE_F_CONDEMNED) &&
3164 	    (nce->nce_fastpath == NULL)) {
3165 		nce->nce_fastpath = (nce_t *)ill->ill_fastpath_list;
3166 		ill->ill_fastpath_list = nce;
3167 	}
3168 
3169 	mutex_exit(&nce->nce_lock);
3170 	mutex_exit(&ill->ill_lock);
3171 }
3172 
3173 /*
3174  * remove nce from the nce fastpath list.
3175  */
3176 void
3177 nce_fastpath_list_delete(nce_t *nce)
3178 {
3179 	nce_t *nce_ptr;
3180 
3181 	ill_t *ill;
3182 
3183 	ill = nce->nce_ill;
3184 	ASSERT(ill != NULL);
3185 
3186 	mutex_enter(&ill->ill_lock);
3187 	if (nce->nce_fastpath == NULL)
3188 		goto done;
3189 
3190 	ASSERT(ill->ill_fastpath_list != &ill->ill_fastpath_list);
3191 
3192 	if (ill->ill_fastpath_list == nce) {
3193 		ill->ill_fastpath_list = nce->nce_fastpath;
3194 	} else {
3195 		nce_ptr = ill->ill_fastpath_list;
3196 		while (nce_ptr != (nce_t *)&ill->ill_fastpath_list) {
3197 			if (nce_ptr->nce_fastpath == nce) {
3198 				nce_ptr->nce_fastpath = nce->nce_fastpath;
3199 				break;
3200 			}
3201 			nce_ptr = nce_ptr->nce_fastpath;
3202 		}
3203 	}
3204 
3205 	nce->nce_fastpath = NULL;
3206 done:
3207 	mutex_exit(&ill->ill_lock);
3208 }
3209 
3210 /*
3211  * Update all NCE's that are not in fastpath mode and
3212  * have an nce_fp_mp that matches mp. mp->b_cont contains
3213  * the fastpath header.
3214  *
3215  * Returns TRUE if entry should be dequeued, or FALSE otherwise.
3216  */
3217 boolean_t
3218 ndp_fastpath_update(nce_t *nce, void *arg)
3219 {
3220 	mblk_t 	*mp, *fp_mp;
3221 	uchar_t	*mp_rptr, *ud_mp_rptr;
3222 	mblk_t	*ud_mp = nce->nce_res_mp;
3223 	ptrdiff_t	cmplen;
3224 
3225 	if (nce->nce_flags & NCE_F_MAPPING)
3226 		return (B_TRUE);
3227 	if ((nce->nce_fp_mp != NULL) || (ud_mp == NULL))
3228 		return (B_TRUE);
3229 
3230 	ip2dbg(("ndp_fastpath_update: trying\n"));
3231 	mp = (mblk_t *)arg;
3232 	mp_rptr = mp->b_rptr;
3233 	cmplen = mp->b_wptr - mp_rptr;
3234 	ASSERT(cmplen >= 0);
3235 	ud_mp_rptr = ud_mp->b_rptr;
3236 	/*
3237 	 * The nce is locked here to prevent any other threads
3238 	 * from accessing and changing nce_res_mp when the IPv6 address
3239 	 * becomes resolved to an lla while we're in the middle
3240 	 * of looking at and comparing the hardware address (lla).
3241 	 * It is also locked to prevent multiple threads in nce_fastpath_update
3242 	 * from examining nce_res_mp atthe same time.
3243 	 */
3244 	mutex_enter(&nce->nce_lock);
3245 	if (ud_mp->b_wptr - ud_mp_rptr != cmplen ||
3246 	    bcmp((char *)mp_rptr, (char *)ud_mp_rptr, cmplen) != 0) {
3247 		mutex_exit(&nce->nce_lock);
3248 		/*
3249 		 * Don't take the ire off the fastpath list yet,
3250 		 * since the response may come later.
3251 		 */
3252 		return (B_FALSE);
3253 	}
3254 	/* Matched - install mp as the fastpath mp */
3255 	ip1dbg(("ndp_fastpath_update: match\n"));
3256 	fp_mp = dupb(mp->b_cont);
3257 	if (fp_mp != NULL) {
3258 		nce->nce_fp_mp = fp_mp;
3259 	}
3260 	mutex_exit(&nce->nce_lock);
3261 	return (B_TRUE);
3262 }
3263 
3264 /*
3265  * This function handles the DL_NOTE_FASTPATH_FLUSH notification from
3266  * driver.  Note that it assumes IP is exclusive...
3267  */
3268 /* ARGSUSED */
3269 void
3270 ndp_fastpath_flush(nce_t *nce, char *arg)
3271 {
3272 	if (nce->nce_flags & NCE_F_MAPPING)
3273 		return;
3274 	/* No fastpath info? */
3275 	if (nce->nce_fp_mp == NULL || nce->nce_res_mp == NULL)
3276 		return;
3277 
3278 	if (nce->nce_ipversion == IPV4_VERSION &&
3279 	    nce->nce_flags & NCE_F_BCAST) {
3280 		/*
3281 		 * IPv4 BROADCAST entries:
3282 		 * We can't delete the nce since it is difficult to
3283 		 * recreate these without going through the
3284 		 * ipif down/up dance.
3285 		 *
3286 		 * All access to nce->nce_fp_mp in the case of these
3287 		 * is protected by nce_lock.
3288 		 */
3289 		mutex_enter(&nce->nce_lock);
3290 		if (nce->nce_fp_mp != NULL) {
3291 			freeb(nce->nce_fp_mp);
3292 			nce->nce_fp_mp = NULL;
3293 			mutex_exit(&nce->nce_lock);
3294 			nce_fastpath(nce);
3295 		} else {
3296 			mutex_exit(&nce->nce_lock);
3297 		}
3298 	} else {
3299 		/* Just delete the NCE... */
3300 		ndp_delete(nce);
3301 	}
3302 }
3303 
3304 /*
3305  * Return a pointer to a given option in the packet.
3306  * Assumes that option part of the packet have already been validated.
3307  */
3308 nd_opt_hdr_t *
3309 ndp_get_option(nd_opt_hdr_t *opt, int optlen, int opt_type)
3310 {
3311 	while (optlen > 0) {
3312 		if (opt->nd_opt_type == opt_type)
3313 			return (opt);
3314 		optlen -= 8 * opt->nd_opt_len;
3315 		opt = (struct nd_opt_hdr *)((char *)opt + 8 * opt->nd_opt_len);
3316 	}
3317 	return (NULL);
3318 }
3319 
3320 /*
3321  * Verify all option lengths present are > 0, also check to see
3322  * if the option lengths and packet length are consistent.
3323  */
3324 boolean_t
3325 ndp_verify_optlen(nd_opt_hdr_t *opt, int optlen)
3326 {
3327 	ASSERT(opt != NULL);
3328 	while (optlen > 0) {
3329 		if (opt->nd_opt_len == 0)
3330 			return (B_FALSE);
3331 		optlen -= 8 * opt->nd_opt_len;
3332 		if (optlen < 0)
3333 			return (B_FALSE);
3334 		opt = (struct nd_opt_hdr *)((char *)opt + 8 * opt->nd_opt_len);
3335 	}
3336 	return (B_TRUE);
3337 }
3338 
3339 /*
3340  * ndp_walk function.
3341  * Free a fraction of the NCE cache entries.
3342  * A fraction of zero means to not free any in that category.
3343  */
3344 void
3345 ndp_cache_reclaim(nce_t *nce, char *arg)
3346 {
3347 	nce_cache_reclaim_t *ncr = (nce_cache_reclaim_t *)arg;
3348 	uint_t	rand;
3349 
3350 	if (nce->nce_flags & NCE_F_PERMANENT)
3351 		return;
3352 
3353 	rand = (uint_t)lbolt +
3354 	    NCE_ADDR_HASH_V6(nce->nce_addr, NCE_TABLE_SIZE);
3355 	if (ncr->ncr_host != 0 &&
3356 	    (rand/ncr->ncr_host)*ncr->ncr_host == rand) {
3357 		ndp_delete(nce);
3358 		return;
3359 	}
3360 }
3361 
3362 /*
3363  * ndp_walk function.
3364  * Count the number of NCEs that can be deleted.
3365  * These would be hosts but not routers.
3366  */
3367 void
3368 ndp_cache_count(nce_t *nce, char *arg)
3369 {
3370 	ncc_cache_count_t *ncc = (ncc_cache_count_t *)arg;
3371 
3372 	if (nce->nce_flags & NCE_F_PERMANENT)
3373 		return;
3374 
3375 	ncc->ncc_total++;
3376 	if (!(nce->nce_flags & NCE_F_ISROUTER))
3377 		ncc->ncc_host++;
3378 }
3379 
3380 #ifdef DEBUG
3381 void
3382 nce_trace_ref(nce_t *nce)
3383 {
3384 	ASSERT(MUTEX_HELD(&nce->nce_lock));
3385 
3386 	if (nce->nce_trace_disable)
3387 		return;
3388 
3389 	if (!th_trace_ref(nce, nce->nce_ill->ill_ipst)) {
3390 		nce->nce_trace_disable = B_TRUE;
3391 		nce_trace_cleanup(nce);
3392 	}
3393 }
3394 
3395 void
3396 nce_untrace_ref(nce_t *nce)
3397 {
3398 	ASSERT(MUTEX_HELD(&nce->nce_lock));
3399 
3400 	if (!nce->nce_trace_disable)
3401 		th_trace_unref(nce);
3402 }
3403 
3404 static void
3405 nce_trace_cleanup(const nce_t *nce)
3406 {
3407 	th_trace_cleanup(nce, nce->nce_trace_disable);
3408 }
3409 #endif
3410 
3411 /*
3412  * Called when address resolution fails due to a timeout.
3413  * Send an ICMP unreachable in response to all queued packets.
3414  */
3415 void
3416 arp_resolv_failed(nce_t *nce)
3417 {
3418 	mblk_t	*mp, *nxt_mp, *first_mp;
3419 	char	buf[INET6_ADDRSTRLEN];
3420 	zoneid_t zoneid = GLOBAL_ZONEID;
3421 	struct in_addr ipv4addr;
3422 	ip_stack_t *ipst = nce->nce_ill->ill_ipst;
3423 
3424 	IN6_V4MAPPED_TO_INADDR(&nce->nce_addr, &ipv4addr);
3425 	ip3dbg(("arp_resolv_failed: dst %s\n",
3426 	    inet_ntop(AF_INET, &ipv4addr, buf, sizeof (buf))));
3427 	mutex_enter(&nce->nce_lock);
3428 	mp = nce->nce_qd_mp;
3429 	nce->nce_qd_mp = NULL;
3430 	mutex_exit(&nce->nce_lock);
3431 
3432 	while (mp != NULL) {
3433 		nxt_mp = mp->b_next;
3434 		mp->b_next = NULL;
3435 		mp->b_prev = NULL;
3436 
3437 		first_mp = mp;
3438 		/*
3439 		 * Send icmp unreachable messages
3440 		 * to the hosts.
3441 		 */
3442 		(void) ip_hdr_complete((ipha_t *)mp->b_rptr, zoneid, ipst);
3443 		ip3dbg(("arp_resolv_failed: Calling icmp_unreachable\n"));
3444 		icmp_unreachable(nce->nce_ill->ill_wq, first_mp,
3445 		    ICMP_HOST_UNREACHABLE, zoneid, ipst);
3446 		mp = nxt_mp;
3447 	}
3448 }
3449 
3450 int
3451 ndp_lookup_then_add_v4(ill_t *ill, const in_addr_t *addr, uint16_t flags,
3452     nce_t **newnce, nce_t *src_nce)
3453 {
3454 	int	err;
3455 	nce_t	*nce;
3456 	in6_addr_t addr6;
3457 	ip_stack_t *ipst = ill->ill_ipst;
3458 
3459 	mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
3460 	nce = *((nce_t **)NCE_HASH_PTR_V4(ipst, *addr));
3461 	IN6_IPADDR_TO_V4MAPPED(*addr, &addr6);
3462 	nce = nce_lookup_addr(ill, &addr6, nce);
3463 	if (nce == NULL) {
3464 		err = ndp_add_v4(ill, addr, flags, newnce, src_nce);
3465 	} else {
3466 		*newnce = nce;
3467 		err = EEXIST;
3468 	}
3469 	mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
3470 	return (err);
3471 }
3472 
3473 /*
3474  * NDP Cache Entry creation routine for IPv4.
3475  * Mapped entries are handled in arp.
3476  * This routine must always be called with ndp4->ndp_g_lock held.
3477  * Prior to return, nce_refcnt is incremented.
3478  */
3479 static int
3480 ndp_add_v4(ill_t *ill, const in_addr_t *addr, uint16_t flags,
3481     nce_t **newnce, nce_t *src_nce)
3482 {
3483 	static	nce_t		nce_nil;
3484 	nce_t		*nce;
3485 	mblk_t		*mp;
3486 	mblk_t		*template = NULL;
3487 	nce_t		**ncep;
3488 	ip_stack_t	*ipst = ill->ill_ipst;
3489 	uint16_t	state = ND_INITIAL;
3490 	int		err;
3491 
3492 	ASSERT(MUTEX_HELD(&ipst->ips_ndp4->ndp_g_lock));
3493 	ASSERT(!ill->ill_isv6);
3494 	ASSERT((flags & NCE_F_MAPPING) == 0);
3495 
3496 	if (ill->ill_resolver_mp == NULL)
3497 		return (EINVAL);
3498 	/*
3499 	 * Allocate the mblk to hold the nce.
3500 	 */
3501 	mp = allocb(sizeof (nce_t), BPRI_MED);
3502 	if (mp == NULL)
3503 		return (ENOMEM);
3504 
3505 	nce = (nce_t *)mp->b_rptr;
3506 	mp->b_wptr = (uchar_t *)&nce[1];
3507 	*nce = nce_nil;
3508 	nce->nce_ill = ill;
3509 	nce->nce_ipversion = IPV4_VERSION;
3510 	nce->nce_flags = flags;
3511 	nce->nce_pcnt = ND_MAX_UNICAST_SOLICIT;
3512 	nce->nce_rcnt = ill->ill_xmit_count;
3513 	IN6_IPADDR_TO_V4MAPPED(*addr, &nce->nce_addr);
3514 	nce->nce_mask = ipv6_all_ones;
3515 	nce->nce_extract_mask = ipv6_all_zeros;
3516 	nce->nce_ll_extract_start = 0;
3517 	nce->nce_qd_mp = NULL;
3518 	nce->nce_mp = mp;
3519 	/* This one is for nce getting created */
3520 	nce->nce_refcnt = 1;
3521 	mutex_init(&nce->nce_lock, NULL, MUTEX_DEFAULT, NULL);
3522 	ncep = ((nce_t **)NCE_HASH_PTR_V4(ipst, *addr));
3523 
3524 	nce->nce_trace_disable = B_FALSE;
3525 
3526 	if (src_nce != NULL) {
3527 		/*
3528 		 * src_nce has been provided by the caller. The only
3529 		 * caller who provides a non-null, non-broadcast
3530 		 * src_nce is from ip_newroute() which must pass in
3531 		 * a ND_REACHABLE src_nce (this condition is verified
3532 		 * via an ASSERT for the save_ire->ire_nce in ip_newroute())
3533 		 */
3534 		mutex_enter(&src_nce->nce_lock);
3535 		state = src_nce->nce_state;
3536 		if ((src_nce->nce_flags & NCE_F_CONDEMNED) ||
3537 		    (ipst->ips_ndp4->ndp_g_hw_change > 0)) {
3538 			/*
3539 			 * src_nce has been deleted, or
3540 			 * ip_arp_news is in the middle of
3541 			 * flushing entries in the the nce.
3542 			 * Fail the add, since we don't know
3543 			 * if it is safe to copy the contents of
3544 			 * src_nce
3545 			 */
3546 			DTRACE_PROBE2(nce__bad__src__nce,
3547 			    nce_t *, src_nce, ill_t *, ill);
3548 			mutex_exit(&src_nce->nce_lock);
3549 			err = EINVAL;
3550 			goto err_ret;
3551 		}
3552 		template = copyb(src_nce->nce_res_mp);
3553 		mutex_exit(&src_nce->nce_lock);
3554 		if (template == NULL) {
3555 			err = ENOMEM;
3556 			goto err_ret;
3557 		}
3558 	} else if (flags & NCE_F_BCAST) {
3559 		/*
3560 		 * broadcast nce.
3561 		 */
3562 		template = copyb(ill->ill_bcast_mp);
3563 		if (template == NULL) {
3564 			err = ENOMEM;
3565 			goto err_ret;
3566 		}
3567 		state = ND_REACHABLE;
3568 	} else if (ill->ill_net_type == IRE_IF_NORESOLVER) {
3569 		/*
3570 		 * NORESOLVER entries are always created in the REACHABLE
3571 		 * state. We create a nce_res_mp with the IP nexthop address
3572 		 * in the destination address in the DLPI hdr if the
3573 		 * physical length is exactly 4 bytes.
3574 		 *
3575 		 * XXX not clear which drivers set ill_phys_addr_length to
3576 		 * IP_ADDR_LEN.
3577 		 */
3578 		if (ill->ill_phys_addr_length == IP_ADDR_LEN) {
3579 			template = ill_dlur_gen((uchar_t *)addr,
3580 			    ill->ill_phys_addr_length,
3581 			    ill->ill_sap, ill->ill_sap_length);
3582 		} else {
3583 			template = copyb(ill->ill_resolver_mp);
3584 		}
3585 		if (template == NULL) {
3586 			err = ENOMEM;
3587 			goto err_ret;
3588 		}
3589 		state = ND_REACHABLE;
3590 	}
3591 	nce->nce_fp_mp = NULL;
3592 	nce->nce_res_mp = template;
3593 	nce->nce_state = state;
3594 	if (state == ND_REACHABLE) {
3595 		nce->nce_last = TICK_TO_MSEC(lbolt64);
3596 		nce->nce_init_time = TICK_TO_MSEC(lbolt64);
3597 	} else {
3598 		nce->nce_last = 0;
3599 		if (state == ND_INITIAL)
3600 			nce->nce_init_time = TICK_TO_MSEC(lbolt64);
3601 	}
3602 
3603 	ASSERT((nce->nce_res_mp == NULL && nce->nce_state == ND_INITIAL) ||
3604 	    (nce->nce_res_mp != NULL && nce->nce_state == ND_REACHABLE));
3605 	/*
3606 	 * Atomically ensure that the ill is not CONDEMNED, before
3607 	 * adding the NCE.
3608 	 */
3609 	mutex_enter(&ill->ill_lock);
3610 	if (ill->ill_state_flags & ILL_CONDEMNED) {
3611 		mutex_exit(&ill->ill_lock);
3612 		err = EINVAL;
3613 		goto err_ret;
3614 	}
3615 	if ((nce->nce_next = *ncep) != NULL)
3616 		nce->nce_next->nce_ptpn = &nce->nce_next;
3617 	*ncep = nce;
3618 	nce->nce_ptpn = ncep;
3619 	*newnce = nce;
3620 	/* This one is for nce being used by an active thread */
3621 	NCE_REFHOLD(*newnce);
3622 
3623 	/* Bump up the number of nce's referencing this ill */
3624 	ill->ill_nce_cnt++;
3625 	mutex_exit(&ill->ill_lock);
3626 	DTRACE_PROBE1(ndp__add__v4, nce_t *, nce);
3627 	return (0);
3628 err_ret:
3629 	freeb(mp);
3630 	freemsg(template);
3631 	return (err);
3632 }
3633 
3634 void
3635 ndp_flush_qd_mp(nce_t *nce)
3636 {
3637 	mblk_t *qd_mp, *qd_next;
3638 
3639 	ASSERT(MUTEX_HELD(&nce->nce_lock));
3640 	qd_mp = nce->nce_qd_mp;
3641 	nce->nce_qd_mp = NULL;
3642 	while (qd_mp != NULL) {
3643 		qd_next = qd_mp->b_next;
3644 		qd_mp->b_next = NULL;
3645 		qd_mp->b_prev = NULL;
3646 		freemsg(qd_mp);
3647 		qd_mp = qd_next;
3648 	}
3649 }
3650 
3651 
3652 /*
3653  * ndp_walk routine to delete all entries that have a given destination or
3654  * gateway address and cached link layer (MAC) address.  This is used when ARP
3655  * informs us that a network-to-link-layer mapping may have changed.
3656  */
3657 void
3658 nce_delete_hw_changed(nce_t *nce, void *arg)
3659 {
3660 	nce_hw_map_t *hwm = arg;
3661 	mblk_t *mp;
3662 	dl_unitdata_req_t *dlu;
3663 	uchar_t *macaddr;
3664 	ill_t *ill;
3665 	int saplen;
3666 	ipaddr_t nce_addr;
3667 
3668 	if (nce->nce_state != ND_REACHABLE)
3669 		return;
3670 
3671 	IN6_V4MAPPED_TO_IPADDR(&nce->nce_addr, nce_addr);
3672 	if (nce_addr != hwm->hwm_addr)
3673 		return;
3674 
3675 	mutex_enter(&nce->nce_lock);
3676 	if ((mp = nce->nce_res_mp) == NULL) {
3677 		mutex_exit(&nce->nce_lock);
3678 		return;
3679 	}
3680 	dlu = (dl_unitdata_req_t *)mp->b_rptr;
3681 	macaddr = (uchar_t *)(dlu + 1);
3682 	ill = nce->nce_ill;
3683 	if ((saplen = ill->ill_sap_length) > 0)
3684 		macaddr += saplen;
3685 	else
3686 		saplen = -saplen;
3687 
3688 	/*
3689 	 * If the hardware address is unchanged, then leave this one alone.
3690 	 * Note that saplen == abs(saplen) now.
3691 	 */
3692 	if (hwm->hwm_hwlen == dlu->dl_dest_addr_length - saplen &&
3693 	    bcmp(hwm->hwm_hwaddr, macaddr, hwm->hwm_hwlen) == 0) {
3694 		mutex_exit(&nce->nce_lock);
3695 		return;
3696 	}
3697 	mutex_exit(&nce->nce_lock);
3698 
3699 	DTRACE_PROBE1(nce__hw__deleted, nce_t *, nce);
3700 	ndp_delete(nce);
3701 }
3702 
3703 /*
3704  * This function verifies whether a given IPv4 address is potentially known to
3705  * the NCE subsystem.  If so, then ARP must not delete the corresponding ace_t,
3706  * so that it can continue to look for hardware changes on that address.
3707  */
3708 boolean_t
3709 ndp_lookup_ipaddr(in_addr_t addr, netstack_t *ns)
3710 {
3711 	nce_t		*nce;
3712 	struct in_addr	nceaddr;
3713 	ip_stack_t	*ipst = ns->netstack_ip;
3714 
3715 	if (addr == INADDR_ANY)
3716 		return (B_FALSE);
3717 
3718 	mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
3719 	nce = *(nce_t **)NCE_HASH_PTR_V4(ipst, addr);
3720 	for (; nce != NULL; nce = nce->nce_next) {
3721 		/* Note that only v4 mapped entries are in the table. */
3722 		IN6_V4MAPPED_TO_INADDR(&nce->nce_addr, &nceaddr);
3723 		if (addr == nceaddr.s_addr &&
3724 		    IN6_ARE_ADDR_EQUAL(&nce->nce_mask, &ipv6_all_ones)) {
3725 			/* Single flag check; no lock needed */
3726 			if (!(nce->nce_flags & NCE_F_CONDEMNED))
3727 				break;
3728 		}
3729 	}
3730 	mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
3731 	return (nce != NULL);
3732 }
3733