xref: /illumos-gate/usr/src/uts/common/inet/ip/ip_ndp.c (revision 8a2b682e57a046b828f37bcde1776f131ef4629f)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved.
23  */
24 
25 /*
26  * Copyright (c) 2019, Joyent, Inc.
27  */
28 
29 #include <sys/types.h>
30 #include <sys/stream.h>
31 #include <sys/stropts.h>
32 #include <sys/strsun.h>
33 #include <sys/sysmacros.h>
34 #include <sys/errno.h>
35 #include <sys/dlpi.h>
36 #include <sys/socket.h>
37 #include <sys/ddi.h>
38 #include <sys/sunddi.h>
39 #include <sys/cmn_err.h>
40 #include <sys/debug.h>
41 #include <sys/vtrace.h>
42 #include <sys/kmem.h>
43 #include <sys/zone.h>
44 #include <sys/ethernet.h>
45 #include <sys/sdt.h>
46 #include <sys/mac.h>
47 
48 #include <net/if.h>
49 #include <net/if_types.h>
50 #include <net/if_dl.h>
51 #include <net/route.h>
52 #include <netinet/in.h>
53 #include <netinet/ip6.h>
54 #include <netinet/icmp6.h>
55 
56 #include <inet/common.h>
57 #include <inet/mi.h>
58 #include <inet/mib2.h>
59 #include <inet/nd.h>
60 #include <inet/ip.h>
61 #include <inet/ip_impl.h>
62 #include <inet/ipclassifier.h>
63 #include <inet/ip_if.h>
64 #include <inet/ip_ire.h>
65 #include <inet/ip_rts.h>
66 #include <inet/ip6.h>
67 #include <inet/ip_ndp.h>
68 #include <inet/sctp_ip.h>
69 #include <inet/ip_arp.h>
70 #include <inet/ip2mac_impl.h>
71 
72 #define	ANNOUNCE_INTERVAL(isv6) \
73 	(isv6 ? ipst->ips_ip_ndp_unsolicit_interval : \
74 	ipst->ips_ip_arp_publish_interval)
75 
76 #define	DEFENSE_INTERVAL(isv6) \
77 	(isv6 ? ipst->ips_ndp_defend_interval : \
78 	ipst->ips_arp_defend_interval)
79 
80 /* Non-tunable probe interval, based on link capabilities */
81 #define	ILL_PROBE_INTERVAL(ill)	((ill)->ill_note_link ? 150 : 1500)
82 
83 /*
84  * The IPv4 Link Local address space is special; we do extra duplicate checking
85  * there, as the entire assignment mechanism rests on random numbers.
86  */
87 #define	IS_IPV4_LL_SPACE(ptr)	(((uchar_t *)ptr)[0] == 169 && \
88 				((uchar_t *)ptr)[1] == 254)
89 
90 /*
91  * NCE_EXTERNAL_FLAGS_MASK defines the set of ncec_flags that may be passed
92  * in to the ncec*add* functions.
93  *
94  * NCE_F_AUTHORITY means that we ignore any incoming adverts for that
95  * mapping (though DAD is performed for the mapping). NCE_F_PUBLISH means
96  * that we will respond to requests for the protocol address.
97  */
98 #define	NCE_EXTERNAL_FLAGS_MASK \
99 	(NCE_F_MYADDR | NCE_F_ISROUTER | NCE_F_NONUD | \
100 	NCE_F_ANYCAST | NCE_F_UNSOL_ADV | NCE_F_BCAST | NCE_F_MCAST | \
101 	NCE_F_AUTHORITY | NCE_F_PUBLISH | NCE_F_STATIC)
102 
103 /*
104  * Lock ordering:
105  *
106  *	ndp_g_lock -> ill_lock -> ncec_lock
107  *
108  * The ndp_g_lock protects the NCE hash (nce_hash_tbl, NCE_HASH_PTR) and
109  * ncec_next.  ncec_lock protects the contents of the NCE (particularly
110  * ncec_refcnt).
111  */
112 
113 static	void	nce_cleanup_list(ncec_t *ncec);
114 static	void	nce_set_ll(ncec_t *ncec, uchar_t *ll_addr);
115 static	ncec_t	*ncec_lookup_illgrp(ill_t *, const in6_addr_t *,
116     ncec_t *);
117 static	nce_t	*nce_lookup_addr(ill_t *, const in6_addr_t *);
118 static	int	nce_set_multicast_v6(ill_t *ill, const in6_addr_t *addr,
119     uint16_t ncec_flags, nce_t **newnce);
120 static	int	nce_set_multicast_v4(ill_t *ill, const in_addr_t *dst,
121     uint16_t ncec_flags, nce_t **newnce);
122 static	boolean_t	ndp_xmit(ill_t *ill, uint32_t operation,
123     uint8_t *hwaddr, uint_t hwaddr_len, const in6_addr_t *sender,
124     const in6_addr_t *target, int flag);
125 static void	ncec_refhold_locked(ncec_t *);
126 static boolean_t ill_defend_rate_limit(ill_t *, ncec_t *);
127 static	void	nce_queue_mp_common(ncec_t *, mblk_t *, boolean_t);
128 static	int	nce_add_common(ill_t *, uchar_t *, uint_t, const in6_addr_t *,
129     uint16_t, uint16_t, nce_t **);
130 static nce_t *nce_add_impl(ill_t *, ncec_t *, nce_t *, mblk_t *, list_t *);
131 static nce_t *nce_add(ill_t *, ncec_t *, list_t *);
132 static void nce_inactive(nce_t *);
133 extern nce_t	*nce_lookup(ill_t *, const in6_addr_t *);
134 static nce_t *nce_ill_lookup_then_add(ill_t *, ncec_t *);
135 static int	nce_add_v6(ill_t *, uchar_t *, uint_t, const in6_addr_t *,
136     uint16_t, uint16_t, nce_t **);
137 static int	nce_add_v4(ill_t *, uchar_t *, uint_t, const in_addr_t *,
138     uint16_t, uint16_t, nce_t **);
139 static int  nce_add_v6_postprocess(nce_t *);
140 static int  nce_add_v4_postprocess(nce_t *);
141 static ill_t *nce_resolve_src(ncec_t *, in6_addr_t *);
142 static clock_t nce_fuzz_interval(clock_t, boolean_t);
143 static void nce_resolv_ipmp_ok(ncec_t *);
144 static void nce_walk_common(ill_t *, pfi_t, void *);
145 static void nce_start_timer(ncec_t *, uint_t);
146 static nce_t *nce_fastpath_create(ill_t *, ncec_t *);
147 static void nce_fastpath_trigger(nce_t *);
148 static nce_t *nce_fastpath(ncec_t *, boolean_t, nce_t *);
149 
150 #ifdef DEBUG
151 static void	ncec_trace_cleanup(const ncec_t *);
152 #endif
153 
154 #define	NCE_HASH_PTR_V4(ipst, addr)					\
155 	(&((ipst)->ips_ndp4->nce_hash_tbl[IRE_ADDR_HASH(addr, NCE_TABLE_SIZE)]))
156 
157 #define	NCE_HASH_PTR_V6(ipst, addr)				 \
158 	(&((ipst)->ips_ndp6->nce_hash_tbl[NCE_ADDR_HASH_V6(addr, \
159 		NCE_TABLE_SIZE)]))
160 
161 extern kmem_cache_t *ncec_cache;
162 extern kmem_cache_t *nce_cache;
163 
164 /*
165  * Send out a IPv6 (unicast) or IPv4 (broadcast) DAD probe
166  * If src_ill is not null, the ncec_addr is bound to src_ill. The
167  * src_ill is ignored by nce_dad for IPv4 Neighbor Cache entries where
168  * the probe is sent on the ncec_ill (in the non-IPMP case) or the
169  * IPMP cast_ill (in the IPMP case).
170  *
171  * Note that the probe interval is based on the src_ill for IPv6, and
172  * the ncec_xmit_interval for IPv4.
173  */
174 static void
175 nce_dad(ncec_t *ncec, ill_t *src_ill, boolean_t send_probe)
176 {
177 	boolean_t dropped;
178 	uint32_t probe_interval;
179 
180 	ASSERT(!(ncec->ncec_flags & NCE_F_MCAST));
181 	ASSERT(!(ncec->ncec_flags & NCE_F_BCAST));
182 	if (ncec->ncec_ipversion == IPV6_VERSION) {
183 		dropped = ndp_xmit(src_ill, ND_NEIGHBOR_SOLICIT,
184 		    ncec->ncec_lladdr, ncec->ncec_lladdr_length,
185 		    &ipv6_all_zeros, &ncec->ncec_addr, NDP_PROBE);
186 		probe_interval = ILL_PROBE_INTERVAL(src_ill);
187 	} else {
188 		/* IPv4 DAD delay the initial probe. */
189 		if (send_probe)
190 			dropped = arp_probe(ncec);
191 		else
192 			dropped = B_TRUE;
193 		probe_interval = nce_fuzz_interval(ncec->ncec_xmit_interval,
194 		    !send_probe);
195 	}
196 	if (!dropped) {
197 		mutex_enter(&ncec->ncec_lock);
198 		ncec->ncec_pcnt--;
199 		mutex_exit(&ncec->ncec_lock);
200 	}
201 	nce_restart_timer(ncec, probe_interval);
202 }
203 
204 /*
205  * Compute default flags to use for an advertisement of this ncec's address.
206  */
207 static int
208 nce_advert_flags(const ncec_t *ncec)
209 {
210 	int flag = 0;
211 
212 	if (ncec->ncec_flags & NCE_F_ISROUTER)
213 		flag |= NDP_ISROUTER;
214 	if (!(ncec->ncec_flags & NCE_F_ANYCAST))
215 		flag |= NDP_ORIDE;
216 
217 	return (flag);
218 }
219 
220 /*
221  * NDP Cache Entry creation routine.
222  * This routine must always be called with ndp6->ndp_g_lock held.
223  */
224 int
225 nce_add_v6(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len,
226     const in6_addr_t *addr, uint16_t flags, uint16_t state, nce_t **newnce)
227 {
228 	int		err;
229 	nce_t		*nce;
230 
231 	ASSERT(MUTEX_HELD(&ill->ill_ipst->ips_ndp6->ndp_g_lock));
232 	ASSERT(ill != NULL && ill->ill_isv6);
233 
234 	err = nce_add_common(ill, hw_addr, hw_addr_len, addr, flags, state,
235 	    &nce);
236 	if (err != 0)
237 		return (err);
238 	ASSERT(newnce != NULL);
239 	*newnce = nce;
240 	return (err);
241 }
242 
243 /*
244  * Post-processing routine to be executed after nce_add_v6(). This function
245  * triggers fastpath (if appropriate) and DAD on the newly added nce entry
246  * and must be called without any locks held.
247  */
248 int
249 nce_add_v6_postprocess(nce_t *nce)
250 {
251 	ncec_t		*ncec = nce->nce_common;
252 	boolean_t	dropped = B_FALSE;
253 	uchar_t		*hw_addr = ncec->ncec_lladdr;
254 	uint_t		hw_addr_len = ncec->ncec_lladdr_length;
255 	ill_t		*ill = ncec->ncec_ill;
256 	int		err = 0;
257 	uint16_t	flags = ncec->ncec_flags;
258 	ip_stack_t	*ipst = ill->ill_ipst;
259 	boolean_t	trigger_fastpath = B_TRUE;
260 
261 	/*
262 	 * If the hw_addr is NULL, typically for ND_INCOMPLETE nces, then
263 	 * we call nce_fastpath as soon as the ncec is resolved in nce_process.
264 	 * We call nce_fastpath from nce_update if the link layer address of
265 	 * the peer changes from nce_update
266 	 */
267 	if (NCE_PUBLISH(ncec) || !NCE_ISREACHABLE(ncec) ||
268 	    (hw_addr == NULL && ill->ill_net_type != IRE_IF_NORESOLVER))
269 		trigger_fastpath = B_FALSE;
270 
271 	if (trigger_fastpath)
272 		nce_fastpath_trigger(nce);
273 	if (NCE_PUBLISH(ncec) && ncec->ncec_state == ND_PROBE) {
274 		ill_t *hwaddr_ill;
275 		/*
276 		 * Unicast entry that needs DAD.
277 		 */
278 		if (IS_IPMP(ill)) {
279 			hwaddr_ill = ipmp_illgrp_find_ill(ill->ill_grp,
280 			    hw_addr, hw_addr_len);
281 		} else {
282 			hwaddr_ill = ill;
283 		}
284 		nce_dad(ncec, hwaddr_ill, B_TRUE);
285 		err = EINPROGRESS;
286 	} else if (flags & NCE_F_UNSOL_ADV) {
287 		/*
288 		 * We account for the transmit below by assigning one
289 		 * less than the ndd variable. Subsequent decrements
290 		 * are done in nce_timer.
291 		 */
292 		mutex_enter(&ncec->ncec_lock);
293 		ncec->ncec_unsolicit_count =
294 		    ipst->ips_ip_ndp_unsolicit_count - 1;
295 		mutex_exit(&ncec->ncec_lock);
296 		dropped = ndp_xmit(ill,
297 		    ND_NEIGHBOR_ADVERT,
298 		    hw_addr,
299 		    hw_addr_len,
300 		    &ncec->ncec_addr,	/* Source and target of the adv */
301 		    &ipv6_all_hosts_mcast, /* Destination of the packet */
302 		    nce_advert_flags(ncec));
303 		mutex_enter(&ncec->ncec_lock);
304 		if (dropped)
305 			ncec->ncec_unsolicit_count++;
306 		else
307 			ncec->ncec_last_time_defended = ddi_get_lbolt();
308 		if (ncec->ncec_unsolicit_count != 0) {
309 			nce_start_timer(ncec,
310 			    ipst->ips_ip_ndp_unsolicit_interval);
311 		}
312 		mutex_exit(&ncec->ncec_lock);
313 	}
314 	return (err);
315 }
316 
317 /*
318  * Atomically lookup and add (if needed) Neighbor Cache information for
319  * an address.
320  *
321  * IPMP notes: the ncec for non-local (i.e., !NCE_MYADDR(ncec) addresses
322  * are always added pointing at the ipmp_ill. Thus, when the ill passed
323  * to nce_add_v6 is an under_ill (i.e., IS_UNDER_IPMP(ill)) two nce_t
324  * entries will be created, both pointing at the same ncec_t. The nce_t
325  * entries will have their nce_ill set to the ipmp_ill and the under_ill
326  * respectively, with the ncec_t having its ncec_ill pointing at the ipmp_ill.
327  * Local addresses are always created on the ill passed to nce_add_v6.
328  */
329 int
330 nce_lookup_then_add_v6(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len,
331     const in6_addr_t *addr, uint16_t flags, uint16_t state, nce_t **newnce)
332 {
333 	int		err = 0;
334 	ip_stack_t	*ipst = ill->ill_ipst;
335 	nce_t		*nce, *upper_nce = NULL;
336 	ill_t		*in_ill = ill;
337 	boolean_t	need_ill_refrele = B_FALSE;
338 
339 	if (flags & NCE_F_MCAST) {
340 		/*
341 		 * hw_addr will be figured out in nce_set_multicast_v6;
342 		 * caller has to select the cast_ill
343 		 */
344 		ASSERT(hw_addr == NULL);
345 		ASSERT(!IS_IPMP(ill));
346 		err = nce_set_multicast_v6(ill, addr, flags, newnce);
347 		return (err);
348 	}
349 	ASSERT(ill->ill_isv6);
350 	if (IS_UNDER_IPMP(ill) && !(flags & NCE_F_MYADDR)) {
351 		ill = ipmp_ill_hold_ipmp_ill(ill);
352 		if (ill == NULL)
353 			return (ENXIO);
354 		need_ill_refrele = B_TRUE;
355 	}
356 
357 	mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
358 	nce = nce_lookup_addr(ill, addr);
359 	if (nce == NULL) {
360 		err = nce_add_v6(ill, hw_addr, hw_addr_len, addr, flags, state,
361 		    &nce);
362 	} else {
363 		err = EEXIST;
364 	}
365 	mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
366 	if (err == 0)
367 		err = nce_add_v6_postprocess(nce);
368 	if (in_ill != ill && nce != NULL) {
369 		nce_t *under_nce = NULL;
370 
371 		/*
372 		 * in_ill was the under_ill. Try to create the under_nce.
373 		 * Hold the ill_g_lock to prevent changes to group membership
374 		 * until we are done.
375 		 */
376 		rw_enter(&ipst->ips_ill_g_lock, RW_READER);
377 		if (!IS_IN_SAME_ILLGRP(in_ill, ill)) {
378 			DTRACE_PROBE2(ill__not__in__group, nce_t *, nce,
379 			    ill_t *, ill);
380 			rw_exit(&ipst->ips_ill_g_lock);
381 			err = ENXIO;
382 			nce_refrele(nce);
383 			nce = NULL;
384 			goto bail;
385 		}
386 		under_nce = nce_fastpath_create(in_ill, nce->nce_common);
387 		if (under_nce == NULL) {
388 			rw_exit(&ipst->ips_ill_g_lock);
389 			err = EINVAL;
390 			nce_refrele(nce);
391 			nce = NULL;
392 			goto bail;
393 		}
394 		rw_exit(&ipst->ips_ill_g_lock);
395 		upper_nce = nce;
396 		nce = under_nce; /* will be returned to caller */
397 		if (NCE_ISREACHABLE(nce->nce_common))
398 			nce_fastpath_trigger(under_nce);
399 	}
400 	/* nce_refrele is deferred until the lock is dropped  */
401 	if (nce != NULL) {
402 		if (newnce != NULL)
403 			*newnce = nce;
404 		else
405 			nce_refrele(nce);
406 	}
407 bail:
408 	if (upper_nce != NULL)
409 		nce_refrele(upper_nce);
410 	if (need_ill_refrele)
411 		ill_refrele(ill);
412 	return (err);
413 }
414 
415 /*
416  * Remove all the CONDEMNED nces from the appropriate hash table.
417  * We create a private list of NCEs, these may have ires pointing
418  * to them, so the list will be passed through to clean up dependent
419  * ires and only then we can do ncec_refrele() which can make NCE inactive.
420  */
421 static void
422 nce_remove(ndp_g_t *ndp, ncec_t *ncec, ncec_t **free_nce_list)
423 {
424 	ncec_t *ncec1;
425 	ncec_t **ptpn;
426 
427 	ASSERT(MUTEX_HELD(&ndp->ndp_g_lock));
428 	ASSERT(ndp->ndp_g_walker == 0);
429 	for (; ncec; ncec = ncec1) {
430 		ncec1 = ncec->ncec_next;
431 		mutex_enter(&ncec->ncec_lock);
432 		if (NCE_ISCONDEMNED(ncec)) {
433 			ptpn = ncec->ncec_ptpn;
434 			ncec1 = ncec->ncec_next;
435 			if (ncec1 != NULL)
436 				ncec1->ncec_ptpn = ptpn;
437 			*ptpn = ncec1;
438 			ncec->ncec_ptpn = NULL;
439 			ncec->ncec_next = NULL;
440 			ncec->ncec_next = *free_nce_list;
441 			*free_nce_list = ncec;
442 		}
443 		mutex_exit(&ncec->ncec_lock);
444 	}
445 }
446 
447 /*
448  * 1. Mark the entry CONDEMNED. This ensures that no new nce_lookup()
449  *    will return this NCE. Also no new timeouts will
450  *    be started (See nce_restart_timer).
451  * 2. Cancel any currently running timeouts.
452  * 3. If there is an ndp walker, return. The walker will do the cleanup.
453  *    This ensures that walkers see a consistent list of NCEs while walking.
454  * 4. Otherwise remove the NCE from the list of NCEs
455  */
456 void
457 ncec_delete(ncec_t *ncec)
458 {
459 	ncec_t	**ptpn;
460 	ncec_t	*ncec1;
461 	int	ipversion = ncec->ncec_ipversion;
462 	ndp_g_t *ndp;
463 	ip_stack_t	*ipst = ncec->ncec_ipst;
464 
465 	if (ipversion == IPV4_VERSION)
466 		ndp = ipst->ips_ndp4;
467 	else
468 		ndp = ipst->ips_ndp6;
469 
470 	/* Serialize deletes */
471 	mutex_enter(&ncec->ncec_lock);
472 	if (NCE_ISCONDEMNED(ncec)) {
473 		/* Some other thread is doing the delete */
474 		mutex_exit(&ncec->ncec_lock);
475 		return;
476 	}
477 	/*
478 	 * Caller has a refhold. Also 1 ref for being in the list. Thus
479 	 * refcnt has to be >= 2
480 	 */
481 	ASSERT(ncec->ncec_refcnt >= 2);
482 	ncec->ncec_flags |= NCE_F_CONDEMNED;
483 	mutex_exit(&ncec->ncec_lock);
484 
485 	/* Count how many condemned ires for kmem_cache callback */
486 	atomic_inc_32(&ipst->ips_num_nce_condemned);
487 	nce_fastpath_list_delete(ncec->ncec_ill, ncec, NULL);
488 
489 	/* Complete any waiting callbacks */
490 	ncec_cb_dispatch(ncec);
491 
492 	/*
493 	 * Cancel any running timer. Timeout can't be restarted
494 	 * since CONDEMNED is set. Can't hold ncec_lock across untimeout.
495 	 * Passing invalid timeout id is fine.
496 	 */
497 	if (ncec->ncec_timeout_id != 0) {
498 		(void) untimeout(ncec->ncec_timeout_id);
499 		ncec->ncec_timeout_id = 0;
500 	}
501 
502 	mutex_enter(&ndp->ndp_g_lock);
503 	if (ncec->ncec_ptpn == NULL) {
504 		/*
505 		 * The last ndp walker has already removed this ncec from
506 		 * the list after we marked the ncec CONDEMNED and before
507 		 * we grabbed the global lock.
508 		 */
509 		mutex_exit(&ndp->ndp_g_lock);
510 		return;
511 	}
512 	if (ndp->ndp_g_walker > 0) {
513 		/*
514 		 * Can't unlink. The walker will clean up
515 		 */
516 		ndp->ndp_g_walker_cleanup = B_TRUE;
517 		mutex_exit(&ndp->ndp_g_lock);
518 		return;
519 	}
520 
521 	/*
522 	 * Now remove the ncec from the list. nce_restart_timer won't restart
523 	 * the timer since it is marked CONDEMNED.
524 	 */
525 	ptpn = ncec->ncec_ptpn;
526 	ncec1 = ncec->ncec_next;
527 	if (ncec1 != NULL)
528 		ncec1->ncec_ptpn = ptpn;
529 	*ptpn = ncec1;
530 	ncec->ncec_ptpn = NULL;
531 	ncec->ncec_next = NULL;
532 	mutex_exit(&ndp->ndp_g_lock);
533 
534 	/* Removed from ncec_ptpn/ncec_next list */
535 	ncec_refrele_notr(ncec);
536 }
537 
538 void
539 ncec_inactive(ncec_t *ncec)
540 {
541 	mblk_t		**mpp;
542 	ill_t		*ill = ncec->ncec_ill;
543 	ip_stack_t	*ipst = ncec->ncec_ipst;
544 
545 	ASSERT(ncec->ncec_refcnt == 0);
546 	ASSERT(MUTEX_HELD(&ncec->ncec_lock));
547 
548 	/* Count how many condemned nces for kmem_cache callback */
549 	if (NCE_ISCONDEMNED(ncec))
550 		atomic_add_32(&ipst->ips_num_nce_condemned, -1);
551 
552 	/* Free all allocated messages */
553 	mpp = &ncec->ncec_qd_mp;
554 	while (*mpp != NULL) {
555 		mblk_t  *mp;
556 
557 		mp = *mpp;
558 		*mpp = mp->b_next;
559 
560 		inet_freemsg(mp);
561 	}
562 	/*
563 	 * must have been cleaned up in ncec_delete
564 	 */
565 	ASSERT(list_is_empty(&ncec->ncec_cb));
566 	list_destroy(&ncec->ncec_cb);
567 	/*
568 	 * free the ncec_lladdr if one was allocated in nce_add_common()
569 	 */
570 	if (ncec->ncec_lladdr_length > 0)
571 		kmem_free(ncec->ncec_lladdr, ncec->ncec_lladdr_length);
572 
573 #ifdef DEBUG
574 	ncec_trace_cleanup(ncec);
575 #endif
576 
577 	mutex_enter(&ill->ill_lock);
578 	DTRACE_PROBE3(ill__decr__cnt, (ill_t *), ill,
579 	    (char *), "ncec", (void *), ncec);
580 	ill->ill_ncec_cnt--;
581 	ncec->ncec_ill = NULL;
582 	/*
583 	 * If the number of ncec's associated with this ill have dropped
584 	 * to zero, check whether we need to restart any operation that
585 	 * is waiting for this to happen.
586 	 */
587 	if (ILL_DOWN_OK(ill)) {
588 		/* ipif_ill_refrele_tail drops the ill_lock */
589 		ipif_ill_refrele_tail(ill);
590 	} else {
591 		mutex_exit(&ill->ill_lock);
592 	}
593 
594 	mutex_destroy(&ncec->ncec_lock);
595 	kmem_cache_free(ncec_cache, ncec);
596 }
597 
598 /*
599  * ncec_walk routine.  Delete the ncec if it is associated with the ill
600  * that is going away.  Always called as a writer.
601  */
602 void
603 ncec_delete_per_ill(ncec_t *ncec, void *arg)
604 {
605 	if ((ncec != NULL) && ncec->ncec_ill == arg) {
606 		ncec_delete(ncec);
607 	}
608 }
609 
610 /*
611  * Neighbor Cache cleanup logic for a list of ncec_t entries.
612  */
613 static void
614 nce_cleanup_list(ncec_t *ncec)
615 {
616 	ncec_t *ncec_next;
617 
618 	ASSERT(ncec != NULL);
619 	while (ncec != NULL) {
620 		ncec_next = ncec->ncec_next;
621 		ncec->ncec_next = NULL;
622 
623 		/*
624 		 * It is possible for the last ndp walker (this thread)
625 		 * to come here after ncec_delete has marked the ncec CONDEMNED
626 		 * and before it has removed the ncec from the fastpath list
627 		 * or called untimeout. So we need to do it here. It is safe
628 		 * for both ncec_delete and this thread to do it twice or
629 		 * even simultaneously since each of the threads has a
630 		 * reference on the ncec.
631 		 */
632 		nce_fastpath_list_delete(ncec->ncec_ill, ncec, NULL);
633 		/*
634 		 * Cancel any running timer. Timeout can't be restarted
635 		 * since CONDEMNED is set. The ncec_lock can't be
636 		 * held across untimeout though passing invalid timeout
637 		 * id is fine.
638 		 */
639 		if (ncec->ncec_timeout_id != 0) {
640 			(void) untimeout(ncec->ncec_timeout_id);
641 			ncec->ncec_timeout_id = 0;
642 		}
643 		/* Removed from ncec_ptpn/ncec_next list */
644 		ncec_refrele_notr(ncec);
645 		ncec = ncec_next;
646 	}
647 }
648 
649 /*
650  * Restart DAD on given NCE.  Returns B_TRUE if DAD has been restarted.
651  */
652 boolean_t
653 nce_restart_dad(ncec_t *ncec)
654 {
655 	boolean_t started;
656 	ill_t *ill, *hwaddr_ill;
657 
658 	if (ncec == NULL)
659 		return (B_FALSE);
660 	ill = ncec->ncec_ill;
661 	mutex_enter(&ncec->ncec_lock);
662 	if (ncec->ncec_state == ND_PROBE) {
663 		mutex_exit(&ncec->ncec_lock);
664 		started = B_TRUE;
665 	} else if (ncec->ncec_state == ND_REACHABLE) {
666 		ASSERT(ncec->ncec_lladdr != NULL);
667 		ncec->ncec_state = ND_PROBE;
668 		ncec->ncec_pcnt = ND_MAX_UNICAST_SOLICIT;
669 		/*
670 		 * Slight cheat here: we don't use the initial probe delay
671 		 * for IPv4 in this obscure case.
672 		 */
673 		mutex_exit(&ncec->ncec_lock);
674 		if (IS_IPMP(ill)) {
675 			hwaddr_ill = ipmp_illgrp_find_ill(ill->ill_grp,
676 			    ncec->ncec_lladdr, ncec->ncec_lladdr_length);
677 		} else {
678 			hwaddr_ill = ill;
679 		}
680 		nce_dad(ncec, hwaddr_ill, B_TRUE);
681 		started = B_TRUE;
682 	} else {
683 		mutex_exit(&ncec->ncec_lock);
684 		started = B_FALSE;
685 	}
686 	return (started);
687 }
688 
689 /*
690  * IPv6 Cache entry lookup.  Try to find an ncec matching the parameters passed.
691  * If one is found, the refcnt on the ncec will be incremented.
692  */
693 ncec_t *
694 ncec_lookup_illgrp_v6(ill_t *ill, const in6_addr_t *addr)
695 {
696 	ncec_t		*ncec;
697 	ip_stack_t	*ipst = ill->ill_ipst;
698 
699 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
700 	mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
701 
702 	/* Get head of v6 hash table */
703 	ncec = *((ncec_t **)NCE_HASH_PTR_V6(ipst, *addr));
704 	ncec = ncec_lookup_illgrp(ill, addr, ncec);
705 	mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
706 	rw_exit(&ipst->ips_ill_g_lock);
707 	return (ncec);
708 }
709 /*
710  * IPv4 Cache entry lookup.  Try to find an ncec matching the parameters passed.
711  * If one is found, the refcnt on the ncec will be incremented.
712  */
713 ncec_t *
714 ncec_lookup_illgrp_v4(ill_t *ill, const in_addr_t *addr)
715 {
716 	ncec_t	*ncec = NULL;
717 	in6_addr_t addr6;
718 	ip_stack_t *ipst = ill->ill_ipst;
719 
720 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
721 	mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
722 
723 	/* Get head of v4 hash table */
724 	ncec = *((ncec_t **)NCE_HASH_PTR_V4(ipst, *addr));
725 	IN6_IPADDR_TO_V4MAPPED(*addr, &addr6);
726 	ncec = ncec_lookup_illgrp(ill, &addr6, ncec);
727 	mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
728 	rw_exit(&ipst->ips_ill_g_lock);
729 	return (ncec);
730 }
731 
732 /*
733  * Cache entry lookup.  Try to find an ncec matching the parameters passed.
734  * If an ncec is found, increment the hold count on that ncec.
735  * The caller passes in the start of the appropriate hash table, and must
736  * be holding the appropriate global lock (ndp_g_lock). In addition, since
737  * this function matches ncec_t entries across the illgrp, the ips_ill_g_lock
738  * must be held as reader.
739  *
740  * This function always matches across the ipmp group.
741  */
742 ncec_t *
743 ncec_lookup_illgrp(ill_t *ill, const in6_addr_t *addr, ncec_t *ncec)
744 {
745 	ndp_g_t		*ndp;
746 	ip_stack_t	*ipst = ill->ill_ipst;
747 
748 	if (ill->ill_isv6)
749 		ndp = ipst->ips_ndp6;
750 	else
751 		ndp = ipst->ips_ndp4;
752 
753 	ASSERT(ill != NULL);
754 	ASSERT(MUTEX_HELD(&ndp->ndp_g_lock));
755 	if (IN6_IS_ADDR_UNSPECIFIED(addr))
756 		return (NULL);
757 	for (; ncec != NULL; ncec = ncec->ncec_next) {
758 		if (ncec->ncec_ill == ill ||
759 		    IS_IN_SAME_ILLGRP(ill, ncec->ncec_ill)) {
760 			if (IN6_ARE_ADDR_EQUAL(&ncec->ncec_addr, addr)) {
761 				mutex_enter(&ncec->ncec_lock);
762 				if (!NCE_ISCONDEMNED(ncec)) {
763 					ncec_refhold_locked(ncec);
764 					mutex_exit(&ncec->ncec_lock);
765 					break;
766 				}
767 				mutex_exit(&ncec->ncec_lock);
768 			}
769 		}
770 	}
771 	return (ncec);
772 }
773 
774 /*
775  * Find an nce_t on ill with nce_addr == addr. Lookup the nce_t
776  * entries for ill only, i.e., when ill is part of an ipmp group,
777  * nce_lookup_v4 will never try to match across the group.
778  */
779 nce_t *
780 nce_lookup_v4(ill_t *ill, const in_addr_t *addr)
781 {
782 	nce_t *nce;
783 	in6_addr_t addr6;
784 	ip_stack_t *ipst = ill->ill_ipst;
785 
786 	mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
787 	IN6_IPADDR_TO_V4MAPPED(*addr, &addr6);
788 	nce = nce_lookup_addr(ill, &addr6);
789 	mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
790 	return (nce);
791 }
792 
793 /*
794  * Find an nce_t on ill with nce_addr == addr. Lookup the nce_t
795  * entries for ill only, i.e., when ill is part of an ipmp group,
796  * nce_lookup_v6 will never try to match across the group.
797  */
798 nce_t *
799 nce_lookup_v6(ill_t *ill, const in6_addr_t *addr6)
800 {
801 	nce_t *nce;
802 	ip_stack_t *ipst = ill->ill_ipst;
803 
804 	mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
805 	nce = nce_lookup_addr(ill, addr6);
806 	mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
807 	return (nce);
808 }
809 
810 static nce_t *
811 nce_lookup_addr(ill_t *ill, const in6_addr_t *addr)
812 {
813 	nce_t *nce;
814 
815 	ASSERT(ill != NULL);
816 #ifdef DEBUG
817 	if (ill->ill_isv6)
818 		ASSERT(MUTEX_HELD(&ill->ill_ipst->ips_ndp6->ndp_g_lock));
819 	else
820 		ASSERT(MUTEX_HELD(&ill->ill_ipst->ips_ndp4->ndp_g_lock));
821 #endif
822 	mutex_enter(&ill->ill_lock);
823 	nce = nce_lookup(ill, addr);
824 	mutex_exit(&ill->ill_lock);
825 	return (nce);
826 }
827 
828 
829 /*
830  * Router turned to host.  We need to make sure that cached copies of the ncec
831  * are not used for forwarding packets if they were derived from the default
832  * route, and that the default route itself is removed, as  required by
833  * section 7.2.5 of RFC 2461.
834  *
835  * Note that the ncec itself probably has valid link-layer information for the
836  * nexthop, so that there is no reason to delete the ncec, as long as the
837  * ISROUTER flag is turned off.
838  */
839 static void
840 ncec_router_to_host(ncec_t *ncec)
841 {
842 	ire_t		*ire;
843 	ip_stack_t	*ipst = ncec->ncec_ipst;
844 
845 	mutex_enter(&ncec->ncec_lock);
846 	ncec->ncec_flags &= ~NCE_F_ISROUTER;
847 	mutex_exit(&ncec->ncec_lock);
848 
849 	ire = ire_ftable_lookup_v6(&ipv6_all_zeros, &ipv6_all_zeros,
850 	    &ncec->ncec_addr, IRE_DEFAULT, ncec->ncec_ill, ALL_ZONES, NULL,
851 	    MATCH_IRE_ILL | MATCH_IRE_TYPE | MATCH_IRE_GW, 0, ipst, NULL);
852 	if (ire != NULL) {
853 		ip_rts_rtmsg(RTM_DELETE, ire, 0, ipst);
854 		ire_delete(ire);
855 		ire_refrele(ire);
856 	}
857 }
858 
859 /*
860  * Process passed in parameters either from an incoming packet or via
861  * user ioctl.
862  */
863 void
864 nce_process(ncec_t *ncec, uchar_t *hw_addr, uint32_t flag, boolean_t is_adv)
865 {
866 	ill_t	*ill = ncec->ncec_ill;
867 	uint32_t hw_addr_len = ill->ill_phys_addr_length;
868 	boolean_t ll_updated = B_FALSE;
869 	boolean_t ll_changed;
870 	nce_t	*nce;
871 
872 	ASSERT(ncec->ncec_ipversion == IPV6_VERSION);
873 	/*
874 	 * No updates of link layer address or the neighbor state is
875 	 * allowed, when the cache is in NONUD state.  This still
876 	 * allows for responding to reachability solicitation.
877 	 */
878 	mutex_enter(&ncec->ncec_lock);
879 	if (ncec->ncec_state == ND_INCOMPLETE) {
880 		if (hw_addr == NULL) {
881 			mutex_exit(&ncec->ncec_lock);
882 			return;
883 		}
884 		nce_set_ll(ncec, hw_addr);
885 		/*
886 		 * Update ncec state and send the queued packets
887 		 * back to ip this time ire will be added.
888 		 */
889 		if (flag & ND_NA_FLAG_SOLICITED) {
890 			nce_update(ncec, ND_REACHABLE, NULL);
891 		} else {
892 			nce_update(ncec, ND_STALE, NULL);
893 		}
894 		mutex_exit(&ncec->ncec_lock);
895 		nce = nce_fastpath(ncec, B_TRUE, NULL);
896 		nce_resolv_ok(ncec);
897 		if (nce != NULL)
898 			nce_refrele(nce);
899 		return;
900 	}
901 	ll_changed = nce_cmp_ll_addr(ncec, hw_addr, hw_addr_len);
902 	if (!is_adv) {
903 		/* If this is a SOLICITATION request only */
904 		if (ll_changed)
905 			nce_update(ncec, ND_STALE, hw_addr);
906 		mutex_exit(&ncec->ncec_lock);
907 		ncec_cb_dispatch(ncec);
908 		return;
909 	}
910 	if (!(flag & ND_NA_FLAG_OVERRIDE) && ll_changed) {
911 		/* If in any other state than REACHABLE, ignore */
912 		if (ncec->ncec_state == ND_REACHABLE) {
913 			nce_update(ncec, ND_STALE, NULL);
914 		}
915 		mutex_exit(&ncec->ncec_lock);
916 		ncec_cb_dispatch(ncec);
917 		return;
918 	} else {
919 		if (ll_changed) {
920 			nce_update(ncec, ND_UNCHANGED, hw_addr);
921 			ll_updated = B_TRUE;
922 		}
923 		if (flag & ND_NA_FLAG_SOLICITED) {
924 			nce_update(ncec, ND_REACHABLE, NULL);
925 		} else {
926 			if (ll_updated) {
927 				nce_update(ncec, ND_STALE, NULL);
928 			}
929 		}
930 		mutex_exit(&ncec->ncec_lock);
931 		if (!(flag & ND_NA_FLAG_ROUTER) && (ncec->ncec_flags &
932 		    NCE_F_ISROUTER)) {
933 			ncec_router_to_host(ncec);
934 		} else {
935 			ncec_cb_dispatch(ncec);
936 		}
937 	}
938 }
939 
940 /*
941  * Pass arg1 to the cbf supplied, along with each ncec in existence.
942  * ncec_walk() places a REFHOLD on the ncec and drops the lock when
943  * walking the hash list.
944  */
945 void
946 ncec_walk_common(ndp_g_t *ndp, ill_t *ill, ncec_walk_cb_t cbf,
947     void *arg1, boolean_t trace)
948 {
949 	ncec_t	*ncec;
950 	ncec_t	*ncec1;
951 	ncec_t	**ncep;
952 	ncec_t	*free_nce_list = NULL;
953 
954 	mutex_enter(&ndp->ndp_g_lock);
955 	/* Prevent ncec_delete from unlink and free of NCE */
956 	ndp->ndp_g_walker++;
957 	mutex_exit(&ndp->ndp_g_lock);
958 	for (ncep = ndp->nce_hash_tbl;
959 	    ncep < A_END(ndp->nce_hash_tbl); ncep++) {
960 		for (ncec = *ncep; ncec != NULL; ncec = ncec1) {
961 			ncec1 = ncec->ncec_next;
962 			if (ill == NULL || ncec->ncec_ill == ill) {
963 				if (trace) {
964 					ncec_refhold(ncec);
965 					(*cbf)(ncec, arg1);
966 					ncec_refrele(ncec);
967 				} else {
968 					ncec_refhold_notr(ncec);
969 					(*cbf)(ncec, arg1);
970 					ncec_refrele_notr(ncec);
971 				}
972 			}
973 		}
974 	}
975 	mutex_enter(&ndp->ndp_g_lock);
976 	ndp->ndp_g_walker--;
977 	if (ndp->ndp_g_walker_cleanup && ndp->ndp_g_walker == 0) {
978 		/* Time to delete condemned entries */
979 		for (ncep = ndp->nce_hash_tbl;
980 		    ncep < A_END(ndp->nce_hash_tbl); ncep++) {
981 			ncec = *ncep;
982 			if (ncec != NULL) {
983 				nce_remove(ndp, ncec, &free_nce_list);
984 			}
985 		}
986 		ndp->ndp_g_walker_cleanup = B_FALSE;
987 	}
988 
989 	mutex_exit(&ndp->ndp_g_lock);
990 
991 	if (free_nce_list != NULL) {
992 		nce_cleanup_list(free_nce_list);
993 	}
994 }
995 
996 /*
997  * Walk everything.
998  * Note that ill can be NULL hence can't derive the ipst from it.
999  */
1000 void
1001 ncec_walk(ill_t *ill, ncec_walk_cb_t cbf, void *arg1, ip_stack_t *ipst)
1002 {
1003 	ncec_walk_common(ipst->ips_ndp4, ill, cbf, arg1, B_TRUE);
1004 	ncec_walk_common(ipst->ips_ndp6, ill, cbf, arg1, B_TRUE);
1005 }
1006 
1007 /*
1008  * Cheesy globals (i.e. all netstacks) for both a limit on per-ill multicast
1009  * NCEs, and the number to reclaim if we hit the limit.  Used by
1010  * nce_set_multicast_v[46]() to limit the linked-list length of ill_nce. Until
1011  * we solve the multicast-mappings-shouldn't-be-NCEs problem, use this.
1012  */
1013 
1014 /* Maximum number of multicast NCEs on an ill. */
1015 uint_t ip_max_ill_mcast_nces = 16384;
1016 /*
1017  * Number of NCEs to delete if we hit the maximum above.  0 means *don't* and
1018  * return an error.  Non-zero means delete so many, and if the number is >=
1019  * the max above, that means delete them all.
1020  */
1021 uint_t ip_ill_mcast_reclaim = 256;
1022 
1023 /*
1024  * Encapsulate multicast ill capping in a function, for easier DTrace
1025  * detections.  Return a list of refheld NCEs to destroy-via-refrele.  That
1026  * list can be NULL, but can only be non-NULL if we successfully reclaimed.
1027  *
1028  * NOTE:  This function must be called while holding the ill_lock AND
1029  * JUST PRIOR to making the insertion into the ill_nce list.
1030  *
1031  * We can't release the ones we delete ourselves because the ill_lock is held
1032  * by the caller. They are, instead, passed back in a list_t for deletion
1033  * outside of the ill_lock hold. nce_graveyard_free() actually frees them.
1034  *
1035  * While this covers nce_t, ncec_t gets done even further down the road.  See
1036  * nce_graveyard_free() for why.
1037  */
1038 static boolean_t
1039 nce_too_many_mcast(ill_t *ill, list_t *graveyard)
1040 {
1041 	uint_t reclaim_count, max_count, reclaimed = 0;
1042 	boolean_t too_many;
1043 	nce_t *nce, *deadman;
1044 
1045 	ASSERT(graveyard != NULL);
1046 	ASSERT(list_is_empty(graveyard));
1047 	ASSERT(MUTEX_HELD(&ill->ill_lock));
1048 
1049 	/*
1050 	 * NOTE: Some grinning weirdo may have lowered the global max beyond
1051 	 * what this ill currently has.  The behavior in this case will be
1052 	 * trim-back just by the reclaim amount for any new ones.
1053 	 */
1054 	max_count = ip_max_ill_mcast_nces;
1055 	reclaim_count = min(ip_ill_mcast_reclaim, max_count);
1056 
1057 	/* All good? */
1058 	if (ill->ill_mcast_nces < max_count)
1059 		return (B_FALSE);	/* Yes, all good. */
1060 
1061 	if (reclaim_count == 0)
1062 		return (B_TRUE);	/* Don't bother - we're stuck. */
1063 
1064 	/* We need to reclaim now.  Exploit our held ill_lock. */
1065 
1066 	/*
1067 	 * Start at the tail and work backwards, new nces are head-inserted,
1068 	 * so we'll be reaping the oldest entries.
1069 	 */
1070 	nce = list_tail(&ill->ill_nce);
1071 	while (reclaimed < reclaim_count) {
1072 		/* Skip ahead to a multicast NCE. */
1073 		while (nce != NULL &&
1074 		    (nce->nce_common->ncec_flags & NCE_F_MCAST) == 0) {
1075 			nce = list_prev(&ill->ill_nce, nce);
1076 		}
1077 		if (nce == NULL)
1078 			break;
1079 
1080 		/*
1081 		 * NOTE: For now, we just delete the first one(s) we find.
1082 		 * This is not optimal, and may require some inspection of nce
1083 		 * & its ncec to be better.
1084 		 */
1085 		deadman = nce;
1086 		nce = list_prev(&ill->ill_nce, nce);
1087 
1088 		/* nce_delete() requires caller holds... */
1089 		nce_refhold(deadman);
1090 		nce_delete(deadman);	/* Bumps down ill_mcast_nces. */
1091 
1092 		/* Link the dead ones singly, still refheld... */
1093 		list_insert_tail(graveyard, deadman);
1094 		reclaimed++;
1095 	}
1096 
1097 	if (reclaimed != reclaim_count) {
1098 		/* We didn't have enough to reach reclaim_count. Why?!? */
1099 		DTRACE_PROBE3(ill__mcast__nce__reclaim__mismatch, ill_t *, ill,
1100 		    uint_t, reclaimed, uint_t, reclaim_count);
1101 
1102 		/* In case for some REALLY weird reason we found none! */
1103 		too_many = (reclaimed == 0);
1104 	} else {
1105 		too_many = B_FALSE;
1106 	}
1107 
1108 	return (too_many);
1109 }
1110 
1111 static void
1112 ncec_mcast_reap_one(ncec_t *ncec, void *arg)
1113 {
1114 	boolean_t reapit;
1115 	ill_t *ill = (ill_t *)arg;
1116 
1117 	/* Obvious no-lock-needed checks... */
1118 	if (ncec == NULL || ncec->ncec_ill != ill ||
1119 	    (ncec->ncec_flags & NCE_F_MCAST) == 0)
1120 		return;
1121 
1122 	mutex_enter(&ncec->ncec_lock);
1123 	/*
1124 	 * It's refheld by the walk infrastructure. It has one reference for
1125 	 * being in the ndp_g_hash, and if an nce_t exists, that's one more.
1126 	 * We want ones without an nce_t, so 2 is the magic number.  If it's
1127 	 * LESS than 2, we have much bigger problems anyway.
1128 	 */
1129 	ASSERT(ncec->ncec_refcnt >= 2);
1130 	reapit = (ncec->ncec_refcnt == 2);
1131 	mutex_exit(&ncec->ncec_lock);
1132 
1133 	if (reapit) {
1134 		IP_STAT(ill->ill_ipst, ip_nce_mcast_reclaim_deleted);
1135 		ncec_delete(ncec);
1136 	}
1137 }
1138 
1139 /*
1140  * Attempt to reap stray multicast ncec_t structures left in the wake of
1141  * nce_graveyard_free(). This is a taskq servicing routine, as it's well
1142  * outside any netstack-global locks being held - ndp_g_lock in this case.  We
1143  * have a reference hold on the ill, which will prevent any unplumbing races.
1144  */
1145 static void
1146 ncec_mcast_reap(void *arg)
1147 {
1148 	ill_t *ill = (ill_t *)arg;
1149 
1150 	IP_STAT(ill->ill_ipst, ip_nce_mcast_reclaim_calls);
1151 	ncec_walk(ill, ncec_mcast_reap_one, ill, ill->ill_ipst);
1152 	mutex_enter(&ill->ill_lock);
1153 	ill->ill_mcast_ncec_cleanup = B_FALSE;
1154 	/*
1155 	 * Inline a _notr() version of ill_refrele. See nce_graveyard_free()
1156 	 * below for why.
1157 	 */
1158 	ill->ill_refcnt--;
1159 	if (ill->ill_refcnt == 0)
1160 		ipif_ill_refrele_tail(ill);	/* Drops ill_lock. */
1161 	else
1162 		mutex_exit(&ill->ill_lock);
1163 }
1164 
1165 /*
1166  * Free a list (including handling an empty list or NULL list) of
1167  * reference-held NCEs that were reaped from a nce_too_many_mcast()
1168  * call. Separate because the caller must have dropped ndp_g_lock first.
1169  *
1170  * This also schedules a taskq task to unlink underlying NCECs from the
1171  * ndp_g_hash, which are protected by ndp_g_lock.
1172  */
1173 static void
1174 nce_graveyard_free(list_t *graveyard)
1175 {
1176 	nce_t *deadman, *current;
1177 	ill_t *ill;
1178 	boolean_t doit;
1179 
1180 	if (graveyard == NULL)
1181 		return;
1182 
1183 	current = list_head(graveyard);
1184 	if (current == NULL) {
1185 		list_destroy(graveyard);
1186 		return;
1187 	}
1188 
1189 	ill = current->nce_ill;
1190 	/*
1191 	 * Normally one should ill_refhold(ill) here.  There's no _notr()
1192 	 * variant like there is for ire_t, dce_t, or even ncec_t, but this is
1193 	 * the ONLY case that'll break the mh_trace that IP debugging uses for
1194 	 * reference counts (i.e. they assume same thread releases as
1195 	 * holds). Instead, we inline ill_refhold() here.  We must do the same
1196 	 * in the release done by the ncec_mcast_reap() above.
1197 	 */
1198 	mutex_enter(&ill->ill_lock);
1199 	ill->ill_refcnt++;
1200 	mutex_exit(&ill->ill_lock);
1201 
1202 	while (current != NULL) {
1203 		ASSERT3P(ill, ==, current->nce_ill);
1204 		deadman = current;
1205 		current = list_next(graveyard, deadman);
1206 		list_remove(graveyard, deadman);
1207 		ASSERT3U((deadman->nce_common->ncec_flags & NCE_F_MCAST), !=,
1208 		    0);
1209 		nce_refrele(deadman);
1210 	}
1211 	list_destroy(graveyard);
1212 
1213 	mutex_enter(&ill->ill_lock);
1214 	if (ill->ill_mcast_ncec_cleanup)
1215 		doit = B_FALSE;
1216 	else {
1217 		ill->ill_mcast_ncec_cleanup = B_TRUE;
1218 		doit = B_TRUE;
1219 	}
1220 	mutex_exit(&ill->ill_lock);
1221 	if (!doit || taskq_dispatch(system_taskq, ncec_mcast_reap,
1222 	    ill, TQ_NOSLEEP) == TASKQID_INVALID) {
1223 		mutex_enter(&ill->ill_lock);
1224 		if (doit) {
1225 			IP_STAT(ill->ill_ipst, ip_nce_mcast_reclaim_tqfail);
1226 			ill->ill_mcast_ncec_cleanup = B_FALSE;
1227 		}
1228 		/* There's no _notr() for ill_refrele(), so inline it here. */
1229 		ill->ill_refcnt--;
1230 		if (ill->ill_refcnt == 0)
1231 			ipif_ill_refrele_tail(ill);	/* Drops ill_lock */
1232 		else
1233 			mutex_exit(&ill->ill_lock);
1234 	}
1235 }
1236 
1237 /*
1238  * For each interface an entry is added for the unspecified multicast group.
1239  * Here that mapping is used to form the multicast cache entry for a particular
1240  * multicast destination.
1241  */
1242 static int
1243 nce_set_multicast_v6(ill_t *ill, const in6_addr_t *dst,
1244     uint16_t flags, nce_t **newnce)
1245 {
1246 	uchar_t		*hw_addr;
1247 	int		err = 0;
1248 	ip_stack_t	*ipst = ill->ill_ipst;
1249 	nce_t		*nce;
1250 
1251 	ASSERT(ill != NULL);
1252 	ASSERT(ill->ill_isv6);
1253 	ASSERT(!(IN6_IS_ADDR_UNSPECIFIED(dst)));
1254 
1255 	mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
1256 	nce = nce_lookup_addr(ill, dst);
1257 	if (nce != NULL) {
1258 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
1259 		goto done;
1260 	}
1261 	if (ill->ill_net_type == IRE_IF_RESOLVER) {
1262 		/*
1263 		 * For IRE_IF_RESOLVER a hardware mapping can be
1264 		 * generated.
1265 		 */
1266 		hw_addr = kmem_alloc(ill->ill_nd_lla_len, KM_NOSLEEP);
1267 		if (hw_addr == NULL) {
1268 			mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
1269 			return (ENOMEM);
1270 		}
1271 		ip_mcast_mapping(ill, (uchar_t *)dst, hw_addr);
1272 	} else {
1273 		/* No hw_addr is needed for IRE_IF_NORESOLVER. */
1274 		hw_addr = NULL;
1275 	}
1276 	ASSERT((flags & NCE_F_MCAST) != 0);
1277 	ASSERT((flags & NCE_F_NONUD) != 0);
1278 	/* nce_state will be computed by nce_add_common() */
1279 	err = nce_add_v6(ill, hw_addr, ill->ill_phys_addr_length, dst, flags,
1280 	    ND_UNCHANGED, &nce);
1281 	mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
1282 	if (err == 0)
1283 		err = (nce != NULL) ? nce_add_v6_postprocess(nce) : ENOMEM;
1284 	if (hw_addr != NULL)
1285 		kmem_free(hw_addr, ill->ill_nd_lla_len);
1286 	if (err != 0) {
1287 		ip1dbg(("nce_set_multicast_v6: create failed" "%d\n", err));
1288 		return (err);
1289 	}
1290 done:
1291 	ASSERT(nce->nce_common->ncec_state == ND_REACHABLE);
1292 	if (newnce != NULL)
1293 		*newnce = nce;
1294 	else
1295 		nce_refrele(nce);
1296 	return (0);
1297 }
1298 
1299 /*
1300  * Return the link layer address, and any flags of a ncec.
1301  */
1302 int
1303 ndp_query(ill_t *ill, struct lif_nd_req *lnr)
1304 {
1305 	ncec_t		*ncec;
1306 	in6_addr_t	*addr;
1307 	sin6_t		*sin6;
1308 
1309 	ASSERT(ill != NULL && ill->ill_isv6);
1310 	sin6 = (sin6_t *)&lnr->lnr_addr;
1311 	addr =  &sin6->sin6_addr;
1312 
1313 	/*
1314 	 * NOTE: if the ill is an IPMP interface, then match against the whole
1315 	 * illgrp.  This e.g. allows in.ndpd to retrieve the link layer
1316 	 * addresses for the data addresses on an IPMP interface even though
1317 	 * ipif_ndp_up() created them with an ncec_ill of ipif_bound_ill.
1318 	 */
1319 	ncec = ncec_lookup_illgrp_v6(ill, addr);
1320 	if (ncec == NULL)
1321 		return (ESRCH);
1322 	/* If no link layer address is available yet, return ESRCH */
1323 	if (!NCE_ISREACHABLE(ncec)) {
1324 		ncec_refrele(ncec);
1325 		return (ESRCH);
1326 	}
1327 	lnr->lnr_hdw_len = ill->ill_phys_addr_length;
1328 	bcopy(ncec->ncec_lladdr, (uchar_t *)&lnr->lnr_hdw_addr,
1329 	    lnr->lnr_hdw_len);
1330 	if (ncec->ncec_flags & NCE_F_ISROUTER)
1331 		lnr->lnr_flags = NDF_ISROUTER_ON;
1332 	if (ncec->ncec_flags & NCE_F_ANYCAST)
1333 		lnr->lnr_flags |= NDF_ANYCAST_ON;
1334 	if (ncec->ncec_flags & NCE_F_STATIC)
1335 		lnr->lnr_flags |= NDF_STATIC;
1336 	ncec_refrele(ncec);
1337 	return (0);
1338 }
1339 
1340 /*
1341  * Finish setting up the Enable/Disable multicast for the driver.
1342  */
1343 mblk_t *
1344 ndp_mcastreq(ill_t *ill, const in6_addr_t *v6group, uint32_t hw_addr_len,
1345     uint32_t hw_addr_offset, mblk_t *mp)
1346 {
1347 	uchar_t		*hw_addr;
1348 	ipaddr_t	v4group;
1349 	uchar_t		*addr;
1350 
1351 	ASSERT(ill->ill_net_type == IRE_IF_RESOLVER);
1352 	if (IN6_IS_ADDR_V4MAPPED(v6group)) {
1353 		IN6_V4MAPPED_TO_IPADDR(v6group, v4group);
1354 
1355 		ASSERT(CLASSD(v4group));
1356 		ASSERT(!(ill->ill_isv6));
1357 
1358 		addr = (uchar_t *)&v4group;
1359 	} else {
1360 		ASSERT(IN6_IS_ADDR_MULTICAST(v6group));
1361 		ASSERT(ill->ill_isv6);
1362 
1363 		addr = (uchar_t *)v6group;
1364 	}
1365 	hw_addr = mi_offset_paramc(mp, hw_addr_offset, hw_addr_len);
1366 	if (hw_addr == NULL) {
1367 		ip0dbg(("ndp_mcastreq NULL hw_addr\n"));
1368 		freemsg(mp);
1369 		return (NULL);
1370 	}
1371 
1372 	ip_mcast_mapping(ill, addr, hw_addr);
1373 	return (mp);
1374 }
1375 
1376 void
1377 ip_ndp_resolve(ncec_t *ncec)
1378 {
1379 	in_addr_t	sender4 = INADDR_ANY;
1380 	in6_addr_t	sender6 = ipv6_all_zeros;
1381 	ill_t		*src_ill;
1382 	uint32_t	ms;
1383 
1384 	src_ill = nce_resolve_src(ncec, &sender6);
1385 	if (src_ill == NULL) {
1386 		/* Make sure we try again later */
1387 		ms = ncec->ncec_ill->ill_reachable_retrans_time;
1388 		nce_restart_timer(ncec, (clock_t)ms);
1389 		return;
1390 	}
1391 	if (ncec->ncec_ipversion == IPV4_VERSION)
1392 		IN6_V4MAPPED_TO_IPADDR(&sender6, sender4);
1393 	mutex_enter(&ncec->ncec_lock);
1394 	if (ncec->ncec_ipversion == IPV6_VERSION)
1395 		ms = ndp_solicit(ncec, sender6, src_ill);
1396 	else
1397 		ms = arp_request(ncec, sender4, src_ill);
1398 	mutex_exit(&ncec->ncec_lock);
1399 	if (ms == 0) {
1400 		if (ncec->ncec_state != ND_REACHABLE) {
1401 			if (ncec->ncec_ipversion == IPV6_VERSION)
1402 				ndp_resolv_failed(ncec);
1403 			else
1404 				arp_resolv_failed(ncec);
1405 			ASSERT((ncec->ncec_flags & NCE_F_STATIC) == 0);
1406 			nce_make_unreachable(ncec);
1407 			ncec_delete(ncec);
1408 		}
1409 	} else {
1410 		nce_restart_timer(ncec, (clock_t)ms);
1411 	}
1412 done:
1413 	ill_refrele(src_ill);
1414 }
1415 
1416 /*
1417  * Send an IPv6 neighbor solicitation.
1418  * Returns number of milliseconds after which we should either rexmit or abort.
1419  * Return of zero means we should abort.
1420  * The caller holds the ncec_lock to protect ncec_qd_mp and ncec_rcnt.
1421  * The optional source address is used as a hint to ndp_solicit for
1422  * which source to use in the packet.
1423  *
1424  * NOTE: This routine drops ncec_lock (and later reacquires it) when sending
1425  * the packet.
1426  */
1427 uint32_t
1428 ndp_solicit(ncec_t *ncec, in6_addr_t src, ill_t *ill)
1429 {
1430 	in6_addr_t	dst;
1431 	boolean_t	dropped = B_FALSE;
1432 
1433 	ASSERT(ncec->ncec_ipversion == IPV6_VERSION);
1434 	ASSERT(MUTEX_HELD(&ncec->ncec_lock));
1435 
1436 	if (ncec->ncec_rcnt == 0)
1437 		return (0);
1438 
1439 	dst = ncec->ncec_addr;
1440 	ncec->ncec_rcnt--;
1441 	mutex_exit(&ncec->ncec_lock);
1442 	dropped = ndp_xmit(ill, ND_NEIGHBOR_SOLICIT, ill->ill_phys_addr,
1443 	    ill->ill_phys_addr_length, &src, &dst, 0);
1444 	mutex_enter(&ncec->ncec_lock);
1445 	if (dropped)
1446 		ncec->ncec_rcnt++;
1447 	return (ncec->ncec_ill->ill_reachable_retrans_time);
1448 }
1449 
1450 /*
1451  * Attempt to recover an address on an interface that's been marked as a
1452  * duplicate.  Because NCEs are destroyed when the interface goes down, there's
1453  * no easy way to just probe the address and have the right thing happen if
1454  * it's no longer in use.  Instead, we just bring it up normally and allow the
1455  * regular interface start-up logic to probe for a remaining duplicate and take
1456  * us back down if necessary.
1457  * Neither DHCP nor temporary addresses arrive here; they're excluded by
1458  * ip_ndp_excl.
1459  */
1460 /* ARGSUSED */
1461 void
1462 ip_addr_recover(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg)
1463 {
1464 	ill_t	*ill = rq->q_ptr;
1465 	ipif_t	*ipif;
1466 	in6_addr_t *addr6 = (in6_addr_t *)mp->b_rptr;
1467 	in_addr_t *addr4 = (in_addr_t *)mp->b_rptr;
1468 	boolean_t addr_equal;
1469 
1470 	for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
1471 		/*
1472 		 * We do not support recovery of proxy ARP'd interfaces,
1473 		 * because the system lacks a complete proxy ARP mechanism.
1474 		 */
1475 		if (ill->ill_isv6) {
1476 			addr_equal = IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6lcl_addr,
1477 			    addr6);
1478 		} else {
1479 			addr_equal = (ipif->ipif_lcl_addr == *addr4);
1480 		}
1481 
1482 		if ((ipif->ipif_flags & IPIF_POINTOPOINT) || !addr_equal)
1483 			continue;
1484 
1485 		/*
1486 		 * If we have already recovered or if the interface is going
1487 		 * away, then ignore.
1488 		 */
1489 		mutex_enter(&ill->ill_lock);
1490 		if (!(ipif->ipif_flags & IPIF_DUPLICATE) ||
1491 		    (ipif->ipif_state_flags & IPIF_CONDEMNED)) {
1492 			mutex_exit(&ill->ill_lock);
1493 			continue;
1494 		}
1495 
1496 		ipif->ipif_flags &= ~IPIF_DUPLICATE;
1497 		ill->ill_ipif_dup_count--;
1498 		mutex_exit(&ill->ill_lock);
1499 		ipif->ipif_was_dup = B_TRUE;
1500 
1501 		if (ill->ill_isv6) {
1502 			VERIFY(ipif_ndp_up(ipif, B_TRUE) != EINPROGRESS);
1503 			(void) ipif_up_done_v6(ipif);
1504 		} else {
1505 			VERIFY(ipif_arp_up(ipif, Res_act_initial, B_TRUE) !=
1506 			    EINPROGRESS);
1507 			(void) ipif_up_done(ipif);
1508 		}
1509 	}
1510 	freeb(mp);
1511 }
1512 
1513 /*
1514  * Attempt to recover an IPv6 interface that's been shut down as a duplicate.
1515  * As long as someone else holds the address, the interface will stay down.
1516  * When that conflict goes away, the interface is brought back up.  This is
1517  * done so that accidental shutdowns of addresses aren't made permanent.  Your
1518  * server will recover from a failure.
1519  *
1520  * For DHCP and temporary addresses, recovery is not done in the kernel.
1521  * Instead, it's handled by user space processes (dhcpagent and in.ndpd).
1522  *
1523  * This function is entered on a timer expiry; the ID is in ipif_recovery_id.
1524  */
1525 void
1526 ipif_dup_recovery(void *arg)
1527 {
1528 	ipif_t *ipif = arg;
1529 
1530 	ipif->ipif_recovery_id = 0;
1531 	if (!(ipif->ipif_flags & IPIF_DUPLICATE))
1532 		return;
1533 
1534 	/*
1535 	 * No lock, because this is just an optimization.
1536 	 */
1537 	if (ipif->ipif_state_flags & IPIF_CONDEMNED)
1538 		return;
1539 
1540 	/* If the link is down, we'll retry this later */
1541 	if (!(ipif->ipif_ill->ill_phyint->phyint_flags & PHYI_RUNNING))
1542 		return;
1543 
1544 	ipif_do_recovery(ipif);
1545 }
1546 
1547 /*
1548  * Perform interface recovery by forcing the duplicate interfaces up and
1549  * allowing the system to determine which ones should stay up.
1550  *
1551  * Called both by recovery timer expiry and link-up notification.
1552  */
1553 void
1554 ipif_do_recovery(ipif_t *ipif)
1555 {
1556 	ill_t *ill = ipif->ipif_ill;
1557 	mblk_t *mp;
1558 	ip_stack_t *ipst = ill->ill_ipst;
1559 	size_t mp_size;
1560 
1561 	if (ipif->ipif_isv6)
1562 		mp_size = sizeof (ipif->ipif_v6lcl_addr);
1563 	else
1564 		mp_size = sizeof (ipif->ipif_lcl_addr);
1565 	mp = allocb(mp_size, BPRI_MED);
1566 	if (mp == NULL) {
1567 		mutex_enter(&ill->ill_lock);
1568 		if (ipst->ips_ip_dup_recovery > 0 &&
1569 		    ipif->ipif_recovery_id == 0 &&
1570 		    !(ipif->ipif_state_flags & IPIF_CONDEMNED)) {
1571 			ipif->ipif_recovery_id = timeout(ipif_dup_recovery,
1572 			    ipif, MSEC_TO_TICK(ipst->ips_ip_dup_recovery));
1573 		}
1574 		mutex_exit(&ill->ill_lock);
1575 	} else {
1576 		/*
1577 		 * A recovery timer may still be running if we got here from
1578 		 * ill_restart_dad(); cancel that timer.
1579 		 */
1580 		if (ipif->ipif_recovery_id != 0)
1581 			(void) untimeout(ipif->ipif_recovery_id);
1582 		ipif->ipif_recovery_id = 0;
1583 
1584 		if (ipif->ipif_isv6) {
1585 			bcopy(&ipif->ipif_v6lcl_addr, mp->b_rptr,
1586 			    sizeof (ipif->ipif_v6lcl_addr));
1587 		} else  {
1588 			bcopy(&ipif->ipif_lcl_addr, mp->b_rptr,
1589 			    sizeof (ipif->ipif_lcl_addr));
1590 		}
1591 		ill_refhold(ill);
1592 		qwriter_ip(ill, ill->ill_rq, mp, ip_addr_recover, NEW_OP,
1593 		    B_FALSE);
1594 	}
1595 }
1596 
1597 /*
1598  * Find the MAC and IP addresses in an NA/NS message.
1599  */
1600 static void
1601 ip_ndp_find_addresses(mblk_t *mp, ip_recv_attr_t *ira, ill_t *ill,
1602     in6_addr_t *targp, uchar_t **haddr, uint_t *haddrlenp)
1603 {
1604 	icmp6_t *icmp6 = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
1605 	nd_neighbor_solicit_t *ns = (nd_neighbor_solicit_t *)icmp6;
1606 	uchar_t *addr;
1607 	int alen;
1608 
1609 	/* icmp_inbound_v6 ensures this */
1610 	ASSERT(ira->ira_flags & IRAF_L2SRC_SET);
1611 
1612 	addr = ira->ira_l2src;
1613 	alen = ill->ill_phys_addr_length;
1614 	if (alen > 0) {
1615 		*haddr = addr;
1616 		*haddrlenp = alen;
1617 	} else {
1618 		*haddr = NULL;
1619 		*haddrlenp = 0;
1620 	}
1621 
1622 	/* nd_ns_target and nd_na_target are at the same offset, so we cheat */
1623 	*targp = ns->nd_ns_target;
1624 }
1625 
1626 /*
1627  * This is for exclusive changes due to NDP duplicate address detection
1628  * failure.
1629  */
1630 /* ARGSUSED */
1631 static void
1632 ip_ndp_excl(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg)
1633 {
1634 	ill_t	*ill = rq->q_ptr;
1635 	ipif_t	*ipif;
1636 	uchar_t	*haddr;
1637 	uint_t	haddrlen;
1638 	ip_stack_t *ipst = ill->ill_ipst;
1639 	in6_addr_t targ;
1640 	ip_recv_attr_t iras;
1641 	mblk_t	*attrmp;
1642 
1643 	attrmp = mp;
1644 	mp = mp->b_cont;
1645 	attrmp->b_cont = NULL;
1646 	if (!ip_recv_attr_from_mblk(attrmp, &iras)) {
1647 		/* The ill or ip_stack_t disappeared on us */
1648 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
1649 		ip_drop_input("ip_recv_attr_from_mblk", mp, ill);
1650 		freemsg(mp);
1651 		ira_cleanup(&iras, B_TRUE);
1652 		return;
1653 	}
1654 
1655 	ASSERT(ill == iras.ira_rill);
1656 
1657 	ip_ndp_find_addresses(mp, &iras, ill, &targ, &haddr, &haddrlen);
1658 	if (haddr != NULL && haddrlen == ill->ill_phys_addr_length) {
1659 		/*
1660 		 * Ignore conflicts generated by misbehaving switches that
1661 		 * just reflect our own messages back to us.  For IPMP, we may
1662 		 * see reflections across any ill in the illgrp.
1663 		 *
1664 		 * RFC2462 and revisions tried to detect both the case
1665 		 * when a statically configured IPv6 address is a duplicate,
1666 		 * and the case when the L2 address itself is a duplicate. The
1667 		 * later is important because, with stateles address autoconf,
1668 		 * if the L2 address is a duplicate, the resulting IPv6
1669 		 * address(es) would also be duplicates. We rely on DAD of the
1670 		 * IPv6 address itself to detect the latter case.
1671 		 */
1672 		/* For an under ill_grp can change under lock */
1673 		rw_enter(&ipst->ips_ill_g_lock, RW_READER);
1674 		if (bcmp(haddr, ill->ill_phys_addr, haddrlen) == 0 ||
1675 		    IS_UNDER_IPMP(ill) &&
1676 		    ipmp_illgrp_find_ill(ill->ill_grp, haddr,
1677 		    haddrlen) != NULL) {
1678 			rw_exit(&ipst->ips_ill_g_lock);
1679 			goto ignore_conflict;
1680 		}
1681 		rw_exit(&ipst->ips_ill_g_lock);
1682 	}
1683 
1684 	/*
1685 	 * Look up the appropriate ipif.
1686 	 */
1687 	ipif = ipif_lookup_addr_v6(&targ, ill, ALL_ZONES, ipst);
1688 	if (ipif == NULL)
1689 		goto ignore_conflict;
1690 
1691 	/* Reload the ill to match the ipif */
1692 	ill = ipif->ipif_ill;
1693 
1694 	/* If it's already duplicate or ineligible, then don't do anything. */
1695 	if (ipif->ipif_flags & (IPIF_POINTOPOINT|IPIF_DUPLICATE)) {
1696 		ipif_refrele(ipif);
1697 		goto ignore_conflict;
1698 	}
1699 
1700 	/*
1701 	 * If this is a failure during duplicate recovery, then don't
1702 	 * complain.  It may take a long time to recover.
1703 	 */
1704 	if (!ipif->ipif_was_dup) {
1705 		char ibuf[LIFNAMSIZ];
1706 		char hbuf[MAC_STR_LEN];
1707 		char sbuf[INET6_ADDRSTRLEN];
1708 
1709 		ipif_get_name(ipif, ibuf, sizeof (ibuf));
1710 		cmn_err(CE_WARN, "%s has duplicate address %s (in use by %s);"
1711 		    " disabled", ibuf,
1712 		    inet_ntop(AF_INET6, &targ, sbuf, sizeof (sbuf)),
1713 		    mac_colon_addr(haddr, haddrlen, hbuf, sizeof (hbuf)));
1714 	}
1715 	mutex_enter(&ill->ill_lock);
1716 	ASSERT(!(ipif->ipif_flags & IPIF_DUPLICATE));
1717 	ipif->ipif_flags |= IPIF_DUPLICATE;
1718 	ill->ill_ipif_dup_count++;
1719 	mutex_exit(&ill->ill_lock);
1720 	(void) ipif_down(ipif, NULL, NULL);
1721 	(void) ipif_down_tail(ipif);
1722 	mutex_enter(&ill->ill_lock);
1723 	if (!(ipif->ipif_flags & (IPIF_DHCPRUNNING|IPIF_TEMPORARY)) &&
1724 	    ill->ill_net_type == IRE_IF_RESOLVER &&
1725 	    !(ipif->ipif_state_flags & IPIF_CONDEMNED) &&
1726 	    ipst->ips_ip_dup_recovery > 0) {
1727 		ASSERT(ipif->ipif_recovery_id == 0);
1728 		ipif->ipif_recovery_id = timeout(ipif_dup_recovery,
1729 		    ipif, MSEC_TO_TICK(ipst->ips_ip_dup_recovery));
1730 	}
1731 	mutex_exit(&ill->ill_lock);
1732 	ipif_refrele(ipif);
1733 
1734 ignore_conflict:
1735 	freemsg(mp);
1736 	ira_cleanup(&iras, B_TRUE);
1737 }
1738 
1739 /*
1740  * Handle failure by tearing down the ipifs with the specified address.  Note
1741  * that tearing down the ipif also means deleting the ncec through ipif_down, so
1742  * it's not possible to do recovery by just restarting the ncec timer.  Instead,
1743  * we start a timer on the ipif.
1744  * Caller has to free mp;
1745  */
1746 static void
1747 ndp_failure(mblk_t *mp, ip_recv_attr_t *ira)
1748 {
1749 	const uchar_t	*haddr;
1750 	ill_t		*ill = ira->ira_rill;
1751 
1752 	/*
1753 	 * Ignore conflicts generated by misbehaving switches that just
1754 	 * reflect our own messages back to us.
1755 	 */
1756 
1757 	/* icmp_inbound_v6 ensures this */
1758 	ASSERT(ira->ira_flags & IRAF_L2SRC_SET);
1759 	haddr = ira->ira_l2src;
1760 	if (haddr != NULL &&
1761 	    bcmp(haddr, ill->ill_phys_addr, ill->ill_phys_addr_length) == 0) {
1762 		return;
1763 	}
1764 
1765 	if ((mp = copymsg(mp)) != NULL) {
1766 		mblk_t	*attrmp;
1767 
1768 		attrmp = ip_recv_attr_to_mblk(ira);
1769 		if (attrmp == NULL) {
1770 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
1771 			ip_drop_input("ipIfStatsInDiscards", mp, ill);
1772 			freemsg(mp);
1773 		} else {
1774 			ASSERT(attrmp->b_cont == NULL);
1775 			attrmp->b_cont = mp;
1776 			mp = attrmp;
1777 			ill_refhold(ill);
1778 			qwriter_ip(ill, ill->ill_rq, mp, ip_ndp_excl, NEW_OP,
1779 			    B_FALSE);
1780 		}
1781 	}
1782 }
1783 
1784 /*
1785  * Handle a discovered conflict: some other system is advertising that it owns
1786  * one of our IP addresses.  We need to defend ourselves, or just shut down the
1787  * interface.
1788  *
1789  * Handles both IPv4 and IPv6
1790  */
1791 boolean_t
1792 ip_nce_conflict(mblk_t *mp, ip_recv_attr_t *ira, ncec_t *ncec)
1793 {
1794 	ipif_t		*ipif;
1795 	clock_t		now;
1796 	uint_t		maxdefense;
1797 	uint_t		defs;
1798 	ill_t		*ill = ira->ira_ill;
1799 	ip_stack_t	*ipst = ill->ill_ipst;
1800 	uint32_t	elapsed;
1801 	boolean_t	isv6 = ill->ill_isv6;
1802 	ipaddr_t	ncec_addr;
1803 
1804 	if (isv6) {
1805 		ipif = ipif_lookup_addr_v6(&ncec->ncec_addr, ill, ALL_ZONES,
1806 		    ipst);
1807 	} else {
1808 		if (arp_no_defense) {
1809 			/*
1810 			 * Yes, there is a conflict, but no, we do not
1811 			 * defend ourself.
1812 			 */
1813 			return (B_TRUE);
1814 		}
1815 		IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, ncec_addr);
1816 		ipif = ipif_lookup_addr(ncec_addr, ill, ALL_ZONES,
1817 		    ipst);
1818 	}
1819 	if (ipif == NULL)
1820 		return (B_FALSE);
1821 
1822 	/*
1823 	 * First, figure out if this address is disposable.
1824 	 */
1825 	if (ipif->ipif_flags & (IPIF_DHCPRUNNING | IPIF_TEMPORARY))
1826 		maxdefense = ipst->ips_ip_max_temp_defend;
1827 	else
1828 		maxdefense = ipst->ips_ip_max_defend;
1829 
1830 	/*
1831 	 * Now figure out how many times we've defended ourselves.  Ignore
1832 	 * defenses that happened long in the past.
1833 	 */
1834 	now = ddi_get_lbolt();
1835 	elapsed = (drv_hztousec(now - ncec->ncec_last_time_defended))/1000000;
1836 	mutex_enter(&ncec->ncec_lock);
1837 	if ((defs = ncec->ncec_defense_count) > 0 &&
1838 	    elapsed > ipst->ips_ip_defend_interval) {
1839 		/*
1840 		 * ip_defend_interval has elapsed.
1841 		 * reset the defense count.
1842 		 */
1843 		ncec->ncec_defense_count = defs = 0;
1844 	}
1845 	ncec->ncec_defense_count++;
1846 	ncec->ncec_last_time_defended = now;
1847 	mutex_exit(&ncec->ncec_lock);
1848 	ipif_refrele(ipif);
1849 
1850 	/*
1851 	 * If we've defended ourselves too many times already, then give up and
1852 	 * tear down the interface(s) using this address.
1853 	 * Otherwise, caller has to defend by sending out an announce.
1854 	 */
1855 	if (defs >= maxdefense) {
1856 		if (isv6)
1857 			ndp_failure(mp, ira);
1858 		else
1859 			arp_failure(mp, ira);
1860 	} else {
1861 		return (B_TRUE); /* caller must defend this address */
1862 	}
1863 	return (B_FALSE);
1864 }
1865 
1866 /*
1867  * Handle reception of Neighbor Solicitation messages.
1868  */
1869 static void
1870 ndp_input_solicit(mblk_t *mp, ip_recv_attr_t *ira)
1871 {
1872 	ill_t		*ill = ira->ira_ill, *under_ill;
1873 	nd_neighbor_solicit_t *ns;
1874 	uint32_t	hlen = ill->ill_phys_addr_length;
1875 	uchar_t		*haddr = NULL;
1876 	icmp6_t		*icmp_nd;
1877 	ip6_t		*ip6h;
1878 	ncec_t		*our_ncec = NULL;
1879 	in6_addr_t	target;
1880 	in6_addr_t	src;
1881 	int		len;
1882 	int		flag = 0;
1883 	nd_opt_hdr_t	*opt = NULL;
1884 	boolean_t	bad_solicit = B_FALSE;
1885 	mib2_ipv6IfIcmpEntry_t	*mib = ill->ill_icmp6_mib;
1886 	boolean_t	need_ill_refrele = B_FALSE;
1887 
1888 	ip6h = (ip6_t *)mp->b_rptr;
1889 	icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
1890 	len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN;
1891 	src = ip6h->ip6_src;
1892 	ns = (nd_neighbor_solicit_t *)icmp_nd;
1893 	target = ns->nd_ns_target;
1894 	if (IN6_IS_ADDR_MULTICAST(&target) || IN6_IS_ADDR_V4MAPPED(&target) ||
1895 	    IN6_IS_ADDR_LOOPBACK(&target)) {
1896 		if (ip_debug > 2) {
1897 			/* ip1dbg */
1898 			pr_addr_dbg("ndp_input_solicit: Martian Target %s\n",
1899 			    AF_INET6, &target);
1900 		}
1901 		bad_solicit = B_TRUE;
1902 		goto done;
1903 	}
1904 	if (len > sizeof (nd_neighbor_solicit_t)) {
1905 		/* Options present */
1906 		opt = (nd_opt_hdr_t *)&ns[1];
1907 		len -= sizeof (nd_neighbor_solicit_t);
1908 		if (!ndp_verify_optlen(opt, len)) {
1909 			ip1dbg(("ndp_input_solicit: Bad opt len\n"));
1910 			bad_solicit = B_TRUE;
1911 			goto done;
1912 		}
1913 	}
1914 	if (IN6_IS_ADDR_UNSPECIFIED(&src)) {
1915 		/* Check to see if this is a valid DAD solicitation */
1916 		if (!IN6_IS_ADDR_MC_SOLICITEDNODE(&ip6h->ip6_dst)) {
1917 			if (ip_debug > 2) {
1918 				/* ip1dbg */
1919 				pr_addr_dbg("ndp_input_solicit: IPv6 "
1920 				    "Destination is not solicited node "
1921 				    "multicast %s\n", AF_INET6,
1922 				    &ip6h->ip6_dst);
1923 			}
1924 			bad_solicit = B_TRUE;
1925 			goto done;
1926 		}
1927 	}
1928 
1929 	/*
1930 	 * NOTE: with IPMP, it's possible the nominated multicast ill (which
1931 	 * received this packet if it's multicast) is not the ill tied to
1932 	 * e.g. the IPMP ill's data link-local.  So we match across the illgrp
1933 	 * to ensure we find the associated NCE.
1934 	 */
1935 	our_ncec = ncec_lookup_illgrp_v6(ill, &target);
1936 	/*
1937 	 * If this is a valid Solicitation for an address we are publishing,
1938 	 * then a PUBLISH entry should exist in the cache
1939 	 */
1940 	if (our_ncec == NULL || !NCE_PUBLISH(our_ncec)) {
1941 		ip1dbg(("ndp_input_solicit: Wrong target in NS?!"
1942 		    "ifname=%s ", ill->ill_name));
1943 		if (ip_debug > 2) {
1944 			/* ip1dbg */
1945 			pr_addr_dbg(" dst %s\n", AF_INET6, &target);
1946 		}
1947 		if (our_ncec == NULL)
1948 			bad_solicit = B_TRUE;
1949 		goto done;
1950 	}
1951 
1952 	/* At this point we should have a verified NS per spec */
1953 	if (opt != NULL) {
1954 		opt = ndp_get_option(opt, len, ND_OPT_SOURCE_LINKADDR);
1955 		if (opt != NULL) {
1956 			haddr = (uchar_t *)&opt[1];
1957 			if (hlen > opt->nd_opt_len * 8 - sizeof (*opt) ||
1958 			    hlen == 0) {
1959 				ip1dbg(("ndp_input_advert: bad SLLA\n"));
1960 				bad_solicit = B_TRUE;
1961 				goto done;
1962 			}
1963 		}
1964 	}
1965 
1966 	/* If sending directly to peer, set the unicast flag */
1967 	if (!IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst))
1968 		flag |= NDP_UNICAST;
1969 
1970 	/*
1971 	 * Create/update the entry for the soliciting node on the ipmp_ill.
1972 	 * or respond to outstanding queries, don't if
1973 	 * the source is unspecified address.
1974 	 */
1975 	if (!IN6_IS_ADDR_UNSPECIFIED(&src)) {
1976 		int	err;
1977 		nce_t	*nnce;
1978 
1979 		ASSERT(ill->ill_isv6);
1980 		/*
1981 		 * Regular solicitations *must* include the Source Link-Layer
1982 		 * Address option.  Ignore messages that do not.
1983 		 */
1984 		if (haddr == NULL && IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst)) {
1985 			ip1dbg(("ndp_input_solicit: source link-layer address "
1986 			    "option missing with a specified source.\n"));
1987 			bad_solicit = B_TRUE;
1988 			goto done;
1989 		}
1990 
1991 		/*
1992 		 * This is a regular solicitation.  If we're still in the
1993 		 * process of verifying the address, then don't respond at all
1994 		 * and don't keep track of the sender.
1995 		 */
1996 		if (our_ncec->ncec_state == ND_PROBE)
1997 			goto done;
1998 
1999 		/*
2000 		 * If the solicitation doesn't have sender hardware address
2001 		 * (legal for unicast solicitation), then process without
2002 		 * installing the return NCE.  Either we already know it, or
2003 		 * we'll be forced to look it up when (and if) we reply to the
2004 		 * packet.
2005 		 */
2006 		if (haddr == NULL)
2007 			goto no_source;
2008 
2009 		under_ill = ill;
2010 		if (IS_UNDER_IPMP(under_ill)) {
2011 			ill = ipmp_ill_hold_ipmp_ill(under_ill);
2012 			if (ill == NULL)
2013 				ill = under_ill;
2014 			else
2015 				need_ill_refrele = B_TRUE;
2016 		}
2017 		err = nce_lookup_then_add_v6(ill,
2018 		    haddr, hlen,
2019 		    &src,	/* Soliciting nodes address */
2020 		    0,
2021 		    ND_STALE,
2022 		    &nnce);
2023 
2024 		if (need_ill_refrele) {
2025 			ill_refrele(ill);
2026 			ill = under_ill;
2027 			need_ill_refrele =  B_FALSE;
2028 		}
2029 		switch (err) {
2030 		case 0:
2031 			/* done with this entry */
2032 			nce_refrele(nnce);
2033 			break;
2034 		case EEXIST:
2035 			/*
2036 			 * B_FALSE indicates this is not an an advertisement.
2037 			 */
2038 			nce_process(nnce->nce_common, haddr, 0, B_FALSE);
2039 			nce_refrele(nnce);
2040 			break;
2041 		default:
2042 			ip1dbg(("ndp_input_solicit: Can't create NCE %d\n",
2043 			    err));
2044 			goto done;
2045 		}
2046 no_source:
2047 		flag |= NDP_SOLICITED;
2048 	} else {
2049 		/*
2050 		 * No source link layer address option should be present in a
2051 		 * valid DAD request.
2052 		 */
2053 		if (haddr != NULL) {
2054 			ip1dbg(("ndp_input_solicit: source link-layer address "
2055 			    "option present with an unspecified source.\n"));
2056 			bad_solicit = B_TRUE;
2057 			goto done;
2058 		}
2059 		if (our_ncec->ncec_state == ND_PROBE) {
2060 			/*
2061 			 * Internally looped-back probes will have
2062 			 * IRAF_L2SRC_LOOPBACK set so we can ignore our own
2063 			 * transmissions.
2064 			 */
2065 			if (!(ira->ira_flags & IRAF_L2SRC_LOOPBACK)) {
2066 				/*
2067 				 * If someone else is probing our address, then
2068 				 * we've crossed wires.  Declare failure.
2069 				 */
2070 				ndp_failure(mp, ira);
2071 			}
2072 			goto done;
2073 		}
2074 		/*
2075 		 * This is a DAD probe.  Multicast the advertisement to the
2076 		 * all-nodes address.
2077 		 */
2078 		src = ipv6_all_hosts_mcast;
2079 	}
2080 	flag |= nce_advert_flags(our_ncec);
2081 	(void) ndp_xmit(ill,
2082 	    ND_NEIGHBOR_ADVERT,
2083 	    our_ncec->ncec_lladdr,
2084 	    our_ncec->ncec_lladdr_length,
2085 	    &target,	/* Source and target of the advertisement pkt */
2086 	    &src,	/* IP Destination (source of original pkt) */
2087 	    flag);
2088 done:
2089 	if (bad_solicit)
2090 		BUMP_MIB(mib, ipv6IfIcmpInBadNeighborSolicitations);
2091 	if (our_ncec != NULL)
2092 		ncec_refrele(our_ncec);
2093 }
2094 
2095 /*
2096  * Handle reception of Neighbor Solicitation messages
2097  */
2098 void
2099 ndp_input_advert(mblk_t *mp, ip_recv_attr_t *ira)
2100 {
2101 	ill_t		*ill = ira->ira_ill;
2102 	nd_neighbor_advert_t *na;
2103 	uint32_t	hlen = ill->ill_phys_addr_length;
2104 	uchar_t		*haddr = NULL;
2105 	icmp6_t		*icmp_nd;
2106 	ip6_t		*ip6h;
2107 	ncec_t		*dst_ncec = NULL;
2108 	in6_addr_t	target;
2109 	nd_opt_hdr_t	*opt = NULL;
2110 	int		len;
2111 	ip_stack_t	*ipst = ill->ill_ipst;
2112 	mib2_ipv6IfIcmpEntry_t	*mib = ill->ill_icmp6_mib;
2113 
2114 	ip6h = (ip6_t *)mp->b_rptr;
2115 	icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
2116 	len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN;
2117 	na = (nd_neighbor_advert_t *)icmp_nd;
2118 
2119 	if (IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst) &&
2120 	    (na->nd_na_flags_reserved & ND_NA_FLAG_SOLICITED)) {
2121 		ip1dbg(("ndp_input_advert: Target is multicast but the "
2122 		    "solicited flag is not zero\n"));
2123 		BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements);
2124 		return;
2125 	}
2126 	target = na->nd_na_target;
2127 	if (IN6_IS_ADDR_MULTICAST(&target) || IN6_IS_ADDR_V4MAPPED(&target) ||
2128 	    IN6_IS_ADDR_LOOPBACK(&target)) {
2129 		if (ip_debug > 2) {
2130 			/* ip1dbg */
2131 			pr_addr_dbg("ndp_input_solicit: Martian Target %s\n",
2132 			    AF_INET6, &target);
2133 		}
2134 		BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements);
2135 		return;
2136 	}
2137 	if (len > sizeof (nd_neighbor_advert_t)) {
2138 		opt = (nd_opt_hdr_t *)&na[1];
2139 		if (!ndp_verify_optlen(opt,
2140 		    len - sizeof (nd_neighbor_advert_t))) {
2141 			ip1dbg(("ndp_input_advert: cannot verify SLLA\n"));
2142 			BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements);
2143 			return;
2144 		}
2145 		/* At this point we have a verified NA per spec */
2146 		len -= sizeof (nd_neighbor_advert_t);
2147 		opt = ndp_get_option(opt, len, ND_OPT_TARGET_LINKADDR);
2148 		if (opt != NULL) {
2149 			haddr = (uchar_t *)&opt[1];
2150 			if (hlen > opt->nd_opt_len * 8 - sizeof (*opt) ||
2151 			    hlen == 0) {
2152 				ip1dbg(("ndp_input_advert: bad SLLA\n"));
2153 				BUMP_MIB(mib,
2154 				    ipv6IfIcmpInBadNeighborAdvertisements);
2155 				return;
2156 			}
2157 		}
2158 	}
2159 
2160 	/*
2161 	 * NOTE: we match across the illgrp since we need to do DAD for all of
2162 	 * our local addresses, and those are spread across all the active
2163 	 * ills in the group.
2164 	 */
2165 	if ((dst_ncec = ncec_lookup_illgrp_v6(ill, &target)) == NULL)
2166 		return;
2167 
2168 	if (NCE_PUBLISH(dst_ncec)) {
2169 		/*
2170 		 * Someone just advertised an addresses that we publish. First,
2171 		 * check it it was us -- if so, we can safely ignore it.
2172 		 * We don't get the haddr from the ira_l2src because, in the
2173 		 * case that the packet originated from us, on an IPMP group,
2174 		 * the ira_l2src may would be the link-layer address of the
2175 		 * cast_ill used to send the packet, which may not be the same
2176 		 * as the dst_ncec->ncec_lladdr of the address.
2177 		 */
2178 		if (haddr != NULL) {
2179 			if (ira->ira_flags & IRAF_L2SRC_LOOPBACK)
2180 				goto out;
2181 
2182 			if (!nce_cmp_ll_addr(dst_ncec, haddr, hlen))
2183 				goto out;   /* from us -- no conflict */
2184 
2185 			/*
2186 			 * If we're in an IPMP group, check if this is an echo
2187 			 * from another ill in the group.  Use the double-
2188 			 * checked locking pattern to avoid grabbing
2189 			 * ill_g_lock in the non-IPMP case.
2190 			 */
2191 			if (IS_UNDER_IPMP(ill)) {
2192 				rw_enter(&ipst->ips_ill_g_lock, RW_READER);
2193 				if (IS_UNDER_IPMP(ill) && ipmp_illgrp_find_ill(
2194 				    ill->ill_grp, haddr, hlen) != NULL) {
2195 					rw_exit(&ipst->ips_ill_g_lock);
2196 					goto out;
2197 				}
2198 				rw_exit(&ipst->ips_ill_g_lock);
2199 			}
2200 		}
2201 
2202 		/*
2203 		 * This appears to be a real conflict.  If we're trying to
2204 		 * configure this NCE (ND_PROBE), then shut it down.
2205 		 * Otherwise, handle the discovered conflict.
2206 		 */
2207 		if (dst_ncec->ncec_state == ND_PROBE) {
2208 			ndp_failure(mp, ira);
2209 		} else {
2210 			if (ip_nce_conflict(mp, ira, dst_ncec)) {
2211 				char hbuf[MAC_STR_LEN];
2212 				char sbuf[INET6_ADDRSTRLEN];
2213 
2214 				cmn_err(CE_WARN,
2215 				    "node '%s' is using %s on %s",
2216 				    inet_ntop(AF_INET6, &target, sbuf,
2217 				    sizeof (sbuf)),
2218 				    haddr == NULL ? "<none>" :
2219 				    mac_colon_addr(haddr, hlen, hbuf,
2220 				    sizeof (hbuf)), ill->ill_name);
2221 				/*
2222 				 * RFC 4862, Section 5.4.4 does not mandate
2223 				 * any specific behavior when an NA matches
2224 				 * a non-tentative address assigned to the
2225 				 * receiver. We make the choice of defending
2226 				 * our address, based on the assumption that
2227 				 * the sender has not detected the Duplicate.
2228 				 *
2229 				 * ncec_last_time_defended has been adjusted
2230 				 * in ip_nce_conflict()
2231 				 */
2232 				(void) ndp_announce(dst_ncec);
2233 			}
2234 		}
2235 	} else {
2236 		if (na->nd_na_flags_reserved & ND_NA_FLAG_ROUTER)
2237 			dst_ncec->ncec_flags |= NCE_F_ISROUTER;
2238 
2239 		/* B_TRUE indicates this an advertisement */
2240 		nce_process(dst_ncec, haddr, na->nd_na_flags_reserved, B_TRUE);
2241 	}
2242 out:
2243 	ncec_refrele(dst_ncec);
2244 }
2245 
2246 /*
2247  * Process NDP neighbor solicitation/advertisement messages.
2248  * The checksum has already checked o.k before reaching here.
2249  * Information about the datalink header is contained in ira_l2src, but
2250  * that should be ignored for loopback packets.
2251  */
2252 void
2253 ndp_input(mblk_t *mp, ip_recv_attr_t *ira)
2254 {
2255 	ill_t		*ill = ira->ira_rill;
2256 	icmp6_t		*icmp_nd;
2257 	ip6_t		*ip6h;
2258 	int		len;
2259 	mib2_ipv6IfIcmpEntry_t	*mib = ill->ill_icmp6_mib;
2260 	ill_t		*orig_ill = NULL;
2261 
2262 	/*
2263 	 * Since ira_ill is where the IRE_LOCAL was hosted we use ira_rill
2264 	 * and make it be the IPMP upper so avoid being confused by a packet
2265 	 * addressed to a unicast address on a different ill.
2266 	 */
2267 	if (IS_UNDER_IPMP(ill)) {
2268 		orig_ill = ill;
2269 		ill = ipmp_ill_hold_ipmp_ill(orig_ill);
2270 		if (ill == NULL) {
2271 			ill = orig_ill;
2272 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2273 			ip_drop_input("ipIfStatsInDiscards - IPMP ill",
2274 			    mp, ill);
2275 			freemsg(mp);
2276 			return;
2277 		}
2278 		ASSERT(ill != orig_ill);
2279 		orig_ill = ira->ira_ill;
2280 		ira->ira_ill = ill;
2281 		mib = ill->ill_icmp6_mib;
2282 	}
2283 	if (!pullupmsg(mp, -1)) {
2284 		ip1dbg(("ndp_input: pullupmsg failed\n"));
2285 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2286 		ip_drop_input("ipIfStatsInDiscards - pullupmsg", mp, ill);
2287 		goto done;
2288 	}
2289 	ip6h = (ip6_t *)mp->b_rptr;
2290 	if (ip6h->ip6_hops != IPV6_MAX_HOPS) {
2291 		ip1dbg(("ndp_input: hoplimit != IPV6_MAX_HOPS\n"));
2292 		ip_drop_input("ipv6IfIcmpBadHoplimit", mp, ill);
2293 		BUMP_MIB(mib, ipv6IfIcmpBadHoplimit);
2294 		goto done;
2295 	}
2296 	/*
2297 	 * NDP does not accept any extension headers between the
2298 	 * IP header and the ICMP header since e.g. a routing
2299 	 * header could be dangerous.
2300 	 * This assumes that any AH or ESP headers are removed
2301 	 * by ip prior to passing the packet to ndp_input.
2302 	 */
2303 	if (ip6h->ip6_nxt != IPPROTO_ICMPV6) {
2304 		ip1dbg(("ndp_input: Wrong next header 0x%x\n",
2305 		    ip6h->ip6_nxt));
2306 		ip_drop_input("Wrong next header", mp, ill);
2307 		BUMP_MIB(mib, ipv6IfIcmpInErrors);
2308 		goto done;
2309 	}
2310 	icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
2311 	ASSERT(icmp_nd->icmp6_type == ND_NEIGHBOR_SOLICIT ||
2312 	    icmp_nd->icmp6_type == ND_NEIGHBOR_ADVERT);
2313 	if (icmp_nd->icmp6_code != 0) {
2314 		ip1dbg(("ndp_input: icmp6 code != 0 \n"));
2315 		ip_drop_input("code non-zero", mp, ill);
2316 		BUMP_MIB(mib, ipv6IfIcmpInErrors);
2317 		goto done;
2318 	}
2319 	len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN;
2320 	/*
2321 	 * Make sure packet length is large enough for either
2322 	 * a NS or a NA icmp packet.
2323 	 */
2324 	if (len <  sizeof (struct icmp6_hdr) + sizeof (struct in6_addr)) {
2325 		ip1dbg(("ndp_input: packet too short\n"));
2326 		ip_drop_input("packet too short", mp, ill);
2327 		BUMP_MIB(mib, ipv6IfIcmpInErrors);
2328 		goto done;
2329 	}
2330 	if (icmp_nd->icmp6_type == ND_NEIGHBOR_SOLICIT) {
2331 		ndp_input_solicit(mp, ira);
2332 	} else {
2333 		ndp_input_advert(mp, ira);
2334 	}
2335 done:
2336 	freemsg(mp);
2337 	if (orig_ill != NULL) {
2338 		ill_refrele(ill);
2339 		ira->ira_ill = orig_ill;
2340 	}
2341 }
2342 
2343 /*
2344  * ndp_xmit is called to form and transmit a ND solicitation or
2345  * advertisement ICMP packet.
2346  *
2347  * If the source address is unspecified and this isn't a probe (used for
2348  * duplicate address detection), an appropriate source address and link layer
2349  * address will be chosen here.  The link layer address option is included if
2350  * the source is specified (i.e., all non-probe packets), and omitted (per the
2351  * specification) otherwise.
2352  *
2353  * It returns B_FALSE only if it does a successful put() to the
2354  * corresponding ill's ill_wq otherwise returns B_TRUE.
2355  */
2356 static boolean_t
2357 ndp_xmit(ill_t *ill, uint32_t operation, uint8_t *hw_addr, uint_t hw_addr_len,
2358     const in6_addr_t *sender, const in6_addr_t *target, int flag)
2359 {
2360 	uint32_t	len;
2361 	icmp6_t		*icmp6;
2362 	mblk_t		*mp;
2363 	ip6_t		*ip6h;
2364 	nd_opt_hdr_t	*opt;
2365 	uint_t		plen;
2366 	zoneid_t	zoneid = GLOBAL_ZONEID;
2367 	ill_t		*hwaddr_ill = ill;
2368 	ip_xmit_attr_t	ixas;
2369 	ip_stack_t	*ipst = ill->ill_ipst;
2370 	boolean_t	need_refrele = B_FALSE;
2371 	boolean_t	probe = B_FALSE;
2372 
2373 	if (IS_UNDER_IPMP(ill)) {
2374 		probe = ipif_lookup_testaddr_v6(ill, sender, NULL);
2375 		/*
2376 		 * We send non-probe packets on the upper IPMP interface.
2377 		 * ip_output_simple() will use cast_ill for sending any
2378 		 * multicast packets. Note that we can't follow the same
2379 		 * logic for probe packets because all interfaces in the ipmp
2380 		 * group may have failed, so that we really want to only try
2381 		 * to send the ND packet on the ill corresponding to the src
2382 		 * address.
2383 		 */
2384 		if (!probe) {
2385 			ill = ipmp_ill_hold_ipmp_ill(ill);
2386 			if (ill != NULL)
2387 				need_refrele = B_TRUE;
2388 			else
2389 				ill = hwaddr_ill;
2390 		}
2391 	}
2392 
2393 	/*
2394 	 * If we have a unspecified source(sender) address, select a
2395 	 * proper source address for the solicitation here itself so
2396 	 * that we can initialize the h/w address correctly.
2397 	 *
2398 	 * If the sender is specified then we use this address in order
2399 	 * to lookup the zoneid before calling ip_output_v6(). This is to
2400 	 * enable unicast ND_NEIGHBOR_ADVERT packets to be routed correctly
2401 	 * by IP (we cannot guarantee that the global zone has an interface
2402 	 * route to the destination).
2403 	 *
2404 	 * Note that the NA never comes here with the unspecified source
2405 	 * address.
2406 	 */
2407 
2408 	/*
2409 	 * Probes will have unspec src at this point.
2410 	 */
2411 	if (!(IN6_IS_ADDR_UNSPECIFIED(sender))) {
2412 		zoneid = ipif_lookup_addr_zoneid_v6(sender, ill, ipst);
2413 		/*
2414 		 * It's possible for ipif_lookup_addr_zoneid_v6() to return
2415 		 * ALL_ZONES if it cannot find a matching ipif for the address
2416 		 * we are trying to use. In this case we err on the side of
2417 		 * trying to send the packet by defaulting to the GLOBAL_ZONEID.
2418 		 */
2419 		if (zoneid == ALL_ZONES)
2420 			zoneid = GLOBAL_ZONEID;
2421 	}
2422 
2423 	plen = (sizeof (nd_opt_hdr_t) + hw_addr_len + 7) / 8;
2424 	len = IPV6_HDR_LEN + sizeof (nd_neighbor_advert_t) + plen * 8;
2425 	mp = allocb(len,  BPRI_LO);
2426 	if (mp == NULL) {
2427 		if (need_refrele)
2428 			ill_refrele(ill);
2429 		return (B_TRUE);
2430 	}
2431 
2432 	bzero((char *)mp->b_rptr, len);
2433 	mp->b_wptr = mp->b_rptr + len;
2434 
2435 	bzero(&ixas, sizeof (ixas));
2436 	ixas.ixa_flags = IXAF_SET_ULP_CKSUM | IXAF_NO_HW_CKSUM;
2437 
2438 	ixas.ixa_ifindex = ill->ill_phyint->phyint_ifindex;
2439 	ixas.ixa_ipst = ipst;
2440 	ixas.ixa_cred = kcred;
2441 	ixas.ixa_cpid = NOPID;
2442 	ixas.ixa_tsl = NULL;
2443 	ixas.ixa_zoneid = zoneid;
2444 
2445 	ip6h = (ip6_t *)mp->b_rptr;
2446 	ip6h->ip6_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
2447 	ip6h->ip6_plen = htons(len - IPV6_HDR_LEN);
2448 	ip6h->ip6_nxt = IPPROTO_ICMPV6;
2449 	ip6h->ip6_hops = IPV6_MAX_HOPS;
2450 	ixas.ixa_multicast_ttl = ip6h->ip6_hops;
2451 	ip6h->ip6_dst = *target;
2452 	icmp6 = (icmp6_t *)&ip6h[1];
2453 
2454 	if (hw_addr_len != 0) {
2455 		opt = (nd_opt_hdr_t *)((uint8_t *)ip6h + IPV6_HDR_LEN +
2456 		    sizeof (nd_neighbor_advert_t));
2457 	} else {
2458 		opt = NULL;
2459 	}
2460 	if (operation == ND_NEIGHBOR_SOLICIT) {
2461 		nd_neighbor_solicit_t *ns = (nd_neighbor_solicit_t *)icmp6;
2462 
2463 		if (opt != NULL && !(flag & NDP_PROBE)) {
2464 			/*
2465 			 * Note that we don't send out SLLA for ND probes
2466 			 * per RFC 4862, even though we do send out the src
2467 			 * haddr for IPv4 DAD probes, even though both IPv4
2468 			 * and IPv6 go out with the unspecified/INADDR_ANY
2469 			 * src IP addr.
2470 			 */
2471 			opt->nd_opt_type = ND_OPT_SOURCE_LINKADDR;
2472 		}
2473 		ip6h->ip6_src = *sender;
2474 		ns->nd_ns_target = *target;
2475 		if (!(flag & NDP_UNICAST)) {
2476 			/* Form multicast address of the target */
2477 			ip6h->ip6_dst = ipv6_solicited_node_mcast;
2478 			ip6h->ip6_dst.s6_addr32[3] |=
2479 			    ns->nd_ns_target.s6_addr32[3];
2480 		}
2481 	} else {
2482 		nd_neighbor_advert_t *na = (nd_neighbor_advert_t *)icmp6;
2483 
2484 		ASSERT(!(flag & NDP_PROBE));
2485 		if (opt != NULL)
2486 			opt->nd_opt_type = ND_OPT_TARGET_LINKADDR;
2487 		ip6h->ip6_src = *sender;
2488 		na->nd_na_target = *sender;
2489 		if (flag & NDP_ISROUTER)
2490 			na->nd_na_flags_reserved |= ND_NA_FLAG_ROUTER;
2491 		if (flag & NDP_SOLICITED)
2492 			na->nd_na_flags_reserved |= ND_NA_FLAG_SOLICITED;
2493 		if (flag & NDP_ORIDE)
2494 			na->nd_na_flags_reserved |= ND_NA_FLAG_OVERRIDE;
2495 	}
2496 
2497 	if (!(flag & NDP_PROBE)) {
2498 		if (hw_addr != NULL && opt != NULL) {
2499 			/* Fill in link layer address and option len */
2500 			opt->nd_opt_len = (uint8_t)plen;
2501 			bcopy(hw_addr, &opt[1], hw_addr_len);
2502 		}
2503 	}
2504 	if (opt != NULL && opt->nd_opt_type == 0) {
2505 		/* If there's no link layer address option, then strip it. */
2506 		len -= plen * 8;
2507 		mp->b_wptr = mp->b_rptr + len;
2508 		ip6h->ip6_plen = htons(len - IPV6_HDR_LEN);
2509 	}
2510 
2511 	icmp6->icmp6_type = (uint8_t)operation;
2512 	icmp6->icmp6_code = 0;
2513 	/*
2514 	 * Prepare for checksum by putting icmp length in the icmp
2515 	 * checksum field. The checksum is calculated in ip_output.c.
2516 	 */
2517 	icmp6->icmp6_cksum = ip6h->ip6_plen;
2518 
2519 	(void) ip_output_simple(mp, &ixas);
2520 	ixa_cleanup(&ixas);
2521 	if (need_refrele)
2522 		ill_refrele(ill);
2523 	return (B_FALSE);
2524 }
2525 
2526 /*
2527  * Used to set ND_UNREACHBLE before ncec_delete sets it NCE_F_CONDEMNED.
2528  * The datapath uses this as an indication that there
2529  * is a problem (as opposed to a NCE that was just
2530  * reclaimed due to lack of memory.
2531  * Note that static ARP entries never become unreachable.
2532  */
2533 void
2534 nce_make_unreachable(ncec_t *ncec)
2535 {
2536 	mutex_enter(&ncec->ncec_lock);
2537 	ncec->ncec_state = ND_UNREACHABLE;
2538 	mutex_exit(&ncec->ncec_lock);
2539 }
2540 
2541 /*
2542  * NCE retransmit timer. Common to IPv4 and IPv6.
2543  * This timer goes off when:
2544  * a. It is time to retransmit a resolution for resolver.
2545  * b. It is time to send reachability probes.
2546  */
2547 void
2548 nce_timer(void *arg)
2549 {
2550 	ncec_t		*ncec = arg;
2551 	ill_t		*ill = ncec->ncec_ill, *src_ill;
2552 	char		addrbuf[INET6_ADDRSTRLEN];
2553 	boolean_t	dropped = B_FALSE;
2554 	ip_stack_t	*ipst = ncec->ncec_ipst;
2555 	boolean_t	isv6 = (ncec->ncec_ipversion == IPV6_VERSION);
2556 	in_addr_t	sender4 = INADDR_ANY;
2557 	in6_addr_t	sender6 = ipv6_all_zeros;
2558 
2559 	/*
2560 	 * The timer has to be cancelled by ncec_delete before doing the final
2561 	 * refrele. So the NCE is guaranteed to exist when the timer runs
2562 	 * until it clears the timeout_id. Before clearing the timeout_id
2563 	 * bump up the refcnt so that we can continue to use the ncec
2564 	 */
2565 	ASSERT(ncec != NULL);
2566 	mutex_enter(&ncec->ncec_lock);
2567 	ncec_refhold_locked(ncec);
2568 	ncec->ncec_timeout_id = 0;
2569 	mutex_exit(&ncec->ncec_lock);
2570 
2571 	src_ill = nce_resolve_src(ncec, &sender6);
2572 	/* if we could not find a sender address, return */
2573 	if (src_ill == NULL) {
2574 		if (!isv6) {
2575 			IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, sender4);
2576 			ip1dbg(("no src ill for %s\n", inet_ntop(AF_INET,
2577 			    &sender4, addrbuf, sizeof (addrbuf))));
2578 		} else {
2579 			ip1dbg(("no src ill for %s\n", inet_ntop(AF_INET6,
2580 			    &ncec->ncec_addr, addrbuf, sizeof (addrbuf))));
2581 		}
2582 		nce_restart_timer(ncec, ill->ill_reachable_retrans_time);
2583 		ncec_refrele(ncec);
2584 		return;
2585 	}
2586 	if (!isv6)
2587 		IN6_V4MAPPED_TO_IPADDR(&sender6, sender4);
2588 
2589 	mutex_enter(&ncec->ncec_lock);
2590 	/*
2591 	 * Check the reachability state.
2592 	 */
2593 	switch (ncec->ncec_state) {
2594 	case ND_DELAY:
2595 		ASSERT(ncec->ncec_lladdr != NULL);
2596 		ncec->ncec_state = ND_PROBE;
2597 		ncec->ncec_pcnt = ND_MAX_UNICAST_SOLICIT;
2598 		if (isv6) {
2599 			mutex_exit(&ncec->ncec_lock);
2600 			dropped = ndp_xmit(src_ill, ND_NEIGHBOR_SOLICIT,
2601 			    src_ill->ill_phys_addr,
2602 			    src_ill->ill_phys_addr_length,
2603 			    &sender6, &ncec->ncec_addr,
2604 			    NDP_UNICAST);
2605 		} else {
2606 			dropped = (arp_request(ncec, sender4, src_ill) == 0);
2607 			mutex_exit(&ncec->ncec_lock);
2608 		}
2609 		if (!dropped) {
2610 			mutex_enter(&ncec->ncec_lock);
2611 			ncec->ncec_pcnt--;
2612 			mutex_exit(&ncec->ncec_lock);
2613 		}
2614 		if (ip_debug > 3) {
2615 			/* ip2dbg */
2616 			pr_addr_dbg("nce_timer: state for %s changed "
2617 			    "to PROBE\n", AF_INET6, &ncec->ncec_addr);
2618 		}
2619 		nce_restart_timer(ncec, ill->ill_reachable_retrans_time);
2620 		break;
2621 	case ND_PROBE:
2622 		/* must be retransmit timer */
2623 		ASSERT(ncec->ncec_pcnt >= -1);
2624 		if (ncec->ncec_pcnt > 0) {
2625 			/*
2626 			 * As per RFC2461, the ncec gets deleted after
2627 			 * MAX_UNICAST_SOLICIT unsuccessful re-transmissions.
2628 			 * Note that the first unicast solicitation is sent
2629 			 * during the DELAY state.
2630 			 */
2631 			ip2dbg(("nce_timer: pcount=%x dst %s\n",
2632 			    ncec->ncec_pcnt,
2633 			    inet_ntop((isv6? AF_INET6 : AF_INET),
2634 			    &ncec->ncec_addr, addrbuf, sizeof (addrbuf))));
2635 			if (NCE_PUBLISH(ncec)) {
2636 				mutex_exit(&ncec->ncec_lock);
2637 				/*
2638 				 * send out a probe; note that src_ill
2639 				 * is ignored by nce_dad() for all
2640 				 * DAD message types other than IPv6
2641 				 * unicast probes
2642 				 */
2643 				nce_dad(ncec, src_ill, B_TRUE);
2644 			} else {
2645 				ASSERT(src_ill != NULL);
2646 				if (isv6) {
2647 					mutex_exit(&ncec->ncec_lock);
2648 					dropped = ndp_xmit(src_ill,
2649 					    ND_NEIGHBOR_SOLICIT,
2650 					    src_ill->ill_phys_addr,
2651 					    src_ill->ill_phys_addr_length,
2652 					    &sender6, &ncec->ncec_addr,
2653 					    NDP_UNICAST);
2654 				} else {
2655 					/*
2656 					 * since the nce is REACHABLE,
2657 					 * the ARP request will be sent out
2658 					 * as a link-layer unicast.
2659 					 */
2660 					dropped = (arp_request(ncec, sender4,
2661 					    src_ill) == 0);
2662 					mutex_exit(&ncec->ncec_lock);
2663 				}
2664 				if (!dropped) {
2665 					mutex_enter(&ncec->ncec_lock);
2666 					ncec->ncec_pcnt--;
2667 					mutex_exit(&ncec->ncec_lock);
2668 				}
2669 				nce_restart_timer(ncec,
2670 				    ill->ill_reachable_retrans_time);
2671 			}
2672 		} else if (ncec->ncec_pcnt < 0) {
2673 			/* No hope, delete the ncec */
2674 			/* Tell datapath it went bad */
2675 			ncec->ncec_state = ND_UNREACHABLE;
2676 			mutex_exit(&ncec->ncec_lock);
2677 			if (ip_debug > 2) {
2678 				/* ip1dbg */
2679 				pr_addr_dbg("nce_timer: Delete NCE for"
2680 				    " dst %s\n", (isv6? AF_INET6: AF_INET),
2681 				    &ncec->ncec_addr);
2682 			}
2683 			/* if static ARP can't delete. */
2684 			if ((ncec->ncec_flags & NCE_F_STATIC) == 0)
2685 				ncec_delete(ncec);
2686 
2687 		} else if (!NCE_PUBLISH(ncec)) {
2688 			/*
2689 			 * Probe count is 0 for a dynamic entry (one that we
2690 			 * ourselves are not publishing). We should never get
2691 			 * here if NONUD was requested, hence the ASSERT below.
2692 			 */
2693 			ASSERT((ncec->ncec_flags & NCE_F_NONUD) == 0);
2694 			ip2dbg(("nce_timer: pcount=%x dst %s\n",
2695 			    ncec->ncec_pcnt, inet_ntop(AF_INET6,
2696 			    &ncec->ncec_addr, addrbuf, sizeof (addrbuf))));
2697 			ncec->ncec_pcnt--;
2698 			mutex_exit(&ncec->ncec_lock);
2699 			/* Wait one interval before killing */
2700 			nce_restart_timer(ncec,
2701 			    ill->ill_reachable_retrans_time);
2702 		} else if (ill->ill_phyint->phyint_flags & PHYI_RUNNING) {
2703 			ipif_t *ipif;
2704 			ipaddr_t ncec_addr;
2705 
2706 			/*
2707 			 * We're done probing, and we can now declare this
2708 			 * address to be usable.  Let IP know that it's ok to
2709 			 * use.
2710 			 */
2711 			ncec->ncec_state = ND_REACHABLE;
2712 			ncec->ncec_flags &= ~NCE_F_UNVERIFIED;
2713 			mutex_exit(&ncec->ncec_lock);
2714 			if (isv6) {
2715 				ipif = ipif_lookup_addr_exact_v6(
2716 				    &ncec->ncec_addr, ill, ipst);
2717 			} else {
2718 				IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr,
2719 				    ncec_addr);
2720 				ipif = ipif_lookup_addr_exact(ncec_addr, ill,
2721 				    ipst);
2722 			}
2723 			if (ipif != NULL) {
2724 				if (ipif->ipif_was_dup) {
2725 					char ibuf[LIFNAMSIZ];
2726 					char sbuf[INET6_ADDRSTRLEN];
2727 
2728 					ipif->ipif_was_dup = B_FALSE;
2729 					(void) inet_ntop(AF_INET6,
2730 					    &ipif->ipif_v6lcl_addr,
2731 					    sbuf, sizeof (sbuf));
2732 					ipif_get_name(ipif, ibuf,
2733 					    sizeof (ibuf));
2734 					cmn_err(CE_NOTE, "recovered address "
2735 					    "%s on %s", sbuf, ibuf);
2736 				}
2737 				if ((ipif->ipif_flags & IPIF_UP) &&
2738 				    !ipif->ipif_addr_ready)
2739 					ipif_up_notify(ipif);
2740 				ipif->ipif_addr_ready = 1;
2741 				ipif_refrele(ipif);
2742 			}
2743 			if (!isv6 && arp_no_defense)
2744 				break;
2745 			/* Begin defending our new address */
2746 			if (ncec->ncec_unsolicit_count > 0) {
2747 				ncec->ncec_unsolicit_count--;
2748 				if (isv6) {
2749 					dropped = ndp_announce(ncec);
2750 				} else {
2751 					dropped = arp_announce(ncec);
2752 				}
2753 
2754 				if (dropped)
2755 					ncec->ncec_unsolicit_count++;
2756 				else
2757 					ncec->ncec_last_time_defended =
2758 					    ddi_get_lbolt();
2759 			}
2760 			if (ncec->ncec_unsolicit_count > 0) {
2761 				nce_restart_timer(ncec,
2762 				    ANNOUNCE_INTERVAL(isv6));
2763 			} else if (DEFENSE_INTERVAL(isv6) != 0) {
2764 				nce_restart_timer(ncec, DEFENSE_INTERVAL(isv6));
2765 			}
2766 		} else {
2767 			/*
2768 			 * This is an address we're probing to be our own, but
2769 			 * the ill is down.  Wait until it comes back before
2770 			 * doing anything, but switch to reachable state so
2771 			 * that the restart will work.
2772 			 */
2773 			ncec->ncec_state = ND_REACHABLE;
2774 			mutex_exit(&ncec->ncec_lock);
2775 		}
2776 		break;
2777 	case ND_INCOMPLETE: {
2778 		mblk_t	*mp, *nextmp;
2779 		mblk_t	**prevmpp;
2780 
2781 		/*
2782 		 * Per case (2) in the nce_queue_mp() comments, scan ncec_qd_mp
2783 		 * for any IPMP probe packets, and toss them.  IPMP probe
2784 		 * packets will always be at the head of ncec_qd_mp, so that
2785 		 * we can stop at the first queued ND packet that is
2786 		 * not a probe packet.
2787 		 */
2788 		prevmpp = &ncec->ncec_qd_mp;
2789 		for (mp = ncec->ncec_qd_mp; mp != NULL; mp = nextmp) {
2790 			nextmp = mp->b_next;
2791 
2792 			if (IS_UNDER_IPMP(ill) && ncec->ncec_nprobes > 0) {
2793 				inet_freemsg(mp);
2794 				ncec->ncec_nprobes--;
2795 				*prevmpp = nextmp;
2796 			} else {
2797 				prevmpp = &mp->b_next;
2798 			}
2799 		}
2800 
2801 		/*
2802 		 * Must be resolver's retransmit timer.
2803 		 */
2804 		mutex_exit(&ncec->ncec_lock);
2805 		ip_ndp_resolve(ncec);
2806 		break;
2807 	}
2808 	case ND_REACHABLE:
2809 		if (((ncec->ncec_flags & NCE_F_UNSOL_ADV) &&
2810 		    ncec->ncec_unsolicit_count != 0) ||
2811 		    (NCE_PUBLISH(ncec) && DEFENSE_INTERVAL(isv6) != 0)) {
2812 			if (ncec->ncec_unsolicit_count > 0) {
2813 				ncec->ncec_unsolicit_count--;
2814 				mutex_exit(&ncec->ncec_lock);
2815 				/*
2816 				 * When we get to zero announcements left,
2817 				 * switch to address defense
2818 				 */
2819 			} else {
2820 				boolean_t rate_limit;
2821 
2822 				mutex_exit(&ncec->ncec_lock);
2823 				rate_limit = ill_defend_rate_limit(ill, ncec);
2824 				if (rate_limit) {
2825 					nce_restart_timer(ncec,
2826 					    DEFENSE_INTERVAL(isv6));
2827 					break;
2828 				}
2829 			}
2830 			if (isv6) {
2831 				dropped = ndp_announce(ncec);
2832 			} else {
2833 				dropped = arp_announce(ncec);
2834 			}
2835 			mutex_enter(&ncec->ncec_lock);
2836 			if (dropped) {
2837 				ncec->ncec_unsolicit_count++;
2838 			} else {
2839 				ncec->ncec_last_time_defended =
2840 				    ddi_get_lbolt();
2841 			}
2842 			mutex_exit(&ncec->ncec_lock);
2843 			if (ncec->ncec_unsolicit_count != 0) {
2844 				nce_restart_timer(ncec,
2845 				    ANNOUNCE_INTERVAL(isv6));
2846 			} else {
2847 				nce_restart_timer(ncec, DEFENSE_INTERVAL(isv6));
2848 			}
2849 		} else {
2850 			mutex_exit(&ncec->ncec_lock);
2851 		}
2852 		break;
2853 	default:
2854 		mutex_exit(&ncec->ncec_lock);
2855 		break;
2856 	}
2857 done:
2858 	ncec_refrele(ncec);
2859 	ill_refrele(src_ill);
2860 }
2861 
2862 /*
2863  * Set a link layer address from the ll_addr passed in.
2864  * Copy SAP from ill.
2865  */
2866 static void
2867 nce_set_ll(ncec_t *ncec, uchar_t *ll_addr)
2868 {
2869 	ill_t	*ill = ncec->ncec_ill;
2870 
2871 	ASSERT(ll_addr != NULL);
2872 	if (ill->ill_phys_addr_length > 0) {
2873 		/*
2874 		 * The bcopy() below used to be called for the physical address
2875 		 * length rather than the link layer address length. For
2876 		 * ethernet and many other media, the phys_addr and lla are
2877 		 * identical.
2878 		 *
2879 		 * The phys_addr and lla may not be the same for devices that
2880 		 * support DL_IPV6_LINK_LAYER_ADDR, though there are currently
2881 		 * no known instances of these.
2882 		 *
2883 		 * For PPP or other interfaces with a zero length
2884 		 * physical address, don't do anything here.
2885 		 * The bcopy() with a zero phys_addr length was previously
2886 		 * a no-op for interfaces with a zero-length physical address.
2887 		 * Using the lla for them would change the way they operate.
2888 		 * Doing nothing in such cases preserves expected behavior.
2889 		 */
2890 		bcopy(ll_addr, ncec->ncec_lladdr, ill->ill_nd_lla_len);
2891 	}
2892 }
2893 
2894 boolean_t
2895 nce_cmp_ll_addr(const ncec_t *ncec, const uchar_t *ll_addr,
2896     uint32_t ll_addr_len)
2897 {
2898 	ASSERT(ncec->ncec_lladdr != NULL);
2899 	if (ll_addr == NULL)
2900 		return (B_FALSE);
2901 	if (bcmp(ll_addr, ncec->ncec_lladdr, ll_addr_len) != 0)
2902 		return (B_TRUE);
2903 	return (B_FALSE);
2904 }
2905 
2906 /*
2907  * Updates the link layer address or the reachability state of
2908  * a cache entry.  Reset probe counter if needed.
2909  */
2910 void
2911 nce_update(ncec_t *ncec, uint16_t new_state, uchar_t *new_ll_addr)
2912 {
2913 	ill_t	*ill = ncec->ncec_ill;
2914 	boolean_t need_stop_timer = B_FALSE;
2915 	boolean_t need_fastpath_update = B_FALSE;
2916 	nce_t	*nce = NULL;
2917 	timeout_id_t tid;
2918 
2919 	ASSERT(MUTEX_HELD(&ncec->ncec_lock));
2920 	/*
2921 	 * If this interface does not do NUD, there is no point
2922 	 * in allowing an update to the cache entry.  Although
2923 	 * we will respond to NS.
2924 	 * The only time we accept an update for a resolver when
2925 	 * NUD is turned off is when it has just been created.
2926 	 * Non-Resolvers will always be created as REACHABLE.
2927 	 */
2928 	if (new_state != ND_UNCHANGED) {
2929 		if ((ncec->ncec_flags & NCE_F_NONUD) &&
2930 		    (ncec->ncec_state != ND_INCOMPLETE))
2931 			return;
2932 		ASSERT((int16_t)new_state >= ND_STATE_VALID_MIN);
2933 		ASSERT((int16_t)new_state <= ND_STATE_VALID_MAX);
2934 		need_stop_timer = B_TRUE;
2935 		if (new_state == ND_REACHABLE)
2936 			ncec->ncec_last = TICK_TO_MSEC(ddi_get_lbolt64());
2937 		else {
2938 			/* We force NUD in this case */
2939 			ncec->ncec_last = 0;
2940 		}
2941 		ncec->ncec_state = new_state;
2942 		ncec->ncec_pcnt = ND_MAX_UNICAST_SOLICIT;
2943 		ASSERT(ncec->ncec_lladdr != NULL || new_state == ND_INITIAL ||
2944 		    new_state == ND_INCOMPLETE);
2945 	}
2946 	if (need_stop_timer || (ncec->ncec_flags & NCE_F_STATIC)) {
2947 		tid = ncec->ncec_timeout_id;
2948 		ncec->ncec_timeout_id = 0;
2949 	}
2950 	/*
2951 	 * Re-trigger fastpath probe and
2952 	 * overwrite the DL_UNITDATA_REQ data, noting we'll lose
2953 	 * whatever packets that happens to be transmitting at the time.
2954 	 */
2955 	if (new_ll_addr != NULL) {
2956 		bcopy(new_ll_addr, ncec->ncec_lladdr,
2957 		    ill->ill_phys_addr_length);
2958 		need_fastpath_update = B_TRUE;
2959 	}
2960 	mutex_exit(&ncec->ncec_lock);
2961 	if (need_stop_timer || (ncec->ncec_flags & NCE_F_STATIC)) {
2962 		if (tid != 0)
2963 			(void) untimeout(tid);
2964 	}
2965 	if (need_fastpath_update) {
2966 		/*
2967 		 * Delete any existing existing dlur_mp and fp_mp information.
2968 		 * For IPMP interfaces, all underlying ill's must be checked
2969 		 * and purged.
2970 		 */
2971 		nce_fastpath_list_delete(ncec->ncec_ill, ncec, NULL);
2972 		/*
2973 		 * add the new dlur_mp and fp_mp
2974 		 */
2975 		nce = nce_fastpath(ncec, B_TRUE, NULL);
2976 		if (nce != NULL)
2977 			nce_refrele(nce);
2978 	}
2979 	mutex_enter(&ncec->ncec_lock);
2980 }
2981 
2982 static void
2983 nce_queue_mp_common(ncec_t *ncec, mblk_t *mp, boolean_t head_insert)
2984 {
2985 	uint_t	count = 0;
2986 	mblk_t  **mpp, *tmp;
2987 
2988 	ASSERT(MUTEX_HELD(&ncec->ncec_lock));
2989 
2990 	for (mpp = &ncec->ncec_qd_mp; *mpp != NULL; mpp = &(*mpp)->b_next) {
2991 		if (++count > ncec->ncec_ill->ill_max_buf) {
2992 			tmp = ncec->ncec_qd_mp->b_next;
2993 			ncec->ncec_qd_mp->b_next = NULL;
2994 			/*
2995 			 * if we never create data addrs on the under_ill
2996 			 * does this matter?
2997 			 */
2998 			BUMP_MIB(ncec->ncec_ill->ill_ip_mib,
2999 			    ipIfStatsOutDiscards);
3000 			ip_drop_output("ipIfStatsOutDiscards", ncec->ncec_qd_mp,
3001 			    ncec->ncec_ill);
3002 			freemsg(ncec->ncec_qd_mp);
3003 			ncec->ncec_qd_mp = tmp;
3004 		}
3005 	}
3006 
3007 	if (head_insert) {
3008 		ncec->ncec_nprobes++;
3009 		mp->b_next = ncec->ncec_qd_mp;
3010 		ncec->ncec_qd_mp = mp;
3011 	} else {
3012 		*mpp = mp;
3013 	}
3014 }
3015 
3016 /*
3017  * nce_queue_mp will queue the packet into the ncec_qd_mp. The packet will be
3018  * queued at the head or tail of the queue based on the input argument
3019  * 'head_insert'. The caller should specify this argument as B_TRUE if this
3020  * packet is an IPMP probe packet, in which case the following happens:
3021  *
3022  *   1. Insert it at the head of the ncec_qd_mp list.  Consider the normal
3023  *	(non-ipmp_probe) load-speading case where the source address of the ND
3024  *	packet is not tied to ncec_ill. If the ill bound to the source address
3025  *	cannot receive, the response to the ND packet will not be received.
3026  *	However, if ND packets for ncec_ill's probes are queued	behind that ND
3027  *	packet, those probes will also fail to be sent, and thus in.mpathd will
3028  *	 erroneously conclude that ncec_ill has also failed.
3029  *
3030  *   2. Drop the ipmp_probe packet in ndp_timer() if the ND did	not succeed on
3031  *	the first attempt.  This ensures that ND problems do not manifest as
3032  *	probe RTT spikes.
3033  *
3034  * We achieve this by inserting ipmp_probe() packets at the head of the
3035  * nce_queue.
3036  *
3037  * The ncec for the probe target is created with ncec_ill set to the ipmp_ill,
3038  * but the caller needs to set head_insert to B_TRUE if this is a probe packet.
3039  */
3040 void
3041 nce_queue_mp(ncec_t *ncec, mblk_t *mp, boolean_t head_insert)
3042 {
3043 	ASSERT(MUTEX_HELD(&ncec->ncec_lock));
3044 	nce_queue_mp_common(ncec, mp, head_insert);
3045 }
3046 
3047 /*
3048  * Called when address resolution failed due to a timeout.
3049  * Send an ICMP unreachable in response to all queued packets.
3050  */
3051 void
3052 ndp_resolv_failed(ncec_t *ncec)
3053 {
3054 	mblk_t	*mp, *nxt_mp;
3055 	char	buf[INET6_ADDRSTRLEN];
3056 	ill_t *ill = ncec->ncec_ill;
3057 	ip_recv_attr_t	iras;
3058 
3059 	bzero(&iras, sizeof (iras));
3060 	iras.ira_flags = 0;
3061 	/*
3062 	 * we are setting the ira_rill to the ipmp_ill (instead of
3063 	 * the actual ill on which the packet was received), but this
3064 	 * is ok because we don't actually need the real ira_rill.
3065 	 * to send the icmp unreachable to the sender.
3066 	 */
3067 	iras.ira_ill = iras.ira_rill = ill;
3068 	iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex;
3069 	iras.ira_rifindex = iras.ira_ruifindex;
3070 
3071 	ip1dbg(("ndp_resolv_failed: dst %s\n",
3072 	    inet_ntop(AF_INET6, (char *)&ncec->ncec_addr, buf, sizeof (buf))));
3073 	mutex_enter(&ncec->ncec_lock);
3074 	mp = ncec->ncec_qd_mp;
3075 	ncec->ncec_qd_mp = NULL;
3076 	ncec->ncec_nprobes = 0;
3077 	mutex_exit(&ncec->ncec_lock);
3078 	while (mp != NULL) {
3079 		nxt_mp = mp->b_next;
3080 		mp->b_next = NULL;
3081 
3082 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
3083 		ip_drop_output("ipIfStatsOutDiscards - address unreachable",
3084 		    mp, ill);
3085 		icmp_unreachable_v6(mp,
3086 		    ICMP6_DST_UNREACH_ADDR, B_FALSE, &iras);
3087 		ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE));
3088 		mp = nxt_mp;
3089 	}
3090 	ncec_cb_dispatch(ncec); /* finish off waiting callbacks */
3091 }
3092 
3093 /*
3094  * Handle the completion of NDP and ARP resolution.
3095  */
3096 void
3097 nce_resolv_ok(ncec_t *ncec)
3098 {
3099 	mblk_t *mp;
3100 	uint_t pkt_len;
3101 	iaflags_t ixaflags = IXAF_NO_TRACE;
3102 	nce_t *nce;
3103 	ill_t	*ill = ncec->ncec_ill;
3104 	boolean_t isv6 = (ncec->ncec_ipversion == IPV6_VERSION);
3105 	ip_stack_t *ipst = ill->ill_ipst;
3106 
3107 	if (IS_IPMP(ncec->ncec_ill)) {
3108 		nce_resolv_ipmp_ok(ncec);
3109 		return;
3110 	}
3111 	/* non IPMP case */
3112 
3113 	mutex_enter(&ncec->ncec_lock);
3114 	ASSERT(ncec->ncec_nprobes == 0);
3115 	mp = ncec->ncec_qd_mp;
3116 	ncec->ncec_qd_mp = NULL;
3117 	mutex_exit(&ncec->ncec_lock);
3118 
3119 	while (mp != NULL) {
3120 		mblk_t *nxt_mp;
3121 
3122 		if (ill->ill_isv6) {
3123 			ip6_t *ip6h = (ip6_t *)mp->b_rptr;
3124 
3125 			pkt_len = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN;
3126 		} else {
3127 			ipha_t *ipha = (ipha_t *)mp->b_rptr;
3128 
3129 			ixaflags |= IXAF_IS_IPV4;
3130 			pkt_len = ntohs(ipha->ipha_length);
3131 		}
3132 		nxt_mp = mp->b_next;
3133 		mp->b_next = NULL;
3134 		/*
3135 		 * IXAF_NO_DEV_FLOW_CTL information for TCP packets is no
3136 		 * longer available, but it's ok to drop this flag because TCP
3137 		 * has its own flow-control in effect, so TCP packets
3138 		 * are not likely to get here when flow-control is in effect.
3139 		 */
3140 		mutex_enter(&ill->ill_lock);
3141 		nce = nce_lookup(ill, &ncec->ncec_addr);
3142 		mutex_exit(&ill->ill_lock);
3143 
3144 		if (nce == NULL) {
3145 			if (isv6) {
3146 				BUMP_MIB(&ipst->ips_ip6_mib,
3147 				    ipIfStatsOutDiscards);
3148 			} else {
3149 				BUMP_MIB(&ipst->ips_ip_mib,
3150 				    ipIfStatsOutDiscards);
3151 			}
3152 			ip_drop_output("ipIfStatsOutDiscards - no nce",
3153 			    mp, NULL);
3154 			freemsg(mp);
3155 		} else {
3156 			/*
3157 			 * We don't know the zoneid, but
3158 			 * ip_xmit does not care since IXAF_NO_TRACE
3159 			 * is set. (We traced the packet the first
3160 			 * time through ip_xmit.)
3161 			 */
3162 			(void) ip_xmit(mp, nce, ixaflags, pkt_len, 0,
3163 			    ALL_ZONES, 0, NULL);
3164 			nce_refrele(nce);
3165 		}
3166 		mp = nxt_mp;
3167 	}
3168 
3169 	ncec_cb_dispatch(ncec); /* complete callbacks */
3170 }
3171 
3172 /*
3173  * Called by SIOCSNDP* ioctl to add/change an ncec entry
3174  * and the corresponding attributes.
3175  * Disallow states other than ND_REACHABLE or ND_STALE.
3176  */
3177 int
3178 ndp_sioc_update(ill_t *ill, lif_nd_req_t *lnr)
3179 {
3180 	sin6_t		*sin6;
3181 	in6_addr_t	*addr;
3182 	ncec_t		*ncec;
3183 	nce_t		*nce;
3184 	int		err = 0;
3185 	uint16_t	new_flags = 0;
3186 	uint16_t	old_flags = 0;
3187 	int		inflags = lnr->lnr_flags;
3188 	ip_stack_t	*ipst = ill->ill_ipst;
3189 	boolean_t	do_postprocess = B_FALSE;
3190 
3191 	ASSERT(ill->ill_isv6);
3192 	if ((lnr->lnr_state_create != ND_REACHABLE) &&
3193 	    (lnr->lnr_state_create != ND_STALE))
3194 		return (EINVAL);
3195 
3196 	sin6 = (sin6_t *)&lnr->lnr_addr;
3197 	addr = &sin6->sin6_addr;
3198 
3199 	mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
3200 	ASSERT(!IS_UNDER_IPMP(ill));
3201 	nce = nce_lookup_addr(ill, addr);
3202 	if (nce != NULL)
3203 		new_flags = nce->nce_common->ncec_flags;
3204 
3205 	switch (inflags & (NDF_ISROUTER_ON|NDF_ISROUTER_OFF)) {
3206 	case NDF_ISROUTER_ON:
3207 		new_flags |= NCE_F_ISROUTER;
3208 		break;
3209 	case NDF_ISROUTER_OFF:
3210 		new_flags &= ~NCE_F_ISROUTER;
3211 		break;
3212 	case (NDF_ISROUTER_OFF|NDF_ISROUTER_ON):
3213 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
3214 		if (nce != NULL)
3215 			nce_refrele(nce);
3216 		return (EINVAL);
3217 	}
3218 	if (inflags & NDF_STATIC)
3219 		new_flags |= NCE_F_STATIC;
3220 
3221 	switch (inflags & (NDF_ANYCAST_ON|NDF_ANYCAST_OFF)) {
3222 	case NDF_ANYCAST_ON:
3223 		new_flags |= NCE_F_ANYCAST;
3224 		break;
3225 	case NDF_ANYCAST_OFF:
3226 		new_flags &= ~NCE_F_ANYCAST;
3227 		break;
3228 	case (NDF_ANYCAST_OFF|NDF_ANYCAST_ON):
3229 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
3230 		if (nce != NULL)
3231 			nce_refrele(nce);
3232 		return (EINVAL);
3233 	}
3234 
3235 	if (nce == NULL) {
3236 		err = nce_add_v6(ill,
3237 		    (uchar_t *)lnr->lnr_hdw_addr,
3238 		    ill->ill_phys_addr_length,
3239 		    addr,
3240 		    new_flags,
3241 		    lnr->lnr_state_create,
3242 		    &nce);
3243 		if (err != 0) {
3244 			mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
3245 			ip1dbg(("ndp_sioc_update: Can't create NCE %d\n", err));
3246 			return (err);
3247 		} else {
3248 			do_postprocess = B_TRUE;
3249 		}
3250 	}
3251 	ncec = nce->nce_common;
3252 	old_flags = ncec->ncec_flags;
3253 	if (old_flags & NCE_F_ISROUTER && !(new_flags & NCE_F_ISROUTER)) {
3254 		ncec_router_to_host(ncec);
3255 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
3256 		if (do_postprocess)
3257 			err = nce_add_v6_postprocess(nce);
3258 		nce_refrele(nce);
3259 		return (0);
3260 	}
3261 	mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
3262 
3263 	if (do_postprocess)
3264 		err = nce_add_v6_postprocess(nce);
3265 	/*
3266 	 * err cannot be anything other than 0 because we don't support
3267 	 * proxy arp of static addresses.
3268 	 */
3269 	ASSERT(err == 0);
3270 
3271 	mutex_enter(&ncec->ncec_lock);
3272 	ncec->ncec_flags = new_flags;
3273 	mutex_exit(&ncec->ncec_lock);
3274 	/*
3275 	 * Note that we ignore the state at this point, which
3276 	 * should be either STALE or REACHABLE.  Instead we let
3277 	 * the link layer address passed in to determine the state
3278 	 * much like incoming packets.
3279 	 */
3280 	nce_process(ncec, (uchar_t *)lnr->lnr_hdw_addr, 0, B_FALSE);
3281 	nce_refrele(nce);
3282 	return (0);
3283 }
3284 
3285 /*
3286  * Create an nce_t structure for ill using the ncec->ncec_lladdr to set up
3287  * the nce_dlur_mp. If ill != ncec->ncec_ill, then the ips_ill_g_lock must
3288  * be held to ensure that they are in the same group.
3289  */
3290 static nce_t *
3291 nce_fastpath_create(ill_t *ill, ncec_t *ncec)
3292 {
3293 
3294 	nce_t *nce;
3295 
3296 	nce = nce_ill_lookup_then_add(ill, ncec);
3297 
3298 	if (nce == NULL || IS_LOOPBACK(nce->nce_ill) || IS_VNI(nce->nce_ill))
3299 		return (nce);
3300 
3301 	/*
3302 	 * hold the ncec_lock to synchronize with nce_update() so that,
3303 	 * at the end of this function, the contents of nce_dlur_mp are
3304 	 * consistent with ncec->ncec_lladdr, even though some intermediate
3305 	 * packet may have been sent out with a mangled address, which would
3306 	 * only be a transient condition.
3307 	 */
3308 	mutex_enter(&ncec->ncec_lock);
3309 	if (ncec->ncec_lladdr != NULL) {
3310 		bcopy(ncec->ncec_lladdr, nce->nce_dlur_mp->b_rptr +
3311 		    NCE_LL_ADDR_OFFSET(ill), ill->ill_phys_addr_length);
3312 	} else {
3313 		nce->nce_dlur_mp = ill_dlur_gen(NULL, 0, ill->ill_sap,
3314 		    ill->ill_sap_length);
3315 	}
3316 	mutex_exit(&ncec->ncec_lock);
3317 	return (nce);
3318 }
3319 
3320 /*
3321  * we make nce_fp_mp to have an M_DATA prepend.
3322  * The caller ensures there is hold on ncec for this function.
3323  * Note that since ill_fastpath_probe() copies the mblk there is
3324  * no need to hold the nce or ncec beyond this function.
3325  *
3326  * If the caller has passed in a non-null ncec_nce to nce_fastpath() that
3327  * ncec_nce must correspond to the nce for ncec with nce_ill == ncec->ncec_ill
3328  * and will be returned back by this function, so that no extra nce_refrele
3329  * is required for the caller. The calls from nce_add_common() use this
3330  * method. All other callers (that pass in NULL ncec_nce) will have to do a
3331  * nce_refrele of the returned nce (when it is non-null).
3332  */
3333 static nce_t *
3334 nce_fastpath(ncec_t *ncec, boolean_t trigger_fp_req, nce_t *ncec_nce)
3335 {
3336 	nce_t *nce;
3337 	ill_t *ill = ncec->ncec_ill;
3338 
3339 	ASSERT(ill != NULL);
3340 
3341 	if (IS_IPMP(ill) && trigger_fp_req) {
3342 		trigger_fp_req = B_FALSE;
3343 		ipmp_ncec_refresh_nce(ncec);
3344 	}
3345 
3346 	/*
3347 	 * If the caller already has the nce corresponding to the ill, use
3348 	 * that one. Otherwise we have to lookup/add the nce. Calls from
3349 	 * nce_add_common() fall in the former category, and have just done
3350 	 * the nce lookup/add that can be reused.
3351 	 */
3352 	if (ncec_nce == NULL)
3353 		nce = nce_fastpath_create(ill, ncec);
3354 	else
3355 		nce = ncec_nce;
3356 
3357 	if (nce == NULL || IS_LOOPBACK(nce->nce_ill) || IS_VNI(nce->nce_ill))
3358 		return (nce);
3359 
3360 	if (trigger_fp_req)
3361 		nce_fastpath_trigger(nce);
3362 	return (nce);
3363 }
3364 
3365 /*
3366  * Trigger fastpath on nce. No locks may be held.
3367  */
3368 static void
3369 nce_fastpath_trigger(nce_t *nce)
3370 {
3371 	int res;
3372 	ill_t *ill = nce->nce_ill;
3373 	ncec_t *ncec = nce->nce_common;
3374 
3375 	res = ill_fastpath_probe(ill, nce->nce_dlur_mp);
3376 	/*
3377 	 * EAGAIN is an indication of a transient error
3378 	 * i.e. allocation failure etc. leave the ncec in the list it
3379 	 * will be updated when another probe happens for another ire
3380 	 * if not it will be taken out of the list when the ire is
3381 	 * deleted.
3382 	 */
3383 	if (res != 0 && res != EAGAIN && res != ENOTSUP)
3384 		nce_fastpath_list_delete(ill, ncec, NULL);
3385 }
3386 
3387 /*
3388  * Add ncec to the nce fastpath list on ill.
3389  */
3390 static nce_t *
3391 nce_ill_lookup_then_add_locked(ill_t *ill, ncec_t *ncec, list_t *graveyard)
3392 {
3393 	nce_t *nce = NULL;
3394 
3395 	ASSERT(MUTEX_HELD(&ill->ill_lock));
3396 	/*
3397 	 * Atomically ensure that the ill is not CONDEMNED and is not going
3398 	 * down, before adding the NCE.
3399 	 */
3400 	if (ill->ill_state_flags & ILL_CONDEMNED)
3401 		return (NULL);
3402 	mutex_enter(&ncec->ncec_lock);
3403 	/*
3404 	 * if ncec has not been deleted and
3405 	 * is not already in the list add it.
3406 	 */
3407 	if (!NCE_ISCONDEMNED(ncec)) {
3408 		nce = nce_lookup(ill, &ncec->ncec_addr);
3409 		if (nce != NULL)
3410 			goto done;
3411 		nce = nce_add(ill, ncec, graveyard);
3412 	}
3413 done:
3414 	mutex_exit(&ncec->ncec_lock);
3415 	return (nce);
3416 }
3417 
3418 static nce_t *
3419 nce_ill_lookup_then_add(ill_t *ill, ncec_t *ncec)
3420 {
3421 	nce_t *nce;
3422 	list_t graveyard;
3423 
3424 	list_create(&graveyard, sizeof (nce_t), offsetof(nce_t, nce_node));
3425 	mutex_enter(&ill->ill_lock);
3426 	nce = nce_ill_lookup_then_add_locked(ill, ncec, &graveyard);
3427 	mutex_exit(&ill->ill_lock);
3428 	nce_graveyard_free(&graveyard);
3429 	return (nce);
3430 }
3431 
3432 
3433 /*
3434  * remove ncec from the ill_nce list. If 'dead' is non-null, the deleted
3435  * nce is added to the 'dead' list, and the caller must nce_refrele() the
3436  * entry after all locks have been dropped.
3437  */
3438 void
3439 nce_fastpath_list_delete(ill_t *ill, ncec_t *ncec, list_t *dead)
3440 {
3441 	nce_t *nce;
3442 
3443 	ASSERT(ill != NULL);
3444 
3445 	/* delete any nces referencing the ncec from underlying ills */
3446 	if (IS_IPMP(ill))
3447 		ipmp_ncec_delete_nce(ncec);
3448 
3449 	/* now the ill itself */
3450 	mutex_enter(&ill->ill_lock);
3451 	for (nce = list_head(&ill->ill_nce); nce != NULL;
3452 	    nce = list_next(&ill->ill_nce, nce)) {
3453 		if (nce->nce_common == ncec) {
3454 			nce_refhold(nce);
3455 			nce_delete(nce);
3456 			break;
3457 		}
3458 	}
3459 	mutex_exit(&ill->ill_lock);
3460 	if (nce != NULL) {
3461 		if (dead == NULL)
3462 			nce_refrele(nce);
3463 		else
3464 			list_insert_tail(dead, nce);
3465 	}
3466 }
3467 
3468 /*
3469  * when the fastpath response does not fit in the datab
3470  * associated with the existing nce_fp_mp, we delete and
3471  * add the nce to retrigger fastpath based on the information
3472  * in the ncec_t.
3473  */
3474 static nce_t *
3475 nce_delete_then_add(nce_t *nce)
3476 {
3477 	ill_t		*ill = nce->nce_ill;
3478 	nce_t		*newnce = NULL;
3479 	list_t		graveyard;
3480 
3481 	list_create(&graveyard, sizeof (nce_t), offsetof(nce_t, nce_node));
3482 	ip0dbg(("nce_delete_then_add nce %p ill %s\n",
3483 	    (void *)nce, ill->ill_name));
3484 	mutex_enter(&ill->ill_lock);
3485 	mutex_enter(&nce->nce_common->ncec_lock);
3486 	nce_delete(nce);
3487 	/*
3488 	 * Make sure that ncec is not condemned before adding. We hold the
3489 	 * ill_lock and ncec_lock to synchronize with ncec_delete() and
3490 	 * ipmp_ncec_delete_nce()
3491 	 */
3492 	if (!NCE_ISCONDEMNED(nce->nce_common))
3493 		newnce = nce_add(ill, nce->nce_common, &graveyard);
3494 	mutex_exit(&nce->nce_common->ncec_lock);
3495 	mutex_exit(&ill->ill_lock);
3496 	nce_graveyard_free(&graveyard);
3497 	nce_refrele(nce);
3498 	return (newnce); /* could be null if nomem */
3499 }
3500 
3501 typedef struct nce_fp_match_s {
3502 	nce_t	*nce_fp_match_res;
3503 	mblk_t	*nce_fp_match_ack_mp;
3504 } nce_fp_match_t;
3505 
3506 /* ARGSUSED */
3507 static int
3508 nce_fastpath_match_dlur(ill_t *ill, nce_t *nce, void *arg)
3509 {
3510 	nce_fp_match_t	*nce_fp_marg = arg;
3511 	ncec_t		*ncec = nce->nce_common;
3512 	mblk_t		*mp = nce_fp_marg->nce_fp_match_ack_mp;
3513 	uchar_t	*mp_rptr, *ud_mp_rptr;
3514 	mblk_t		*ud_mp = nce->nce_dlur_mp;
3515 	ptrdiff_t	cmplen;
3516 
3517 	/*
3518 	 * mp is the mp associated with the fastpath ack.
3519 	 * ud_mp is the outstanding DL_UNITDATA_REQ on the nce_t
3520 	 * under consideration. If the contents match, then the
3521 	 * fastpath ack is used to update the nce.
3522 	 */
3523 	if (ud_mp == NULL)
3524 		return (0);
3525 	mp_rptr = mp->b_rptr;
3526 	cmplen = mp->b_wptr - mp_rptr;
3527 	ASSERT(cmplen >= 0);
3528 
3529 	ud_mp_rptr = ud_mp->b_rptr;
3530 	/*
3531 	 * The ncec is locked here to prevent any other threads from accessing
3532 	 * and changing nce_dlur_mp when the address becomes resolved to an
3533 	 * lla while we're in the middle of looking at and comparing the
3534 	 * hardware address (lla). It is also locked to prevent multiple
3535 	 * threads in nce_fastpath() from examining nce_dlur_mp at the same
3536 	 * time.
3537 	 */
3538 	mutex_enter(&ncec->ncec_lock);
3539 	if (ud_mp->b_wptr - ud_mp_rptr != cmplen ||
3540 	    bcmp((char *)mp_rptr, (char *)ud_mp_rptr, cmplen) == 0) {
3541 		nce_fp_marg->nce_fp_match_res = nce;
3542 		mutex_exit(&ncec->ncec_lock);
3543 		nce_refhold(nce);
3544 		return (1);
3545 	}
3546 	mutex_exit(&ncec->ncec_lock);
3547 	return (0);
3548 }
3549 
3550 /*
3551  * Update all NCE's that are not in fastpath mode and
3552  * have an nce_fp_mp that matches mp. mp->b_cont contains
3553  * the fastpath header.
3554  *
3555  * Returns TRUE if entry should be dequeued, or FALSE otherwise.
3556  */
3557 void
3558 nce_fastpath_update(ill_t *ill,  mblk_t *mp)
3559 {
3560 	nce_fp_match_t nce_fp_marg;
3561 	nce_t *nce;
3562 	mblk_t *nce_fp_mp, *fp_mp;
3563 
3564 	nce_fp_marg.nce_fp_match_res = NULL;
3565 	nce_fp_marg.nce_fp_match_ack_mp = mp;
3566 
3567 	nce_walk(ill, nce_fastpath_match_dlur, &nce_fp_marg);
3568 
3569 	if ((nce = nce_fp_marg.nce_fp_match_res) == NULL)
3570 		return;
3571 
3572 	mutex_enter(&nce->nce_lock);
3573 	nce_fp_mp = nce->nce_fp_mp;
3574 
3575 	if (nce_fp_mp != NULL) {
3576 		fp_mp = mp->b_cont;
3577 		if (nce_fp_mp->b_rptr + MBLKL(fp_mp) >
3578 		    nce_fp_mp->b_datap->db_lim) {
3579 			mutex_exit(&nce->nce_lock);
3580 			nce = nce_delete_then_add(nce);
3581 			if (nce == NULL) {
3582 				return;
3583 			}
3584 			mutex_enter(&nce->nce_lock);
3585 			nce_fp_mp = nce->nce_fp_mp;
3586 		}
3587 	}
3588 
3589 	/* Matched - install mp as the fastpath mp */
3590 	if (nce_fp_mp == NULL) {
3591 		fp_mp = dupb(mp->b_cont);
3592 		nce->nce_fp_mp = fp_mp;
3593 	} else {
3594 		fp_mp = mp->b_cont;
3595 		bcopy(fp_mp->b_rptr, nce_fp_mp->b_rptr, MBLKL(fp_mp));
3596 		nce->nce_fp_mp->b_wptr = nce->nce_fp_mp->b_rptr
3597 		    + MBLKL(fp_mp);
3598 	}
3599 	mutex_exit(&nce->nce_lock);
3600 	nce_refrele(nce);
3601 }
3602 
3603 /*
3604  * Return a pointer to a given option in the packet.
3605  * Assumes that option part of the packet have already been validated.
3606  */
3607 nd_opt_hdr_t *
3608 ndp_get_option(nd_opt_hdr_t *opt, int optlen, int opt_type)
3609 {
3610 	while (optlen > 0) {
3611 		if (opt->nd_opt_type == opt_type)
3612 			return (opt);
3613 		optlen -= 8 * opt->nd_opt_len;
3614 		opt = (struct nd_opt_hdr *)((char *)opt + 8 * opt->nd_opt_len);
3615 	}
3616 	return (NULL);
3617 }
3618 
3619 /*
3620  * Verify all option lengths present are > 0, also check to see
3621  * if the option lengths and packet length are consistent.
3622  */
3623 boolean_t
3624 ndp_verify_optlen(nd_opt_hdr_t *opt, int optlen)
3625 {
3626 	ASSERT(opt != NULL);
3627 	while (optlen > 0) {
3628 		if (opt->nd_opt_len == 0)
3629 			return (B_FALSE);
3630 		optlen -= 8 * opt->nd_opt_len;
3631 		if (optlen < 0)
3632 			return (B_FALSE);
3633 		opt = (struct nd_opt_hdr *)((char *)opt + 8 * opt->nd_opt_len);
3634 	}
3635 	return (B_TRUE);
3636 }
3637 
3638 /*
3639  * ncec_walk function.
3640  * Free a fraction of the NCE cache entries.
3641  *
3642  * A possible optimization here would be to use ncec_last where possible, and
3643  * delete the least-frequently used entry, which would require more complex
3644  * computation as we walk through the ncec's (e.g., track ncec entries by
3645  * order of ncec_last and/or maintain state)
3646  */
3647 static void
3648 ncec_cache_reclaim(ncec_t *ncec, void *arg)
3649 {
3650 	ip_stack_t	*ipst = ncec->ncec_ipst;
3651 	uint_t		fraction = *(uint_t *)arg;
3652 	uint_t		rand;
3653 
3654 	if ((ncec->ncec_flags &
3655 	    (NCE_F_MYADDR | NCE_F_STATIC | NCE_F_BCAST)) != 0) {
3656 		return;
3657 	}
3658 
3659 	rand = (uint_t)ddi_get_lbolt() +
3660 	    NCE_ADDR_HASH_V6(ncec->ncec_addr, NCE_TABLE_SIZE);
3661 	if ((rand/fraction)*fraction == rand) {
3662 		IP_STAT(ipst, ip_nce_reclaim_deleted);
3663 		ncec_delete(ncec);
3664 	}
3665 }
3666 
3667 /*
3668  * kmem_cache callback to free up memory.
3669  *
3670  * For now we just delete a fixed fraction.
3671  */
3672 static void
3673 ip_nce_reclaim_stack(ip_stack_t *ipst)
3674 {
3675 	uint_t		fraction = ipst->ips_ip_nce_reclaim_fraction;
3676 
3677 	IP_STAT(ipst, ip_nce_reclaim_calls);
3678 
3679 	ncec_walk(NULL, ncec_cache_reclaim, &fraction, ipst);
3680 
3681 	/*
3682 	 * Walk all CONNs that can have a reference on an ire, ncec or dce.
3683 	 * Get them to update any stale references to drop any refholds they
3684 	 * have.
3685 	 */
3686 	ipcl_walk(conn_ixa_cleanup, (void *)B_FALSE, ipst);
3687 }
3688 
3689 /*
3690  * Called by the memory allocator subsystem directly, when the system
3691  * is running low on memory.
3692  */
3693 /* ARGSUSED */
3694 void
3695 ip_nce_reclaim(void *args)
3696 {
3697 	netstack_handle_t nh;
3698 	netstack_t *ns;
3699 	ip_stack_t *ipst;
3700 
3701 	netstack_next_init(&nh);
3702 	while ((ns = netstack_next(&nh)) != NULL) {
3703 		/*
3704 		 * netstack_next() can return a netstack_t with a NULL
3705 		 * netstack_ip at boot time.
3706 		 */
3707 		if ((ipst = ns->netstack_ip) == NULL) {
3708 			netstack_rele(ns);
3709 			continue;
3710 		}
3711 		ip_nce_reclaim_stack(ipst);
3712 		netstack_rele(ns);
3713 	}
3714 	netstack_next_fini(&nh);
3715 }
3716 
3717 #ifdef DEBUG
3718 void
3719 ncec_trace_ref(ncec_t *ncec)
3720 {
3721 	ASSERT(MUTEX_HELD(&ncec->ncec_lock));
3722 
3723 	if (ncec->ncec_trace_disable)
3724 		return;
3725 
3726 	if (!th_trace_ref(ncec, ncec->ncec_ipst)) {
3727 		ncec->ncec_trace_disable = B_TRUE;
3728 		ncec_trace_cleanup(ncec);
3729 	}
3730 }
3731 
3732 void
3733 ncec_untrace_ref(ncec_t *ncec)
3734 {
3735 	ASSERT(MUTEX_HELD(&ncec->ncec_lock));
3736 
3737 	if (!ncec->ncec_trace_disable)
3738 		th_trace_unref(ncec);
3739 }
3740 
3741 static void
3742 ncec_trace_cleanup(const ncec_t *ncec)
3743 {
3744 	th_trace_cleanup(ncec, ncec->ncec_trace_disable);
3745 }
3746 #endif
3747 
3748 /*
3749  * Called when address resolution fails due to a timeout.
3750  * Send an ICMP unreachable in response to all queued packets.
3751  */
3752 void
3753 arp_resolv_failed(ncec_t *ncec)
3754 {
3755 	mblk_t	*mp, *nxt_mp;
3756 	char	buf[INET6_ADDRSTRLEN];
3757 	struct in_addr ipv4addr;
3758 	ill_t *ill = ncec->ncec_ill;
3759 	ip_stack_t *ipst = ncec->ncec_ipst;
3760 	ip_recv_attr_t	iras;
3761 
3762 	bzero(&iras, sizeof (iras));
3763 	iras.ira_flags = IRAF_IS_IPV4;
3764 	/*
3765 	 * we are setting the ira_rill to the ipmp_ill (instead of
3766 	 * the actual ill on which the packet was received), but this
3767 	 * is ok because we don't actually need the real ira_rill.
3768 	 * to send the icmp unreachable to the sender.
3769 	 */
3770 	iras.ira_ill = iras.ira_rill = ill;
3771 	iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex;
3772 	iras.ira_rifindex = iras.ira_ruifindex;
3773 
3774 	IN6_V4MAPPED_TO_INADDR(&ncec->ncec_addr, &ipv4addr);
3775 	ip3dbg(("arp_resolv_failed: dst %s\n",
3776 	    inet_ntop(AF_INET, &ipv4addr, buf, sizeof (buf))));
3777 	mutex_enter(&ncec->ncec_lock);
3778 	mp = ncec->ncec_qd_mp;
3779 	ncec->ncec_qd_mp = NULL;
3780 	ncec->ncec_nprobes = 0;
3781 	mutex_exit(&ncec->ncec_lock);
3782 	while (mp != NULL) {
3783 		nxt_mp = mp->b_next;
3784 		mp->b_next = NULL;
3785 
3786 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
3787 		ip_drop_output("ipIfStatsOutDiscards - address unreachable",
3788 		    mp, ill);
3789 		if (ipst->ips_ip_arp_icmp_error) {
3790 			ip3dbg(("arp_resolv_failed: "
3791 			    "Calling icmp_unreachable\n"));
3792 			icmp_unreachable(mp, ICMP_HOST_UNREACHABLE, &iras);
3793 		} else {
3794 			freemsg(mp);
3795 		}
3796 		ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE));
3797 		mp = nxt_mp;
3798 	}
3799 	ncec_cb_dispatch(ncec); /* finish off waiting callbacks */
3800 }
3801 
3802 /*
3803  * if ill is an under_ill, translate it to the ipmp_ill and add the
3804  * nce on the ipmp_ill. Two nce_t entries (one on the ipmp_ill, and
3805  * one on the underlying in_ill) will be created for the
3806  * ncec_t in this case. The ncec_t itself will be created on the ipmp_ill.
3807  */
3808 int
3809 nce_lookup_then_add_v4(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len,
3810     const in_addr_t *addr, uint16_t flags, uint16_t state, nce_t **newnce)
3811 {
3812 	int	err;
3813 	in6_addr_t addr6;
3814 	ip_stack_t *ipst = ill->ill_ipst;
3815 	nce_t	*nce, *upper_nce = NULL;
3816 	ill_t	*in_ill = ill, *under = NULL;
3817 	boolean_t need_ill_refrele = B_FALSE;
3818 
3819 	if (flags & NCE_F_MCAST) {
3820 		/*
3821 		 * hw_addr will be figured out in nce_set_multicast_v4;
3822 		 * caller needs to pass in the cast_ill for ipmp
3823 		 */
3824 		ASSERT(hw_addr == NULL);
3825 		ASSERT(!IS_IPMP(ill));
3826 		err = nce_set_multicast_v4(ill, addr, flags, newnce);
3827 		return (err);
3828 	}
3829 
3830 	if (IS_UNDER_IPMP(ill) && !(flags & NCE_F_MYADDR)) {
3831 		ill = ipmp_ill_hold_ipmp_ill(ill);
3832 		if (ill == NULL)
3833 			return (ENXIO);
3834 		need_ill_refrele = B_TRUE;
3835 	}
3836 	if ((flags & NCE_F_BCAST) != 0) {
3837 		/*
3838 		 * IPv4 broadcast ncec: compute the hwaddr.
3839 		 */
3840 		if (IS_IPMP(ill)) {
3841 			under = ipmp_ill_hold_xmit_ill(ill, B_FALSE);
3842 			if (under == NULL)  {
3843 				if (need_ill_refrele)
3844 					ill_refrele(ill);
3845 				return (ENETDOWN);
3846 			}
3847 			hw_addr = under->ill_bcast_mp->b_rptr +
3848 			    NCE_LL_ADDR_OFFSET(under);
3849 			hw_addr_len = under->ill_phys_addr_length;
3850 		} else {
3851 			hw_addr = ill->ill_bcast_mp->b_rptr +
3852 			    NCE_LL_ADDR_OFFSET(ill),
3853 			    hw_addr_len = ill->ill_phys_addr_length;
3854 		}
3855 	}
3856 
3857 	mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
3858 	IN6_IPADDR_TO_V4MAPPED(*addr, &addr6);
3859 	nce = nce_lookup_addr(ill, &addr6);
3860 	if (nce == NULL) {
3861 		err = nce_add_v4(ill, hw_addr, hw_addr_len, addr, flags,
3862 		    state, &nce);
3863 	} else {
3864 		err = EEXIST;
3865 	}
3866 	mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
3867 	if (err == 0)
3868 		err = nce_add_v4_postprocess(nce);
3869 
3870 	if (in_ill != ill && nce != NULL) {
3871 		nce_t *under_nce = NULL;
3872 
3873 		/*
3874 		 * in_ill was the under_ill. Try to create the under_nce.
3875 		 * Hold the ill_g_lock to prevent changes to group membership
3876 		 * until we are done.
3877 		 */
3878 		rw_enter(&ipst->ips_ill_g_lock, RW_READER);
3879 		if (!IS_IN_SAME_ILLGRP(in_ill, ill)) {
3880 			DTRACE_PROBE2(ill__not__in__group, nce_t *, nce,
3881 			    ill_t *, ill);
3882 			rw_exit(&ipst->ips_ill_g_lock);
3883 			err = ENXIO;
3884 			nce_refrele(nce);
3885 			nce = NULL;
3886 			goto bail;
3887 		}
3888 		under_nce = nce_fastpath_create(in_ill, nce->nce_common);
3889 		if (under_nce == NULL) {
3890 			rw_exit(&ipst->ips_ill_g_lock);
3891 			err = EINVAL;
3892 			nce_refrele(nce);
3893 			nce = NULL;
3894 			goto bail;
3895 		}
3896 		rw_exit(&ipst->ips_ill_g_lock);
3897 		upper_nce = nce;
3898 		nce = under_nce; /* will be returned to caller */
3899 		if (NCE_ISREACHABLE(nce->nce_common))
3900 			nce_fastpath_trigger(under_nce);
3901 	}
3902 	if (nce != NULL) {
3903 		if (newnce != NULL)
3904 			*newnce = nce;
3905 		else
3906 			nce_refrele(nce);
3907 	}
3908 bail:
3909 	if (under != NULL)
3910 		ill_refrele(under);
3911 	if (upper_nce != NULL)
3912 		nce_refrele(upper_nce);
3913 	if (need_ill_refrele)
3914 		ill_refrele(ill);
3915 
3916 	return (err);
3917 }
3918 
3919 /*
3920  * NDP Cache Entry creation routine for IPv4.
3921  * This routine must always be called with ndp4->ndp_g_lock held.
3922  * Prior to return, ncec_refcnt is incremented.
3923  *
3924  * IPMP notes: the ncec for non-local (i.e., !NCE_MYADDR(ncec) addresses
3925  * are always added pointing at the ipmp_ill. Thus, when the ill passed
3926  * to nce_add_v4 is an under_ill (i.e., IS_UNDER_IPMP(ill)) two nce_t
3927  * entries will be created, both pointing at the same ncec_t. The nce_t
3928  * entries will have their nce_ill set to the ipmp_ill and the under_ill
3929  * respectively, with the ncec_t having its ncec_ill pointing at the ipmp_ill.
3930  * Local addresses are always created on the ill passed to nce_add_v4.
3931  */
3932 int
3933 nce_add_v4(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len,
3934     const in_addr_t *addr, uint16_t flags, uint16_t state, nce_t **newnce)
3935 {
3936 	int		err;
3937 	boolean_t	is_multicast = (flags & NCE_F_MCAST);
3938 	struct in6_addr	addr6;
3939 	nce_t		*nce;
3940 
3941 	ASSERT(MUTEX_HELD(&ill->ill_ipst->ips_ndp4->ndp_g_lock));
3942 	ASSERT(!ill->ill_isv6);
3943 	ASSERT(!IN_MULTICAST(htonl(*addr)) || is_multicast);
3944 
3945 	IN6_IPADDR_TO_V4MAPPED(*addr, &addr6);
3946 	err = nce_add_common(ill, hw_addr, hw_addr_len, &addr6, flags, state,
3947 	    &nce);
3948 	ASSERT(newnce != NULL);
3949 	*newnce = nce;
3950 	return (err);
3951 }
3952 
3953 /*
3954  * Post-processing routine to be executed after nce_add_v4(). This function
3955  * triggers fastpath (if appropriate) and DAD on the newly added nce entry
3956  * and must be called without any locks held.
3957  *
3958  * Always returns 0, but we return an int to keep this symmetric with the
3959  * IPv6 counter-part.
3960  */
3961 int
3962 nce_add_v4_postprocess(nce_t *nce)
3963 {
3964 	ncec_t		*ncec = nce->nce_common;
3965 	uint16_t	flags = ncec->ncec_flags;
3966 	boolean_t	ndp_need_dad = B_FALSE;
3967 	boolean_t	dropped;
3968 	clock_t		delay;
3969 	ip_stack_t	*ipst = ncec->ncec_ill->ill_ipst;
3970 	uchar_t		*hw_addr = ncec->ncec_lladdr;
3971 	boolean_t	trigger_fastpath = B_TRUE;
3972 
3973 	/*
3974 	 * If the hw_addr is NULL, typically for ND_INCOMPLETE nces, then
3975 	 * we call nce_fastpath as soon as the ncec is resolved in nce_process.
3976 	 * We call nce_fastpath from nce_update if the link layer address of
3977 	 * the peer changes from nce_update
3978 	 */
3979 	if (NCE_PUBLISH(ncec) || !NCE_ISREACHABLE(ncec) || (hw_addr == NULL &&
3980 	    ncec->ncec_ill->ill_net_type != IRE_IF_NORESOLVER))
3981 		trigger_fastpath = B_FALSE;
3982 
3983 	if (trigger_fastpath)
3984 		nce_fastpath_trigger(nce);
3985 
3986 	if (NCE_PUBLISH(ncec) && ncec->ncec_state == ND_PROBE) {
3987 		/*
3988 		 * Either the caller (by passing in ND_PROBE)
3989 		 * or nce_add_common() (by the internally computed state
3990 		 * based on ncec_addr and ill_net_type) has determined
3991 		 * that this unicast entry needs DAD. Trigger DAD.
3992 		 */
3993 		ndp_need_dad = B_TRUE;
3994 	} else if (flags & NCE_F_UNSOL_ADV) {
3995 		/*
3996 		 * We account for the transmit below by assigning one
3997 		 * less than the ndd variable. Subsequent decrements
3998 		 * are done in nce_timer.
3999 		 */
4000 		mutex_enter(&ncec->ncec_lock);
4001 		ncec->ncec_unsolicit_count =
4002 		    ipst->ips_ip_arp_publish_count - 1;
4003 		mutex_exit(&ncec->ncec_lock);
4004 		dropped = arp_announce(ncec);
4005 		mutex_enter(&ncec->ncec_lock);
4006 		if (dropped)
4007 			ncec->ncec_unsolicit_count++;
4008 		else
4009 			ncec->ncec_last_time_defended = ddi_get_lbolt();
4010 		if (ncec->ncec_unsolicit_count != 0) {
4011 			nce_start_timer(ncec,
4012 			    ipst->ips_ip_arp_publish_interval);
4013 		}
4014 		mutex_exit(&ncec->ncec_lock);
4015 	}
4016 
4017 	/*
4018 	 * If ncec_xmit_interval is 0, user has configured us to send the first
4019 	 * probe right away.  Do so, and set up for the subsequent probes.
4020 	 */
4021 	if (ndp_need_dad) {
4022 		mutex_enter(&ncec->ncec_lock);
4023 		if (ncec->ncec_pcnt == 0) {
4024 			/*
4025 			 * DAD probes and announce can be
4026 			 * administratively disabled by setting the
4027 			 * probe_count to zero. Restart the timer in
4028 			 * this case to mark the ipif as ready.
4029 			 */
4030 			ncec->ncec_unsolicit_count = 0;
4031 			mutex_exit(&ncec->ncec_lock);
4032 			nce_restart_timer(ncec, 0);
4033 		} else {
4034 			mutex_exit(&ncec->ncec_lock);
4035 			delay = ((ncec->ncec_flags & NCE_F_FAST) ?
4036 			    ipst->ips_arp_probe_delay :
4037 			    ipst->ips_arp_fastprobe_delay);
4038 			nce_dad(ncec, NULL, (delay == 0 ? B_TRUE : B_FALSE));
4039 		}
4040 	}
4041 	return (0);
4042 }
4043 
4044 /*
4045  * ncec_walk routine to update all entries that have a given destination or
4046  * gateway address and cached link layer (MAC) address.  This is used when ARP
4047  * informs us that a network-to-link-layer mapping may have changed.
4048  */
4049 void
4050 nce_update_hw_changed(ncec_t *ncec, void *arg)
4051 {
4052 	nce_hw_map_t *hwm = arg;
4053 	ipaddr_t ncec_addr;
4054 
4055 	if (ncec->ncec_state != ND_REACHABLE)
4056 		return;
4057 
4058 	IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, ncec_addr);
4059 	if (ncec_addr != hwm->hwm_addr)
4060 		return;
4061 
4062 	mutex_enter(&ncec->ncec_lock);
4063 	if (hwm->hwm_flags != 0)
4064 		ncec->ncec_flags = hwm->hwm_flags;
4065 	nce_update(ncec, ND_STALE, hwm->hwm_hwaddr);
4066 	mutex_exit(&ncec->ncec_lock);
4067 }
4068 
4069 void
4070 ncec_refhold(ncec_t *ncec)
4071 {
4072 	mutex_enter(&(ncec)->ncec_lock);
4073 	(ncec)->ncec_refcnt++;
4074 	ASSERT((ncec)->ncec_refcnt != 0);
4075 #ifdef DEBUG
4076 	ncec_trace_ref(ncec);
4077 #endif
4078 	mutex_exit(&(ncec)->ncec_lock);
4079 }
4080 
4081 void
4082 ncec_refhold_notr(ncec_t *ncec)
4083 {
4084 	mutex_enter(&(ncec)->ncec_lock);
4085 	(ncec)->ncec_refcnt++;
4086 	ASSERT((ncec)->ncec_refcnt != 0);
4087 	mutex_exit(&(ncec)->ncec_lock);
4088 }
4089 
4090 static void
4091 ncec_refhold_locked(ncec_t *ncec)
4092 {
4093 	ASSERT(MUTEX_HELD(&(ncec)->ncec_lock));
4094 	(ncec)->ncec_refcnt++;
4095 #ifdef DEBUG
4096 	ncec_trace_ref(ncec);
4097 #endif
4098 }
4099 
4100 /* ncec_inactive destroys the mutex thus no mutex_exit is needed */
4101 void
4102 ncec_refrele(ncec_t *ncec)
4103 {
4104 	mutex_enter(&(ncec)->ncec_lock);
4105 #ifdef DEBUG
4106 	ncec_untrace_ref(ncec);
4107 #endif
4108 	ASSERT((ncec)->ncec_refcnt != 0);
4109 	if (--(ncec)->ncec_refcnt == 0) {
4110 		ncec_inactive(ncec);
4111 	} else {
4112 		mutex_exit(&(ncec)->ncec_lock);
4113 	}
4114 }
4115 
4116 void
4117 ncec_refrele_notr(ncec_t *ncec)
4118 {
4119 	mutex_enter(&(ncec)->ncec_lock);
4120 	ASSERT((ncec)->ncec_refcnt != 0);
4121 	if (--(ncec)->ncec_refcnt == 0) {
4122 		ncec_inactive(ncec);
4123 	} else {
4124 		mutex_exit(&(ncec)->ncec_lock);
4125 	}
4126 }
4127 
4128 /*
4129  * Common to IPv4 and IPv6.
4130  */
4131 void
4132 nce_restart_timer(ncec_t *ncec, uint_t ms)
4133 {
4134 	timeout_id_t tid;
4135 
4136 	ASSERT(!MUTEX_HELD(&(ncec)->ncec_lock));
4137 
4138 	/* First cancel any running timer */
4139 	mutex_enter(&ncec->ncec_lock);
4140 	tid = ncec->ncec_timeout_id;
4141 	ncec->ncec_timeout_id = 0;
4142 	if (tid != 0) {
4143 		mutex_exit(&ncec->ncec_lock);
4144 		(void) untimeout(tid);
4145 		mutex_enter(&ncec->ncec_lock);
4146 	}
4147 
4148 	/* Restart timer */
4149 	nce_start_timer(ncec, ms);
4150 	mutex_exit(&ncec->ncec_lock);
4151 }
4152 
4153 static void
4154 nce_start_timer(ncec_t *ncec, uint_t ms)
4155 {
4156 	ASSERT(MUTEX_HELD(&ncec->ncec_lock));
4157 	/*
4158 	 * Don't start the timer if the ncec has been deleted, or if the timer
4159 	 * is already running
4160 	 */
4161 	if (!NCE_ISCONDEMNED(ncec) && ncec->ncec_timeout_id == 0) {
4162 		ncec->ncec_timeout_id = timeout(nce_timer, ncec,
4163 		    MSEC_TO_TICK(ms) == 0 ? 1 : MSEC_TO_TICK(ms));
4164 	}
4165 }
4166 
4167 int
4168 nce_set_multicast_v4(ill_t *ill, const in_addr_t *dst,
4169     uint16_t flags, nce_t **newnce)
4170 {
4171 	uchar_t		*hw_addr;
4172 	int		err = 0;
4173 	ip_stack_t	*ipst = ill->ill_ipst;
4174 	in6_addr_t	dst6;
4175 	nce_t		*nce;
4176 
4177 	ASSERT(!ill->ill_isv6);
4178 
4179 	IN6_IPADDR_TO_V4MAPPED(*dst, &dst6);
4180 	mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
4181 	if ((nce = nce_lookup_addr(ill, &dst6)) != NULL) {
4182 		mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
4183 		goto done;
4184 	}
4185 	if (ill->ill_net_type == IRE_IF_RESOLVER) {
4186 		/*
4187 		 * For IRE_IF_RESOLVER a hardware mapping can be
4188 		 * generated, for IRE_IF_NORESOLVER, resolution cookie
4189 		 * in the ill is copied in nce_add_v4().
4190 		 */
4191 		hw_addr = kmem_alloc(ill->ill_phys_addr_length, KM_NOSLEEP);
4192 		if (hw_addr == NULL) {
4193 			mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
4194 			return (ENOMEM);
4195 		}
4196 		ip_mcast_mapping(ill, (uchar_t *)dst, hw_addr);
4197 	} else {
4198 		/*
4199 		 * IRE_IF_NORESOLVER type simply copies the resolution
4200 		 * cookie passed in.  So no hw_addr is needed.
4201 		 */
4202 		hw_addr = NULL;
4203 	}
4204 	ASSERT(flags & NCE_F_MCAST);
4205 	ASSERT(flags & NCE_F_NONUD);
4206 	/* nce_state will be computed by nce_add_common() */
4207 	err = nce_add_v4(ill, hw_addr, ill->ill_phys_addr_length, dst, flags,
4208 	    ND_UNCHANGED, &nce);
4209 	mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
4210 	if (err == 0)
4211 		err = (nce != NULL) ? nce_add_v4_postprocess(nce) : ENOMEM;
4212 	if (hw_addr != NULL)
4213 		kmem_free(hw_addr, ill->ill_phys_addr_length);
4214 	if (err != 0) {
4215 		ip1dbg(("nce_set_multicast_v4: create failed" "%d\n", err));
4216 		return (err);
4217 	}
4218 done:
4219 	if (newnce != NULL)
4220 		*newnce = nce;
4221 	else
4222 		nce_refrele(nce);
4223 	return (0);
4224 }
4225 
4226 /*
4227  * This is used when scanning for "old" (least recently broadcast) NCEs.  We
4228  * don't want to have to walk the list for every single one, so we gather up
4229  * batches at a time.
4230  */
4231 #define	NCE_RESCHED_LIST_LEN	8
4232 
4233 typedef struct {
4234 	ill_t	*ncert_ill;
4235 	uint_t	ncert_num;
4236 	ncec_t	*ncert_nces[NCE_RESCHED_LIST_LEN];
4237 } nce_resched_t;
4238 
4239 /*
4240  * Pick the longest waiting NCEs for defense.
4241  */
4242 /* ARGSUSED */
4243 static int
4244 ncec_reschedule(ill_t *ill, nce_t *nce, void *arg)
4245 {
4246 	nce_resched_t *ncert = arg;
4247 	ncec_t **ncecs;
4248 	ncec_t **ncec_max;
4249 	ncec_t *ncec_temp;
4250 	ncec_t *ncec = nce->nce_common;
4251 
4252 	ASSERT(ncec->ncec_ill == ncert->ncert_ill);
4253 	/*
4254 	 * Only reachable entries that are ready for announcement are eligible.
4255 	 */
4256 	if (!NCE_MYADDR(ncec) || ncec->ncec_state != ND_REACHABLE)
4257 		return (0);
4258 	if (ncert->ncert_num < NCE_RESCHED_LIST_LEN) {
4259 		ncec_refhold(ncec);
4260 		ncert->ncert_nces[ncert->ncert_num++] = ncec;
4261 	} else {
4262 		ncecs = ncert->ncert_nces;
4263 		ncec_max = ncecs + NCE_RESCHED_LIST_LEN;
4264 		ncec_refhold(ncec);
4265 		for (; ncecs < ncec_max; ncecs++) {
4266 			ASSERT(ncec != NULL);
4267 			if ((*ncecs)->ncec_last_time_defended >
4268 			    ncec->ncec_last_time_defended) {
4269 				ncec_temp = *ncecs;
4270 				*ncecs = ncec;
4271 				ncec = ncec_temp;
4272 			}
4273 		}
4274 		ncec_refrele(ncec);
4275 	}
4276 	return (0);
4277 }
4278 
4279 /*
4280  * Reschedule the ARP defense of any long-waiting NCEs.  It's assumed that this
4281  * doesn't happen very often (if at all), and thus it needn't be highly
4282  * optimized.  (Note, though, that it's actually O(N) complexity, because the
4283  * outer loop is bounded by a constant rather than by the length of the list.)
4284  */
4285 static void
4286 nce_ill_reschedule(ill_t *ill, nce_resched_t *ncert)
4287 {
4288 	ncec_t		*ncec;
4289 	ip_stack_t	*ipst = ill->ill_ipst;
4290 	uint_t		i, defend_rate;
4291 
4292 	i = ill->ill_defend_count;
4293 	ill->ill_defend_count = 0;
4294 	if (ill->ill_isv6)
4295 		defend_rate = ipst->ips_ndp_defend_rate;
4296 	else
4297 		defend_rate = ipst->ips_arp_defend_rate;
4298 	/* If none could be sitting around, then don't reschedule */
4299 	if (i < defend_rate) {
4300 		DTRACE_PROBE1(reschedule_none, ill_t *, ill);
4301 		return;
4302 	}
4303 	ncert->ncert_ill = ill;
4304 	while (ill->ill_defend_count < defend_rate) {
4305 		nce_walk_common(ill, ncec_reschedule, ncert);
4306 		for (i = 0; i < ncert->ncert_num; i++) {
4307 
4308 			ncec = ncert->ncert_nces[i];
4309 			mutex_enter(&ncec->ncec_lock);
4310 			ncec->ncec_flags |= NCE_F_DELAYED;
4311 			mutex_exit(&ncec->ncec_lock);
4312 			/*
4313 			 * we plan to schedule this ncec, so incr the
4314 			 * defend_count in anticipation.
4315 			 */
4316 			if (++ill->ill_defend_count >= defend_rate)
4317 				break;
4318 		}
4319 		if (ncert->ncert_num < NCE_RESCHED_LIST_LEN)
4320 			break;
4321 	}
4322 }
4323 
4324 /*
4325  * Check if the current rate-limiting parameters permit the sending
4326  * of another address defense announcement for both IPv4 and IPv6.
4327  * Returns B_TRUE if rate-limiting is in effect (i.e., send is not
4328  * permitted), and B_FALSE otherwise. The `defend_rate' parameter
4329  * determines how many address defense announcements are permitted
4330  * in any `defense_perio' interval.
4331  */
4332 static boolean_t
4333 ill_defend_rate_limit(ill_t *ill, ncec_t *ncec)
4334 {
4335 	clock_t		now = ddi_get_lbolt();
4336 	ip_stack_t	*ipst = ill->ill_ipst;
4337 	clock_t		start = ill->ill_defend_start;
4338 	uint32_t	elapsed, defend_period, defend_rate;
4339 	nce_resched_t	ncert;
4340 	boolean_t	ret;
4341 	int		i;
4342 
4343 	if (ill->ill_isv6) {
4344 		defend_period = ipst->ips_ndp_defend_period;
4345 		defend_rate = ipst->ips_ndp_defend_rate;
4346 	} else {
4347 		defend_period = ipst->ips_arp_defend_period;
4348 		defend_rate = ipst->ips_arp_defend_rate;
4349 	}
4350 	if (defend_rate == 0)
4351 		return (B_TRUE);
4352 	bzero(&ncert, sizeof (ncert));
4353 	mutex_enter(&ill->ill_lock);
4354 	if (start > 0) {
4355 		elapsed = now - start;
4356 		if (elapsed > SEC_TO_TICK(defend_period)) {
4357 			ill->ill_defend_start = now;
4358 			/*
4359 			 * nce_ill_reschedule will attempt to
4360 			 * prevent starvation by reschduling the
4361 			 * oldest entries, which are marked with
4362 			 * the NCE_F_DELAYED flag.
4363 			 */
4364 			nce_ill_reschedule(ill, &ncert);
4365 		}
4366 	} else {
4367 		ill->ill_defend_start = now;
4368 	}
4369 	ASSERT(ill->ill_defend_count <= defend_rate);
4370 	mutex_enter(&ncec->ncec_lock);
4371 	if (ncec->ncec_flags & NCE_F_DELAYED) {
4372 		/*
4373 		 * This ncec was rescheduled as one of the really old
4374 		 * entries needing on-going defense. The
4375 		 * ill_defend_count was already incremented in
4376 		 * nce_ill_reschedule. Go ahead and send the announce.
4377 		 */
4378 		ncec->ncec_flags &= ~NCE_F_DELAYED;
4379 		mutex_exit(&ncec->ncec_lock);
4380 		ret = B_FALSE;
4381 		goto done;
4382 	}
4383 	mutex_exit(&ncec->ncec_lock);
4384 	if (ill->ill_defend_count < defend_rate)
4385 		ill->ill_defend_count++;
4386 	if (ill->ill_defend_count == defend_rate) {
4387 		/*
4388 		 * we are no longer allowed to send unbidden defense
4389 		 * messages. Wait for rescheduling.
4390 		 */
4391 		ret = B_TRUE;
4392 	} else {
4393 		ret = B_FALSE;
4394 	}
4395 done:
4396 	mutex_exit(&ill->ill_lock);
4397 	/*
4398 	 * After all the locks have been dropped we can restart nce timer,
4399 	 * and refrele the delayed ncecs
4400 	 */
4401 	for (i = 0; i < ncert.ncert_num; i++) {
4402 		clock_t	xmit_interval;
4403 		ncec_t	*tmp;
4404 
4405 		tmp = ncert.ncert_nces[i];
4406 		xmit_interval = nce_fuzz_interval(tmp->ncec_xmit_interval,
4407 		    B_FALSE);
4408 		nce_restart_timer(tmp, xmit_interval);
4409 		ncec_refrele(tmp);
4410 	}
4411 	return (ret);
4412 }
4413 
4414 boolean_t
4415 ndp_announce(ncec_t *ncec)
4416 {
4417 	return (ndp_xmit(ncec->ncec_ill, ND_NEIGHBOR_ADVERT, ncec->ncec_lladdr,
4418 	    ncec->ncec_lladdr_length, &ncec->ncec_addr, &ipv6_all_hosts_mcast,
4419 	    nce_advert_flags(ncec)));
4420 }
4421 
4422 ill_t *
4423 nce_resolve_src(ncec_t *ncec, in6_addr_t *src)
4424 {
4425 	mblk_t		*mp;
4426 	in6_addr_t	src6;
4427 	ipaddr_t	src4;
4428 	ill_t		*ill = ncec->ncec_ill;
4429 	ill_t		*src_ill = NULL;
4430 	ipif_t		*ipif = NULL;
4431 	boolean_t	is_myaddr = NCE_MYADDR(ncec);
4432 	boolean_t	isv6 = (ncec->ncec_ipversion == IPV6_VERSION);
4433 
4434 	ASSERT(src != NULL);
4435 	ASSERT(IN6_IS_ADDR_UNSPECIFIED(src));
4436 	src6 = *src;
4437 	if (is_myaddr) {
4438 		src6 = ncec->ncec_addr;
4439 		if (!isv6)
4440 			IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, src4);
4441 	} else {
4442 		/*
4443 		 * try to find one from the outgoing packet.
4444 		 */
4445 		mutex_enter(&ncec->ncec_lock);
4446 		mp = ncec->ncec_qd_mp;
4447 		if (mp != NULL) {
4448 			if (isv6) {
4449 				ip6_t	*ip6h = (ip6_t *)mp->b_rptr;
4450 
4451 				src6 = ip6h->ip6_src;
4452 			} else {
4453 				ipha_t  *ipha = (ipha_t *)mp->b_rptr;
4454 
4455 				src4 = ipha->ipha_src;
4456 				IN6_IPADDR_TO_V4MAPPED(src4, &src6);
4457 			}
4458 		}
4459 		mutex_exit(&ncec->ncec_lock);
4460 	}
4461 
4462 	/*
4463 	 * For outgoing packets, if the src of outgoing packet is one
4464 	 * of the assigned interface addresses use it, otherwise we
4465 	 * will pick the source address below.
4466 	 * For local addresses (is_myaddr) doing DAD, NDP announce
4467 	 * messages are mcast. So we use the (IPMP) cast_ill or the
4468 	 * (non-IPMP) ncec_ill for these message types. The only case
4469 	 * of unicast DAD messages are for IPv6 ND probes, for which
4470 	 * we find the ipif_bound_ill corresponding to the ncec_addr.
4471 	 */
4472 	if (!IN6_IS_ADDR_UNSPECIFIED(&src6) || is_myaddr) {
4473 		if (isv6) {
4474 			ipif = ipif_lookup_addr_nondup_v6(&src6, ill, ALL_ZONES,
4475 			    ill->ill_ipst);
4476 		} else {
4477 			ipif = ipif_lookup_addr_nondup(src4, ill, ALL_ZONES,
4478 			    ill->ill_ipst);
4479 		}
4480 
4481 		/*
4482 		 * If no relevant ipif can be found, then it's not one of our
4483 		 * addresses.  Reset to :: and try to find a src for the NS or
4484 		 * ARP request using ipif_select_source_v[4,6]  below.
4485 		 * If an ipif can be found, but it's not yet done with
4486 		 * DAD verification, and we are not being invoked for
4487 		 * DAD (i.e., !is_myaddr), then just postpone this
4488 		 * transmission until later.
4489 		 */
4490 		if (ipif == NULL) {
4491 			src6 = ipv6_all_zeros;
4492 			src4 = INADDR_ANY;
4493 		} else if (!ipif->ipif_addr_ready && !is_myaddr) {
4494 			DTRACE_PROBE2(nce__resolve__ipif__not__ready,
4495 			    ncec_t *, ncec, ipif_t *, ipif);
4496 			ipif_refrele(ipif);
4497 			return (NULL);
4498 		}
4499 	}
4500 
4501 	if (IN6_IS_ADDR_UNSPECIFIED(&src6) && !is_myaddr) {
4502 		/*
4503 		 * Pick a source address for this solicitation, but
4504 		 * restrict the selection to addresses assigned to the
4505 		 * output interface.  We do this because the destination will
4506 		 * create a neighbor cache entry for the source address of
4507 		 * this packet, so the source address had better be a valid
4508 		 * neighbor.
4509 		 */
4510 		if (isv6) {
4511 			ipif = ipif_select_source_v6(ill, &ncec->ncec_addr,
4512 			    B_TRUE, IPV6_PREFER_SRC_DEFAULT, ALL_ZONES,
4513 			    B_FALSE, NULL);
4514 		} else {
4515 			ipaddr_t nce_addr;
4516 
4517 			IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, nce_addr);
4518 			ipif = ipif_select_source_v4(ill, nce_addr, ALL_ZONES,
4519 			    B_FALSE, NULL);
4520 		}
4521 		if (ipif == NULL && IS_IPMP(ill)) {
4522 			ill_t *send_ill = ipmp_ill_hold_xmit_ill(ill, B_TRUE);
4523 
4524 			if (send_ill != NULL) {
4525 				if (isv6) {
4526 					ipif = ipif_select_source_v6(send_ill,
4527 					    &ncec->ncec_addr, B_TRUE,
4528 					    IPV6_PREFER_SRC_DEFAULT, ALL_ZONES,
4529 					    B_FALSE, NULL);
4530 				} else {
4531 					IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr,
4532 					    src4);
4533 					ipif = ipif_select_source_v4(send_ill,
4534 					    src4, ALL_ZONES, B_TRUE, NULL);
4535 				}
4536 				ill_refrele(send_ill);
4537 			}
4538 		}
4539 
4540 		if (ipif == NULL) {
4541 			char buf[INET6_ADDRSTRLEN];
4542 
4543 			ip1dbg(("nce_resolve_src: No source ipif for dst %s\n",
4544 			    inet_ntop((isv6 ? AF_INET6 : AF_INET),
4545 			    (char *)&ncec->ncec_addr, buf, sizeof (buf))));
4546 			DTRACE_PROBE1(nce__resolve__no__ipif, ncec_t *, ncec);
4547 			return (NULL);
4548 		}
4549 		src6 = ipif->ipif_v6lcl_addr;
4550 	}
4551 	*src = src6;
4552 	if (ipif != NULL) {
4553 		src_ill = ipif->ipif_ill;
4554 		if (IS_IPMP(src_ill))
4555 			src_ill = ipmp_ipif_hold_bound_ill(ipif);
4556 		else
4557 			ill_refhold(src_ill);
4558 		ipif_refrele(ipif);
4559 		DTRACE_PROBE2(nce__resolve__src__ill, ncec_t *, ncec,
4560 		    ill_t *, src_ill);
4561 	}
4562 	return (src_ill);
4563 }
4564 
4565 void
4566 ip_nce_lookup_and_update(ipaddr_t *addr, ipif_t *ipif, ip_stack_t *ipst,
4567     uchar_t *hwaddr, int hwaddr_len, int flags)
4568 {
4569 	ill_t	*ill;
4570 	ncec_t	*ncec;
4571 	nce_t	*nce;
4572 	uint16_t new_state;
4573 
4574 	ill = (ipif ? ipif->ipif_ill : NULL);
4575 	if (ill != NULL) {
4576 		/*
4577 		 * only one ncec is possible
4578 		 */
4579 		nce = nce_lookup_v4(ill, addr);
4580 		if (nce != NULL) {
4581 			ncec = nce->nce_common;
4582 			mutex_enter(&ncec->ncec_lock);
4583 			if (NCE_ISREACHABLE(ncec))
4584 				new_state = ND_UNCHANGED;
4585 			else
4586 				new_state = ND_STALE;
4587 			ncec->ncec_flags = flags;
4588 			nce_update(ncec, new_state, hwaddr);
4589 			mutex_exit(&ncec->ncec_lock);
4590 			nce_refrele(nce);
4591 			return;
4592 		}
4593 	} else {
4594 		/*
4595 		 * ill is wildcard; clean up all ncec's and ire's
4596 		 * that match on addr.
4597 		 */
4598 		nce_hw_map_t hwm;
4599 
4600 		hwm.hwm_addr = *addr;
4601 		hwm.hwm_hwlen = hwaddr_len;
4602 		hwm.hwm_hwaddr = hwaddr;
4603 		hwm.hwm_flags = flags;
4604 
4605 		ncec_walk_common(ipst->ips_ndp4, NULL,
4606 		    nce_update_hw_changed, &hwm, B_TRUE);
4607 	}
4608 }
4609 
4610 /*
4611  * Common function to add ncec entries.
4612  * we always add the ncec with ncec_ill == ill, and always create
4613  * nce_t on ncec_ill. A dlpi fastpath message may be triggered if the
4614  * ncec is !reachable.
4615  *
4616  * When the caller passes in an nce_state of ND_UNCHANGED,
4617  * nce_add_common() will determine the state of the created nce based
4618  * on the ill_net_type and nce_flags used. Otherwise, the nce will
4619  * be created with state set to the passed in nce_state.
4620  */
4621 static int
4622 nce_add_common(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len,
4623     const in6_addr_t *addr, uint16_t flags, uint16_t nce_state, nce_t **retnce)
4624 {
4625 	static	ncec_t		nce_nil;
4626 	uchar_t			*template = NULL;
4627 	int			err;
4628 	ncec_t			*ncec;
4629 	ncec_t			**ncep;
4630 	ip_stack_t		*ipst = ill->ill_ipst;
4631 	uint16_t		state;
4632 	boolean_t		fastprobe = B_FALSE;
4633 	struct ndp_g_s		*ndp;
4634 	nce_t			*nce = NULL;
4635 	list_t			graveyard;
4636 	mblk_t			*dlur_mp = NULL;
4637 
4638 	if (ill->ill_isv6)
4639 		ndp = ill->ill_ipst->ips_ndp6;
4640 	else
4641 		ndp = ill->ill_ipst->ips_ndp4;
4642 
4643 	*retnce = NULL;
4644 
4645 	ASSERT(MUTEX_HELD(&ndp->ndp_g_lock));
4646 
4647 	if (IN6_IS_ADDR_UNSPECIFIED(addr)) {
4648 		ip0dbg(("nce_add_common: no addr\n"));
4649 		return (EINVAL);
4650 	}
4651 	if ((flags & ~NCE_EXTERNAL_FLAGS_MASK)) {
4652 		ip0dbg(("nce_add_common: flags = %x\n", (int)flags));
4653 		return (EINVAL);
4654 	}
4655 
4656 	if (ill->ill_isv6) {
4657 		ncep = ((ncec_t **)NCE_HASH_PTR_V6(ipst, *addr));
4658 	} else {
4659 		ipaddr_t v4addr;
4660 
4661 		IN6_V4MAPPED_TO_IPADDR(addr, v4addr);
4662 		ncep = ((ncec_t **)NCE_HASH_PTR_V4(ipst, v4addr));
4663 	}
4664 
4665 	/*
4666 	 * The caller has ensured that there is no nce on ill, but there could
4667 	 * still be an nce_common_t for the address, so that we find exisiting
4668 	 * ncec_t strucutures first, and atomically add a new nce_t if
4669 	 * one is found. The ndp_g_lock ensures that we don't cross threads
4670 	 * with an ncec_delete(). Unlike ncec_lookup_illgrp() we do not
4671 	 * compare for matches across the illgrp because this function is
4672 	 * called via nce_lookup_then_add_v* -> nce_add_v* -> nce_add_common,
4673 	 * with the nce_lookup_then_add_v* passing in the ipmp_ill where
4674 	 * appropriate.
4675 	 */
4676 	ncec = *ncep;
4677 	for (; ncec != NULL; ncec = ncec->ncec_next) {
4678 		if (ncec->ncec_ill == ill) {
4679 			if (IN6_ARE_ADDR_EQUAL(&ncec->ncec_addr, addr)) {
4680 				/*
4681 				 * We should never find *retnce to be
4682 				 * MYADDR, since the caller may then
4683 				 * incorrectly restart a DAD timer that's
4684 				 * already running.  However, if we are in
4685 				 * forwarding mode, and the interface is
4686 				 * moving in/out of groups, the data
4687 				 * path ire lookup (e.g., ire_revalidate_nce)
4688 				 * may  have determined that some destination
4689 				 * is offlink while the control path is adding
4690 				 * that address as a local address.
4691 				 * Recover from  this case by failing the
4692 				 * lookup
4693 				 */
4694 				if (NCE_MYADDR(ncec))
4695 					return (ENXIO);
4696 				*retnce = nce_ill_lookup_then_add(ill, ncec);
4697 				if (*retnce != NULL)
4698 					break;
4699 			}
4700 		}
4701 	}
4702 	if (*retnce != NULL) /* caller must trigger fastpath on nce */
4703 		return (0);
4704 
4705 	ncec = kmem_cache_alloc(ncec_cache, KM_NOSLEEP);
4706 	if (ncec == NULL)
4707 		return (ENOMEM);
4708 	*ncec = nce_nil;
4709 	ncec->ncec_ill = ill;
4710 	ncec->ncec_ipversion = (ill->ill_isv6 ? IPV6_VERSION : IPV4_VERSION);
4711 	ncec->ncec_flags = flags;
4712 	ncec->ncec_ipst = ipst;	/* No netstack_hold */
4713 
4714 	if (!ill->ill_isv6) {
4715 		ipaddr_t addr4;
4716 
4717 		/*
4718 		 * DAD probe interval and probe count are set based on
4719 		 * fast/slow probe settings. If the underlying link doesn't
4720 		 * have reliably up/down notifications or if we're working
4721 		 * with IPv4 169.254.0.0/16 Link Local Address space, then
4722 		 * don't use the fast timers.  Otherwise, use them.
4723 		 */
4724 		ASSERT(IN6_IS_ADDR_V4MAPPED(addr));
4725 		IN6_V4MAPPED_TO_IPADDR(addr, addr4);
4726 		if (ill->ill_note_link && !IS_IPV4_LL_SPACE(&addr4)) {
4727 			fastprobe = B_TRUE;
4728 		} else if (IS_IPMP(ill) && NCE_PUBLISH(ncec) &&
4729 		    !IS_IPV4_LL_SPACE(&addr4)) {
4730 			ill_t *hwaddr_ill;
4731 
4732 			hwaddr_ill = ipmp_illgrp_find_ill(ill->ill_grp, hw_addr,
4733 			    hw_addr_len);
4734 			if (hwaddr_ill != NULL && hwaddr_ill->ill_note_link)
4735 				fastprobe = B_TRUE;
4736 		}
4737 		if (fastprobe) {
4738 			ncec->ncec_xmit_interval =
4739 			    ipst->ips_arp_fastprobe_interval;
4740 			ncec->ncec_pcnt =
4741 			    ipst->ips_arp_fastprobe_count;
4742 			ncec->ncec_flags |= NCE_F_FAST;
4743 		} else {
4744 			ncec->ncec_xmit_interval =
4745 			    ipst->ips_arp_probe_interval;
4746 			ncec->ncec_pcnt =
4747 			    ipst->ips_arp_probe_count;
4748 		}
4749 		if (NCE_PUBLISH(ncec)) {
4750 			ncec->ncec_unsolicit_count =
4751 			    ipst->ips_ip_arp_publish_count;
4752 		}
4753 	} else {
4754 		/*
4755 		 * probe interval is constant: ILL_PROBE_INTERVAL
4756 		 * probe count is constant: ND_MAX_UNICAST_SOLICIT
4757 		 */
4758 		ncec->ncec_pcnt = ND_MAX_UNICAST_SOLICIT;
4759 		if (NCE_PUBLISH(ncec)) {
4760 			ncec->ncec_unsolicit_count =
4761 			    ipst->ips_ip_ndp_unsolicit_count;
4762 		}
4763 	}
4764 	ncec->ncec_rcnt = ill->ill_xmit_count;
4765 	ncec->ncec_addr = *addr;
4766 	ncec->ncec_qd_mp = NULL;
4767 	ncec->ncec_refcnt = 1; /* for ncec getting created */
4768 	mutex_init(&ncec->ncec_lock, NULL, MUTEX_DEFAULT, NULL);
4769 	ncec->ncec_trace_disable = B_FALSE;
4770 
4771 	/*
4772 	 * ncec_lladdr holds link layer address
4773 	 */
4774 	if (hw_addr_len > 0) {
4775 		template = kmem_alloc(hw_addr_len, KM_NOSLEEP);
4776 		if (template == NULL) {
4777 			err = ENOMEM;
4778 			goto err_ret;
4779 		}
4780 		ncec->ncec_lladdr = template;
4781 		ncec->ncec_lladdr_length = hw_addr_len;
4782 		bzero(ncec->ncec_lladdr, hw_addr_len);
4783 	}
4784 	if ((flags & NCE_F_BCAST) != 0) {
4785 		state = ND_REACHABLE;
4786 		ASSERT(hw_addr_len > 0);
4787 	} else if (ill->ill_net_type == IRE_IF_RESOLVER) {
4788 		state = ND_INITIAL;
4789 	} else if (ill->ill_net_type == IRE_IF_NORESOLVER) {
4790 		/*
4791 		 * NORESOLVER entries are always created in the REACHABLE
4792 		 * state.
4793 		 */
4794 		state = ND_REACHABLE;
4795 		if (ill->ill_phys_addr_length == IP_ADDR_LEN &&
4796 		    ill->ill_mactype != DL_IPV4 &&
4797 		    ill->ill_mactype != DL_6TO4) {
4798 			/*
4799 			 * We create a nce_res_mp with the IP nexthop address
4800 			 * as the destination address if the physical length
4801 			 * is exactly 4 bytes for point-to-multipoint links
4802 			 * that do their own resolution from IP to link-layer
4803 			 * address (e.g. IP over X.25).
4804 			 */
4805 			bcopy((uchar_t *)addr,
4806 			    ncec->ncec_lladdr, ill->ill_phys_addr_length);
4807 		}
4808 		if (ill->ill_phys_addr_length == IPV6_ADDR_LEN &&
4809 		    ill->ill_mactype != DL_IPV6) {
4810 			/*
4811 			 * We create a nce_res_mp with the IP nexthop address
4812 			 * as the destination address if the physical legnth
4813 			 * is exactly 16 bytes for point-to-multipoint links
4814 			 * that do their own resolution from IP to link-layer
4815 			 * address.
4816 			 */
4817 			bcopy((uchar_t *)addr,
4818 			    ncec->ncec_lladdr, ill->ill_phys_addr_length);
4819 		}
4820 		/*
4821 		 * Since NUD is not part of the base IPv4 protocol definition,
4822 		 * IPv4 neighbor entries on NORESOLVER interfaces will never
4823 		 * age, and are marked NCE_F_NONUD.
4824 		 */
4825 		if (!ill->ill_isv6)
4826 			ncec->ncec_flags |= NCE_F_NONUD;
4827 	} else if (ill->ill_net_type == IRE_LOOPBACK) {
4828 		state = ND_REACHABLE;
4829 	}
4830 
4831 	if (hw_addr != NULL || ill->ill_net_type == IRE_IF_NORESOLVER) {
4832 		/*
4833 		 * We are adding an ncec with a deterministic hw_addr,
4834 		 * so the state can only be one of {REACHABLE, STALE, PROBE}.
4835 		 *
4836 		 * if we are adding a unicast ncec for the local address
4837 		 * it would be REACHABLE; we would be adding a ND_STALE entry
4838 		 * for the requestor of an ARP_REQUEST/ND_SOLICIT. Our own
4839 		 * addresses are added in PROBE to trigger DAD.
4840 		 */
4841 		if ((flags & (NCE_F_MCAST|NCE_F_BCAST)) ||
4842 		    ill->ill_net_type == IRE_IF_NORESOLVER)
4843 			state = ND_REACHABLE;
4844 		else if (!NCE_PUBLISH(ncec))
4845 			state = ND_STALE;
4846 		else
4847 			state = ND_PROBE;
4848 		if (hw_addr != NULL)
4849 			nce_set_ll(ncec, hw_addr);
4850 	}
4851 	/* caller overrides internally computed state */
4852 	if (nce_state != ND_UNCHANGED)
4853 		state = nce_state;
4854 
4855 	if (state == ND_PROBE)
4856 		ncec->ncec_flags |= NCE_F_UNVERIFIED;
4857 
4858 	ncec->ncec_state = state;
4859 
4860 	if (state == ND_REACHABLE) {
4861 		ncec->ncec_last = ncec->ncec_init_time =
4862 		    TICK_TO_MSEC(ddi_get_lbolt64());
4863 	} else {
4864 		ncec->ncec_last = 0;
4865 		if (state == ND_INITIAL)
4866 			ncec->ncec_init_time = TICK_TO_MSEC(ddi_get_lbolt64());
4867 	}
4868 	list_create(&ncec->ncec_cb, sizeof (ncec_cb_t),
4869 	    offsetof(ncec_cb_t, ncec_cb_node));
4870 	/*
4871 	 * have all the memory allocations out of the way before taking locks
4872 	 * and adding the nce.
4873 	 */
4874 	nce = kmem_cache_alloc(nce_cache, KM_NOSLEEP);
4875 	if (nce == NULL) {
4876 		err = ENOMEM;
4877 		goto err_ret;
4878 	}
4879 	if (ncec->ncec_lladdr != NULL ||
4880 	    ill->ill_net_type == IRE_IF_NORESOLVER) {
4881 		dlur_mp = ill_dlur_gen(ncec->ncec_lladdr,
4882 		    ill->ill_phys_addr_length, ill->ill_sap,
4883 		    ill->ill_sap_length);
4884 		if (dlur_mp == NULL) {
4885 			err = ENOMEM;
4886 			goto err_ret;
4887 		}
4888 	}
4889 
4890 	/*
4891 	 * Atomically ensure that the ill is not CONDEMNED, before
4892 	 * adding the NCE.
4893 	 */
4894 	mutex_enter(&ill->ill_lock);
4895 	if (ill->ill_state_flags & ILL_CONDEMNED) {
4896 		mutex_exit(&ill->ill_lock);
4897 		err = EINVAL;
4898 		goto err_ret;
4899 	}
4900 	if (!NCE_MYADDR(ncec) &&
4901 	    (ill->ill_state_flags & ILL_DOWN_IN_PROGRESS)) {
4902 		mutex_exit(&ill->ill_lock);
4903 		DTRACE_PROBE1(nce__add__on__down__ill, ncec_t *, ncec);
4904 		err = EINVAL;
4905 		goto err_ret;
4906 	}
4907 	/*
4908 	 * Acquire the ncec_lock even before adding the ncec to the list
4909 	 * so that it cannot get deleted after the ncec is added, but
4910 	 * before we add the nce.
4911 	 */
4912 	mutex_enter(&ncec->ncec_lock);
4913 	if ((ncec->ncec_next = *ncep) != NULL)
4914 		ncec->ncec_next->ncec_ptpn = &ncec->ncec_next;
4915 	*ncep = ncec;
4916 	ncec->ncec_ptpn = ncep;
4917 
4918 	/* Bump up the number of ncec's referencing this ill */
4919 	DTRACE_PROBE3(ill__incr__cnt, (ill_t *), ill,
4920 	    (char *), "ncec", (void *), ncec);
4921 	ill->ill_ncec_cnt++;
4922 	/*
4923 	 * Since we hold the ncec_lock at this time, the ncec cannot be
4924 	 * condemned, and we can safely add the nce.
4925 	 */
4926 	list_create(&graveyard, sizeof (nce_t), offsetof(nce_t, nce_node));
4927 	*retnce = nce_add_impl(ill, ncec, nce, dlur_mp, &graveyard);
4928 	mutex_exit(&ncec->ncec_lock);
4929 	mutex_exit(&ill->ill_lock);
4930 	nce_graveyard_free(&graveyard);
4931 
4932 	/* caller must trigger fastpath on *retnce */
4933 	return (0);
4934 
4935 err_ret:
4936 	if (ncec != NULL)
4937 		kmem_cache_free(ncec_cache, ncec);
4938 	if (nce != NULL)
4939 		kmem_cache_free(nce_cache, nce);
4940 	freemsg(dlur_mp);
4941 	if (template != NULL)
4942 		kmem_free(template, ill->ill_phys_addr_length);
4943 	return (err);
4944 }
4945 
4946 /*
4947  * take a ref on the nce
4948  */
4949 void
4950 nce_refhold(nce_t *nce)
4951 {
4952 	mutex_enter(&nce->nce_lock);
4953 	nce->nce_refcnt++;
4954 	ASSERT((nce)->nce_refcnt != 0);
4955 	mutex_exit(&nce->nce_lock);
4956 }
4957 
4958 /*
4959  * release a ref on the nce; In general, this
4960  * cannot be called with locks held because nce_inactive
4961  * may result in nce_inactive which will take the ill_lock,
4962  * do ipif_ill_refrele_tail etc. Thus the one exception
4963  * where this can be called with locks held is when the caller
4964  * is certain that the nce_refcnt is sufficient to prevent
4965  * the invocation of nce_inactive.
4966  */
4967 void
4968 nce_refrele(nce_t *nce)
4969 {
4970 	ASSERT((nce)->nce_refcnt != 0);
4971 	mutex_enter(&nce->nce_lock);
4972 	if (--nce->nce_refcnt == 0)
4973 		nce_inactive(nce); /* destroys the mutex */
4974 	else
4975 		mutex_exit(&nce->nce_lock);
4976 }
4977 
4978 /*
4979  * free the nce after all refs have gone away.
4980  */
4981 static void
4982 nce_inactive(nce_t *nce)
4983 {
4984 	ill_t *ill = nce->nce_ill;
4985 
4986 	ASSERT(nce->nce_refcnt == 0);
4987 
4988 	ncec_refrele_notr(nce->nce_common);
4989 	nce->nce_common = NULL;
4990 	freemsg(nce->nce_fp_mp);
4991 	freemsg(nce->nce_dlur_mp);
4992 
4993 	mutex_enter(&ill->ill_lock);
4994 	DTRACE_PROBE3(ill__decr__cnt, (ill_t *), ill,
4995 	    (char *), "nce", (void *), nce);
4996 	ill->ill_nce_cnt--;
4997 	nce->nce_ill = NULL;
4998 	/*
4999 	 * If the number of ncec's associated with this ill have dropped
5000 	 * to zero, check whether we need to restart any operation that
5001 	 * is waiting for this to happen.
5002 	 */
5003 	if (ILL_DOWN_OK(ill)) {
5004 		/* ipif_ill_refrele_tail drops the ill_lock */
5005 		ipif_ill_refrele_tail(ill);
5006 	} else {
5007 		mutex_exit(&ill->ill_lock);
5008 	}
5009 
5010 	mutex_destroy(&nce->nce_lock);
5011 	kmem_cache_free(nce_cache, nce);
5012 }
5013 
5014 /*
5015  * Add an nce to the ill_nce list.
5016  *
5017  * Adding multicast NCEs is subject to a per-ill limit. This function returns
5018  * NULL if that's the case, and it may reap a number of multicast nces.
5019  * Callers (and upstack) must be able to cope with NULL returns.
5020  */
5021 static nce_t *
5022 nce_add_impl(ill_t *ill, ncec_t *ncec, nce_t *nce, mblk_t *dlur_mp,
5023     list_t *graveyard)
5024 {
5025 	ASSERT(MUTEX_HELD(&ill->ill_lock));
5026 
5027 	if ((ncec->ncec_flags & NCE_F_MCAST) != 0) {
5028 		if (nce_too_many_mcast(ill, graveyard)) {
5029 			kmem_cache_free(nce_cache, nce);
5030 			return (NULL);
5031 		}
5032 		ill->ill_mcast_nces++;
5033 	}
5034 
5035 	bzero(nce, sizeof (*nce));
5036 	mutex_init(&nce->nce_lock, NULL, MUTEX_DEFAULT, NULL);
5037 	nce->nce_common = ncec;
5038 	nce->nce_addr = ncec->ncec_addr;
5039 	nce->nce_ill = ill;
5040 	DTRACE_PROBE3(ill__incr__cnt, (ill_t *), ill,
5041 	    (char *), "nce", (void *), nce);
5042 	ill->ill_nce_cnt++;
5043 
5044 	nce->nce_refcnt = 1; /* for the thread */
5045 	ncec->ncec_refcnt++; /* want ncec_refhold_locked_notr(ncec) */
5046 	nce->nce_dlur_mp = dlur_mp;
5047 
5048 	/* add nce to the ill's fastpath list.  */
5049 	nce->nce_refcnt++; /* for the list */
5050 	list_insert_head(&ill->ill_nce, nce);
5051 	return (nce);
5052 }
5053 
5054 static nce_t *
5055 nce_add(ill_t *ill, ncec_t *ncec, list_t *graveyard)
5056 {
5057 	nce_t	*nce;
5058 	mblk_t	*dlur_mp = NULL;
5059 
5060 	ASSERT(MUTEX_HELD(&ill->ill_lock));
5061 	ASSERT(MUTEX_HELD(&ncec->ncec_lock));
5062 
5063 	nce = kmem_cache_alloc(nce_cache, KM_NOSLEEP);
5064 	if (nce == NULL)
5065 		return (NULL);
5066 	if (ncec->ncec_lladdr != NULL ||
5067 	    ill->ill_net_type == IRE_IF_NORESOLVER) {
5068 		dlur_mp = ill_dlur_gen(ncec->ncec_lladdr,
5069 		    ill->ill_phys_addr_length, ill->ill_sap,
5070 		    ill->ill_sap_length);
5071 		if (dlur_mp == NULL) {
5072 			kmem_cache_free(nce_cache, nce);
5073 			return (NULL);
5074 		}
5075 	}
5076 	/*
5077 	 * If nce_add_impl() returns NULL due to on multicast limiting, caller
5078 	 * will (correctly) assume ENOMEM.
5079 	 */
5080 	return (nce_add_impl(ill, ncec, nce, dlur_mp, graveyard));
5081 }
5082 
5083 /*
5084  * remove the nce from the ill_faspath list
5085  */
5086 void
5087 nce_delete(nce_t *nce)
5088 {
5089 	ill_t	*ill = nce->nce_ill;
5090 
5091 	ASSERT(MUTEX_HELD(&ill->ill_lock));
5092 
5093 	mutex_enter(&nce->nce_lock);
5094 	if (nce->nce_is_condemned) {
5095 		/*
5096 		 * some other thread has removed this nce from the ill_nce list
5097 		 */
5098 		mutex_exit(&nce->nce_lock);
5099 		return;
5100 	}
5101 	nce->nce_is_condemned = B_TRUE;
5102 	mutex_exit(&nce->nce_lock);
5103 
5104 	/* Update the count of multicast NCEs. */
5105 	if ((nce->nce_common->ncec_flags & NCE_F_MCAST) == NCE_F_MCAST)
5106 		ill->ill_mcast_nces--;
5107 
5108 	list_remove(&ill->ill_nce, nce);
5109 	/*
5110 	 * even though we are holding the ill_lock, it is ok to
5111 	 * call nce_refrele here because we know that we should have
5112 	 * at least 2 refs on the nce: one for the thread, and one
5113 	 * for the list. The refrele below will release the one for
5114 	 * the list.
5115 	 */
5116 	nce_refrele(nce);
5117 }
5118 
5119 nce_t *
5120 nce_lookup(ill_t *ill, const in6_addr_t *addr)
5121 {
5122 	nce_t *nce = NULL;
5123 
5124 	ASSERT(ill != NULL);
5125 	ASSERT(MUTEX_HELD(&ill->ill_lock));
5126 
5127 	for (nce = list_head(&ill->ill_nce); nce != NULL;
5128 	    nce = list_next(&ill->ill_nce, nce)) {
5129 		if (IN6_ARE_ADDR_EQUAL(&nce->nce_addr, addr))
5130 			break;
5131 	}
5132 
5133 	/*
5134 	 * if we found the nce on the ill_nce list while holding
5135 	 * the ill_lock, then it cannot be condemned yet.
5136 	 */
5137 	if (nce != NULL) {
5138 		ASSERT(!nce->nce_is_condemned);
5139 		nce_refhold(nce);
5140 	}
5141 	return (nce);
5142 }
5143 
5144 /*
5145  * Walk the ill_nce list on ill. The callback function func() cannot perform
5146  * any destructive actions.
5147  */
5148 static void
5149 nce_walk_common(ill_t *ill, pfi_t func, void *arg)
5150 {
5151 	nce_t *nce = NULL, *nce_next;
5152 
5153 	ASSERT(MUTEX_HELD(&ill->ill_lock));
5154 	for (nce = list_head(&ill->ill_nce); nce != NULL; ) {
5155 		nce_next = list_next(&ill->ill_nce, nce);
5156 		if (func(ill, nce, arg) != 0)
5157 			break;
5158 		nce = nce_next;
5159 	}
5160 }
5161 
5162 void
5163 nce_walk(ill_t *ill, pfi_t func, void *arg)
5164 {
5165 	mutex_enter(&ill->ill_lock);
5166 	nce_walk_common(ill, func, arg);
5167 	mutex_exit(&ill->ill_lock);
5168 }
5169 
5170 void
5171 nce_flush(ill_t *ill, boolean_t flushall)
5172 {
5173 	nce_t *nce, *nce_next;
5174 	list_t dead;
5175 
5176 	list_create(&dead, sizeof (nce_t), offsetof(nce_t, nce_node));
5177 	mutex_enter(&ill->ill_lock);
5178 	for (nce = list_head(&ill->ill_nce); nce != NULL; ) {
5179 		nce_next = list_next(&ill->ill_nce, nce);
5180 		if (!flushall && NCE_PUBLISH(nce->nce_common)) {
5181 			nce = nce_next;
5182 			continue;
5183 		}
5184 		/*
5185 		 * nce_delete requires that the caller should either not
5186 		 * be holding locks, or should hold a ref to ensure that
5187 		 * we wont hit ncec_inactive. So take a ref and clean up
5188 		 * after the list is flushed.
5189 		 */
5190 		nce_refhold(nce);
5191 		nce_delete(nce);
5192 		list_insert_tail(&dead, nce);
5193 		nce = nce_next;
5194 	}
5195 	mutex_exit(&ill->ill_lock);
5196 	while ((nce = list_head(&dead)) != NULL) {
5197 		list_remove(&dead, nce);
5198 		nce_refrele(nce);
5199 	}
5200 	ASSERT(list_is_empty(&dead));
5201 	list_destroy(&dead);
5202 }
5203 
5204 /* Return an interval that is anywhere in the [1 .. intv] range */
5205 static clock_t
5206 nce_fuzz_interval(clock_t intv, boolean_t initial_time)
5207 {
5208 	clock_t rnd, frac;
5209 
5210 	(void) random_get_pseudo_bytes((uint8_t *)&rnd, sizeof (rnd));
5211 	/* Note that clock_t is signed; must chop off bits */
5212 	rnd &= (1ul << (NBBY * sizeof (rnd) - 1)) - 1;
5213 	if (initial_time) {
5214 		if (intv <= 0)
5215 			intv = 1;
5216 		else
5217 			intv = (rnd % intv) + 1;
5218 	} else {
5219 		/* Compute 'frac' as 20% of the configured interval */
5220 		if ((frac = intv / 5) <= 1)
5221 			frac = 2;
5222 		/* Set intv randomly in the range [intv-frac .. intv+frac] */
5223 		if ((intv = intv - frac + rnd % (2 * frac + 1)) <= 0)
5224 			intv = 1;
5225 	}
5226 	return (intv);
5227 }
5228 
5229 void
5230 nce_resolv_ipmp_ok(ncec_t *ncec)
5231 {
5232 	mblk_t *mp;
5233 	uint_t pkt_len;
5234 	iaflags_t ixaflags = IXAF_NO_TRACE;
5235 	nce_t *under_nce;
5236 	ill_t	*ill = ncec->ncec_ill;
5237 	boolean_t isv6 = (ncec->ncec_ipversion == IPV6_VERSION);
5238 	ipif_t *src_ipif = NULL;
5239 	ip_stack_t *ipst = ill->ill_ipst;
5240 	ill_t *send_ill;
5241 	uint_t nprobes;
5242 
5243 	ASSERT(IS_IPMP(ill));
5244 
5245 	mutex_enter(&ncec->ncec_lock);
5246 	nprobes = ncec->ncec_nprobes;
5247 	mp = ncec->ncec_qd_mp;
5248 	ncec->ncec_qd_mp = NULL;
5249 	ncec->ncec_nprobes = 0;
5250 	mutex_exit(&ncec->ncec_lock);
5251 
5252 	while (mp != NULL) {
5253 		mblk_t *nxt_mp;
5254 
5255 		nxt_mp = mp->b_next;
5256 		mp->b_next = NULL;
5257 		if (isv6) {
5258 			ip6_t *ip6h = (ip6_t *)mp->b_rptr;
5259 
5260 			pkt_len = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN;
5261 			src_ipif = ipif_lookup_addr_nondup_v6(&ip6h->ip6_src,
5262 			    ill, ALL_ZONES, ipst);
5263 		} else {
5264 			ipha_t *ipha = (ipha_t *)mp->b_rptr;
5265 
5266 			ixaflags |= IXAF_IS_IPV4;
5267 			pkt_len = ntohs(ipha->ipha_length);
5268 			src_ipif = ipif_lookup_addr_nondup(ipha->ipha_src,
5269 			    ill, ALL_ZONES, ipst);
5270 		}
5271 
5272 		/*
5273 		 * find a new nce based on an under_ill. The first IPMP probe
5274 		 * packet gets queued, so we could still find a src_ipif that
5275 		 * matches an IPMP test address.
5276 		 */
5277 		if (src_ipif == NULL || IS_IPMP(src_ipif->ipif_ill)) {
5278 			/*
5279 			 * if src_ipif is null, this could be either a
5280 			 * forwarded packet or a probe whose src got deleted.
5281 			 * We identify the former case by looking for the
5282 			 * ncec_nprobes: the first ncec_nprobes packets are
5283 			 * probes;
5284 			 */
5285 			if (src_ipif == NULL && nprobes > 0)
5286 				goto drop_pkt;
5287 
5288 			/*
5289 			 * For forwarded packets, we use the ipmp rotor
5290 			 * to find send_ill.
5291 			 */
5292 			send_ill = ipmp_ill_hold_xmit_ill(ncec->ncec_ill,
5293 			    B_TRUE);
5294 		} else {
5295 			send_ill = src_ipif->ipif_ill;
5296 			ill_refhold(send_ill);
5297 		}
5298 
5299 		DTRACE_PROBE4(nce__resolve__ipmp, (mblk_t *), mp,
5300 		    (ncec_t *), ncec, (ipif_t *),
5301 		    src_ipif, (ill_t *), send_ill);
5302 
5303 		if (send_ill == NULL) {
5304 			if (src_ipif != NULL)
5305 				ipif_refrele(src_ipif);
5306 			goto drop_pkt;
5307 		}
5308 		/* create an under_nce on send_ill */
5309 		rw_enter(&ipst->ips_ill_g_lock, RW_READER);
5310 		if (IS_IN_SAME_ILLGRP(send_ill, ncec->ncec_ill))
5311 			under_nce = nce_fastpath_create(send_ill, ncec);
5312 		else
5313 			under_nce = NULL;
5314 		rw_exit(&ipst->ips_ill_g_lock);
5315 		if (under_nce != NULL && NCE_ISREACHABLE(ncec))
5316 			nce_fastpath_trigger(under_nce);
5317 
5318 		ill_refrele(send_ill);
5319 		if (src_ipif != NULL)
5320 			ipif_refrele(src_ipif);
5321 
5322 		if (under_nce != NULL) {
5323 			(void) ip_xmit(mp, under_nce, ixaflags, pkt_len, 0,
5324 			    ALL_ZONES, 0, NULL);
5325 			nce_refrele(under_nce);
5326 			if (nprobes > 0)
5327 				nprobes--;
5328 			mp = nxt_mp;
5329 			continue;
5330 		}
5331 drop_pkt:
5332 		if (isv6) {
5333 			BUMP_MIB(&ipst->ips_ip6_mib, ipIfStatsOutDiscards);
5334 		} else {
5335 			BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
5336 		}
5337 		ip_drop_output("ipIfStatsOutDiscards - no under_ill", mp, NULL);
5338 		freemsg(mp);
5339 		if (nprobes > 0)
5340 			nprobes--;
5341 		mp = nxt_mp;
5342 	}
5343 	ncec_cb_dispatch(ncec); /* complete callbacks */
5344 }
5345