xref: /titanic_44/usr/src/uts/common/inet/ip/ip_ndp.c (revision 0db3240d392634cfff2f95fb6da34b56b8dc574f)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #include <sys/types.h>
27 #include <sys/stream.h>
28 #include <sys/stropts.h>
29 #include <sys/strsun.h>
30 #include <sys/sysmacros.h>
31 #include <sys/errno.h>
32 #include <sys/dlpi.h>
33 #include <sys/socket.h>
34 #include <sys/ddi.h>
35 #include <sys/sunddi.h>
36 #include <sys/cmn_err.h>
37 #include <sys/debug.h>
38 #include <sys/vtrace.h>
39 #include <sys/kmem.h>
40 #include <sys/zone.h>
41 #include <sys/ethernet.h>
42 #include <sys/sdt.h>
43 #include <sys/mac.h>
44 
45 #include <net/if.h>
46 #include <net/if_types.h>
47 #include <net/if_dl.h>
48 #include <net/route.h>
49 #include <netinet/in.h>
50 #include <netinet/ip6.h>
51 #include <netinet/icmp6.h>
52 
53 #include <inet/common.h>
54 #include <inet/mi.h>
55 #include <inet/mib2.h>
56 #include <inet/nd.h>
57 #include <inet/ip.h>
58 #include <inet/ip_impl.h>
59 #include <inet/ipclassifier.h>
60 #include <inet/ip_if.h>
61 #include <inet/ip_ire.h>
62 #include <inet/ip_rts.h>
63 #include <inet/ip6.h>
64 #include <inet/ip_ndp.h>
65 #include <inet/sctp_ip.h>
66 #include <inet/ip_arp.h>
67 #include <inet/ip2mac_impl.h>
68 
69 #define	ANNOUNCE_INTERVAL(isv6) \
70 	(isv6 ? ipst->ips_ip_ndp_unsolicit_interval : \
71 	ipst->ips_ip_arp_publish_interval)
72 
73 #define	DEFENSE_INTERVAL(isv6) \
74 	(isv6 ? ipst->ips_ndp_defend_interval : \
75 	ipst->ips_arp_defend_interval)
76 
77 /* Non-tunable probe interval, based on link capabilities */
78 #define	ILL_PROBE_INTERVAL(ill)	((ill)->ill_note_link ? 150 : 1500)
79 
80 /*
81  * The IPv4 Link Local address space is special; we do extra duplicate checking
82  * there, as the entire assignment mechanism rests on random numbers.
83  */
84 #define	IS_IPV4_LL_SPACE(ptr)	(((uchar_t *)ptr)[0] == 169 && \
85 				((uchar_t *)ptr)[1] == 254)
86 
87 /*
88  * NCE_EXTERNAL_FLAGS_MASK defines the set of ncec_flags that may be passed
89  * in to the ncec*add* functions.
90  *
91  * NCE_F_AUTHORITY means that we ignore any incoming adverts for that
92  * mapping (though DAD is performed for the mapping). NCE_F_PUBLISH means
93  * that we will respond to requests for the protocol address.
94  */
95 #define	NCE_EXTERNAL_FLAGS_MASK \
96 	(NCE_F_MYADDR | NCE_F_ISROUTER | NCE_F_NONUD | \
97 	NCE_F_ANYCAST | NCE_F_UNSOL_ADV | NCE_F_BCAST | NCE_F_MCAST | \
98 	NCE_F_AUTHORITY | NCE_F_PUBLISH | NCE_F_STATIC)
99 
100 /*
101  * Lock ordering:
102  *
103  *	ndp_g_lock -> ill_lock -> ncec_lock
104  *
105  * The ndp_g_lock protects the NCE hash (nce_hash_tbl, NCE_HASH_PTR) and
106  * ncec_next.  ncec_lock protects the contents of the NCE (particularly
107  * ncec_refcnt).
108  */
109 
110 static	void	nce_cleanup_list(ncec_t *ncec);
111 static	void 	nce_set_ll(ncec_t *ncec, uchar_t *ll_addr);
112 static	ncec_t	*ncec_lookup_illgrp(ill_t *, const in6_addr_t *,
113     ncec_t *);
114 static	nce_t	*nce_lookup_addr(ill_t *, const in6_addr_t *);
115 static	int	nce_set_multicast_v6(ill_t *ill, const in6_addr_t *addr,
116     uint16_t ncec_flags, nce_t **newnce);
117 static	int	nce_set_multicast_v4(ill_t *ill, const in_addr_t *dst,
118     uint16_t ncec_flags, nce_t **newnce);
119 static	boolean_t	ndp_xmit(ill_t *ill, uint32_t operation,
120     uint8_t *hwaddr, uint_t hwaddr_len, const in6_addr_t *sender,
121     const in6_addr_t *target, int flag);
122 static void	ncec_refhold_locked(ncec_t *);
123 static boolean_t ill_defend_rate_limit(ill_t *, ncec_t *);
124 static	void	nce_queue_mp_common(ncec_t *, mblk_t *, boolean_t);
125 static	int	nce_add_common(ill_t *, uchar_t *, uint_t, const in6_addr_t *,
126     uint16_t, uint16_t, nce_t **);
127 static nce_t *nce_add_impl(ill_t *, ncec_t *, nce_t *, mblk_t *);
128 static nce_t *nce_add(ill_t *, ncec_t *);
129 static void nce_inactive(nce_t *);
130 extern nce_t 	*nce_lookup(ill_t *, const in6_addr_t *);
131 static nce_t *nce_ill_lookup_then_add(ill_t *, ncec_t *);
132 static int	nce_add_v6(ill_t *, uchar_t *, uint_t, const in6_addr_t *,
133     uint16_t, uint16_t, nce_t **);
134 static int	nce_add_v4(ill_t *, uchar_t *, uint_t, const in_addr_t *,
135     uint16_t, uint16_t, nce_t **);
136 static int  nce_add_v6_postprocess(nce_t *);
137 static int  nce_add_v4_postprocess(nce_t *);
138 static ill_t *nce_resolve_src(ncec_t *, in6_addr_t *);
139 static clock_t nce_fuzz_interval(clock_t, boolean_t);
140 static void nce_resolv_ipmp_ok(ncec_t *);
141 static void nce_walk_common(ill_t *, pfi_t, void *);
142 static void nce_start_timer(ncec_t *, uint_t);
143 static nce_t *nce_fastpath_create(ill_t *, ncec_t *);
144 static void nce_fastpath_trigger(nce_t *);
145 static nce_t *nce_fastpath(ncec_t *, boolean_t, nce_t *);
146 
147 #ifdef DEBUG
148 static void	ncec_trace_cleanup(const ncec_t *);
149 #endif
150 
151 #define	NCE_HASH_PTR_V4(ipst, addr)					\
152 	(&((ipst)->ips_ndp4->nce_hash_tbl[IRE_ADDR_HASH(addr, NCE_TABLE_SIZE)]))
153 
154 #define	NCE_HASH_PTR_V6(ipst, addr)				 \
155 	(&((ipst)->ips_ndp6->nce_hash_tbl[NCE_ADDR_HASH_V6(addr, \
156 		NCE_TABLE_SIZE)]))
157 
158 extern kmem_cache_t *ncec_cache;
159 extern kmem_cache_t *nce_cache;
160 
161 /*
162  * Send out a IPv6 (unicast) or IPv4 (broadcast) DAD probe
163  * If src_ill is not null, the ncec_addr is bound to src_ill. The
164  * src_ill is ignored by nce_dad for IPv4 Neighbor Cache entries where
165  * the probe is sent on the ncec_ill (in the non-IPMP case) or the
166  * IPMP cast_ill (in the IPMP case).
167  *
168  * Note that the probe interval is based on the src_ill for IPv6, and
169  * the ncec_xmit_interval for IPv4.
170  */
171 static void
172 nce_dad(ncec_t *ncec, ill_t *src_ill, boolean_t send_probe)
173 {
174 	boolean_t dropped;
175 	uint32_t probe_interval;
176 
177 	ASSERT(!(ncec->ncec_flags & NCE_F_MCAST));
178 	ASSERT(!(ncec->ncec_flags & NCE_F_BCAST));
179 	if (ncec->ncec_ipversion == IPV6_VERSION) {
180 		dropped = ndp_xmit(src_ill, ND_NEIGHBOR_SOLICIT,
181 		    ncec->ncec_lladdr, ncec->ncec_lladdr_length,
182 		    &ipv6_all_zeros, &ncec->ncec_addr, NDP_PROBE);
183 		probe_interval = ILL_PROBE_INTERVAL(src_ill);
184 	} else {
185 		/* IPv4 DAD delay the initial probe. */
186 		if (send_probe)
187 			dropped = arp_probe(ncec);
188 		else
189 			dropped = B_TRUE;
190 		probe_interval = nce_fuzz_interval(ncec->ncec_xmit_interval,
191 		    !send_probe);
192 	}
193 	if (!dropped) {
194 		mutex_enter(&ncec->ncec_lock);
195 		ncec->ncec_pcnt--;
196 		mutex_exit(&ncec->ncec_lock);
197 	}
198 	nce_restart_timer(ncec, probe_interval);
199 }
200 
201 /*
202  * Compute default flags to use for an advertisement of this ncec's address.
203  */
204 static int
205 nce_advert_flags(const ncec_t *ncec)
206 {
207 	int flag = 0;
208 
209 	if (ncec->ncec_flags & NCE_F_ISROUTER)
210 		flag |= NDP_ISROUTER;
211 	if (!(ncec->ncec_flags & NCE_F_ANYCAST))
212 		flag |= NDP_ORIDE;
213 
214 	return (flag);
215 }
216 
217 /*
218  * NDP Cache Entry creation routine.
219  * This routine must always be called with ndp6->ndp_g_lock held.
220  */
221 int
222 nce_add_v6(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len,
223     const in6_addr_t *addr, uint16_t flags, uint16_t state, nce_t **newnce)
224 {
225 	int		err;
226 	nce_t		*nce;
227 
228 	ASSERT(MUTEX_HELD(&ill->ill_ipst->ips_ndp6->ndp_g_lock));
229 	ASSERT(ill != NULL && ill->ill_isv6);
230 
231 	err = nce_add_common(ill, hw_addr, hw_addr_len, addr, flags, state,
232 	    &nce);
233 	if (err != 0)
234 		return (err);
235 	ASSERT(newnce != NULL);
236 	*newnce = nce;
237 	return (err);
238 }
239 
240 /*
241  * Post-processing routine to be executed after nce_add_v6(). This function
242  * triggers fastpath (if appropriate) and DAD on the newly added nce entry
243  * and must be called without any locks held.
244  */
245 int
246 nce_add_v6_postprocess(nce_t *nce)
247 {
248 	ncec_t		*ncec = nce->nce_common;
249 	boolean_t	dropped = B_FALSE;
250 	uchar_t		*hw_addr = ncec->ncec_lladdr;
251 	uint_t		hw_addr_len = ncec->ncec_lladdr_length;
252 	ill_t		*ill = ncec->ncec_ill;
253 	int		err = 0;
254 	uint16_t	flags = ncec->ncec_flags;
255 	ip_stack_t	*ipst = ill->ill_ipst;
256 	boolean_t	trigger_fastpath = B_TRUE;
257 
258 	/*
259 	 * If the hw_addr is NULL, typically for ND_INCOMPLETE nces, then
260 	 * we call nce_fastpath as soon as the ncec is resolved in nce_process.
261 	 * We call nce_fastpath from nce_update if the link layer address of
262 	 * the peer changes from nce_update
263 	 */
264 	if (NCE_PUBLISH(ncec) || !NCE_ISREACHABLE(ncec) ||
265 	    (hw_addr == NULL && ill->ill_net_type != IRE_IF_NORESOLVER))
266 		trigger_fastpath = B_FALSE;
267 
268 	if (trigger_fastpath)
269 		nce_fastpath_trigger(nce);
270 	if (NCE_PUBLISH(ncec) && ncec->ncec_state == ND_PROBE) {
271 		ill_t *hwaddr_ill;
272 		/*
273 		 * Unicast entry that needs DAD.
274 		 */
275 		if (IS_IPMP(ill)) {
276 			hwaddr_ill = ipmp_illgrp_find_ill(ill->ill_grp,
277 			    hw_addr, hw_addr_len);
278 		} else {
279 			hwaddr_ill = ill;
280 		}
281 		nce_dad(ncec, hwaddr_ill, B_TRUE);
282 		err = EINPROGRESS;
283 	} else if (flags & NCE_F_UNSOL_ADV) {
284 		/*
285 		 * We account for the transmit below by assigning one
286 		 * less than the ndd variable. Subsequent decrements
287 		 * are done in nce_timer.
288 		 */
289 		mutex_enter(&ncec->ncec_lock);
290 		ncec->ncec_unsolicit_count =
291 		    ipst->ips_ip_ndp_unsolicit_count - 1;
292 		mutex_exit(&ncec->ncec_lock);
293 		dropped = ndp_xmit(ill,
294 		    ND_NEIGHBOR_ADVERT,
295 		    hw_addr,
296 		    hw_addr_len,
297 		    &ncec->ncec_addr,	/* Source and target of the adv */
298 		    &ipv6_all_hosts_mcast, /* Destination of the packet */
299 		    nce_advert_flags(ncec));
300 		mutex_enter(&ncec->ncec_lock);
301 		if (dropped)
302 			ncec->ncec_unsolicit_count++;
303 		else
304 			ncec->ncec_last_time_defended = ddi_get_lbolt();
305 		if (ncec->ncec_unsolicit_count != 0) {
306 			nce_start_timer(ncec,
307 			    ipst->ips_ip_ndp_unsolicit_interval);
308 		}
309 		mutex_exit(&ncec->ncec_lock);
310 	}
311 	return (err);
312 }
313 
314 /*
315  * Atomically lookup and add (if needed) Neighbor Cache information for
316  * an address.
317  *
318  * IPMP notes: the ncec for non-local (i.e., !NCE_MYADDR(ncec) addresses
319  * are always added pointing at the ipmp_ill. Thus, when the ill passed
320  * to nce_add_v6 is an under_ill (i.e., IS_UNDER_IPMP(ill)) two nce_t
321  * entries will be created, both pointing at the same ncec_t. The nce_t
322  * entries will have their nce_ill set to the ipmp_ill and the under_ill
323  * respectively, with the ncec_t having its ncec_ill pointing at the ipmp_ill.
324  * Local addresses are always created on the ill passed to nce_add_v6.
325  */
326 int
327 nce_lookup_then_add_v6(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len,
328     const in6_addr_t *addr, uint16_t flags, uint16_t state, nce_t **newnce)
329 {
330 	int		err = 0;
331 	ip_stack_t	*ipst = ill->ill_ipst;
332 	nce_t		*nce, *upper_nce = NULL;
333 	ill_t		*in_ill = ill;
334 	boolean_t	need_ill_refrele = B_FALSE;
335 
336 	if (flags & NCE_F_MCAST) {
337 		/*
338 		 * hw_addr will be figured out in nce_set_multicast_v6;
339 		 * caller has to select the cast_ill
340 		 */
341 		ASSERT(hw_addr == NULL);
342 		ASSERT(!IS_IPMP(ill));
343 		err = nce_set_multicast_v6(ill, addr, flags, newnce);
344 		return (err);
345 	}
346 	ASSERT(ill->ill_isv6);
347 	if (IS_UNDER_IPMP(ill) && !(flags & NCE_F_MYADDR)) {
348 		ill = ipmp_ill_hold_ipmp_ill(ill);
349 		if (ill == NULL)
350 			return (ENXIO);
351 		need_ill_refrele = B_TRUE;
352 	}
353 
354 	mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
355 	nce = nce_lookup_addr(ill, addr);
356 	if (nce == NULL) {
357 		err = nce_add_v6(ill, hw_addr, hw_addr_len, addr, flags, state,
358 		    &nce);
359 	} else {
360 		err = EEXIST;
361 	}
362 	mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
363 	if (err == 0)
364 		err = nce_add_v6_postprocess(nce);
365 	if (in_ill != ill && nce != NULL) {
366 		nce_t *under_nce = NULL;
367 
368 		/*
369 		 * in_ill was the under_ill. Try to create the under_nce.
370 		 * Hold the ill_g_lock to prevent changes to group membership
371 		 * until we are done.
372 		 */
373 		rw_enter(&ipst->ips_ill_g_lock, RW_READER);
374 		if (!IS_IN_SAME_ILLGRP(in_ill, ill)) {
375 			DTRACE_PROBE2(ill__not__in__group, nce_t *, nce,
376 			    ill_t *, ill);
377 			rw_exit(&ipst->ips_ill_g_lock);
378 			err = ENXIO;
379 			nce_refrele(nce);
380 			nce = NULL;
381 			goto bail;
382 		}
383 		under_nce = nce_fastpath_create(in_ill, nce->nce_common);
384 		if (under_nce == NULL) {
385 			rw_exit(&ipst->ips_ill_g_lock);
386 			err = EINVAL;
387 			nce_refrele(nce);
388 			nce = NULL;
389 			goto bail;
390 		}
391 		rw_exit(&ipst->ips_ill_g_lock);
392 		upper_nce = nce;
393 		nce = under_nce; /* will be returned to caller */
394 		if (NCE_ISREACHABLE(nce->nce_common))
395 			nce_fastpath_trigger(under_nce);
396 	}
397 	/* nce_refrele is deferred until the lock is dropped  */
398 	if (nce != NULL) {
399 		if (newnce != NULL)
400 			*newnce = nce;
401 		else
402 			nce_refrele(nce);
403 	}
404 bail:
405 	if (upper_nce != NULL)
406 		nce_refrele(upper_nce);
407 	if (need_ill_refrele)
408 		ill_refrele(ill);
409 	return (err);
410 }
411 
412 /*
413  * Remove all the CONDEMNED nces from the appropriate hash table.
414  * We create a private list of NCEs, these may have ires pointing
415  * to them, so the list will be passed through to clean up dependent
416  * ires and only then we can do ncec_refrele() which can make NCE inactive.
417  */
418 static void
419 nce_remove(ndp_g_t *ndp, ncec_t *ncec, ncec_t **free_nce_list)
420 {
421 	ncec_t *ncec1;
422 	ncec_t **ptpn;
423 
424 	ASSERT(MUTEX_HELD(&ndp->ndp_g_lock));
425 	ASSERT(ndp->ndp_g_walker == 0);
426 	for (; ncec; ncec = ncec1) {
427 		ncec1 = ncec->ncec_next;
428 		mutex_enter(&ncec->ncec_lock);
429 		if (NCE_ISCONDEMNED(ncec)) {
430 			ptpn = ncec->ncec_ptpn;
431 			ncec1 = ncec->ncec_next;
432 			if (ncec1 != NULL)
433 				ncec1->ncec_ptpn = ptpn;
434 			*ptpn = ncec1;
435 			ncec->ncec_ptpn = NULL;
436 			ncec->ncec_next = NULL;
437 			ncec->ncec_next = *free_nce_list;
438 			*free_nce_list = ncec;
439 		}
440 		mutex_exit(&ncec->ncec_lock);
441 	}
442 }
443 
444 /*
445  * 1. Mark the entry CONDEMNED. This ensures that no new nce_lookup()
446  *    will return this NCE. Also no new timeouts will
447  *    be started (See nce_restart_timer).
448  * 2. Cancel any currently running timeouts.
449  * 3. If there is an ndp walker, return. The walker will do the cleanup.
450  *    This ensures that walkers see a consistent list of NCEs while walking.
451  * 4. Otherwise remove the NCE from the list of NCEs
452  */
453 void
454 ncec_delete(ncec_t *ncec)
455 {
456 	ncec_t	**ptpn;
457 	ncec_t	*ncec1;
458 	int	ipversion = ncec->ncec_ipversion;
459 	ndp_g_t *ndp;
460 	ip_stack_t	*ipst = ncec->ncec_ipst;
461 
462 	if (ipversion == IPV4_VERSION)
463 		ndp = ipst->ips_ndp4;
464 	else
465 		ndp = ipst->ips_ndp6;
466 
467 	/* Serialize deletes */
468 	mutex_enter(&ncec->ncec_lock);
469 	if (NCE_ISCONDEMNED(ncec)) {
470 		/* Some other thread is doing the delete */
471 		mutex_exit(&ncec->ncec_lock);
472 		return;
473 	}
474 	/*
475 	 * Caller has a refhold. Also 1 ref for being in the list. Thus
476 	 * refcnt has to be >= 2
477 	 */
478 	ASSERT(ncec->ncec_refcnt >= 2);
479 	ncec->ncec_flags |= NCE_F_CONDEMNED;
480 	mutex_exit(&ncec->ncec_lock);
481 
482 	/* Count how many condemned ires for kmem_cache callback */
483 	atomic_add_32(&ipst->ips_num_nce_condemned, 1);
484 	nce_fastpath_list_delete(ncec->ncec_ill, ncec, NULL);
485 
486 	/* Complete any waiting callbacks */
487 	ncec_cb_dispatch(ncec);
488 
489 	/*
490 	 * Cancel any running timer. Timeout can't be restarted
491 	 * since CONDEMNED is set. Can't hold ncec_lock across untimeout.
492 	 * Passing invalid timeout id is fine.
493 	 */
494 	if (ncec->ncec_timeout_id != 0) {
495 		(void) untimeout(ncec->ncec_timeout_id);
496 		ncec->ncec_timeout_id = 0;
497 	}
498 
499 	mutex_enter(&ndp->ndp_g_lock);
500 	if (ncec->ncec_ptpn == NULL) {
501 		/*
502 		 * The last ndp walker has already removed this ncec from
503 		 * the list after we marked the ncec CONDEMNED and before
504 		 * we grabbed the global lock.
505 		 */
506 		mutex_exit(&ndp->ndp_g_lock);
507 		return;
508 	}
509 	if (ndp->ndp_g_walker > 0) {
510 		/*
511 		 * Can't unlink. The walker will clean up
512 		 */
513 		ndp->ndp_g_walker_cleanup = B_TRUE;
514 		mutex_exit(&ndp->ndp_g_lock);
515 		return;
516 	}
517 
518 	/*
519 	 * Now remove the ncec from the list. nce_restart_timer won't restart
520 	 * the timer since it is marked CONDEMNED.
521 	 */
522 	ptpn = ncec->ncec_ptpn;
523 	ncec1 = ncec->ncec_next;
524 	if (ncec1 != NULL)
525 		ncec1->ncec_ptpn = ptpn;
526 	*ptpn = ncec1;
527 	ncec->ncec_ptpn = NULL;
528 	ncec->ncec_next = NULL;
529 	mutex_exit(&ndp->ndp_g_lock);
530 
531 	/* Removed from ncec_ptpn/ncec_next list */
532 	ncec_refrele_notr(ncec);
533 }
534 
535 void
536 ncec_inactive(ncec_t *ncec)
537 {
538 	mblk_t		**mpp;
539 	ill_t		*ill = ncec->ncec_ill;
540 	ip_stack_t	*ipst = ncec->ncec_ipst;
541 
542 	ASSERT(ncec->ncec_refcnt == 0);
543 	ASSERT(MUTEX_HELD(&ncec->ncec_lock));
544 
545 	/* Count how many condemned nces for kmem_cache callback */
546 	if (NCE_ISCONDEMNED(ncec))
547 		atomic_add_32(&ipst->ips_num_nce_condemned, -1);
548 
549 	/* Free all allocated messages */
550 	mpp = &ncec->ncec_qd_mp;
551 	while (*mpp != NULL) {
552 		mblk_t  *mp;
553 
554 		mp = *mpp;
555 		*mpp = mp->b_next;
556 
557 		inet_freemsg(mp);
558 	}
559 	/*
560 	 * must have been cleaned up in ncec_delete
561 	 */
562 	ASSERT(list_is_empty(&ncec->ncec_cb));
563 	list_destroy(&ncec->ncec_cb);
564 	/*
565 	 * free the ncec_lladdr if one was allocated in nce_add_common()
566 	 */
567 	if (ncec->ncec_lladdr_length > 0)
568 		kmem_free(ncec->ncec_lladdr, ncec->ncec_lladdr_length);
569 
570 #ifdef DEBUG
571 	ncec_trace_cleanup(ncec);
572 #endif
573 
574 	mutex_enter(&ill->ill_lock);
575 	DTRACE_PROBE3(ill__decr__cnt, (ill_t *), ill,
576 	    (char *), "ncec", (void *), ncec);
577 	ill->ill_ncec_cnt--;
578 	ncec->ncec_ill = NULL;
579 	/*
580 	 * If the number of ncec's associated with this ill have dropped
581 	 * to zero, check whether we need to restart any operation that
582 	 * is waiting for this to happen.
583 	 */
584 	if (ILL_DOWN_OK(ill)) {
585 		/* ipif_ill_refrele_tail drops the ill_lock */
586 		ipif_ill_refrele_tail(ill);
587 	} else {
588 		mutex_exit(&ill->ill_lock);
589 	}
590 
591 	mutex_destroy(&ncec->ncec_lock);
592 	kmem_cache_free(ncec_cache, ncec);
593 }
594 
595 /*
596  * ncec_walk routine.  Delete the ncec if it is associated with the ill
597  * that is going away.  Always called as a writer.
598  */
599 void
600 ncec_delete_per_ill(ncec_t *ncec, uchar_t *arg)
601 {
602 	if ((ncec != NULL) && ncec->ncec_ill == (ill_t *)arg) {
603 		ncec_delete(ncec);
604 	}
605 }
606 
607 /*
608  * Neighbor Cache cleanup logic for a list of ncec_t entries.
609  */
610 static void
611 nce_cleanup_list(ncec_t *ncec)
612 {
613 	ncec_t *ncec_next;
614 
615 	ASSERT(ncec != NULL);
616 	while (ncec != NULL) {
617 		ncec_next = ncec->ncec_next;
618 		ncec->ncec_next = NULL;
619 
620 		/*
621 		 * It is possible for the last ndp walker (this thread)
622 		 * to come here after ncec_delete has marked the ncec CONDEMNED
623 		 * and before it has removed the ncec from the fastpath list
624 		 * or called untimeout. So we need to do it here. It is safe
625 		 * for both ncec_delete and this thread to do it twice or
626 		 * even simultaneously since each of the threads has a
627 		 * reference on the ncec.
628 		 */
629 		nce_fastpath_list_delete(ncec->ncec_ill, ncec, NULL);
630 		/*
631 		 * Cancel any running timer. Timeout can't be restarted
632 		 * since CONDEMNED is set. The ncec_lock can't be
633 		 * held across untimeout though passing invalid timeout
634 		 * id is fine.
635 		 */
636 		if (ncec->ncec_timeout_id != 0) {
637 			(void) untimeout(ncec->ncec_timeout_id);
638 			ncec->ncec_timeout_id = 0;
639 		}
640 		/* Removed from ncec_ptpn/ncec_next list */
641 		ncec_refrele_notr(ncec);
642 		ncec = ncec_next;
643 	}
644 }
645 
646 /*
647  * Restart DAD on given NCE.  Returns B_TRUE if DAD has been restarted.
648  */
649 boolean_t
650 nce_restart_dad(ncec_t *ncec)
651 {
652 	boolean_t started;
653 	ill_t *ill, *hwaddr_ill;
654 
655 	if (ncec == NULL)
656 		return (B_FALSE);
657 	ill = ncec->ncec_ill;
658 	mutex_enter(&ncec->ncec_lock);
659 	if (ncec->ncec_state == ND_PROBE) {
660 		mutex_exit(&ncec->ncec_lock);
661 		started = B_TRUE;
662 	} else if (ncec->ncec_state == ND_REACHABLE) {
663 		ASSERT(ncec->ncec_lladdr != NULL);
664 		ncec->ncec_state = ND_PROBE;
665 		ncec->ncec_pcnt = ND_MAX_UNICAST_SOLICIT;
666 		/*
667 		 * Slight cheat here: we don't use the initial probe delay
668 		 * for IPv4 in this obscure case.
669 		 */
670 		mutex_exit(&ncec->ncec_lock);
671 		if (IS_IPMP(ill)) {
672 			hwaddr_ill = ipmp_illgrp_find_ill(ill->ill_grp,
673 			    ncec->ncec_lladdr, ncec->ncec_lladdr_length);
674 		} else {
675 			hwaddr_ill = ill;
676 		}
677 		nce_dad(ncec, hwaddr_ill, B_TRUE);
678 		started = B_TRUE;
679 	} else {
680 		mutex_exit(&ncec->ncec_lock);
681 		started = B_FALSE;
682 	}
683 	return (started);
684 }
685 
686 /*
687  * IPv6 Cache entry lookup.  Try to find an ncec matching the parameters passed.
688  * If one is found, the refcnt on the ncec will be incremented.
689  */
690 ncec_t *
691 ncec_lookup_illgrp_v6(ill_t *ill, const in6_addr_t *addr)
692 {
693 	ncec_t		*ncec;
694 	ip_stack_t	*ipst = ill->ill_ipst;
695 
696 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
697 	mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
698 
699 	/* Get head of v6 hash table */
700 	ncec = *((ncec_t **)NCE_HASH_PTR_V6(ipst, *addr));
701 	ncec = ncec_lookup_illgrp(ill, addr, ncec);
702 	mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
703 	rw_exit(&ipst->ips_ill_g_lock);
704 	return (ncec);
705 }
706 /*
707  * IPv4 Cache entry lookup.  Try to find an ncec matching the parameters passed.
708  * If one is found, the refcnt on the ncec will be incremented.
709  */
710 ncec_t *
711 ncec_lookup_illgrp_v4(ill_t *ill, const in_addr_t *addr)
712 {
713 	ncec_t	*ncec = NULL;
714 	in6_addr_t addr6;
715 	ip_stack_t *ipst = ill->ill_ipst;
716 
717 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
718 	mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
719 
720 	/* Get head of v4 hash table */
721 	ncec = *((ncec_t **)NCE_HASH_PTR_V4(ipst, *addr));
722 	IN6_IPADDR_TO_V4MAPPED(*addr, &addr6);
723 	ncec = ncec_lookup_illgrp(ill, &addr6, ncec);
724 	mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
725 	rw_exit(&ipst->ips_ill_g_lock);
726 	return (ncec);
727 }
728 
729 /*
730  * Cache entry lookup.  Try to find an ncec matching the parameters passed.
731  * If an ncec is found, increment the hold count on that ncec.
732  * The caller passes in the start of the appropriate hash table, and must
733  * be holding the appropriate global lock (ndp_g_lock). In addition, since
734  * this function matches ncec_t entries across the illgrp, the ips_ill_g_lock
735  * must be held as reader.
736  *
737  * This function always matches across the ipmp group.
738  */
739 ncec_t *
740 ncec_lookup_illgrp(ill_t *ill, const in6_addr_t *addr, ncec_t *ncec)
741 {
742 	ndp_g_t		*ndp;
743 	ip_stack_t	*ipst = ill->ill_ipst;
744 
745 	if (ill->ill_isv6)
746 		ndp = ipst->ips_ndp6;
747 	else
748 		ndp = ipst->ips_ndp4;
749 
750 	ASSERT(ill != NULL);
751 	ASSERT(MUTEX_HELD(&ndp->ndp_g_lock));
752 	if (IN6_IS_ADDR_UNSPECIFIED(addr))
753 		return (NULL);
754 	for (; ncec != NULL; ncec = ncec->ncec_next) {
755 		if (ncec->ncec_ill == ill ||
756 		    IS_IN_SAME_ILLGRP(ill, ncec->ncec_ill)) {
757 			if (IN6_ARE_ADDR_EQUAL(&ncec->ncec_addr, addr)) {
758 				mutex_enter(&ncec->ncec_lock);
759 				if (!NCE_ISCONDEMNED(ncec)) {
760 					ncec_refhold_locked(ncec);
761 					mutex_exit(&ncec->ncec_lock);
762 					break;
763 				}
764 				mutex_exit(&ncec->ncec_lock);
765 			}
766 		}
767 	}
768 	return (ncec);
769 }
770 
771 /*
772  * Find an nce_t on ill with nce_addr == addr. Lookup the nce_t
773  * entries for ill only, i.e., when ill is part of an ipmp group,
774  * nce_lookup_v4 will never try to match across the group.
775  */
776 nce_t *
777 nce_lookup_v4(ill_t *ill, const in_addr_t *addr)
778 {
779 	nce_t *nce;
780 	in6_addr_t addr6;
781 	ip_stack_t *ipst = ill->ill_ipst;
782 
783 	mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
784 	IN6_IPADDR_TO_V4MAPPED(*addr, &addr6);
785 	nce = nce_lookup_addr(ill, &addr6);
786 	mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
787 	return (nce);
788 }
789 
790 /*
791  * Find an nce_t on ill with nce_addr == addr. Lookup the nce_t
792  * entries for ill only, i.e., when ill is part of an ipmp group,
793  * nce_lookup_v6 will never try to match across the group.
794  */
795 nce_t *
796 nce_lookup_v6(ill_t *ill, const in6_addr_t *addr6)
797 {
798 	nce_t *nce;
799 	ip_stack_t *ipst = ill->ill_ipst;
800 
801 	mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
802 	nce = nce_lookup_addr(ill, addr6);
803 	mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
804 	return (nce);
805 }
806 
807 static nce_t *
808 nce_lookup_addr(ill_t *ill, const in6_addr_t *addr)
809 {
810 	nce_t *nce;
811 
812 	ASSERT(ill != NULL);
813 #ifdef DEBUG
814 	if (ill->ill_isv6)
815 		ASSERT(MUTEX_HELD(&ill->ill_ipst->ips_ndp6->ndp_g_lock));
816 	else
817 		ASSERT(MUTEX_HELD(&ill->ill_ipst->ips_ndp4->ndp_g_lock));
818 #endif
819 	mutex_enter(&ill->ill_lock);
820 	nce = nce_lookup(ill, addr);
821 	mutex_exit(&ill->ill_lock);
822 	return (nce);
823 }
824 
825 
826 /*
827  * Router turned to host.  We need to make sure that cached copies of the ncec
828  * are not used for forwarding packets if they were derived from the default
829  * route, and that the default route itself is removed, as  required by
830  * section 7.2.5 of RFC 2461.
831  *
832  * Note that the ncec itself probably has valid link-layer information for the
833  * nexthop, so that there is no reason to delete the ncec, as long as the
834  * ISROUTER flag is turned off.
835  */
836 static void
837 ncec_router_to_host(ncec_t *ncec)
838 {
839 	ire_t		*ire;
840 	ip_stack_t	*ipst = ncec->ncec_ipst;
841 
842 	mutex_enter(&ncec->ncec_lock);
843 	ncec->ncec_flags &= ~NCE_F_ISROUTER;
844 	mutex_exit(&ncec->ncec_lock);
845 
846 	ire = ire_ftable_lookup_v6(&ipv6_all_zeros, &ipv6_all_zeros,
847 	    &ncec->ncec_addr, IRE_DEFAULT, ncec->ncec_ill, ALL_ZONES, NULL,
848 	    MATCH_IRE_ILL | MATCH_IRE_TYPE | MATCH_IRE_GW, 0, ipst, NULL);
849 	if (ire != NULL) {
850 		ip_rts_rtmsg(RTM_DELETE, ire, 0, ipst);
851 		ire_delete(ire);
852 		ire_refrele(ire);
853 	}
854 }
855 
856 /*
857  * Process passed in parameters either from an incoming packet or via
858  * user ioctl.
859  */
860 void
861 nce_process(ncec_t *ncec, uchar_t *hw_addr, uint32_t flag, boolean_t is_adv)
862 {
863 	ill_t	*ill = ncec->ncec_ill;
864 	uint32_t hw_addr_len = ill->ill_phys_addr_length;
865 	boolean_t ll_updated = B_FALSE;
866 	boolean_t ll_changed;
867 	nce_t	*nce;
868 
869 	ASSERT(ncec->ncec_ipversion == IPV6_VERSION);
870 	/*
871 	 * No updates of link layer address or the neighbor state is
872 	 * allowed, when the cache is in NONUD state.  This still
873 	 * allows for responding to reachability solicitation.
874 	 */
875 	mutex_enter(&ncec->ncec_lock);
876 	if (ncec->ncec_state == ND_INCOMPLETE) {
877 		if (hw_addr == NULL) {
878 			mutex_exit(&ncec->ncec_lock);
879 			return;
880 		}
881 		nce_set_ll(ncec, hw_addr);
882 		/*
883 		 * Update ncec state and send the queued packets
884 		 * back to ip this time ire will be added.
885 		 */
886 		if (flag & ND_NA_FLAG_SOLICITED) {
887 			nce_update(ncec, ND_REACHABLE, NULL);
888 		} else {
889 			nce_update(ncec, ND_STALE, NULL);
890 		}
891 		mutex_exit(&ncec->ncec_lock);
892 		nce = nce_fastpath(ncec, B_TRUE, NULL);
893 		nce_resolv_ok(ncec);
894 		if (nce != NULL)
895 			nce_refrele(nce);
896 		return;
897 	}
898 	ll_changed = nce_cmp_ll_addr(ncec, hw_addr, hw_addr_len);
899 	if (!is_adv) {
900 		/* If this is a SOLICITATION request only */
901 		if (ll_changed)
902 			nce_update(ncec, ND_STALE, hw_addr);
903 		mutex_exit(&ncec->ncec_lock);
904 		ncec_cb_dispatch(ncec);
905 		return;
906 	}
907 	if (!(flag & ND_NA_FLAG_OVERRIDE) && ll_changed) {
908 		/* If in any other state than REACHABLE, ignore */
909 		if (ncec->ncec_state == ND_REACHABLE) {
910 			nce_update(ncec, ND_STALE, NULL);
911 		}
912 		mutex_exit(&ncec->ncec_lock);
913 		ncec_cb_dispatch(ncec);
914 		return;
915 	} else {
916 		if (ll_changed) {
917 			nce_update(ncec, ND_UNCHANGED, hw_addr);
918 			ll_updated = B_TRUE;
919 		}
920 		if (flag & ND_NA_FLAG_SOLICITED) {
921 			nce_update(ncec, ND_REACHABLE, NULL);
922 		} else {
923 			if (ll_updated) {
924 				nce_update(ncec, ND_STALE, NULL);
925 			}
926 		}
927 		mutex_exit(&ncec->ncec_lock);
928 		if (!(flag & ND_NA_FLAG_ROUTER) && (ncec->ncec_flags &
929 		    NCE_F_ISROUTER)) {
930 			ncec_router_to_host(ncec);
931 		} else {
932 			ncec_cb_dispatch(ncec);
933 		}
934 	}
935 }
936 
937 /*
938  * Pass arg1 to the pfi supplied, along with each ncec in existence.
939  * ncec_walk() places a REFHOLD on the ncec and drops the lock when
940  * walking the hash list.
941  */
942 void
943 ncec_walk_common(ndp_g_t *ndp, ill_t *ill, pfi_t pfi, void *arg1,
944     boolean_t trace)
945 {
946 	ncec_t	*ncec;
947 	ncec_t	*ncec1;
948 	ncec_t	**ncep;
949 	ncec_t	*free_nce_list = NULL;
950 
951 	mutex_enter(&ndp->ndp_g_lock);
952 	/* Prevent ncec_delete from unlink and free of NCE */
953 	ndp->ndp_g_walker++;
954 	mutex_exit(&ndp->ndp_g_lock);
955 	for (ncep = ndp->nce_hash_tbl;
956 	    ncep < A_END(ndp->nce_hash_tbl); ncep++) {
957 		for (ncec = *ncep; ncec != NULL; ncec = ncec1) {
958 			ncec1 = ncec->ncec_next;
959 			if (ill == NULL || ncec->ncec_ill == ill) {
960 				if (trace) {
961 					ncec_refhold(ncec);
962 					(*pfi)(ncec, arg1);
963 					ncec_refrele(ncec);
964 				} else {
965 					ncec_refhold_notr(ncec);
966 					(*pfi)(ncec, arg1);
967 					ncec_refrele_notr(ncec);
968 				}
969 			}
970 		}
971 	}
972 	mutex_enter(&ndp->ndp_g_lock);
973 	ndp->ndp_g_walker--;
974 	if (ndp->ndp_g_walker_cleanup && ndp->ndp_g_walker == 0) {
975 		/* Time to delete condemned entries */
976 		for (ncep = ndp->nce_hash_tbl;
977 		    ncep < A_END(ndp->nce_hash_tbl); ncep++) {
978 			ncec = *ncep;
979 			if (ncec != NULL) {
980 				nce_remove(ndp, ncec, &free_nce_list);
981 			}
982 		}
983 		ndp->ndp_g_walker_cleanup = B_FALSE;
984 	}
985 
986 	mutex_exit(&ndp->ndp_g_lock);
987 
988 	if (free_nce_list != NULL) {
989 		nce_cleanup_list(free_nce_list);
990 	}
991 }
992 
993 /*
994  * Walk everything.
995  * Note that ill can be NULL hence can't derive the ipst from it.
996  */
997 void
998 ncec_walk(ill_t *ill, pfi_t pfi, void *arg1, ip_stack_t *ipst)
999 {
1000 	ncec_walk_common(ipst->ips_ndp4, ill, pfi, arg1, B_TRUE);
1001 	ncec_walk_common(ipst->ips_ndp6, ill, pfi, arg1, B_TRUE);
1002 }
1003 
1004 /*
1005  * For each interface an entry is added for the unspecified multicast group.
1006  * Here that mapping is used to form the multicast cache entry for a particular
1007  * multicast destination.
1008  */
1009 static int
1010 nce_set_multicast_v6(ill_t *ill, const in6_addr_t *dst,
1011     uint16_t flags, nce_t **newnce)
1012 {
1013 	uchar_t		*hw_addr;
1014 	int		err = 0;
1015 	ip_stack_t	*ipst = ill->ill_ipst;
1016 	nce_t		*nce;
1017 
1018 	ASSERT(ill != NULL);
1019 	ASSERT(ill->ill_isv6);
1020 	ASSERT(!(IN6_IS_ADDR_UNSPECIFIED(dst)));
1021 
1022 	mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
1023 	nce = nce_lookup_addr(ill, dst);
1024 	if (nce != NULL) {
1025 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
1026 		goto done;
1027 	}
1028 	if (ill->ill_net_type == IRE_IF_RESOLVER) {
1029 		/*
1030 		 * For IRE_IF_RESOLVER a hardware mapping can be
1031 		 * generated.
1032 		 */
1033 		hw_addr = kmem_alloc(ill->ill_nd_lla_len, KM_NOSLEEP);
1034 		if (hw_addr == NULL) {
1035 			mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
1036 			return (ENOMEM);
1037 		}
1038 		ip_mcast_mapping(ill, (uchar_t *)dst, hw_addr);
1039 	} else {
1040 		/* No hw_addr is needed for IRE_IF_NORESOLVER. */
1041 		hw_addr = NULL;
1042 	}
1043 	ASSERT((flags & NCE_F_MCAST) != 0);
1044 	ASSERT((flags & NCE_F_NONUD) != 0);
1045 	/* nce_state will be computed by nce_add_common() */
1046 	err = nce_add_v6(ill, hw_addr, ill->ill_phys_addr_length, dst, flags,
1047 	    ND_UNCHANGED, &nce);
1048 	mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
1049 	if (err == 0)
1050 		err = nce_add_v6_postprocess(nce);
1051 	if (hw_addr != NULL)
1052 		kmem_free(hw_addr, ill->ill_nd_lla_len);
1053 	if (err != 0) {
1054 		ip1dbg(("nce_set_multicast_v6: create failed" "%d\n", err));
1055 		return (err);
1056 	}
1057 done:
1058 	ASSERT(nce->nce_common->ncec_state == ND_REACHABLE);
1059 	if (newnce != NULL)
1060 		*newnce = nce;
1061 	else
1062 		nce_refrele(nce);
1063 	return (0);
1064 }
1065 
1066 /*
1067  * Return the link layer address, and any flags of a ncec.
1068  */
1069 int
1070 ndp_query(ill_t *ill, struct lif_nd_req *lnr)
1071 {
1072 	ncec_t		*ncec;
1073 	in6_addr_t	*addr;
1074 	sin6_t		*sin6;
1075 
1076 	ASSERT(ill != NULL && ill->ill_isv6);
1077 	sin6 = (sin6_t *)&lnr->lnr_addr;
1078 	addr =  &sin6->sin6_addr;
1079 
1080 	/*
1081 	 * NOTE: if the ill is an IPMP interface, then match against the whole
1082 	 * illgrp.  This e.g. allows in.ndpd to retrieve the link layer
1083 	 * addresses for the data addresses on an IPMP interface even though
1084 	 * ipif_ndp_up() created them with an ncec_ill of ipif_bound_ill.
1085 	 */
1086 	ncec = ncec_lookup_illgrp_v6(ill, addr);
1087 	if (ncec == NULL)
1088 		return (ESRCH);
1089 	/* If no link layer address is available yet, return ESRCH */
1090 	if (!NCE_ISREACHABLE(ncec)) {
1091 		ncec_refrele(ncec);
1092 		return (ESRCH);
1093 	}
1094 	lnr->lnr_hdw_len = ill->ill_phys_addr_length;
1095 	bcopy(ncec->ncec_lladdr, (uchar_t *)&lnr->lnr_hdw_addr,
1096 	    lnr->lnr_hdw_len);
1097 	if (ncec->ncec_flags & NCE_F_ISROUTER)
1098 		lnr->lnr_flags = NDF_ISROUTER_ON;
1099 	if (ncec->ncec_flags & NCE_F_ANYCAST)
1100 		lnr->lnr_flags |= NDF_ANYCAST_ON;
1101 	ncec_refrele(ncec);
1102 	return (0);
1103 }
1104 
1105 /*
1106  * Finish setting up the Enable/Disable multicast for the driver.
1107  */
1108 mblk_t *
1109 ndp_mcastreq(ill_t *ill, const in6_addr_t *v6group, uint32_t hw_addr_len,
1110     uint32_t hw_addr_offset, mblk_t *mp)
1111 {
1112 	uchar_t		*hw_addr;
1113 	ipaddr_t	v4group;
1114 	uchar_t		*addr;
1115 
1116 	ASSERT(ill->ill_net_type == IRE_IF_RESOLVER);
1117 	if (IN6_IS_ADDR_V4MAPPED(v6group)) {
1118 		IN6_V4MAPPED_TO_IPADDR(v6group, v4group);
1119 
1120 		ASSERT(CLASSD(v4group));
1121 		ASSERT(!(ill->ill_isv6));
1122 
1123 		addr = (uchar_t *)&v4group;
1124 	} else {
1125 		ASSERT(IN6_IS_ADDR_MULTICAST(v6group));
1126 		ASSERT(ill->ill_isv6);
1127 
1128 		addr = (uchar_t *)v6group;
1129 	}
1130 	hw_addr = mi_offset_paramc(mp, hw_addr_offset, hw_addr_len);
1131 	if (hw_addr == NULL) {
1132 		ip0dbg(("ndp_mcastreq NULL hw_addr\n"));
1133 		freemsg(mp);
1134 		return (NULL);
1135 	}
1136 
1137 	ip_mcast_mapping(ill, addr, hw_addr);
1138 	return (mp);
1139 }
1140 
1141 void
1142 ip_ndp_resolve(ncec_t *ncec)
1143 {
1144 	in_addr_t	sender4 = INADDR_ANY;
1145 	in6_addr_t	sender6 = ipv6_all_zeros;
1146 	ill_t		*src_ill;
1147 	uint32_t	ms;
1148 
1149 	src_ill = nce_resolve_src(ncec, &sender6);
1150 	if (src_ill == NULL) {
1151 		/* Make sure we try again later */
1152 		ms = ncec->ncec_ill->ill_reachable_retrans_time;
1153 		nce_restart_timer(ncec, (clock_t)ms);
1154 		return;
1155 	}
1156 	if (ncec->ncec_ipversion == IPV4_VERSION)
1157 		IN6_V4MAPPED_TO_IPADDR(&sender6, sender4);
1158 	mutex_enter(&ncec->ncec_lock);
1159 	if (ncec->ncec_ipversion == IPV6_VERSION)
1160 		ms = ndp_solicit(ncec, sender6, src_ill);
1161 	else
1162 		ms = arp_request(ncec, sender4, src_ill);
1163 	mutex_exit(&ncec->ncec_lock);
1164 	if (ms == 0) {
1165 		if (ncec->ncec_state != ND_REACHABLE) {
1166 			if (ncec->ncec_ipversion == IPV6_VERSION)
1167 				ndp_resolv_failed(ncec);
1168 			else
1169 				arp_resolv_failed(ncec);
1170 			ASSERT((ncec->ncec_flags & NCE_F_STATIC) == 0);
1171 			nce_make_unreachable(ncec);
1172 			ncec_delete(ncec);
1173 		}
1174 	} else {
1175 		nce_restart_timer(ncec, (clock_t)ms);
1176 	}
1177 done:
1178 	ill_refrele(src_ill);
1179 }
1180 
1181 /*
1182  * Send an IPv6 neighbor solicitation.
1183  * Returns number of milliseconds after which we should either rexmit or abort.
1184  * Return of zero means we should abort.
1185  * The caller holds the ncec_lock to protect ncec_qd_mp and ncec_rcnt.
1186  * The optional source address is used as a hint to ndp_solicit for
1187  * which source to use in the packet.
1188  *
1189  * NOTE: This routine drops ncec_lock (and later reacquires it) when sending
1190  * the packet.
1191  */
1192 uint32_t
1193 ndp_solicit(ncec_t *ncec, in6_addr_t src, ill_t *ill)
1194 {
1195 	in6_addr_t	dst;
1196 	boolean_t	dropped = B_FALSE;
1197 
1198 	ASSERT(ncec->ncec_ipversion == IPV6_VERSION);
1199 	ASSERT(MUTEX_HELD(&ncec->ncec_lock));
1200 
1201 	if (ncec->ncec_rcnt == 0)
1202 		return (0);
1203 
1204 	dst = ncec->ncec_addr;
1205 	ncec->ncec_rcnt--;
1206 	mutex_exit(&ncec->ncec_lock);
1207 	dropped = ndp_xmit(ill, ND_NEIGHBOR_SOLICIT, ill->ill_phys_addr,
1208 	    ill->ill_phys_addr_length, &src, &dst, 0);
1209 	mutex_enter(&ncec->ncec_lock);
1210 	if (dropped)
1211 		ncec->ncec_rcnt++;
1212 	return (ncec->ncec_ill->ill_reachable_retrans_time);
1213 }
1214 
1215 /*
1216  * Attempt to recover an address on an interface that's been marked as a
1217  * duplicate.  Because NCEs are destroyed when the interface goes down, there's
1218  * no easy way to just probe the address and have the right thing happen if
1219  * it's no longer in use.  Instead, we just bring it up normally and allow the
1220  * regular interface start-up logic to probe for a remaining duplicate and take
1221  * us back down if necessary.
1222  * Neither DHCP nor temporary addresses arrive here; they're excluded by
1223  * ip_ndp_excl.
1224  */
1225 /* ARGSUSED */
1226 void
1227 ip_addr_recover(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg)
1228 {
1229 	ill_t	*ill = rq->q_ptr;
1230 	ipif_t	*ipif;
1231 	in6_addr_t *addr6 = (in6_addr_t *)mp->b_rptr;
1232 	in_addr_t *addr4 = (in_addr_t *)mp->b_rptr;
1233 	boolean_t addr_equal;
1234 
1235 	for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
1236 		/*
1237 		 * We do not support recovery of proxy ARP'd interfaces,
1238 		 * because the system lacks a complete proxy ARP mechanism.
1239 		 */
1240 		if (ill->ill_isv6) {
1241 			addr_equal = IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6lcl_addr,
1242 			    addr6);
1243 		} else {
1244 			addr_equal = (ipif->ipif_lcl_addr == *addr4);
1245 		}
1246 
1247 		if ((ipif->ipif_flags & IPIF_POINTOPOINT) || !addr_equal)
1248 			continue;
1249 
1250 		/*
1251 		 * If we have already recovered or if the interface is going
1252 		 * away, then ignore.
1253 		 */
1254 		mutex_enter(&ill->ill_lock);
1255 		if (!(ipif->ipif_flags & IPIF_DUPLICATE) ||
1256 		    (ipif->ipif_state_flags & IPIF_CONDEMNED)) {
1257 			mutex_exit(&ill->ill_lock);
1258 			continue;
1259 		}
1260 
1261 		ipif->ipif_flags &= ~IPIF_DUPLICATE;
1262 		ill->ill_ipif_dup_count--;
1263 		mutex_exit(&ill->ill_lock);
1264 		ipif->ipif_was_dup = B_TRUE;
1265 
1266 		if (ill->ill_isv6) {
1267 			VERIFY(ipif_ndp_up(ipif, B_TRUE) != EINPROGRESS);
1268 			(void) ipif_up_done_v6(ipif);
1269 		} else {
1270 			VERIFY(ipif_arp_up(ipif, Res_act_initial, B_TRUE) !=
1271 			    EINPROGRESS);
1272 			(void) ipif_up_done(ipif);
1273 		}
1274 	}
1275 	freeb(mp);
1276 }
1277 
1278 /*
1279  * Attempt to recover an IPv6 interface that's been shut down as a duplicate.
1280  * As long as someone else holds the address, the interface will stay down.
1281  * When that conflict goes away, the interface is brought back up.  This is
1282  * done so that accidental shutdowns of addresses aren't made permanent.  Your
1283  * server will recover from a failure.
1284  *
1285  * For DHCP and temporary addresses, recovery is not done in the kernel.
1286  * Instead, it's handled by user space processes (dhcpagent and in.ndpd).
1287  *
1288  * This function is entered on a timer expiry; the ID is in ipif_recovery_id.
1289  */
1290 void
1291 ipif_dup_recovery(void *arg)
1292 {
1293 	ipif_t *ipif = arg;
1294 
1295 	ipif->ipif_recovery_id = 0;
1296 	if (!(ipif->ipif_flags & IPIF_DUPLICATE))
1297 		return;
1298 
1299 	/*
1300 	 * No lock, because this is just an optimization.
1301 	 */
1302 	if (ipif->ipif_state_flags & IPIF_CONDEMNED)
1303 		return;
1304 
1305 	/* If the link is down, we'll retry this later */
1306 	if (!(ipif->ipif_ill->ill_phyint->phyint_flags & PHYI_RUNNING))
1307 		return;
1308 
1309 	ipif_do_recovery(ipif);
1310 }
1311 
1312 /*
1313  * Perform interface recovery by forcing the duplicate interfaces up and
1314  * allowing the system to determine which ones should stay up.
1315  *
1316  * Called both by recovery timer expiry and link-up notification.
1317  */
1318 void
1319 ipif_do_recovery(ipif_t *ipif)
1320 {
1321 	ill_t *ill = ipif->ipif_ill;
1322 	mblk_t *mp;
1323 	ip_stack_t *ipst = ill->ill_ipst;
1324 	size_t mp_size;
1325 
1326 	if (ipif->ipif_isv6)
1327 		mp_size = sizeof (ipif->ipif_v6lcl_addr);
1328 	else
1329 		mp_size = sizeof (ipif->ipif_lcl_addr);
1330 	mp = allocb(mp_size, BPRI_MED);
1331 	if (mp == NULL) {
1332 		mutex_enter(&ill->ill_lock);
1333 		if (ipst->ips_ip_dup_recovery > 0 &&
1334 		    ipif->ipif_recovery_id == 0 &&
1335 		    !(ipif->ipif_state_flags & IPIF_CONDEMNED)) {
1336 			ipif->ipif_recovery_id = timeout(ipif_dup_recovery,
1337 			    ipif, MSEC_TO_TICK(ipst->ips_ip_dup_recovery));
1338 		}
1339 		mutex_exit(&ill->ill_lock);
1340 	} else {
1341 		/*
1342 		 * A recovery timer may still be running if we got here from
1343 		 * ill_restart_dad(); cancel that timer.
1344 		 */
1345 		if (ipif->ipif_recovery_id != 0)
1346 			(void) untimeout(ipif->ipif_recovery_id);
1347 		ipif->ipif_recovery_id = 0;
1348 
1349 		if (ipif->ipif_isv6) {
1350 			bcopy(&ipif->ipif_v6lcl_addr, mp->b_rptr,
1351 			    sizeof (ipif->ipif_v6lcl_addr));
1352 		} else  {
1353 			bcopy(&ipif->ipif_lcl_addr, mp->b_rptr,
1354 			    sizeof (ipif->ipif_lcl_addr));
1355 		}
1356 		ill_refhold(ill);
1357 		qwriter_ip(ill, ill->ill_rq, mp, ip_addr_recover, NEW_OP,
1358 		    B_FALSE);
1359 	}
1360 }
1361 
1362 /*
1363  * Find the MAC and IP addresses in an NA/NS message.
1364  */
1365 static void
1366 ip_ndp_find_addresses(mblk_t *mp, ip_recv_attr_t *ira, ill_t *ill,
1367     in6_addr_t *targp, uchar_t **haddr, uint_t *haddrlenp)
1368 {
1369 	icmp6_t *icmp6 = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
1370 	nd_neighbor_solicit_t *ns = (nd_neighbor_solicit_t *)icmp6;
1371 	uchar_t *addr;
1372 	int alen;
1373 
1374 	/* icmp_inbound_v6 ensures this */
1375 	ASSERT(ira->ira_flags & IRAF_L2SRC_SET);
1376 
1377 	addr = ira->ira_l2src;
1378 	alen = ill->ill_phys_addr_length;
1379 	if (alen > 0) {
1380 		*haddr = addr;
1381 		*haddrlenp = alen;
1382 	} else {
1383 		*haddr = NULL;
1384 		*haddrlenp = 0;
1385 	}
1386 
1387 	/* nd_ns_target and nd_na_target are at the same offset, so we cheat */
1388 	*targp = ns->nd_ns_target;
1389 }
1390 
1391 /*
1392  * This is for exclusive changes due to NDP duplicate address detection
1393  * failure.
1394  */
1395 /* ARGSUSED */
1396 static void
1397 ip_ndp_excl(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg)
1398 {
1399 	ill_t	*ill = rq->q_ptr;
1400 	ipif_t	*ipif;
1401 	uchar_t	*haddr;
1402 	uint_t	haddrlen;
1403 	ip_stack_t *ipst = ill->ill_ipst;
1404 	in6_addr_t targ;
1405 	ip_recv_attr_t iras;
1406 	mblk_t	*attrmp;
1407 
1408 	attrmp = mp;
1409 	mp = mp->b_cont;
1410 	attrmp->b_cont = NULL;
1411 	if (!ip_recv_attr_from_mblk(attrmp, &iras)) {
1412 		/* The ill or ip_stack_t disappeared on us */
1413 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
1414 		ip_drop_input("ip_recv_attr_from_mblk", mp, ill);
1415 		freemsg(mp);
1416 		ira_cleanup(&iras, B_TRUE);
1417 		return;
1418 	}
1419 
1420 	ASSERT(ill == iras.ira_rill);
1421 
1422 	ip_ndp_find_addresses(mp, &iras, ill, &targ, &haddr, &haddrlen);
1423 	if (haddr != NULL && haddrlen == ill->ill_phys_addr_length) {
1424 		/*
1425 		 * Ignore conflicts generated by misbehaving switches that
1426 		 * just reflect our own messages back to us.  For IPMP, we may
1427 		 * see reflections across any ill in the illgrp.
1428 		 *
1429 		 * RFC2462 and revisions tried to detect both the case
1430 		 * when a statically configured IPv6 address is a duplicate,
1431 		 * and the case when the L2 address itself is a duplicate. The
1432 		 * later is important because, with stateles address autoconf,
1433 		 * if the L2 address is a duplicate, the resulting IPv6
1434 		 * address(es) would also be duplicates. We rely on DAD of the
1435 		 * IPv6 address itself to detect the latter case.
1436 		 */
1437 		/* For an under ill_grp can change under lock */
1438 		rw_enter(&ipst->ips_ill_g_lock, RW_READER);
1439 		if (bcmp(haddr, ill->ill_phys_addr, haddrlen) == 0 ||
1440 		    IS_UNDER_IPMP(ill) &&
1441 		    ipmp_illgrp_find_ill(ill->ill_grp, haddr,
1442 		    haddrlen) != NULL) {
1443 			rw_exit(&ipst->ips_ill_g_lock);
1444 			goto ignore_conflict;
1445 		}
1446 		rw_exit(&ipst->ips_ill_g_lock);
1447 	}
1448 
1449 	/*
1450 	 * Look up the appropriate ipif.
1451 	 */
1452 	ipif = ipif_lookup_addr_v6(&targ, ill, ALL_ZONES, ipst);
1453 	if (ipif == NULL)
1454 		goto ignore_conflict;
1455 
1456 	/* Reload the ill to match the ipif */
1457 	ill = ipif->ipif_ill;
1458 
1459 	/* If it's already duplicate or ineligible, then don't do anything. */
1460 	if (ipif->ipif_flags & (IPIF_POINTOPOINT|IPIF_DUPLICATE)) {
1461 		ipif_refrele(ipif);
1462 		goto ignore_conflict;
1463 	}
1464 
1465 	/*
1466 	 * If this is a failure during duplicate recovery, then don't
1467 	 * complain.  It may take a long time to recover.
1468 	 */
1469 	if (!ipif->ipif_was_dup) {
1470 		char ibuf[LIFNAMSIZ];
1471 		char hbuf[MAC_STR_LEN];
1472 		char sbuf[INET6_ADDRSTRLEN];
1473 
1474 		ipif_get_name(ipif, ibuf, sizeof (ibuf));
1475 		cmn_err(CE_WARN, "%s has duplicate address %s (in use by %s);"
1476 		    " disabled", ibuf,
1477 		    inet_ntop(AF_INET6, &targ, sbuf, sizeof (sbuf)),
1478 		    mac_colon_addr(haddr, haddrlen, hbuf, sizeof (hbuf)));
1479 	}
1480 	mutex_enter(&ill->ill_lock);
1481 	ASSERT(!(ipif->ipif_flags & IPIF_DUPLICATE));
1482 	ipif->ipif_flags |= IPIF_DUPLICATE;
1483 	ill->ill_ipif_dup_count++;
1484 	mutex_exit(&ill->ill_lock);
1485 	(void) ipif_down(ipif, NULL, NULL);
1486 	(void) ipif_down_tail(ipif);
1487 	mutex_enter(&ill->ill_lock);
1488 	if (!(ipif->ipif_flags & (IPIF_DHCPRUNNING|IPIF_TEMPORARY)) &&
1489 	    ill->ill_net_type == IRE_IF_RESOLVER &&
1490 	    !(ipif->ipif_state_flags & IPIF_CONDEMNED) &&
1491 	    ipst->ips_ip_dup_recovery > 0) {
1492 		ASSERT(ipif->ipif_recovery_id == 0);
1493 		ipif->ipif_recovery_id = timeout(ipif_dup_recovery,
1494 		    ipif, MSEC_TO_TICK(ipst->ips_ip_dup_recovery));
1495 	}
1496 	mutex_exit(&ill->ill_lock);
1497 	ipif_refrele(ipif);
1498 
1499 ignore_conflict:
1500 	freemsg(mp);
1501 	ira_cleanup(&iras, B_TRUE);
1502 }
1503 
1504 /*
1505  * Handle failure by tearing down the ipifs with the specified address.  Note
1506  * that tearing down the ipif also means deleting the ncec through ipif_down, so
1507  * it's not possible to do recovery by just restarting the ncec timer.  Instead,
1508  * we start a timer on the ipif.
1509  * Caller has to free mp;
1510  */
1511 static void
1512 ndp_failure(mblk_t *mp, ip_recv_attr_t *ira)
1513 {
1514 	const uchar_t	*haddr;
1515 	ill_t		*ill = ira->ira_rill;
1516 
1517 	/*
1518 	 * Ignore conflicts generated by misbehaving switches that just
1519 	 * reflect our own messages back to us.
1520 	 */
1521 
1522 	/* icmp_inbound_v6 ensures this */
1523 	ASSERT(ira->ira_flags & IRAF_L2SRC_SET);
1524 	haddr = ira->ira_l2src;
1525 	if (haddr != NULL &&
1526 	    bcmp(haddr, ill->ill_phys_addr, ill->ill_phys_addr_length) == 0) {
1527 		return;
1528 	}
1529 
1530 	if ((mp = copymsg(mp)) != NULL) {
1531 		mblk_t	*attrmp;
1532 
1533 		attrmp = ip_recv_attr_to_mblk(ira);
1534 		if (attrmp == NULL) {
1535 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
1536 			ip_drop_input("ipIfStatsInDiscards", mp, ill);
1537 			freemsg(mp);
1538 		} else {
1539 			ASSERT(attrmp->b_cont == NULL);
1540 			attrmp->b_cont = mp;
1541 			mp = attrmp;
1542 			ill_refhold(ill);
1543 			qwriter_ip(ill, ill->ill_rq, mp, ip_ndp_excl, NEW_OP,
1544 			    B_FALSE);
1545 		}
1546 	}
1547 }
1548 
1549 /*
1550  * Handle a discovered conflict: some other system is advertising that it owns
1551  * one of our IP addresses.  We need to defend ourselves, or just shut down the
1552  * interface.
1553  *
1554  * Handles both IPv4 and IPv6
1555  */
1556 boolean_t
1557 ip_nce_conflict(mblk_t *mp, ip_recv_attr_t *ira, ncec_t *ncec)
1558 {
1559 	ipif_t		*ipif;
1560 	clock_t		now;
1561 	uint_t		maxdefense;
1562 	uint_t		defs;
1563 	ill_t		*ill = ira->ira_ill;
1564 	ip_stack_t	*ipst = ill->ill_ipst;
1565 	uint32_t	elapsed;
1566 	boolean_t	isv6 = ill->ill_isv6;
1567 	ipaddr_t	ncec_addr;
1568 
1569 	if (isv6) {
1570 		ipif = ipif_lookup_addr_v6(&ncec->ncec_addr, ill, ALL_ZONES,
1571 		    ipst);
1572 	} else {
1573 		if (arp_no_defense) {
1574 			/*
1575 			 * Yes, there is a conflict, but no, we do not
1576 			 * defend ourself.
1577 			 */
1578 			return (B_TRUE);
1579 		}
1580 		IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, ncec_addr);
1581 		ipif = ipif_lookup_addr(ncec_addr, ill, ALL_ZONES,
1582 		    ipst);
1583 	}
1584 	if (ipif == NULL)
1585 		return (B_FALSE);
1586 
1587 	/*
1588 	 * First, figure out if this address is disposable.
1589 	 */
1590 	if (ipif->ipif_flags & (IPIF_DHCPRUNNING | IPIF_TEMPORARY))
1591 		maxdefense = ipst->ips_ip_max_temp_defend;
1592 	else
1593 		maxdefense = ipst->ips_ip_max_defend;
1594 
1595 	/*
1596 	 * Now figure out how many times we've defended ourselves.  Ignore
1597 	 * defenses that happened long in the past.
1598 	 */
1599 	now = ddi_get_lbolt();
1600 	elapsed = (drv_hztousec(now - ncec->ncec_last_time_defended))/1000000;
1601 	mutex_enter(&ncec->ncec_lock);
1602 	if ((defs = ncec->ncec_defense_count) > 0 &&
1603 	    elapsed > ipst->ips_ip_defend_interval) {
1604 		/*
1605 		 * ip_defend_interval has elapsed.
1606 		 * reset the defense count.
1607 		 */
1608 		ncec->ncec_defense_count = defs = 0;
1609 	}
1610 	ncec->ncec_defense_count++;
1611 	ncec->ncec_last_time_defended = now;
1612 	mutex_exit(&ncec->ncec_lock);
1613 	ipif_refrele(ipif);
1614 
1615 	/*
1616 	 * If we've defended ourselves too many times already, then give up and
1617 	 * tear down the interface(s) using this address.
1618 	 * Otherwise, caller has to defend by sending out an announce.
1619 	 */
1620 	if (defs >= maxdefense) {
1621 		if (isv6)
1622 			ndp_failure(mp, ira);
1623 		else
1624 			arp_failure(mp, ira);
1625 	} else {
1626 		return (B_TRUE); /* caller must defend this address */
1627 	}
1628 	return (B_FALSE);
1629 }
1630 
1631 /*
1632  * Handle reception of Neighbor Solicitation messages.
1633  */
1634 static void
1635 ndp_input_solicit(mblk_t *mp, ip_recv_attr_t *ira)
1636 {
1637 	ill_t		*ill = ira->ira_ill, *under_ill;
1638 	nd_neighbor_solicit_t *ns;
1639 	uint32_t	hlen = ill->ill_phys_addr_length;
1640 	uchar_t		*haddr = NULL;
1641 	icmp6_t		*icmp_nd;
1642 	ip6_t		*ip6h;
1643 	ncec_t		*our_ncec = NULL;
1644 	in6_addr_t	target;
1645 	in6_addr_t	src;
1646 	int		len;
1647 	int		flag = 0;
1648 	nd_opt_hdr_t	*opt = NULL;
1649 	boolean_t	bad_solicit = B_FALSE;
1650 	mib2_ipv6IfIcmpEntry_t	*mib = ill->ill_icmp6_mib;
1651 	boolean_t	need_ill_refrele = B_FALSE;
1652 
1653 	ip6h = (ip6_t *)mp->b_rptr;
1654 	icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
1655 	len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN;
1656 	src = ip6h->ip6_src;
1657 	ns = (nd_neighbor_solicit_t *)icmp_nd;
1658 	target = ns->nd_ns_target;
1659 	if (IN6_IS_ADDR_MULTICAST(&target) || IN6_IS_ADDR_V4MAPPED(&target) ||
1660 	    IN6_IS_ADDR_LOOPBACK(&target)) {
1661 		if (ip_debug > 2) {
1662 			/* ip1dbg */
1663 			pr_addr_dbg("ndp_input_solicit: Martian Target %s\n",
1664 			    AF_INET6, &target);
1665 		}
1666 		bad_solicit = B_TRUE;
1667 		goto done;
1668 	}
1669 	if (len > sizeof (nd_neighbor_solicit_t)) {
1670 		/* Options present */
1671 		opt = (nd_opt_hdr_t *)&ns[1];
1672 		len -= sizeof (nd_neighbor_solicit_t);
1673 		if (!ndp_verify_optlen(opt, len)) {
1674 			ip1dbg(("ndp_input_solicit: Bad opt len\n"));
1675 			bad_solicit = B_TRUE;
1676 			goto done;
1677 		}
1678 	}
1679 	if (IN6_IS_ADDR_UNSPECIFIED(&src)) {
1680 		/* Check to see if this is a valid DAD solicitation */
1681 		if (!IN6_IS_ADDR_MC_SOLICITEDNODE(&ip6h->ip6_dst)) {
1682 			if (ip_debug > 2) {
1683 				/* ip1dbg */
1684 				pr_addr_dbg("ndp_input_solicit: IPv6 "
1685 				    "Destination is not solicited node "
1686 				    "multicast %s\n", AF_INET6,
1687 				    &ip6h->ip6_dst);
1688 			}
1689 			bad_solicit = B_TRUE;
1690 			goto done;
1691 		}
1692 	}
1693 
1694 	/*
1695 	 * NOTE: with IPMP, it's possible the nominated multicast ill (which
1696 	 * received this packet if it's multicast) is not the ill tied to
1697 	 * e.g. the IPMP ill's data link-local.  So we match across the illgrp
1698 	 * to ensure we find the associated NCE.
1699 	 */
1700 	our_ncec = ncec_lookup_illgrp_v6(ill, &target);
1701 	/*
1702 	 * If this is a valid Solicitation for an address we are publishing,
1703 	 * then a PUBLISH entry should exist in the cache
1704 	 */
1705 	if (our_ncec == NULL || !NCE_PUBLISH(our_ncec)) {
1706 		ip1dbg(("ndp_input_solicit: Wrong target in NS?!"
1707 		    "ifname=%s ", ill->ill_name));
1708 		if (ip_debug > 2) {
1709 			/* ip1dbg */
1710 			pr_addr_dbg(" dst %s\n", AF_INET6, &target);
1711 		}
1712 		if (our_ncec == NULL)
1713 			bad_solicit = B_TRUE;
1714 		goto done;
1715 	}
1716 
1717 	/* At this point we should have a verified NS per spec */
1718 	if (opt != NULL) {
1719 		opt = ndp_get_option(opt, len, ND_OPT_SOURCE_LINKADDR);
1720 		if (opt != NULL) {
1721 			haddr = (uchar_t *)&opt[1];
1722 			if (hlen > opt->nd_opt_len * 8 - sizeof (*opt) ||
1723 			    hlen == 0) {
1724 				ip1dbg(("ndp_input_advert: bad SLLA\n"));
1725 				bad_solicit = B_TRUE;
1726 				goto done;
1727 			}
1728 		}
1729 	}
1730 
1731 	/* If sending directly to peer, set the unicast flag */
1732 	if (!IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst))
1733 		flag |= NDP_UNICAST;
1734 
1735 	/*
1736 	 * Create/update the entry for the soliciting node on the ipmp_ill.
1737 	 * or respond to outstanding queries, don't if
1738 	 * the source is unspecified address.
1739 	 */
1740 	if (!IN6_IS_ADDR_UNSPECIFIED(&src)) {
1741 		int	err;
1742 		nce_t	*nnce;
1743 
1744 		ASSERT(ill->ill_isv6);
1745 		/*
1746 		 * Regular solicitations *must* include the Source Link-Layer
1747 		 * Address option.  Ignore messages that do not.
1748 		 */
1749 		if (haddr == NULL && IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst)) {
1750 			ip1dbg(("ndp_input_solicit: source link-layer address "
1751 			    "option missing with a specified source.\n"));
1752 			bad_solicit = B_TRUE;
1753 			goto done;
1754 		}
1755 
1756 		/*
1757 		 * This is a regular solicitation.  If we're still in the
1758 		 * process of verifying the address, then don't respond at all
1759 		 * and don't keep track of the sender.
1760 		 */
1761 		if (our_ncec->ncec_state == ND_PROBE)
1762 			goto done;
1763 
1764 		/*
1765 		 * If the solicitation doesn't have sender hardware address
1766 		 * (legal for unicast solicitation), then process without
1767 		 * installing the return NCE.  Either we already know it, or
1768 		 * we'll be forced to look it up when (and if) we reply to the
1769 		 * packet.
1770 		 */
1771 		if (haddr == NULL)
1772 			goto no_source;
1773 
1774 		under_ill = ill;
1775 		if (IS_UNDER_IPMP(under_ill)) {
1776 			ill = ipmp_ill_hold_ipmp_ill(under_ill);
1777 			if (ill == NULL)
1778 				ill = under_ill;
1779 			else
1780 				need_ill_refrele = B_TRUE;
1781 		}
1782 		err = nce_lookup_then_add_v6(ill,
1783 		    haddr, hlen,
1784 		    &src,	/* Soliciting nodes address */
1785 		    0,
1786 		    ND_STALE,
1787 		    &nnce);
1788 
1789 		if (need_ill_refrele) {
1790 			ill_refrele(ill);
1791 			ill = under_ill;
1792 			need_ill_refrele =  B_FALSE;
1793 		}
1794 		switch (err) {
1795 		case 0:
1796 			/* done with this entry */
1797 			nce_refrele(nnce);
1798 			break;
1799 		case EEXIST:
1800 			/*
1801 			 * B_FALSE indicates this is not an an advertisement.
1802 			 */
1803 			nce_process(nnce->nce_common, haddr, 0, B_FALSE);
1804 			nce_refrele(nnce);
1805 			break;
1806 		default:
1807 			ip1dbg(("ndp_input_solicit: Can't create NCE %d\n",
1808 			    err));
1809 			goto done;
1810 		}
1811 no_source:
1812 		flag |= NDP_SOLICITED;
1813 	} else {
1814 		/*
1815 		 * No source link layer address option should be present in a
1816 		 * valid DAD request.
1817 		 */
1818 		if (haddr != NULL) {
1819 			ip1dbg(("ndp_input_solicit: source link-layer address "
1820 			    "option present with an unspecified source.\n"));
1821 			bad_solicit = B_TRUE;
1822 			goto done;
1823 		}
1824 		if (our_ncec->ncec_state == ND_PROBE) {
1825 			/*
1826 			 * Internally looped-back probes will have
1827 			 * IRAF_L2SRC_LOOPBACK set so we can ignore our own
1828 			 * transmissions.
1829 			 */
1830 			if (!(ira->ira_flags & IRAF_L2SRC_LOOPBACK)) {
1831 				/*
1832 				 * If someone else is probing our address, then
1833 				 * we've crossed wires.  Declare failure.
1834 				 */
1835 				ndp_failure(mp, ira);
1836 			}
1837 			goto done;
1838 		}
1839 		/*
1840 		 * This is a DAD probe.  Multicast the advertisement to the
1841 		 * all-nodes address.
1842 		 */
1843 		src = ipv6_all_hosts_mcast;
1844 	}
1845 	flag |= nce_advert_flags(our_ncec);
1846 	(void) ndp_xmit(ill,
1847 	    ND_NEIGHBOR_ADVERT,
1848 	    our_ncec->ncec_lladdr,
1849 	    our_ncec->ncec_lladdr_length,
1850 	    &target,	/* Source and target of the advertisement pkt */
1851 	    &src,	/* IP Destination (source of original pkt) */
1852 	    flag);
1853 done:
1854 	if (bad_solicit)
1855 		BUMP_MIB(mib, ipv6IfIcmpInBadNeighborSolicitations);
1856 	if (our_ncec != NULL)
1857 		ncec_refrele(our_ncec);
1858 }
1859 
1860 /*
1861  * Handle reception of Neighbor Solicitation messages
1862  */
1863 void
1864 ndp_input_advert(mblk_t *mp, ip_recv_attr_t *ira)
1865 {
1866 	ill_t		*ill = ira->ira_ill;
1867 	nd_neighbor_advert_t *na;
1868 	uint32_t	hlen = ill->ill_phys_addr_length;
1869 	uchar_t		*haddr = NULL;
1870 	icmp6_t		*icmp_nd;
1871 	ip6_t		*ip6h;
1872 	ncec_t		*dst_ncec = NULL;
1873 	in6_addr_t	target;
1874 	nd_opt_hdr_t	*opt = NULL;
1875 	int		len;
1876 	ip_stack_t	*ipst = ill->ill_ipst;
1877 	mib2_ipv6IfIcmpEntry_t	*mib = ill->ill_icmp6_mib;
1878 
1879 	ip6h = (ip6_t *)mp->b_rptr;
1880 	icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
1881 	len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN;
1882 	na = (nd_neighbor_advert_t *)icmp_nd;
1883 
1884 	if (IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst) &&
1885 	    (na->nd_na_flags_reserved & ND_NA_FLAG_SOLICITED)) {
1886 		ip1dbg(("ndp_input_advert: Target is multicast but the "
1887 		    "solicited flag is not zero\n"));
1888 		BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements);
1889 		return;
1890 	}
1891 	target = na->nd_na_target;
1892 	if (IN6_IS_ADDR_MULTICAST(&target) || IN6_IS_ADDR_V4MAPPED(&target) ||
1893 	    IN6_IS_ADDR_LOOPBACK(&target)) {
1894 		if (ip_debug > 2) {
1895 			/* ip1dbg */
1896 			pr_addr_dbg("ndp_input_solicit: Martian Target %s\n",
1897 			    AF_INET6, &target);
1898 		}
1899 		BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements);
1900 		return;
1901 	}
1902 	if (len > sizeof (nd_neighbor_advert_t)) {
1903 		opt = (nd_opt_hdr_t *)&na[1];
1904 		if (!ndp_verify_optlen(opt,
1905 		    len - sizeof (nd_neighbor_advert_t))) {
1906 			ip1dbg(("ndp_input_advert: cannot verify SLLA\n"));
1907 			BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements);
1908 			return;
1909 		}
1910 		/* At this point we have a verified NA per spec */
1911 		len -= sizeof (nd_neighbor_advert_t);
1912 		opt = ndp_get_option(opt, len, ND_OPT_TARGET_LINKADDR);
1913 		if (opt != NULL) {
1914 			haddr = (uchar_t *)&opt[1];
1915 			if (hlen > opt->nd_opt_len * 8 - sizeof (*opt) ||
1916 			    hlen == 0) {
1917 				ip1dbg(("ndp_input_advert: bad SLLA\n"));
1918 				BUMP_MIB(mib,
1919 				    ipv6IfIcmpInBadNeighborAdvertisements);
1920 				return;
1921 			}
1922 		}
1923 	}
1924 
1925 	/*
1926 	 * NOTE: we match across the illgrp since we need to do DAD for all of
1927 	 * our local addresses, and those are spread across all the active
1928 	 * ills in the group.
1929 	 */
1930 	if ((dst_ncec = ncec_lookup_illgrp_v6(ill, &target)) == NULL)
1931 		return;
1932 
1933 	if (NCE_PUBLISH(dst_ncec)) {
1934 		/*
1935 		 * Someone just advertised an addresses that we publish. First,
1936 		 * check it it was us -- if so, we can safely ignore it.
1937 		 * We don't get the haddr from the ira_l2src because, in the
1938 		 * case that the packet originated from us, on an IPMP group,
1939 		 * the ira_l2src may would be the link-layer address of the
1940 		 * cast_ill used to send the packet, which may not be the same
1941 		 * as the dst_ncec->ncec_lladdr of the address.
1942 		 */
1943 		if (haddr != NULL) {
1944 			if (ira->ira_flags & IRAF_L2SRC_LOOPBACK)
1945 				goto out;
1946 
1947 			if (!nce_cmp_ll_addr(dst_ncec, haddr, hlen))
1948 				goto out;   /* from us -- no conflict */
1949 
1950 			/*
1951 			 * If we're in an IPMP group, check if this is an echo
1952 			 * from another ill in the group.  Use the double-
1953 			 * checked locking pattern to avoid grabbing
1954 			 * ill_g_lock in the non-IPMP case.
1955 			 */
1956 			if (IS_UNDER_IPMP(ill)) {
1957 				rw_enter(&ipst->ips_ill_g_lock, RW_READER);
1958 				if (IS_UNDER_IPMP(ill) && ipmp_illgrp_find_ill(
1959 				    ill->ill_grp, haddr, hlen) != NULL) {
1960 					rw_exit(&ipst->ips_ill_g_lock);
1961 					goto out;
1962 				}
1963 				rw_exit(&ipst->ips_ill_g_lock);
1964 			}
1965 		}
1966 
1967 		/*
1968 		 * This appears to be a real conflict.  If we're trying to
1969 		 * configure this NCE (ND_PROBE), then shut it down.
1970 		 * Otherwise, handle the discovered conflict.
1971 		 */
1972 		if (dst_ncec->ncec_state == ND_PROBE) {
1973 			ndp_failure(mp, ira);
1974 		} else {
1975 			if (ip_nce_conflict(mp, ira, dst_ncec)) {
1976 				char hbuf[MAC_STR_LEN];
1977 				char sbuf[INET6_ADDRSTRLEN];
1978 
1979 				cmn_err(CE_WARN,
1980 				    "node '%s' is using %s on %s",
1981 				    inet_ntop(AF_INET6, &target, sbuf,
1982 				    sizeof (sbuf)),
1983 				    haddr == NULL ? "<none>" :
1984 				    mac_colon_addr(haddr, hlen, hbuf,
1985 				    sizeof (hbuf)), ill->ill_name);
1986 				/*
1987 				 * RFC 4862, Section 5.4.4 does not mandate
1988 				 * any specific behavior when an NA matches
1989 				 * a non-tentative address assigned to the
1990 				 * receiver. We make the choice of defending
1991 				 * our address, based on the assumption that
1992 				 * the sender has not detected the Duplicate.
1993 				 *
1994 				 * ncec_last_time_defended has been adjusted
1995 				 * in ip_nce_conflict()
1996 				 */
1997 				(void) ndp_announce(dst_ncec);
1998 			}
1999 		}
2000 	} else {
2001 		if (na->nd_na_flags_reserved & ND_NA_FLAG_ROUTER)
2002 			dst_ncec->ncec_flags |= NCE_F_ISROUTER;
2003 
2004 		/* B_TRUE indicates this an advertisement */
2005 		nce_process(dst_ncec, haddr, na->nd_na_flags_reserved, B_TRUE);
2006 	}
2007 out:
2008 	ncec_refrele(dst_ncec);
2009 }
2010 
2011 /*
2012  * Process NDP neighbor solicitation/advertisement messages.
2013  * The checksum has already checked o.k before reaching here.
2014  * Information about the datalink header is contained in ira_l2src, but
2015  * that should be ignored for loopback packets.
2016  */
2017 void
2018 ndp_input(mblk_t *mp, ip_recv_attr_t *ira)
2019 {
2020 	ill_t		*ill = ira->ira_rill;
2021 	icmp6_t		*icmp_nd;
2022 	ip6_t		*ip6h;
2023 	int		len;
2024 	mib2_ipv6IfIcmpEntry_t	*mib = ill->ill_icmp6_mib;
2025 	ill_t		*orig_ill = NULL;
2026 
2027 	/*
2028 	 * Since ira_ill is where the IRE_LOCAL was hosted we use ira_rill
2029 	 * and make it be the IPMP upper so avoid being confused by a packet
2030 	 * addressed to a unicast address on a different ill.
2031 	 */
2032 	if (IS_UNDER_IPMP(ill)) {
2033 		orig_ill = ill;
2034 		ill = ipmp_ill_hold_ipmp_ill(orig_ill);
2035 		if (ill == NULL) {
2036 			ill = orig_ill;
2037 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2038 			ip_drop_input("ipIfStatsInDiscards - IPMP ill",
2039 			    mp, ill);
2040 			freemsg(mp);
2041 			return;
2042 		}
2043 		ASSERT(ill != orig_ill);
2044 		orig_ill = ira->ira_ill;
2045 		ira->ira_ill = ill;
2046 		mib = ill->ill_icmp6_mib;
2047 	}
2048 	if (!pullupmsg(mp, -1)) {
2049 		ip1dbg(("ndp_input: pullupmsg failed\n"));
2050 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2051 		ip_drop_input("ipIfStatsInDiscards - pullupmsg", mp, ill);
2052 		goto done;
2053 	}
2054 	ip6h = (ip6_t *)mp->b_rptr;
2055 	if (ip6h->ip6_hops != IPV6_MAX_HOPS) {
2056 		ip1dbg(("ndp_input: hoplimit != IPV6_MAX_HOPS\n"));
2057 		ip_drop_input("ipv6IfIcmpBadHoplimit", mp, ill);
2058 		BUMP_MIB(mib, ipv6IfIcmpBadHoplimit);
2059 		goto done;
2060 	}
2061 	/*
2062 	 * NDP does not accept any extension headers between the
2063 	 * IP header and the ICMP header since e.g. a routing
2064 	 * header could be dangerous.
2065 	 * This assumes that any AH or ESP headers are removed
2066 	 * by ip prior to passing the packet to ndp_input.
2067 	 */
2068 	if (ip6h->ip6_nxt != IPPROTO_ICMPV6) {
2069 		ip1dbg(("ndp_input: Wrong next header 0x%x\n",
2070 		    ip6h->ip6_nxt));
2071 		ip_drop_input("Wrong next header", mp, ill);
2072 		BUMP_MIB(mib, ipv6IfIcmpInErrors);
2073 		goto done;
2074 	}
2075 	icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
2076 	ASSERT(icmp_nd->icmp6_type == ND_NEIGHBOR_SOLICIT ||
2077 	    icmp_nd->icmp6_type == ND_NEIGHBOR_ADVERT);
2078 	if (icmp_nd->icmp6_code != 0) {
2079 		ip1dbg(("ndp_input: icmp6 code != 0 \n"));
2080 		ip_drop_input("code non-zero", mp, ill);
2081 		BUMP_MIB(mib, ipv6IfIcmpInErrors);
2082 		goto done;
2083 	}
2084 	len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN;
2085 	/*
2086 	 * Make sure packet length is large enough for either
2087 	 * a NS or a NA icmp packet.
2088 	 */
2089 	if (len <  sizeof (struct icmp6_hdr) + sizeof (struct in6_addr)) {
2090 		ip1dbg(("ndp_input: packet too short\n"));
2091 		ip_drop_input("packet too short", mp, ill);
2092 		BUMP_MIB(mib, ipv6IfIcmpInErrors);
2093 		goto done;
2094 	}
2095 	if (icmp_nd->icmp6_type == ND_NEIGHBOR_SOLICIT) {
2096 		ndp_input_solicit(mp, ira);
2097 	} else {
2098 		ndp_input_advert(mp, ira);
2099 	}
2100 done:
2101 	freemsg(mp);
2102 	if (orig_ill != NULL) {
2103 		ill_refrele(ill);
2104 		ira->ira_ill = orig_ill;
2105 	}
2106 }
2107 
2108 /*
2109  * ndp_xmit is called to form and transmit a ND solicitation or
2110  * advertisement ICMP packet.
2111  *
2112  * If the source address is unspecified and this isn't a probe (used for
2113  * duplicate address detection), an appropriate source address and link layer
2114  * address will be chosen here.  The link layer address option is included if
2115  * the source is specified (i.e., all non-probe packets), and omitted (per the
2116  * specification) otherwise.
2117  *
2118  * It returns B_FALSE only if it does a successful put() to the
2119  * corresponding ill's ill_wq otherwise returns B_TRUE.
2120  */
2121 static boolean_t
2122 ndp_xmit(ill_t *ill, uint32_t operation, uint8_t *hw_addr, uint_t hw_addr_len,
2123     const in6_addr_t *sender, const in6_addr_t *target, int flag)
2124 {
2125 	uint32_t	len;
2126 	icmp6_t 	*icmp6;
2127 	mblk_t		*mp;
2128 	ip6_t		*ip6h;
2129 	nd_opt_hdr_t	*opt;
2130 	uint_t		plen;
2131 	zoneid_t	zoneid = GLOBAL_ZONEID;
2132 	ill_t		*hwaddr_ill = ill;
2133 	ip_xmit_attr_t	ixas;
2134 	ip_stack_t	*ipst = ill->ill_ipst;
2135 	boolean_t	need_refrele = B_FALSE;
2136 	boolean_t	probe = B_FALSE;
2137 
2138 	if (IS_UNDER_IPMP(ill)) {
2139 		probe = ipif_lookup_testaddr_v6(ill, sender, NULL);
2140 		/*
2141 		 * We send non-probe packets on the upper IPMP interface.
2142 		 * ip_output_simple() will use cast_ill for sending any
2143 		 * multicast packets. Note that we can't follow the same
2144 		 * logic for probe packets because all interfaces in the ipmp
2145 		 * group may have failed, so that we really want to only try
2146 		 * to send the ND packet on the ill corresponding to the src
2147 		 * address.
2148 		 */
2149 		if (!probe) {
2150 			ill = ipmp_ill_hold_ipmp_ill(ill);
2151 			if (ill != NULL)
2152 				need_refrele = B_TRUE;
2153 			else
2154 				ill = hwaddr_ill;
2155 		}
2156 	}
2157 
2158 	/*
2159 	 * If we have a unspecified source(sender) address, select a
2160 	 * proper source address for the solicitation here itself so
2161 	 * that we can initialize the h/w address correctly.
2162 	 *
2163 	 * If the sender is specified then we use this address in order
2164 	 * to lookup the zoneid before calling ip_output_v6(). This is to
2165 	 * enable unicast ND_NEIGHBOR_ADVERT packets to be routed correctly
2166 	 * by IP (we cannot guarantee that the global zone has an interface
2167 	 * route to the destination).
2168 	 *
2169 	 * Note that the NA never comes here with the unspecified source
2170 	 * address.
2171 	 */
2172 
2173 	/*
2174 	 * Probes will have unspec src at this point.
2175 	 */
2176 	if (!(IN6_IS_ADDR_UNSPECIFIED(sender))) {
2177 		zoneid = ipif_lookup_addr_zoneid_v6(sender, ill, ipst);
2178 		/*
2179 		 * It's possible for ipif_lookup_addr_zoneid_v6() to return
2180 		 * ALL_ZONES if it cannot find a matching ipif for the address
2181 		 * we are trying to use. In this case we err on the side of
2182 		 * trying to send the packet by defaulting to the GLOBAL_ZONEID.
2183 		 */
2184 		if (zoneid == ALL_ZONES)
2185 			zoneid = GLOBAL_ZONEID;
2186 	}
2187 
2188 	plen = (sizeof (nd_opt_hdr_t) + hw_addr_len + 7) / 8;
2189 	len = IPV6_HDR_LEN + sizeof (nd_neighbor_advert_t) + plen * 8;
2190 	mp = allocb(len,  BPRI_LO);
2191 	if (mp == NULL) {
2192 		if (need_refrele)
2193 			ill_refrele(ill);
2194 		return (B_TRUE);
2195 	}
2196 
2197 	bzero((char *)mp->b_rptr, len);
2198 	mp->b_wptr = mp->b_rptr + len;
2199 
2200 	bzero(&ixas, sizeof (ixas));
2201 	ixas.ixa_flags = IXAF_SET_ULP_CKSUM | IXAF_NO_HW_CKSUM;
2202 
2203 	ixas.ixa_ifindex = ill->ill_phyint->phyint_ifindex;
2204 	ixas.ixa_ipst = ipst;
2205 	ixas.ixa_cred = kcred;
2206 	ixas.ixa_cpid = NOPID;
2207 	ixas.ixa_tsl = NULL;
2208 	ixas.ixa_zoneid = zoneid;
2209 
2210 	ip6h = (ip6_t *)mp->b_rptr;
2211 	ip6h->ip6_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
2212 	ip6h->ip6_plen = htons(len - IPV6_HDR_LEN);
2213 	ip6h->ip6_nxt = IPPROTO_ICMPV6;
2214 	ip6h->ip6_hops = IPV6_MAX_HOPS;
2215 	ixas.ixa_multicast_ttl = ip6h->ip6_hops;
2216 	ip6h->ip6_dst = *target;
2217 	icmp6 = (icmp6_t *)&ip6h[1];
2218 
2219 	if (hw_addr_len != 0) {
2220 		opt = (nd_opt_hdr_t *)((uint8_t *)ip6h + IPV6_HDR_LEN +
2221 		    sizeof (nd_neighbor_advert_t));
2222 	} else {
2223 		opt = NULL;
2224 	}
2225 	if (operation == ND_NEIGHBOR_SOLICIT) {
2226 		nd_neighbor_solicit_t *ns = (nd_neighbor_solicit_t *)icmp6;
2227 
2228 		if (opt != NULL && !(flag & NDP_PROBE)) {
2229 			/*
2230 			 * Note that we don't send out SLLA for ND probes
2231 			 * per RFC 4862, even though we do send out the src
2232 			 * haddr for IPv4 DAD probes, even though both IPv4
2233 			 * and IPv6 go out with the unspecified/INADDR_ANY
2234 			 * src IP addr.
2235 			 */
2236 			opt->nd_opt_type = ND_OPT_SOURCE_LINKADDR;
2237 		}
2238 		ip6h->ip6_src = *sender;
2239 		ns->nd_ns_target = *target;
2240 		if (!(flag & NDP_UNICAST)) {
2241 			/* Form multicast address of the target */
2242 			ip6h->ip6_dst = ipv6_solicited_node_mcast;
2243 			ip6h->ip6_dst.s6_addr32[3] |=
2244 			    ns->nd_ns_target.s6_addr32[3];
2245 		}
2246 	} else {
2247 		nd_neighbor_advert_t *na = (nd_neighbor_advert_t *)icmp6;
2248 
2249 		ASSERT(!(flag & NDP_PROBE));
2250 		if (opt != NULL)
2251 			opt->nd_opt_type = ND_OPT_TARGET_LINKADDR;
2252 		ip6h->ip6_src = *sender;
2253 		na->nd_na_target = *sender;
2254 		if (flag & NDP_ISROUTER)
2255 			na->nd_na_flags_reserved |= ND_NA_FLAG_ROUTER;
2256 		if (flag & NDP_SOLICITED)
2257 			na->nd_na_flags_reserved |= ND_NA_FLAG_SOLICITED;
2258 		if (flag & NDP_ORIDE)
2259 			na->nd_na_flags_reserved |= ND_NA_FLAG_OVERRIDE;
2260 	}
2261 
2262 	if (!(flag & NDP_PROBE)) {
2263 		if (hw_addr != NULL && opt != NULL) {
2264 			/* Fill in link layer address and option len */
2265 			opt->nd_opt_len = (uint8_t)plen;
2266 			bcopy(hw_addr, &opt[1], hw_addr_len);
2267 		}
2268 	}
2269 	if (opt != NULL && opt->nd_opt_type == 0) {
2270 		/* If there's no link layer address option, then strip it. */
2271 		len -= plen * 8;
2272 		mp->b_wptr = mp->b_rptr + len;
2273 		ip6h->ip6_plen = htons(len - IPV6_HDR_LEN);
2274 	}
2275 
2276 	icmp6->icmp6_type = (uint8_t)operation;
2277 	icmp6->icmp6_code = 0;
2278 	/*
2279 	 * Prepare for checksum by putting icmp length in the icmp
2280 	 * checksum field. The checksum is calculated in ip_output.c.
2281 	 */
2282 	icmp6->icmp6_cksum = ip6h->ip6_plen;
2283 
2284 	(void) ip_output_simple(mp, &ixas);
2285 	ixa_cleanup(&ixas);
2286 	if (need_refrele)
2287 		ill_refrele(ill);
2288 	return (B_FALSE);
2289 }
2290 
2291 /*
2292  * Used to set ND_UNREACHBLE before ncec_delete sets it NCE_F_CONDEMNED.
2293  * The datapath uses this as an indication that there
2294  * is a problem (as opposed to a NCE that was just
2295  * reclaimed due to lack of memory.
2296  * Note that static ARP entries never become unreachable.
2297  */
2298 void
2299 nce_make_unreachable(ncec_t *ncec)
2300 {
2301 	mutex_enter(&ncec->ncec_lock);
2302 	ncec->ncec_state = ND_UNREACHABLE;
2303 	mutex_exit(&ncec->ncec_lock);
2304 }
2305 
2306 /*
2307  * NCE retransmit timer. Common to IPv4 and IPv6.
2308  * This timer goes off when:
2309  * a. It is time to retransmit a resolution for resolver.
2310  * b. It is time to send reachability probes.
2311  */
2312 void
2313 nce_timer(void *arg)
2314 {
2315 	ncec_t		*ncec = arg;
2316 	ill_t		*ill = ncec->ncec_ill, *src_ill;
2317 	char		addrbuf[INET6_ADDRSTRLEN];
2318 	boolean_t	dropped = B_FALSE;
2319 	ip_stack_t	*ipst = ncec->ncec_ipst;
2320 	boolean_t	isv6 = (ncec->ncec_ipversion == IPV6_VERSION);
2321 	in_addr_t	sender4 = INADDR_ANY;
2322 	in6_addr_t	sender6 = ipv6_all_zeros;
2323 
2324 	/*
2325 	 * The timer has to be cancelled by ncec_delete before doing the final
2326 	 * refrele. So the NCE is guaranteed to exist when the timer runs
2327 	 * until it clears the timeout_id. Before clearing the timeout_id
2328 	 * bump up the refcnt so that we can continue to use the ncec
2329 	 */
2330 	ASSERT(ncec != NULL);
2331 	mutex_enter(&ncec->ncec_lock);
2332 	ncec_refhold_locked(ncec);
2333 	ncec->ncec_timeout_id = 0;
2334 	mutex_exit(&ncec->ncec_lock);
2335 
2336 	src_ill = nce_resolve_src(ncec, &sender6);
2337 	/* if we could not find a sender address, return */
2338 	if (src_ill == NULL) {
2339 		if (!isv6) {
2340 			IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, sender4);
2341 			ip1dbg(("no src ill for %s\n", inet_ntop(AF_INET,
2342 			    &sender4, addrbuf, sizeof (addrbuf))));
2343 		} else {
2344 			ip1dbg(("no src ill for %s\n", inet_ntop(AF_INET6,
2345 			    &ncec->ncec_addr, addrbuf, sizeof (addrbuf))));
2346 		}
2347 		nce_restart_timer(ncec, ill->ill_reachable_retrans_time);
2348 		ncec_refrele(ncec);
2349 		return;
2350 	}
2351 	if (!isv6)
2352 		IN6_V4MAPPED_TO_IPADDR(&sender6, sender4);
2353 
2354 	mutex_enter(&ncec->ncec_lock);
2355 	/*
2356 	 * Check the reachability state.
2357 	 */
2358 	switch (ncec->ncec_state) {
2359 	case ND_DELAY:
2360 		ASSERT(ncec->ncec_lladdr != NULL);
2361 		ncec->ncec_state = ND_PROBE;
2362 		ncec->ncec_pcnt = ND_MAX_UNICAST_SOLICIT;
2363 		if (isv6) {
2364 			mutex_exit(&ncec->ncec_lock);
2365 			dropped = ndp_xmit(src_ill, ND_NEIGHBOR_SOLICIT,
2366 			    src_ill->ill_phys_addr,
2367 			    src_ill->ill_phys_addr_length,
2368 			    &sender6, &ncec->ncec_addr,
2369 			    NDP_UNICAST);
2370 		} else {
2371 			dropped = arp_request(ncec, sender4, src_ill);
2372 			mutex_exit(&ncec->ncec_lock);
2373 		}
2374 		if (!dropped) {
2375 			mutex_enter(&ncec->ncec_lock);
2376 			ncec->ncec_pcnt--;
2377 			mutex_exit(&ncec->ncec_lock);
2378 		}
2379 		if (ip_debug > 3) {
2380 			/* ip2dbg */
2381 			pr_addr_dbg("nce_timer: state for %s changed "
2382 			    "to PROBE\n", AF_INET6, &ncec->ncec_addr);
2383 		}
2384 		nce_restart_timer(ncec, ill->ill_reachable_retrans_time);
2385 		break;
2386 	case ND_PROBE:
2387 		/* must be retransmit timer */
2388 		ASSERT(ncec->ncec_pcnt >= -1);
2389 		if (ncec->ncec_pcnt > 0) {
2390 			/*
2391 			 * As per RFC2461, the ncec gets deleted after
2392 			 * MAX_UNICAST_SOLICIT unsuccessful re-transmissions.
2393 			 * Note that the first unicast solicitation is sent
2394 			 * during the DELAY state.
2395 			 */
2396 			ip2dbg(("nce_timer: pcount=%x dst %s\n",
2397 			    ncec->ncec_pcnt,
2398 			    inet_ntop((isv6? AF_INET6 : AF_INET),
2399 			    &ncec->ncec_addr, addrbuf, sizeof (addrbuf))));
2400 			if (NCE_PUBLISH(ncec)) {
2401 				mutex_exit(&ncec->ncec_lock);
2402 				/*
2403 				 * send out a probe; note that src_ill
2404 				 * is ignored by nce_dad() for all
2405 				 * DAD message types other than IPv6
2406 				 * unicast probes
2407 				 */
2408 				nce_dad(ncec, src_ill, B_TRUE);
2409 			} else {
2410 				ASSERT(src_ill != NULL);
2411 				if (isv6) {
2412 					mutex_exit(&ncec->ncec_lock);
2413 					dropped = ndp_xmit(src_ill,
2414 					    ND_NEIGHBOR_SOLICIT,
2415 					    src_ill->ill_phys_addr,
2416 					    src_ill->ill_phys_addr_length,
2417 					    &sender6, &ncec->ncec_addr,
2418 					    NDP_UNICAST);
2419 				} else {
2420 					/*
2421 					 * since the nce is REACHABLE,
2422 					 * the ARP request will be sent out
2423 					 * as a link-layer unicast.
2424 					 */
2425 					dropped = arp_request(ncec, sender4,
2426 					    src_ill);
2427 					mutex_exit(&ncec->ncec_lock);
2428 				}
2429 				if (!dropped) {
2430 					mutex_enter(&ncec->ncec_lock);
2431 					ncec->ncec_pcnt--;
2432 					mutex_exit(&ncec->ncec_lock);
2433 				}
2434 				nce_restart_timer(ncec,
2435 				    ill->ill_reachable_retrans_time);
2436 			}
2437 		} else if (ncec->ncec_pcnt < 0) {
2438 			/* No hope, delete the ncec */
2439 			/* Tell datapath it went bad */
2440 			ncec->ncec_state = ND_UNREACHABLE;
2441 			mutex_exit(&ncec->ncec_lock);
2442 			if (ip_debug > 2) {
2443 				/* ip1dbg */
2444 				pr_addr_dbg("nce_timer: Delete NCE for"
2445 				    " dst %s\n", (isv6? AF_INET6: AF_INET),
2446 				    &ncec->ncec_addr);
2447 			}
2448 			/* if static ARP can't delete. */
2449 			if ((ncec->ncec_flags & NCE_F_STATIC) == 0)
2450 				ncec_delete(ncec);
2451 
2452 		} else if (!NCE_PUBLISH(ncec)) {
2453 			/*
2454 			 * Probe count is 0 for a dynamic entry (one that we
2455 			 * ourselves are not publishing). We should never get
2456 			 * here if NONUD was requested, hence the ASSERT below.
2457 			 */
2458 			ASSERT((ncec->ncec_flags & NCE_F_NONUD) == 0);
2459 			ip2dbg(("nce_timer: pcount=%x dst %s\n",
2460 			    ncec->ncec_pcnt, inet_ntop(AF_INET6,
2461 			    &ncec->ncec_addr, addrbuf, sizeof (addrbuf))));
2462 			ncec->ncec_pcnt--;
2463 			mutex_exit(&ncec->ncec_lock);
2464 			/* Wait one interval before killing */
2465 			nce_restart_timer(ncec,
2466 			    ill->ill_reachable_retrans_time);
2467 		} else if (ill->ill_phyint->phyint_flags & PHYI_RUNNING) {
2468 			ipif_t *ipif;
2469 			ipaddr_t ncec_addr;
2470 
2471 			/*
2472 			 * We're done probing, and we can now declare this
2473 			 * address to be usable.  Let IP know that it's ok to
2474 			 * use.
2475 			 */
2476 			ncec->ncec_state = ND_REACHABLE;
2477 			ncec->ncec_flags &= ~NCE_F_UNVERIFIED;
2478 			mutex_exit(&ncec->ncec_lock);
2479 			if (isv6) {
2480 				ipif = ipif_lookup_addr_exact_v6(
2481 				    &ncec->ncec_addr, ill, ipst);
2482 			} else {
2483 				IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr,
2484 				    ncec_addr);
2485 				ipif = ipif_lookup_addr_exact(ncec_addr, ill,
2486 				    ipst);
2487 			}
2488 			if (ipif != NULL) {
2489 				if (ipif->ipif_was_dup) {
2490 					char ibuf[LIFNAMSIZ];
2491 					char sbuf[INET6_ADDRSTRLEN];
2492 
2493 					ipif->ipif_was_dup = B_FALSE;
2494 					(void) inet_ntop(AF_INET6,
2495 					    &ipif->ipif_v6lcl_addr,
2496 					    sbuf, sizeof (sbuf));
2497 					ipif_get_name(ipif, ibuf,
2498 					    sizeof (ibuf));
2499 					cmn_err(CE_NOTE, "recovered address "
2500 					    "%s on %s", sbuf, ibuf);
2501 				}
2502 				if ((ipif->ipif_flags & IPIF_UP) &&
2503 				    !ipif->ipif_addr_ready)
2504 					ipif_up_notify(ipif);
2505 				ipif->ipif_addr_ready = 1;
2506 				ipif_refrele(ipif);
2507 			}
2508 			if (!isv6 && arp_no_defense)
2509 				break;
2510 			/* Begin defending our new address */
2511 			if (ncec->ncec_unsolicit_count > 0) {
2512 				ncec->ncec_unsolicit_count--;
2513 				if (isv6) {
2514 					dropped = ndp_announce(ncec);
2515 				} else {
2516 					dropped = arp_announce(ncec);
2517 				}
2518 
2519 				if (dropped)
2520 					ncec->ncec_unsolicit_count++;
2521 				else
2522 					ncec->ncec_last_time_defended =
2523 					    ddi_get_lbolt();
2524 			}
2525 			if (ncec->ncec_unsolicit_count > 0) {
2526 				nce_restart_timer(ncec,
2527 				    ANNOUNCE_INTERVAL(isv6));
2528 			} else if (DEFENSE_INTERVAL(isv6) != 0) {
2529 				nce_restart_timer(ncec, DEFENSE_INTERVAL(isv6));
2530 			}
2531 		} else {
2532 			/*
2533 			 * This is an address we're probing to be our own, but
2534 			 * the ill is down.  Wait until it comes back before
2535 			 * doing anything, but switch to reachable state so
2536 			 * that the restart will work.
2537 			 */
2538 			ncec->ncec_state = ND_REACHABLE;
2539 			mutex_exit(&ncec->ncec_lock);
2540 		}
2541 		break;
2542 	case ND_INCOMPLETE: {
2543 		mblk_t	*mp, *nextmp;
2544 		mblk_t	**prevmpp;
2545 
2546 		/*
2547 		 * Per case (2) in the nce_queue_mp() comments, scan ncec_qd_mp
2548 		 * for any IPMP probe packets, and toss them.  IPMP probe
2549 		 * packets will always be at the head of ncec_qd_mp, so that
2550 		 * we can stop at the first queued ND packet that is
2551 		 * not a probe packet.
2552 		 */
2553 		prevmpp = &ncec->ncec_qd_mp;
2554 		for (mp = ncec->ncec_qd_mp; mp != NULL; mp = nextmp) {
2555 			nextmp = mp->b_next;
2556 
2557 			if (IS_UNDER_IPMP(ill) && ncec->ncec_nprobes > 0) {
2558 				inet_freemsg(mp);
2559 				ncec->ncec_nprobes--;
2560 				*prevmpp = nextmp;
2561 			} else {
2562 				prevmpp = &mp->b_next;
2563 			}
2564 		}
2565 
2566 		/*
2567 		 * Must be resolver's retransmit timer.
2568 		 */
2569 		mutex_exit(&ncec->ncec_lock);
2570 		ip_ndp_resolve(ncec);
2571 		break;
2572 	}
2573 	case ND_REACHABLE:
2574 		if (((ncec->ncec_flags & NCE_F_UNSOL_ADV) &&
2575 		    ncec->ncec_unsolicit_count != 0) ||
2576 		    (NCE_PUBLISH(ncec) && DEFENSE_INTERVAL(isv6) != 0)) {
2577 			if (ncec->ncec_unsolicit_count > 0) {
2578 				ncec->ncec_unsolicit_count--;
2579 				mutex_exit(&ncec->ncec_lock);
2580 				/*
2581 				 * When we get to zero announcements left,
2582 				 * switch to address defense
2583 				 */
2584 			} else {
2585 				boolean_t rate_limit;
2586 
2587 				mutex_exit(&ncec->ncec_lock);
2588 				rate_limit = ill_defend_rate_limit(ill, ncec);
2589 				if (rate_limit) {
2590 					nce_restart_timer(ncec,
2591 					    DEFENSE_INTERVAL(isv6));
2592 					break;
2593 				}
2594 			}
2595 			if (isv6) {
2596 				dropped = ndp_announce(ncec);
2597 			} else {
2598 				dropped = arp_announce(ncec);
2599 			}
2600 			mutex_enter(&ncec->ncec_lock);
2601 			if (dropped) {
2602 				ncec->ncec_unsolicit_count++;
2603 			} else {
2604 				ncec->ncec_last_time_defended =
2605 				    ddi_get_lbolt();
2606 			}
2607 			mutex_exit(&ncec->ncec_lock);
2608 			if (ncec->ncec_unsolicit_count != 0) {
2609 				nce_restart_timer(ncec,
2610 				    ANNOUNCE_INTERVAL(isv6));
2611 			} else {
2612 				nce_restart_timer(ncec, DEFENSE_INTERVAL(isv6));
2613 			}
2614 		} else {
2615 			mutex_exit(&ncec->ncec_lock);
2616 		}
2617 		break;
2618 	default:
2619 		mutex_exit(&ncec->ncec_lock);
2620 		break;
2621 	}
2622 done:
2623 	ncec_refrele(ncec);
2624 	ill_refrele(src_ill);
2625 }
2626 
2627 /*
2628  * Set a link layer address from the ll_addr passed in.
2629  * Copy SAP from ill.
2630  */
2631 static void
2632 nce_set_ll(ncec_t *ncec, uchar_t *ll_addr)
2633 {
2634 	ill_t	*ill = ncec->ncec_ill;
2635 
2636 	ASSERT(ll_addr != NULL);
2637 	if (ill->ill_phys_addr_length > 0) {
2638 		/*
2639 		 * The bcopy() below used to be called for the physical address
2640 		 * length rather than the link layer address length. For
2641 		 * ethernet and many other media, the phys_addr and lla are
2642 		 * identical.
2643 		 *
2644 		 * The phys_addr and lla may not be the same for devices that
2645 		 * support DL_IPV6_LINK_LAYER_ADDR, though there are currently
2646 		 * no known instances of these.
2647 		 *
2648 		 * For PPP or other interfaces with a zero length
2649 		 * physical address, don't do anything here.
2650 		 * The bcopy() with a zero phys_addr length was previously
2651 		 * a no-op for interfaces with a zero-length physical address.
2652 		 * Using the lla for them would change the way they operate.
2653 		 * Doing nothing in such cases preserves expected behavior.
2654 		 */
2655 		bcopy(ll_addr, ncec->ncec_lladdr, ill->ill_nd_lla_len);
2656 	}
2657 }
2658 
2659 boolean_t
2660 nce_cmp_ll_addr(const ncec_t *ncec, const uchar_t *ll_addr,
2661     uint32_t ll_addr_len)
2662 {
2663 	ASSERT(ncec->ncec_lladdr != NULL);
2664 	if (ll_addr == NULL)
2665 		return (B_FALSE);
2666 	if (bcmp(ll_addr, ncec->ncec_lladdr, ll_addr_len) != 0)
2667 		return (B_TRUE);
2668 	return (B_FALSE);
2669 }
2670 
2671 /*
2672  * Updates the link layer address or the reachability state of
2673  * a cache entry.  Reset probe counter if needed.
2674  */
2675 void
2676 nce_update(ncec_t *ncec, uint16_t new_state, uchar_t *new_ll_addr)
2677 {
2678 	ill_t	*ill = ncec->ncec_ill;
2679 	boolean_t need_stop_timer = B_FALSE;
2680 	boolean_t need_fastpath_update = B_FALSE;
2681 	nce_t	*nce = NULL;
2682 	timeout_id_t tid;
2683 
2684 	ASSERT(MUTEX_HELD(&ncec->ncec_lock));
2685 	/*
2686 	 * If this interface does not do NUD, there is no point
2687 	 * in allowing an update to the cache entry.  Although
2688 	 * we will respond to NS.
2689 	 * The only time we accept an update for a resolver when
2690 	 * NUD is turned off is when it has just been created.
2691 	 * Non-Resolvers will always be created as REACHABLE.
2692 	 */
2693 	if (new_state != ND_UNCHANGED) {
2694 		if ((ncec->ncec_flags & NCE_F_NONUD) &&
2695 		    (ncec->ncec_state != ND_INCOMPLETE))
2696 			return;
2697 		ASSERT((int16_t)new_state >= ND_STATE_VALID_MIN);
2698 		ASSERT((int16_t)new_state <= ND_STATE_VALID_MAX);
2699 		need_stop_timer = B_TRUE;
2700 		if (new_state == ND_REACHABLE)
2701 			ncec->ncec_last = TICK_TO_MSEC(ddi_get_lbolt64());
2702 		else {
2703 			/* We force NUD in this case */
2704 			ncec->ncec_last = 0;
2705 		}
2706 		ncec->ncec_state = new_state;
2707 		ncec->ncec_pcnt = ND_MAX_UNICAST_SOLICIT;
2708 		ASSERT(ncec->ncec_lladdr != NULL || new_state == ND_INITIAL ||
2709 		    new_state == ND_INCOMPLETE);
2710 	}
2711 	if (need_stop_timer || (ncec->ncec_flags & NCE_F_STATIC)) {
2712 		tid = ncec->ncec_timeout_id;
2713 		ncec->ncec_timeout_id = 0;
2714 	}
2715 	/*
2716 	 * Re-trigger fastpath probe and
2717 	 * overwrite the DL_UNITDATA_REQ data, noting we'll lose
2718 	 * whatever packets that happens to be transmitting at the time.
2719 	 */
2720 	if (new_ll_addr != NULL) {
2721 		bcopy(new_ll_addr, ncec->ncec_lladdr,
2722 		    ill->ill_phys_addr_length);
2723 		need_fastpath_update = B_TRUE;
2724 	}
2725 	mutex_exit(&ncec->ncec_lock);
2726 	if (need_stop_timer || (ncec->ncec_flags & NCE_F_STATIC)) {
2727 		if (tid != 0)
2728 			(void) untimeout(tid);
2729 	}
2730 	if (need_fastpath_update) {
2731 		/*
2732 		 * Delete any existing existing dlur_mp and fp_mp information.
2733 		 * For IPMP interfaces, all underlying ill's must be checked
2734 		 * and purged.
2735 		 */
2736 		nce_fastpath_list_delete(ncec->ncec_ill, ncec, NULL);
2737 		/*
2738 		 * add the new dlur_mp and fp_mp
2739 		 */
2740 		nce = nce_fastpath(ncec, B_TRUE, NULL);
2741 		if (nce != NULL)
2742 			nce_refrele(nce);
2743 	}
2744 	mutex_enter(&ncec->ncec_lock);
2745 }
2746 
2747 static void
2748 nce_queue_mp_common(ncec_t *ncec, mblk_t *mp, boolean_t head_insert)
2749 {
2750 	uint_t	count = 0;
2751 	mblk_t  **mpp, *tmp;
2752 
2753 	ASSERT(MUTEX_HELD(&ncec->ncec_lock));
2754 
2755 	for (mpp = &ncec->ncec_qd_mp; *mpp != NULL; mpp = &(*mpp)->b_next) {
2756 		if (++count > ncec->ncec_ill->ill_max_buf) {
2757 			tmp = ncec->ncec_qd_mp->b_next;
2758 			ncec->ncec_qd_mp->b_next = NULL;
2759 			/*
2760 			 * if we never create data addrs on the under_ill
2761 			 * does this matter?
2762 			 */
2763 			BUMP_MIB(ncec->ncec_ill->ill_ip_mib,
2764 			    ipIfStatsOutDiscards);
2765 			ip_drop_output("ipIfStatsOutDiscards", ncec->ncec_qd_mp,
2766 			    ncec->ncec_ill);
2767 			freemsg(ncec->ncec_qd_mp);
2768 			ncec->ncec_qd_mp = tmp;
2769 		}
2770 	}
2771 
2772 	if (head_insert) {
2773 		ncec->ncec_nprobes++;
2774 		mp->b_next = ncec->ncec_qd_mp;
2775 		ncec->ncec_qd_mp = mp;
2776 	} else {
2777 		*mpp = mp;
2778 	}
2779 }
2780 
2781 /*
2782  * nce_queue_mp will queue the packet into the ncec_qd_mp. The packet will be
2783  * queued at the head or tail of the queue based on the input argument
2784  * 'head_insert'. The caller should specify this argument as B_TRUE if this
2785  * packet is an IPMP probe packet, in which case the following happens:
2786  *
2787  *   1. Insert it at the head of the ncec_qd_mp list.  Consider the normal
2788  *	(non-ipmp_probe) load-speading case where the source address of the ND
2789  *	packet is not tied to ncec_ill. If the ill bound to the source address
2790  *	cannot receive, the response to the ND packet will not be received.
2791  *	However, if ND packets for ncec_ill's probes are queued	behind that ND
2792  *	packet, those probes will also fail to be sent, and thus in.mpathd will
2793  *	 erroneously conclude that ncec_ill has also failed.
2794  *
2795  *   2. Drop the ipmp_probe packet in ndp_timer() if the ND did	not succeed on
2796  *	the first attempt.  This ensures that ND problems do not manifest as
2797  *	probe RTT spikes.
2798  *
2799  * We achieve this by inserting ipmp_probe() packets at the head of the
2800  * nce_queue.
2801  *
2802  * The ncec for the probe target is created with ncec_ill set to the ipmp_ill,
2803  * but the caller needs to set head_insert to B_TRUE if this is a probe packet.
2804  */
2805 void
2806 nce_queue_mp(ncec_t *ncec, mblk_t *mp, boolean_t head_insert)
2807 {
2808 	ASSERT(MUTEX_HELD(&ncec->ncec_lock));
2809 	nce_queue_mp_common(ncec, mp, head_insert);
2810 }
2811 
2812 /*
2813  * Called when address resolution failed due to a timeout.
2814  * Send an ICMP unreachable in response to all queued packets.
2815  */
2816 void
2817 ndp_resolv_failed(ncec_t *ncec)
2818 {
2819 	mblk_t	*mp, *nxt_mp;
2820 	char	buf[INET6_ADDRSTRLEN];
2821 	ill_t *ill = ncec->ncec_ill;
2822 	ip_recv_attr_t	iras;
2823 
2824 	bzero(&iras, sizeof (iras));
2825 	iras.ira_flags = 0;
2826 	/*
2827 	 * we are setting the ira_rill to the ipmp_ill (instead of
2828 	 * the actual ill on which the packet was received), but this
2829 	 * is ok because we don't actually need the real ira_rill.
2830 	 * to send the icmp unreachable to the sender.
2831 	 */
2832 	iras.ira_ill = iras.ira_rill = ill;
2833 	iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex;
2834 	iras.ira_rifindex = iras.ira_ruifindex;
2835 
2836 	ip1dbg(("ndp_resolv_failed: dst %s\n",
2837 	    inet_ntop(AF_INET6, (char *)&ncec->ncec_addr, buf, sizeof (buf))));
2838 	mutex_enter(&ncec->ncec_lock);
2839 	mp = ncec->ncec_qd_mp;
2840 	ncec->ncec_qd_mp = NULL;
2841 	ncec->ncec_nprobes = 0;
2842 	mutex_exit(&ncec->ncec_lock);
2843 	while (mp != NULL) {
2844 		nxt_mp = mp->b_next;
2845 		mp->b_next = NULL;
2846 
2847 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
2848 		ip_drop_output("ipIfStatsOutDiscards - address unreachable",
2849 		    mp, ill);
2850 		icmp_unreachable_v6(mp,
2851 		    ICMP6_DST_UNREACH_ADDR, B_FALSE, &iras);
2852 		ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE));
2853 		mp = nxt_mp;
2854 	}
2855 	ncec_cb_dispatch(ncec); /* finish off waiting callbacks */
2856 }
2857 
2858 /*
2859  * Handle the completion of NDP and ARP resolution.
2860  */
2861 void
2862 nce_resolv_ok(ncec_t *ncec)
2863 {
2864 	mblk_t *mp;
2865 	uint_t pkt_len;
2866 	iaflags_t ixaflags = IXAF_NO_TRACE;
2867 	nce_t *nce;
2868 	ill_t	*ill = ncec->ncec_ill;
2869 	boolean_t isv6 = (ncec->ncec_ipversion == IPV6_VERSION);
2870 	ip_stack_t *ipst = ill->ill_ipst;
2871 
2872 	if (IS_IPMP(ncec->ncec_ill)) {
2873 		nce_resolv_ipmp_ok(ncec);
2874 		return;
2875 	}
2876 	/* non IPMP case */
2877 
2878 	mutex_enter(&ncec->ncec_lock);
2879 	ASSERT(ncec->ncec_nprobes == 0);
2880 	mp = ncec->ncec_qd_mp;
2881 	ncec->ncec_qd_mp = NULL;
2882 	mutex_exit(&ncec->ncec_lock);
2883 
2884 	while (mp != NULL) {
2885 		mblk_t *nxt_mp;
2886 
2887 		if (ill->ill_isv6) {
2888 			ip6_t *ip6h = (ip6_t *)mp->b_rptr;
2889 
2890 			pkt_len = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN;
2891 		} else {
2892 			ipha_t *ipha = (ipha_t *)mp->b_rptr;
2893 
2894 			ixaflags |= IXAF_IS_IPV4;
2895 			pkt_len = ntohs(ipha->ipha_length);
2896 		}
2897 		nxt_mp = mp->b_next;
2898 		mp->b_next = NULL;
2899 		/*
2900 		 * IXAF_NO_DEV_FLOW_CTL information for TCP packets is no
2901 		 * longer available, but it's ok to drop this flag because TCP
2902 		 * has its own flow-control in effect, so TCP packets
2903 		 * are not likely to get here when flow-control is in effect.
2904 		 */
2905 		mutex_enter(&ill->ill_lock);
2906 		nce = nce_lookup(ill, &ncec->ncec_addr);
2907 		mutex_exit(&ill->ill_lock);
2908 
2909 		if (nce == NULL) {
2910 			if (isv6) {
2911 				BUMP_MIB(&ipst->ips_ip6_mib,
2912 				    ipIfStatsOutDiscards);
2913 			} else {
2914 				BUMP_MIB(&ipst->ips_ip_mib,
2915 				    ipIfStatsOutDiscards);
2916 			}
2917 			ip_drop_output("ipIfStatsOutDiscards - no nce",
2918 			    mp, NULL);
2919 			freemsg(mp);
2920 		} else {
2921 			/*
2922 			 * We don't know the zoneid, but
2923 			 * ip_xmit does not care since IXAF_NO_TRACE
2924 			 * is set. (We traced the packet the first
2925 			 * time through ip_xmit.)
2926 			 */
2927 			(void) ip_xmit(mp, nce, ixaflags, pkt_len, 0,
2928 			    ALL_ZONES, 0, NULL);
2929 			nce_refrele(nce);
2930 		}
2931 		mp = nxt_mp;
2932 	}
2933 
2934 	ncec_cb_dispatch(ncec); /* complete callbacks */
2935 }
2936 
2937 /*
2938  * Called by SIOCSNDP* ioctl to add/change an ncec entry
2939  * and the corresponding attributes.
2940  * Disallow states other than ND_REACHABLE or ND_STALE.
2941  */
2942 int
2943 ndp_sioc_update(ill_t *ill, lif_nd_req_t *lnr)
2944 {
2945 	sin6_t		*sin6;
2946 	in6_addr_t	*addr;
2947 	ncec_t		*ncec;
2948 	nce_t		*nce;
2949 	int		err = 0;
2950 	uint16_t	new_flags = 0;
2951 	uint16_t	old_flags = 0;
2952 	int		inflags = lnr->lnr_flags;
2953 	ip_stack_t	*ipst = ill->ill_ipst;
2954 	boolean_t	do_postprocess = B_FALSE;
2955 
2956 	ASSERT(ill->ill_isv6);
2957 	if ((lnr->lnr_state_create != ND_REACHABLE) &&
2958 	    (lnr->lnr_state_create != ND_STALE))
2959 		return (EINVAL);
2960 
2961 	sin6 = (sin6_t *)&lnr->lnr_addr;
2962 	addr = &sin6->sin6_addr;
2963 
2964 	mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
2965 	ASSERT(!IS_UNDER_IPMP(ill));
2966 	nce = nce_lookup_addr(ill, addr);
2967 	if (nce != NULL)
2968 		new_flags = nce->nce_common->ncec_flags;
2969 
2970 	switch (inflags & (NDF_ISROUTER_ON|NDF_ISROUTER_OFF)) {
2971 	case NDF_ISROUTER_ON:
2972 		new_flags |= NCE_F_ISROUTER;
2973 		break;
2974 	case NDF_ISROUTER_OFF:
2975 		new_flags &= ~NCE_F_ISROUTER;
2976 		break;
2977 	case (NDF_ISROUTER_OFF|NDF_ISROUTER_ON):
2978 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
2979 		if (nce != NULL)
2980 			nce_refrele(nce);
2981 		return (EINVAL);
2982 	}
2983 	if (inflags & NDF_STATIC)
2984 		new_flags |= NCE_F_STATIC;
2985 
2986 	switch (inflags & (NDF_ANYCAST_ON|NDF_ANYCAST_OFF)) {
2987 	case NDF_ANYCAST_ON:
2988 		new_flags |= NCE_F_ANYCAST;
2989 		break;
2990 	case NDF_ANYCAST_OFF:
2991 		new_flags &= ~NCE_F_ANYCAST;
2992 		break;
2993 	case (NDF_ANYCAST_OFF|NDF_ANYCAST_ON):
2994 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
2995 		if (nce != NULL)
2996 			nce_refrele(nce);
2997 		return (EINVAL);
2998 	}
2999 
3000 	if (nce == NULL) {
3001 		err = nce_add_v6(ill,
3002 		    (uchar_t *)lnr->lnr_hdw_addr,
3003 		    ill->ill_phys_addr_length,
3004 		    addr,
3005 		    new_flags,
3006 		    lnr->lnr_state_create,
3007 		    &nce);
3008 		if (err != 0) {
3009 			mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
3010 			ip1dbg(("ndp_sioc_update: Can't create NCE %d\n", err));
3011 			return (err);
3012 		} else {
3013 			do_postprocess = B_TRUE;
3014 		}
3015 	}
3016 	ncec = nce->nce_common;
3017 	old_flags = ncec->ncec_flags;
3018 	if (old_flags & NCE_F_ISROUTER && !(new_flags & NCE_F_ISROUTER)) {
3019 		ncec_router_to_host(ncec);
3020 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
3021 		if (do_postprocess)
3022 			err = nce_add_v6_postprocess(nce);
3023 		nce_refrele(nce);
3024 		return (0);
3025 	}
3026 	mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
3027 
3028 	if (do_postprocess)
3029 		err = nce_add_v6_postprocess(nce);
3030 	/*
3031 	 * err cannot be anything other than 0 because we don't support
3032 	 * proxy arp of static addresses.
3033 	 */
3034 	ASSERT(err == 0);
3035 
3036 	mutex_enter(&ncec->ncec_lock);
3037 	ncec->ncec_flags = new_flags;
3038 	mutex_exit(&ncec->ncec_lock);
3039 	/*
3040 	 * Note that we ignore the state at this point, which
3041 	 * should be either STALE or REACHABLE.  Instead we let
3042 	 * the link layer address passed in to determine the state
3043 	 * much like incoming packets.
3044 	 */
3045 	nce_process(ncec, (uchar_t *)lnr->lnr_hdw_addr, 0, B_FALSE);
3046 	nce_refrele(nce);
3047 	return (0);
3048 }
3049 
3050 /*
3051  * Create an nce_t structure for ill using the ncec->ncec_lladdr to set up
3052  * the nce_dlur_mp. If ill != ncec->ncec_ill, then the ips_ill_g_lock must
3053  * be held to ensure that they are in the same group.
3054  */
3055 static nce_t *
3056 nce_fastpath_create(ill_t *ill, ncec_t *ncec)
3057 {
3058 
3059 	nce_t *nce;
3060 
3061 	nce = nce_ill_lookup_then_add(ill, ncec);
3062 
3063 	if (nce == NULL || IS_LOOPBACK(nce->nce_ill) || IS_VNI(nce->nce_ill))
3064 		return (nce);
3065 
3066 	/*
3067 	 * hold the ncec_lock to synchronize with nce_update() so that,
3068 	 * at the end of this function, the contents of nce_dlur_mp are
3069 	 * consistent with ncec->ncec_lladdr, even though some intermediate
3070 	 * packet may have been sent out with a mangled address, which would
3071 	 * only be a transient condition.
3072 	 */
3073 	mutex_enter(&ncec->ncec_lock);
3074 	if (ncec->ncec_lladdr != NULL) {
3075 		bcopy(ncec->ncec_lladdr, nce->nce_dlur_mp->b_rptr +
3076 		    NCE_LL_ADDR_OFFSET(ill), ill->ill_phys_addr_length);
3077 	} else {
3078 		nce->nce_dlur_mp = ill_dlur_gen(NULL, 0, ill->ill_sap,
3079 		    ill->ill_sap_length);
3080 	}
3081 	mutex_exit(&ncec->ncec_lock);
3082 	return (nce);
3083 }
3084 
3085 /*
3086  * we make nce_fp_mp to have an M_DATA prepend.
3087  * The caller ensures there is hold on ncec for this function.
3088  * Note that since ill_fastpath_probe() copies the mblk there is
3089  * no need to hold the nce or ncec beyond this function.
3090  *
3091  * If the caller has passed in a non-null ncec_nce to nce_faspath() that
3092  * ncec_nce must correspond to the nce for ncec with nce_ill == ncec->ncec_ill
3093  * and will be returned back by this function, so that no extra nce_refrele
3094  * is required for the caller. The calls from nce_add_common() use this
3095  * method. All other callers (that pass in NULL ncec_nce) will have to do a
3096  * nce_refrele of the returned nce (when it is non-null).
3097  */
3098 nce_t *
3099 nce_fastpath(ncec_t *ncec, boolean_t trigger_fp_req, nce_t *ncec_nce)
3100 {
3101 	nce_t *nce;
3102 	ill_t *ill = ncec->ncec_ill;
3103 
3104 	ASSERT(ill != NULL);
3105 
3106 	if (IS_IPMP(ill) && trigger_fp_req) {
3107 		trigger_fp_req = B_FALSE;
3108 		ipmp_ncec_fastpath(ncec, ill);
3109 
3110 	}
3111 	/*
3112 	 * If the caller already has the nce corresponding to the ill, use
3113 	 * that one. Otherwise we have to lookup/add the nce. Calls from
3114 	 * nce_add_common() fall in the former category, and have just done
3115 	 * the nce lookup/add that can be reused.
3116 	 */
3117 	if (ncec_nce == NULL)
3118 		nce = nce_fastpath_create(ill, ncec);
3119 	else
3120 		nce = ncec_nce;
3121 
3122 	if (nce == NULL || IS_LOOPBACK(nce->nce_ill) || IS_VNI(nce->nce_ill))
3123 		return (nce);
3124 
3125 	if (trigger_fp_req)
3126 		nce_fastpath_trigger(nce);
3127 	return (nce);
3128 }
3129 
3130 /*
3131  * Trigger fastpath on nce. No locks may be held.
3132  */
3133 static void
3134 nce_fastpath_trigger(nce_t *nce)
3135 {
3136 	int res;
3137 	ill_t *ill = nce->nce_ill;
3138 	ncec_t *ncec = nce->nce_common;
3139 
3140 	res = ill_fastpath_probe(ill, nce->nce_dlur_mp);
3141 	/*
3142 	 * EAGAIN is an indication of a transient error
3143 	 * i.e. allocation failure etc. leave the ncec in the list it
3144 	 * will be updated when another probe happens for another ire
3145 	 * if not it will be taken out of the list when the ire is
3146 	 * deleted.
3147 	 */
3148 	if (res != 0 && res != EAGAIN && res != ENOTSUP)
3149 		nce_fastpath_list_delete(ill, ncec, NULL);
3150 }
3151 
3152 /*
3153  * Add ncec to the nce fastpath list on ill.
3154  */
3155 static nce_t *
3156 nce_ill_lookup_then_add_locked(ill_t *ill, ncec_t *ncec)
3157 {
3158 	nce_t *nce = NULL;
3159 
3160 	ASSERT(MUTEX_HELD(&ill->ill_lock));
3161 	/*
3162 	 * Atomically ensure that the ill is not CONDEMNED and is not going
3163 	 * down, before adding the NCE.
3164 	 */
3165 	if (ill->ill_state_flags & ILL_CONDEMNED)
3166 		return (NULL);
3167 	mutex_enter(&ncec->ncec_lock);
3168 	/*
3169 	 * if ncec has not been deleted and
3170 	 * is not already in the list add it.
3171 	 */
3172 	if (!NCE_ISCONDEMNED(ncec)) {
3173 		nce = nce_lookup(ill, &ncec->ncec_addr);
3174 		if (nce != NULL)
3175 			goto done;
3176 		nce = nce_add(ill, ncec);
3177 	}
3178 done:
3179 	mutex_exit(&ncec->ncec_lock);
3180 	return (nce);
3181 }
3182 
3183 nce_t *
3184 nce_ill_lookup_then_add(ill_t *ill, ncec_t *ncec)
3185 {
3186 	nce_t *nce;
3187 
3188 	mutex_enter(&ill->ill_lock);
3189 	nce = nce_ill_lookup_then_add_locked(ill, ncec);
3190 	mutex_exit(&ill->ill_lock);
3191 	return (nce);
3192 }
3193 
3194 
3195 /*
3196  * remove ncec from the ill_nce list. If 'dead' is non-null, the deleted
3197  * nce is added to the 'dead' list, and the caller must nce_refrele() the
3198  * entry after all locks have been dropped.
3199  */
3200 void
3201 nce_fastpath_list_delete(ill_t *ill, ncec_t *ncec, list_t *dead)
3202 {
3203 	nce_t *nce;
3204 
3205 	ASSERT(ill != NULL);
3206 
3207 	/* first clean out any nce pointers in the under_ills */
3208 	if (IS_IPMP(ill))
3209 		ipmp_ncec_flush_nce(ncec);
3210 
3211 	/* now the ill itself */
3212 	mutex_enter(&ill->ill_lock);
3213 	for (nce = list_head(&ill->ill_nce); nce != NULL;
3214 	    nce = list_next(&ill->ill_nce, nce)) {
3215 		if (nce->nce_common == ncec) {
3216 			nce_refhold(nce);
3217 			nce_delete(nce);
3218 			break;
3219 		}
3220 	}
3221 	mutex_exit(&ill->ill_lock);
3222 	if (nce != NULL) {
3223 		if (dead == NULL)
3224 			nce_refrele(nce);
3225 		else
3226 			list_insert_tail(dead, nce);
3227 	}
3228 }
3229 
3230 /*
3231  * when the fastpath response does not fit in the datab
3232  * associated with the existing nce_fp_mp, we delete and
3233  * add the nce to retrigger fastpath based on the information
3234  * in the ncec_t.
3235  */
3236 static nce_t *
3237 nce_delete_then_add(nce_t *nce)
3238 {
3239 	ill_t		*ill = nce->nce_ill;
3240 	nce_t		*newnce = NULL;
3241 
3242 	ip0dbg(("nce_delete_then_add nce %p ill %s\n",
3243 	    (void *)nce, ill->ill_name));
3244 	mutex_enter(&ill->ill_lock);
3245 	mutex_enter(&nce->nce_common->ncec_lock);
3246 	nce_delete(nce);
3247 	/*
3248 	 * Make sure that ncec is not condemned before adding. We hold the
3249 	 * ill_lock and ncec_lock to synchronize with ncec_delete() and
3250 	 * ipmp_ncec_flush_nce()
3251 	 */
3252 	if (!NCE_ISCONDEMNED(nce->nce_common))
3253 		newnce = nce_add(ill, nce->nce_common);
3254 	mutex_exit(&nce->nce_common->ncec_lock);
3255 	mutex_exit(&ill->ill_lock);
3256 	nce_refrele(nce);
3257 	return (newnce); /* could be null if nomem */
3258 }
3259 
3260 typedef struct nce_fp_match_s {
3261 	nce_t	*nce_fp_match_res;
3262 	mblk_t	*nce_fp_match_ack_mp;
3263 } nce_fp_match_t;
3264 
3265 /* ARGSUSED */
3266 static int
3267 nce_fastpath_match_dlur(ill_t *ill, nce_t *nce, void *arg)
3268 {
3269 	nce_fp_match_t	*nce_fp_marg = arg;
3270 	ncec_t		*ncec = nce->nce_common;
3271 	mblk_t		*mp = nce_fp_marg->nce_fp_match_ack_mp;
3272 	uchar_t	*mp_rptr, *ud_mp_rptr;
3273 	mblk_t		*ud_mp = nce->nce_dlur_mp;
3274 	ptrdiff_t	cmplen;
3275 
3276 	/*
3277 	 * mp is the mp associated with the fastpath ack.
3278 	 * ud_mp is the outstanding DL_UNITDATA_REQ on the nce_t
3279 	 * under consideration. If the contents match, then the
3280 	 * fastpath ack is used to update the nce.
3281 	 */
3282 	if (ud_mp == NULL)
3283 		return (0);
3284 	mp_rptr = mp->b_rptr;
3285 	cmplen = mp->b_wptr - mp_rptr;
3286 	ASSERT(cmplen >= 0);
3287 
3288 	ud_mp_rptr = ud_mp->b_rptr;
3289 	/*
3290 	 * The ncec is locked here to prevent any other threads from accessing
3291 	 * and changing nce_dlur_mp when the address becomes resolved to an
3292 	 * lla while we're in the middle of looking at and comparing the
3293 	 * hardware address (lla). It is also locked to prevent multiple
3294 	 * threads in nce_fastpath() from examining nce_dlur_mp at the same
3295 	 * time.
3296 	 */
3297 	mutex_enter(&ncec->ncec_lock);
3298 	if (ud_mp->b_wptr - ud_mp_rptr != cmplen ||
3299 	    bcmp((char *)mp_rptr, (char *)ud_mp_rptr, cmplen) == 0) {
3300 		nce_fp_marg->nce_fp_match_res = nce;
3301 		mutex_exit(&ncec->ncec_lock);
3302 		nce_refhold(nce);
3303 		return (1);
3304 	}
3305 	mutex_exit(&ncec->ncec_lock);
3306 	return (0);
3307 }
3308 
3309 /*
3310  * Update all NCE's that are not in fastpath mode and
3311  * have an nce_fp_mp that matches mp. mp->b_cont contains
3312  * the fastpath header.
3313  *
3314  * Returns TRUE if entry should be dequeued, or FALSE otherwise.
3315  */
3316 void
3317 nce_fastpath_update(ill_t *ill,  mblk_t *mp)
3318 {
3319 	nce_fp_match_t nce_fp_marg;
3320 	nce_t *nce;
3321 	mblk_t *nce_fp_mp, *fp_mp;
3322 
3323 	nce_fp_marg.nce_fp_match_res = NULL;
3324 	nce_fp_marg.nce_fp_match_ack_mp = mp;
3325 
3326 	nce_walk(ill, nce_fastpath_match_dlur, &nce_fp_marg);
3327 
3328 	if ((nce = nce_fp_marg.nce_fp_match_res) == NULL)
3329 		return;
3330 
3331 	mutex_enter(&nce->nce_lock);
3332 	nce_fp_mp = nce->nce_fp_mp;
3333 
3334 	if (nce_fp_mp != NULL) {
3335 		fp_mp = mp->b_cont;
3336 		if (nce_fp_mp->b_rptr + MBLKL(fp_mp) >
3337 		    nce_fp_mp->b_datap->db_lim) {
3338 			mutex_exit(&nce->nce_lock);
3339 			nce = nce_delete_then_add(nce);
3340 			if (nce == NULL) {
3341 				return;
3342 			}
3343 			mutex_enter(&nce->nce_lock);
3344 			nce_fp_mp = nce->nce_fp_mp;
3345 		}
3346 	}
3347 
3348 	/* Matched - install mp as the fastpath mp */
3349 	if (nce_fp_mp == NULL) {
3350 		fp_mp = dupb(mp->b_cont);
3351 		nce->nce_fp_mp = fp_mp;
3352 	} else {
3353 		fp_mp = mp->b_cont;
3354 		bcopy(fp_mp->b_rptr, nce_fp_mp->b_rptr, MBLKL(fp_mp));
3355 		nce->nce_fp_mp->b_wptr = nce->nce_fp_mp->b_rptr
3356 		    + MBLKL(fp_mp);
3357 	}
3358 	mutex_exit(&nce->nce_lock);
3359 	nce_refrele(nce);
3360 }
3361 
3362 /*
3363  * Return a pointer to a given option in the packet.
3364  * Assumes that option part of the packet have already been validated.
3365  */
3366 nd_opt_hdr_t *
3367 ndp_get_option(nd_opt_hdr_t *opt, int optlen, int opt_type)
3368 {
3369 	while (optlen > 0) {
3370 		if (opt->nd_opt_type == opt_type)
3371 			return (opt);
3372 		optlen -= 8 * opt->nd_opt_len;
3373 		opt = (struct nd_opt_hdr *)((char *)opt + 8 * opt->nd_opt_len);
3374 	}
3375 	return (NULL);
3376 }
3377 
3378 /*
3379  * Verify all option lengths present are > 0, also check to see
3380  * if the option lengths and packet length are consistent.
3381  */
3382 boolean_t
3383 ndp_verify_optlen(nd_opt_hdr_t *opt, int optlen)
3384 {
3385 	ASSERT(opt != NULL);
3386 	while (optlen > 0) {
3387 		if (opt->nd_opt_len == 0)
3388 			return (B_FALSE);
3389 		optlen -= 8 * opt->nd_opt_len;
3390 		if (optlen < 0)
3391 			return (B_FALSE);
3392 		opt = (struct nd_opt_hdr *)((char *)opt + 8 * opt->nd_opt_len);
3393 	}
3394 	return (B_TRUE);
3395 }
3396 
3397 /*
3398  * ncec_walk function.
3399  * Free a fraction of the NCE cache entries.
3400  *
3401  * A possible optimization here would be to use ncec_last where possible, and
3402  * delete the least-frequently used entry, which would require more complex
3403  * computation as we walk through the ncec's (e.g., track ncec entries by
3404  * order of ncec_last and/or maintain state)
3405  */
3406 static void
3407 ncec_cache_reclaim(ncec_t *ncec, char *arg)
3408 {
3409 	ip_stack_t	*ipst = ncec->ncec_ipst;
3410 	uint_t		fraction = *(uint_t *)arg;
3411 	uint_t		rand;
3412 
3413 	if ((ncec->ncec_flags &
3414 	    (NCE_F_MYADDR | NCE_F_STATIC | NCE_F_BCAST)) != 0) {
3415 		return;
3416 	}
3417 
3418 	rand = (uint_t)ddi_get_lbolt() +
3419 	    NCE_ADDR_HASH_V6(ncec->ncec_addr, NCE_TABLE_SIZE);
3420 	if ((rand/fraction)*fraction == rand) {
3421 		IP_STAT(ipst, ip_nce_reclaim_deleted);
3422 		ncec_delete(ncec);
3423 	}
3424 }
3425 
3426 /*
3427  * kmem_cache callback to free up memory.
3428  *
3429  * For now we just delete a fixed fraction.
3430  */
3431 static void
3432 ip_nce_reclaim_stack(ip_stack_t *ipst)
3433 {
3434 	uint_t		fraction = ipst->ips_ip_nce_reclaim_fraction;
3435 
3436 	IP_STAT(ipst, ip_nce_reclaim_calls);
3437 
3438 	ncec_walk(NULL, (pfi_t)ncec_cache_reclaim, (uchar_t *)&fraction, ipst);
3439 
3440 	/*
3441 	 * Walk all CONNs that can have a reference on an ire, ncec or dce.
3442 	 * Get them to update any stale references to drop any refholds they
3443 	 * have.
3444 	 */
3445 	ipcl_walk(conn_ixa_cleanup, (void *)B_FALSE, ipst);
3446 }
3447 
3448 /*
3449  * Called by the memory allocator subsystem directly, when the system
3450  * is running low on memory.
3451  */
3452 /* ARGSUSED */
3453 void
3454 ip_nce_reclaim(void *args)
3455 {
3456 	netstack_handle_t nh;
3457 	netstack_t *ns;
3458 	ip_stack_t *ipst;
3459 
3460 	netstack_next_init(&nh);
3461 	while ((ns = netstack_next(&nh)) != NULL) {
3462 		/*
3463 		 * netstack_next() can return a netstack_t with a NULL
3464 		 * netstack_ip at boot time.
3465 		 */
3466 		if ((ipst = ns->netstack_ip) == NULL) {
3467 			netstack_rele(ns);
3468 			continue;
3469 		}
3470 		ip_nce_reclaim_stack(ipst);
3471 		netstack_rele(ns);
3472 	}
3473 	netstack_next_fini(&nh);
3474 }
3475 
3476 #ifdef DEBUG
3477 void
3478 ncec_trace_ref(ncec_t *ncec)
3479 {
3480 	ASSERT(MUTEX_HELD(&ncec->ncec_lock));
3481 
3482 	if (ncec->ncec_trace_disable)
3483 		return;
3484 
3485 	if (!th_trace_ref(ncec, ncec->ncec_ipst)) {
3486 		ncec->ncec_trace_disable = B_TRUE;
3487 		ncec_trace_cleanup(ncec);
3488 	}
3489 }
3490 
3491 void
3492 ncec_untrace_ref(ncec_t *ncec)
3493 {
3494 	ASSERT(MUTEX_HELD(&ncec->ncec_lock));
3495 
3496 	if (!ncec->ncec_trace_disable)
3497 		th_trace_unref(ncec);
3498 }
3499 
3500 static void
3501 ncec_trace_cleanup(const ncec_t *ncec)
3502 {
3503 	th_trace_cleanup(ncec, ncec->ncec_trace_disable);
3504 }
3505 #endif
3506 
3507 /*
3508  * Called when address resolution fails due to a timeout.
3509  * Send an ICMP unreachable in response to all queued packets.
3510  */
3511 void
3512 arp_resolv_failed(ncec_t *ncec)
3513 {
3514 	mblk_t	*mp, *nxt_mp;
3515 	char	buf[INET6_ADDRSTRLEN];
3516 	struct in_addr ipv4addr;
3517 	ill_t *ill = ncec->ncec_ill;
3518 	ip_stack_t *ipst = ncec->ncec_ipst;
3519 	ip_recv_attr_t	iras;
3520 
3521 	bzero(&iras, sizeof (iras));
3522 	iras.ira_flags = IRAF_IS_IPV4;
3523 	/*
3524 	 * we are setting the ira_rill to the ipmp_ill (instead of
3525 	 * the actual ill on which the packet was received), but this
3526 	 * is ok because we don't actually need the real ira_rill.
3527 	 * to send the icmp unreachable to the sender.
3528 	 */
3529 	iras.ira_ill = iras.ira_rill = ill;
3530 	iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex;
3531 	iras.ira_rifindex = iras.ira_ruifindex;
3532 
3533 	IN6_V4MAPPED_TO_INADDR(&ncec->ncec_addr, &ipv4addr);
3534 	ip3dbg(("arp_resolv_failed: dst %s\n",
3535 	    inet_ntop(AF_INET, &ipv4addr, buf, sizeof (buf))));
3536 	mutex_enter(&ncec->ncec_lock);
3537 	mp = ncec->ncec_qd_mp;
3538 	ncec->ncec_qd_mp = NULL;
3539 	ncec->ncec_nprobes = 0;
3540 	mutex_exit(&ncec->ncec_lock);
3541 	while (mp != NULL) {
3542 		nxt_mp = mp->b_next;
3543 		mp->b_next = NULL;
3544 
3545 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
3546 		ip_drop_output("ipIfStatsOutDiscards - address unreachable",
3547 		    mp, ill);
3548 		if (ipst->ips_ip_arp_icmp_error) {
3549 			ip3dbg(("arp_resolv_failed: "
3550 			    "Calling icmp_unreachable\n"));
3551 			icmp_unreachable(mp, ICMP_HOST_UNREACHABLE, &iras);
3552 		} else {
3553 			freemsg(mp);
3554 		}
3555 		ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE));
3556 		mp = nxt_mp;
3557 	}
3558 	ncec_cb_dispatch(ncec); /* finish off waiting callbacks */
3559 }
3560 
3561 /*
3562  * if ill is an under_ill, translate it to the ipmp_ill and add the
3563  * nce on the ipmp_ill. Two nce_t entries (one on the ipmp_ill, and
3564  * one on the underlying in_ill) will be created for the
3565  * ncec_t in this case. The ncec_t itself will be created on the ipmp_ill.
3566  */
3567 int
3568 nce_lookup_then_add_v4(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len,
3569     const in_addr_t *addr, uint16_t flags, uint16_t state, nce_t **newnce)
3570 {
3571 	int	err;
3572 	in6_addr_t addr6;
3573 	ip_stack_t *ipst = ill->ill_ipst;
3574 	nce_t	*nce, *upper_nce = NULL;
3575 	ill_t	*in_ill = ill, *under = NULL;
3576 	boolean_t need_ill_refrele = B_FALSE;
3577 
3578 	if (flags & NCE_F_MCAST) {
3579 		/*
3580 		 * hw_addr will be figured out in nce_set_multicast_v4;
3581 		 * caller needs to pass in the cast_ill for ipmp
3582 		 */
3583 		ASSERT(hw_addr == NULL);
3584 		ASSERT(!IS_IPMP(ill));
3585 		err = nce_set_multicast_v4(ill, addr, flags, newnce);
3586 		return (err);
3587 	}
3588 
3589 	if (IS_UNDER_IPMP(ill) && !(flags & NCE_F_MYADDR)) {
3590 		ill = ipmp_ill_hold_ipmp_ill(ill);
3591 		if (ill == NULL)
3592 			return (ENXIO);
3593 		need_ill_refrele = B_TRUE;
3594 	}
3595 	if ((flags & NCE_F_BCAST) != 0) {
3596 		/*
3597 		 * IPv4 broadcast ncec: compute the hwaddr.
3598 		 */
3599 		if (IS_IPMP(ill)) {
3600 			under = ipmp_ill_get_xmit_ill(ill, B_FALSE);
3601 			if (under == NULL)  {
3602 				if (need_ill_refrele)
3603 					ill_refrele(ill);
3604 				return (ENETDOWN);
3605 			}
3606 			hw_addr = under->ill_bcast_mp->b_rptr +
3607 			    NCE_LL_ADDR_OFFSET(under);
3608 			hw_addr_len = under->ill_phys_addr_length;
3609 		} else {
3610 			hw_addr = ill->ill_bcast_mp->b_rptr +
3611 			    NCE_LL_ADDR_OFFSET(ill),
3612 			    hw_addr_len = ill->ill_phys_addr_length;
3613 		}
3614 	}
3615 
3616 	mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
3617 	IN6_IPADDR_TO_V4MAPPED(*addr, &addr6);
3618 	nce = nce_lookup_addr(ill, &addr6);
3619 	if (nce == NULL) {
3620 		err = nce_add_v4(ill, hw_addr, hw_addr_len, addr, flags,
3621 		    state, &nce);
3622 	} else {
3623 		err = EEXIST;
3624 	}
3625 	mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
3626 	if (err == 0)
3627 		err = nce_add_v4_postprocess(nce);
3628 
3629 	if (in_ill != ill && nce != NULL) {
3630 		nce_t *under_nce = NULL;
3631 
3632 		/*
3633 		 * in_ill was the under_ill. Try to create the under_nce.
3634 		 * Hold the ill_g_lock to prevent changes to group membership
3635 		 * until we are done.
3636 		 */
3637 		rw_enter(&ipst->ips_ill_g_lock, RW_READER);
3638 		if (!IS_IN_SAME_ILLGRP(in_ill, ill)) {
3639 			DTRACE_PROBE2(ill__not__in__group, nce_t *, nce,
3640 			    ill_t *, ill);
3641 			rw_exit(&ipst->ips_ill_g_lock);
3642 			err = ENXIO;
3643 			nce_refrele(nce);
3644 			nce = NULL;
3645 			goto bail;
3646 		}
3647 		under_nce = nce_fastpath_create(in_ill, nce->nce_common);
3648 		if (under_nce == NULL) {
3649 			rw_exit(&ipst->ips_ill_g_lock);
3650 			err = EINVAL;
3651 			nce_refrele(nce);
3652 			nce = NULL;
3653 			goto bail;
3654 		}
3655 		rw_exit(&ipst->ips_ill_g_lock);
3656 		upper_nce = nce;
3657 		nce = under_nce; /* will be returned to caller */
3658 		if (NCE_ISREACHABLE(nce->nce_common))
3659 			nce_fastpath_trigger(under_nce);
3660 	}
3661 	if (nce != NULL) {
3662 		if (newnce != NULL)
3663 			*newnce = nce;
3664 		else
3665 			nce_refrele(nce);
3666 	}
3667 bail:
3668 	if (under != NULL)
3669 		ill_refrele(under);
3670 	if (upper_nce != NULL)
3671 		nce_refrele(upper_nce);
3672 	if (need_ill_refrele)
3673 		ill_refrele(ill);
3674 
3675 	return (err);
3676 }
3677 
3678 /*
3679  * NDP Cache Entry creation routine for IPv4.
3680  * This routine must always be called with ndp4->ndp_g_lock held.
3681  * Prior to return, ncec_refcnt is incremented.
3682  *
3683  * IPMP notes: the ncec for non-local (i.e., !NCE_MYADDR(ncec) addresses
3684  * are always added pointing at the ipmp_ill. Thus, when the ill passed
3685  * to nce_add_v4 is an under_ill (i.e., IS_UNDER_IPMP(ill)) two nce_t
3686  * entries will be created, both pointing at the same ncec_t. The nce_t
3687  * entries will have their nce_ill set to the ipmp_ill and the under_ill
3688  * respectively, with the ncec_t having its ncec_ill pointing at the ipmp_ill.
3689  * Local addresses are always created on the ill passed to nce_add_v4.
3690  */
3691 int
3692 nce_add_v4(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len,
3693     const in_addr_t *addr, uint16_t flags, uint16_t state, nce_t **newnce)
3694 {
3695 	int		err;
3696 	boolean_t	is_multicast = (flags & NCE_F_MCAST);
3697 	struct in6_addr	addr6;
3698 	nce_t		*nce;
3699 
3700 	ASSERT(MUTEX_HELD(&ill->ill_ipst->ips_ndp4->ndp_g_lock));
3701 	ASSERT(!ill->ill_isv6);
3702 	ASSERT(!IN_MULTICAST(htonl(*addr)) || is_multicast);
3703 
3704 	IN6_IPADDR_TO_V4MAPPED(*addr, &addr6);
3705 	err = nce_add_common(ill, hw_addr, hw_addr_len, &addr6, flags, state,
3706 	    &nce);
3707 	ASSERT(newnce != NULL);
3708 	*newnce = nce;
3709 	return (err);
3710 }
3711 
3712 /*
3713  * Post-processing routine to be executed after nce_add_v4(). This function
3714  * triggers fastpath (if appropriate) and DAD on the newly added nce entry
3715  * and must be called without any locks held.
3716  *
3717  * Always returns 0, but we return an int to keep this symmetric with the
3718  * IPv6 counter-part.
3719  */
3720 int
3721 nce_add_v4_postprocess(nce_t *nce)
3722 {
3723 	ncec_t		*ncec = nce->nce_common;
3724 	uint16_t	flags = ncec->ncec_flags;
3725 	boolean_t	ndp_need_dad = B_FALSE;
3726 	boolean_t	dropped;
3727 	clock_t		delay;
3728 	ip_stack_t	*ipst = ncec->ncec_ill->ill_ipst;
3729 	uchar_t		*hw_addr = ncec->ncec_lladdr;
3730 	boolean_t	trigger_fastpath = B_TRUE;
3731 
3732 	/*
3733 	 * If the hw_addr is NULL, typically for ND_INCOMPLETE nces, then
3734 	 * we call nce_fastpath as soon as the ncec is resolved in nce_process.
3735 	 * We call nce_fastpath from nce_update if the link layer address of
3736 	 * the peer changes from nce_update
3737 	 */
3738 	if (NCE_PUBLISH(ncec) || !NCE_ISREACHABLE(ncec) || (hw_addr == NULL &&
3739 	    ncec->ncec_ill->ill_net_type != IRE_IF_NORESOLVER))
3740 		trigger_fastpath = B_FALSE;
3741 
3742 	if (trigger_fastpath)
3743 		nce_fastpath_trigger(nce);
3744 
3745 	if (NCE_PUBLISH(ncec) && ncec->ncec_state == ND_PROBE) {
3746 		/*
3747 		 * Either the caller (by passing in ND_PROBE)
3748 		 * or nce_add_common() (by the internally computed state
3749 		 * based on ncec_addr and ill_net_type) has determined
3750 		 * that this unicast entry needs DAD. Trigger DAD.
3751 		 */
3752 		ndp_need_dad = B_TRUE;
3753 	} else if (flags & NCE_F_UNSOL_ADV) {
3754 		/*
3755 		 * We account for the transmit below by assigning one
3756 		 * less than the ndd variable. Subsequent decrements
3757 		 * are done in nce_timer.
3758 		 */
3759 		mutex_enter(&ncec->ncec_lock);
3760 		ncec->ncec_unsolicit_count =
3761 		    ipst->ips_ip_arp_publish_count - 1;
3762 		mutex_exit(&ncec->ncec_lock);
3763 		dropped = arp_announce(ncec);
3764 		mutex_enter(&ncec->ncec_lock);
3765 		if (dropped)
3766 			ncec->ncec_unsolicit_count++;
3767 		else
3768 			ncec->ncec_last_time_defended = ddi_get_lbolt();
3769 		if (ncec->ncec_unsolicit_count != 0) {
3770 			nce_start_timer(ncec,
3771 			    ipst->ips_ip_arp_publish_interval);
3772 		}
3773 		mutex_exit(&ncec->ncec_lock);
3774 	}
3775 
3776 	/*
3777 	 * If ncec_xmit_interval is 0, user has configured us to send the first
3778 	 * probe right away.  Do so, and set up for the subsequent probes.
3779 	 */
3780 	if (ndp_need_dad) {
3781 		mutex_enter(&ncec->ncec_lock);
3782 		if (ncec->ncec_pcnt == 0) {
3783 			/*
3784 			 * DAD probes and announce can be
3785 			 * administratively disabled by setting the
3786 			 * probe_count to zero. Restart the timer in
3787 			 * this case to mark the ipif as ready.
3788 			 */
3789 			ncec->ncec_unsolicit_count = 0;
3790 			mutex_exit(&ncec->ncec_lock);
3791 			nce_restart_timer(ncec, 0);
3792 		} else {
3793 			mutex_exit(&ncec->ncec_lock);
3794 			delay = ((ncec->ncec_flags & NCE_F_FAST) ?
3795 			    ipst->ips_arp_probe_delay :
3796 			    ipst->ips_arp_fastprobe_delay);
3797 			nce_dad(ncec, NULL, (delay == 0 ? B_TRUE : B_FALSE));
3798 		}
3799 	}
3800 	return (0);
3801 }
3802 
3803 /*
3804  * ncec_walk routine to update all entries that have a given destination or
3805  * gateway address and cached link layer (MAC) address.  This is used when ARP
3806  * informs us that a network-to-link-layer mapping may have changed.
3807  */
3808 void
3809 nce_update_hw_changed(ncec_t *ncec, void *arg)
3810 {
3811 	nce_hw_map_t *hwm = arg;
3812 	ipaddr_t ncec_addr;
3813 
3814 	if (ncec->ncec_state != ND_REACHABLE)
3815 		return;
3816 
3817 	IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, ncec_addr);
3818 	if (ncec_addr != hwm->hwm_addr)
3819 		return;
3820 
3821 	mutex_enter(&ncec->ncec_lock);
3822 	if (hwm->hwm_flags != 0)
3823 		ncec->ncec_flags = hwm->hwm_flags;
3824 	nce_update(ncec, ND_STALE, hwm->hwm_hwaddr);
3825 	mutex_exit(&ncec->ncec_lock);
3826 }
3827 
3828 void
3829 ncec_refhold(ncec_t *ncec)
3830 {
3831 	mutex_enter(&(ncec)->ncec_lock);
3832 	(ncec)->ncec_refcnt++;
3833 	ASSERT((ncec)->ncec_refcnt != 0);
3834 #ifdef DEBUG
3835 	ncec_trace_ref(ncec);
3836 #endif
3837 	mutex_exit(&(ncec)->ncec_lock);
3838 }
3839 
3840 void
3841 ncec_refhold_notr(ncec_t *ncec)
3842 {
3843 	mutex_enter(&(ncec)->ncec_lock);
3844 	(ncec)->ncec_refcnt++;
3845 	ASSERT((ncec)->ncec_refcnt != 0);
3846 	mutex_exit(&(ncec)->ncec_lock);
3847 }
3848 
3849 static void
3850 ncec_refhold_locked(ncec_t *ncec)
3851 {
3852 	ASSERT(MUTEX_HELD(&(ncec)->ncec_lock));
3853 	(ncec)->ncec_refcnt++;
3854 #ifdef DEBUG
3855 	ncec_trace_ref(ncec);
3856 #endif
3857 }
3858 
3859 /* ncec_inactive destroys the mutex thus no mutex_exit is needed */
3860 void
3861 ncec_refrele(ncec_t *ncec)
3862 {
3863 	mutex_enter(&(ncec)->ncec_lock);
3864 #ifdef DEBUG
3865 	ncec_untrace_ref(ncec);
3866 #endif
3867 	ASSERT((ncec)->ncec_refcnt != 0);
3868 	if (--(ncec)->ncec_refcnt == 0) {
3869 		ncec_inactive(ncec);
3870 	} else {
3871 		mutex_exit(&(ncec)->ncec_lock);
3872 	}
3873 }
3874 
3875 void
3876 ncec_refrele_notr(ncec_t *ncec)
3877 {
3878 	mutex_enter(&(ncec)->ncec_lock);
3879 	ASSERT((ncec)->ncec_refcnt != 0);
3880 	if (--(ncec)->ncec_refcnt == 0) {
3881 		ncec_inactive(ncec);
3882 	} else {
3883 		mutex_exit(&(ncec)->ncec_lock);
3884 	}
3885 }
3886 
3887 /*
3888  * Common to IPv4 and IPv6.
3889  */
3890 void
3891 nce_restart_timer(ncec_t *ncec, uint_t ms)
3892 {
3893 	timeout_id_t tid;
3894 
3895 	ASSERT(!MUTEX_HELD(&(ncec)->ncec_lock));
3896 
3897 	/* First cancel any running timer */
3898 	mutex_enter(&ncec->ncec_lock);
3899 	tid = ncec->ncec_timeout_id;
3900 	ncec->ncec_timeout_id = 0;
3901 	if (tid != 0) {
3902 		mutex_exit(&ncec->ncec_lock);
3903 		(void) untimeout(tid);
3904 		mutex_enter(&ncec->ncec_lock);
3905 	}
3906 
3907 	/* Restart timer */
3908 	nce_start_timer(ncec, ms);
3909 	mutex_exit(&ncec->ncec_lock);
3910 }
3911 
3912 static void
3913 nce_start_timer(ncec_t *ncec, uint_t ms)
3914 {
3915 	ASSERT(MUTEX_HELD(&ncec->ncec_lock));
3916 	/*
3917 	 * Don't start the timer if the ncec has been deleted, or if the timer
3918 	 * is already running
3919 	 */
3920 	if (!NCE_ISCONDEMNED(ncec) && ncec->ncec_timeout_id == 0) {
3921 		ncec->ncec_timeout_id = timeout(nce_timer, ncec,
3922 		    MSEC_TO_TICK(ms) == 0 ? 1 : MSEC_TO_TICK(ms));
3923 	}
3924 }
3925 
3926 int
3927 nce_set_multicast_v4(ill_t *ill, const in_addr_t *dst,
3928     uint16_t flags, nce_t **newnce)
3929 {
3930 	uchar_t		*hw_addr;
3931 	int		err = 0;
3932 	ip_stack_t	*ipst = ill->ill_ipst;
3933 	in6_addr_t	dst6;
3934 	nce_t		*nce;
3935 
3936 	ASSERT(!ill->ill_isv6);
3937 
3938 	IN6_IPADDR_TO_V4MAPPED(*dst, &dst6);
3939 	mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
3940 	if ((nce = nce_lookup_addr(ill, &dst6)) != NULL) {
3941 		mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
3942 		goto done;
3943 	}
3944 	if (ill->ill_net_type == IRE_IF_RESOLVER) {
3945 		/*
3946 		 * For IRE_IF_RESOLVER a hardware mapping can be
3947 		 * generated, for IRE_IF_NORESOLVER, resolution cookie
3948 		 * in the ill is copied in nce_add_v4().
3949 		 */
3950 		hw_addr = kmem_alloc(ill->ill_phys_addr_length, KM_NOSLEEP);
3951 		if (hw_addr == NULL) {
3952 			mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
3953 			return (ENOMEM);
3954 		}
3955 		ip_mcast_mapping(ill, (uchar_t *)dst, hw_addr);
3956 	} else {
3957 		/*
3958 		 * IRE_IF_NORESOLVER type simply copies the resolution
3959 		 * cookie passed in.  So no hw_addr is needed.
3960 		 */
3961 		hw_addr = NULL;
3962 	}
3963 	ASSERT(flags & NCE_F_MCAST);
3964 	ASSERT(flags & NCE_F_NONUD);
3965 	/* nce_state will be computed by nce_add_common() */
3966 	err = nce_add_v4(ill, hw_addr, ill->ill_phys_addr_length, dst, flags,
3967 	    ND_UNCHANGED, &nce);
3968 	mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
3969 	if (err == 0)
3970 		err = nce_add_v4_postprocess(nce);
3971 	if (hw_addr != NULL)
3972 		kmem_free(hw_addr, ill->ill_phys_addr_length);
3973 	if (err != 0) {
3974 		ip1dbg(("nce_set_multicast_v4: create failed" "%d\n", err));
3975 		return (err);
3976 	}
3977 done:
3978 	if (newnce != NULL)
3979 		*newnce = nce;
3980 	else
3981 		nce_refrele(nce);
3982 	return (0);
3983 }
3984 
3985 /*
3986  * This is used when scanning for "old" (least recently broadcast) NCEs.  We
3987  * don't want to have to walk the list for every single one, so we gather up
3988  * batches at a time.
3989  */
3990 #define	NCE_RESCHED_LIST_LEN	8
3991 
3992 typedef struct {
3993 	ill_t	*ncert_ill;
3994 	uint_t	ncert_num;
3995 	ncec_t	*ncert_nces[NCE_RESCHED_LIST_LEN];
3996 } nce_resched_t;
3997 
3998 /*
3999  * Pick the longest waiting NCEs for defense.
4000  */
4001 /* ARGSUSED */
4002 static int
4003 ncec_reschedule(ill_t *ill, nce_t *nce, void *arg)
4004 {
4005 	nce_resched_t *ncert = arg;
4006 	ncec_t **ncecs;
4007 	ncec_t **ncec_max;
4008 	ncec_t *ncec_temp;
4009 	ncec_t *ncec = nce->nce_common;
4010 
4011 	ASSERT(ncec->ncec_ill == ncert->ncert_ill);
4012 	/*
4013 	 * Only reachable entries that are ready for announcement are eligible.
4014 	 */
4015 	if (!NCE_MYADDR(ncec) || ncec->ncec_state != ND_REACHABLE)
4016 		return (0);
4017 	if (ncert->ncert_num < NCE_RESCHED_LIST_LEN) {
4018 		ncec_refhold(ncec);
4019 		ncert->ncert_nces[ncert->ncert_num++] = ncec;
4020 	} else {
4021 		ncecs = ncert->ncert_nces;
4022 		ncec_max = ncecs + NCE_RESCHED_LIST_LEN;
4023 		ncec_refhold(ncec);
4024 		for (; ncecs < ncec_max; ncecs++) {
4025 			ASSERT(ncec != NULL);
4026 			if ((*ncecs)->ncec_last_time_defended >
4027 			    ncec->ncec_last_time_defended) {
4028 				ncec_temp = *ncecs;
4029 				*ncecs = ncec;
4030 				ncec = ncec_temp;
4031 			}
4032 		}
4033 		ncec_refrele(ncec);
4034 	}
4035 	return (0);
4036 }
4037 
4038 /*
4039  * Reschedule the ARP defense of any long-waiting NCEs.  It's assumed that this
4040  * doesn't happen very often (if at all), and thus it needn't be highly
4041  * optimized.  (Note, though, that it's actually O(N) complexity, because the
4042  * outer loop is bounded by a constant rather than by the length of the list.)
4043  */
4044 static void
4045 nce_ill_reschedule(ill_t *ill, nce_resched_t *ncert)
4046 {
4047 	ncec_t		*ncec;
4048 	ip_stack_t	*ipst = ill->ill_ipst;
4049 	uint_t		i, defend_rate;
4050 
4051 	i = ill->ill_defend_count;
4052 	ill->ill_defend_count = 0;
4053 	if (ill->ill_isv6)
4054 		defend_rate = ipst->ips_ndp_defend_rate;
4055 	else
4056 		defend_rate = ipst->ips_arp_defend_rate;
4057 	/* If none could be sitting around, then don't reschedule */
4058 	if (i < defend_rate) {
4059 		DTRACE_PROBE1(reschedule_none, ill_t *, ill);
4060 		return;
4061 	}
4062 	ncert->ncert_ill = ill;
4063 	while (ill->ill_defend_count < defend_rate) {
4064 		nce_walk_common(ill, ncec_reschedule, ncert);
4065 		for (i = 0; i < ncert->ncert_num; i++) {
4066 
4067 			ncec = ncert->ncert_nces[i];
4068 			mutex_enter(&ncec->ncec_lock);
4069 			ncec->ncec_flags |= NCE_F_DELAYED;
4070 			mutex_exit(&ncec->ncec_lock);
4071 			/*
4072 			 * we plan to schedule this ncec, so incr the
4073 			 * defend_count in anticipation.
4074 			 */
4075 			if (++ill->ill_defend_count >= defend_rate)
4076 				break;
4077 		}
4078 		if (ncert->ncert_num < NCE_RESCHED_LIST_LEN)
4079 			break;
4080 	}
4081 }
4082 
4083 /*
4084  * Check if the current rate-limiting parameters permit the sending
4085  * of another address defense announcement for both IPv4 and IPv6.
4086  * Returns B_TRUE if rate-limiting is in effect (i.e., send is not
4087  * permitted), and B_FALSE otherwise. The `defend_rate' parameter
4088  * determines how many address defense announcements are permitted
4089  * in any `defense_perio' interval.
4090  */
4091 static boolean_t
4092 ill_defend_rate_limit(ill_t *ill, ncec_t *ncec)
4093 {
4094 	clock_t		now = ddi_get_lbolt();
4095 	ip_stack_t	*ipst = ill->ill_ipst;
4096 	clock_t		start = ill->ill_defend_start;
4097 	uint32_t	elapsed, defend_period, defend_rate;
4098 	nce_resched_t	ncert;
4099 	boolean_t	ret;
4100 	int		i;
4101 
4102 	if (ill->ill_isv6) {
4103 		defend_period = ipst->ips_ndp_defend_period;
4104 		defend_rate = ipst->ips_ndp_defend_rate;
4105 	} else {
4106 		defend_period = ipst->ips_arp_defend_period;
4107 		defend_rate = ipst->ips_arp_defend_rate;
4108 	}
4109 	if (defend_rate == 0)
4110 		return (B_TRUE);
4111 	bzero(&ncert, sizeof (ncert));
4112 	mutex_enter(&ill->ill_lock);
4113 	if (start > 0) {
4114 		elapsed = now - start;
4115 		if (elapsed > SEC_TO_TICK(defend_period)) {
4116 			ill->ill_defend_start = now;
4117 			/*
4118 			 * nce_ill_reschedule will attempt to
4119 			 * prevent starvation by reschduling the
4120 			 * oldest entries, which are marked with
4121 			 * the NCE_F_DELAYED flag.
4122 			 */
4123 			nce_ill_reschedule(ill, &ncert);
4124 		}
4125 	} else {
4126 		ill->ill_defend_start = now;
4127 	}
4128 	ASSERT(ill->ill_defend_count <= defend_rate);
4129 	mutex_enter(&ncec->ncec_lock);
4130 	if (ncec->ncec_flags & NCE_F_DELAYED) {
4131 		/*
4132 		 * This ncec was rescheduled as one of the really old
4133 		 * entries needing on-going defense. The
4134 		 * ill_defend_count was already incremented in
4135 		 * nce_ill_reschedule. Go ahead and send the announce.
4136 		 */
4137 		ncec->ncec_flags &= ~NCE_F_DELAYED;
4138 		mutex_exit(&ncec->ncec_lock);
4139 		ret = B_FALSE;
4140 		goto done;
4141 	}
4142 	mutex_exit(&ncec->ncec_lock);
4143 	if (ill->ill_defend_count < defend_rate)
4144 		ill->ill_defend_count++;
4145 	if (ill->ill_defend_count == defend_rate) {
4146 		/*
4147 		 * we are no longer allowed to send unbidden defense
4148 		 * messages. Wait for rescheduling.
4149 		 */
4150 		ret = B_TRUE;
4151 	} else {
4152 		ret = B_FALSE;
4153 	}
4154 done:
4155 	mutex_exit(&ill->ill_lock);
4156 	/*
4157 	 * After all the locks have been dropped we can restart nce timer,
4158 	 * and refrele the delayed ncecs
4159 	 */
4160 	for (i = 0; i < ncert.ncert_num; i++) {
4161 		clock_t	xmit_interval;
4162 		ncec_t	*tmp;
4163 
4164 		tmp = ncert.ncert_nces[i];
4165 		xmit_interval = nce_fuzz_interval(tmp->ncec_xmit_interval,
4166 		    B_FALSE);
4167 		nce_restart_timer(tmp, xmit_interval);
4168 		ncec_refrele(tmp);
4169 	}
4170 	return (ret);
4171 }
4172 
4173 boolean_t
4174 ndp_announce(ncec_t *ncec)
4175 {
4176 	return (ndp_xmit(ncec->ncec_ill, ND_NEIGHBOR_ADVERT, ncec->ncec_lladdr,
4177 	    ncec->ncec_lladdr_length, &ncec->ncec_addr, &ipv6_all_hosts_mcast,
4178 	    nce_advert_flags(ncec)));
4179 }
4180 
4181 ill_t *
4182 nce_resolve_src(ncec_t *ncec, in6_addr_t *src)
4183 {
4184 	mblk_t		*mp;
4185 	in6_addr_t	src6;
4186 	ipaddr_t	src4;
4187 	ill_t		*ill = ncec->ncec_ill;
4188 	ill_t		*src_ill = NULL;
4189 	ipif_t		*ipif = NULL;
4190 	boolean_t	is_myaddr = NCE_MYADDR(ncec);
4191 	boolean_t	isv6 = (ncec->ncec_ipversion == IPV6_VERSION);
4192 
4193 	ASSERT(src != NULL);
4194 	ASSERT(IN6_IS_ADDR_UNSPECIFIED(src));
4195 	src6 = *src;
4196 	if (is_myaddr) {
4197 		src6 = ncec->ncec_addr;
4198 		if (!isv6)
4199 			IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, src4);
4200 	} else {
4201 		/*
4202 		 * try to find one from the outgoing packet.
4203 		 */
4204 		mutex_enter(&ncec->ncec_lock);
4205 		mp = ncec->ncec_qd_mp;
4206 		if (mp != NULL) {
4207 			if (isv6) {
4208 				ip6_t	*ip6h = (ip6_t *)mp->b_rptr;
4209 
4210 				src6 = ip6h->ip6_src;
4211 			} else {
4212 				ipha_t  *ipha = (ipha_t *)mp->b_rptr;
4213 
4214 				src4 = ipha->ipha_src;
4215 				IN6_IPADDR_TO_V4MAPPED(src4, &src6);
4216 			}
4217 		}
4218 		mutex_exit(&ncec->ncec_lock);
4219 	}
4220 
4221 	/*
4222 	 * For outgoing packets, if the src of outgoing packet is one
4223 	 * of the assigned interface addresses use it, otherwise we
4224 	 * will pick the source address below.
4225 	 * For local addresses (is_myaddr) doing DAD, NDP announce
4226 	 * messages are mcast. So we use the (IPMP) cast_ill or the
4227 	 * (non-IPMP) ncec_ill for these message types. The only case
4228 	 * of unicast DAD messages are for IPv6 ND probes, for which
4229 	 * we find the ipif_bound_ill corresponding to the ncec_addr.
4230 	 */
4231 	if (!IN6_IS_ADDR_UNSPECIFIED(&src6) || is_myaddr) {
4232 		if (isv6) {
4233 			ipif = ipif_lookup_addr_nondup_v6(&src6, ill, ALL_ZONES,
4234 			    ill->ill_ipst);
4235 		} else {
4236 			ipif = ipif_lookup_addr_nondup(src4, ill, ALL_ZONES,
4237 			    ill->ill_ipst);
4238 		}
4239 
4240 		/*
4241 		 * If no relevant ipif can be found, then it's not one of our
4242 		 * addresses.  Reset to :: and try to find a src for the NS or
4243 		 * ARP request using ipif_select_source_v[4,6]  below.
4244 		 * If an ipif can be found, but it's not yet done with
4245 		 * DAD verification, and we are not being invoked for
4246 		 * DAD (i.e., !is_myaddr), then just postpone this
4247 		 * transmission until later.
4248 		 */
4249 		if (ipif == NULL) {
4250 			src6 = ipv6_all_zeros;
4251 			src4 = INADDR_ANY;
4252 		} else if (!ipif->ipif_addr_ready && !is_myaddr) {
4253 			DTRACE_PROBE2(nce__resolve__ipif__not__ready,
4254 			    ncec_t *, ncec, ipif_t *, ipif);
4255 			ipif_refrele(ipif);
4256 			return (NULL);
4257 		}
4258 	}
4259 
4260 	if (IN6_IS_ADDR_UNSPECIFIED(&src6) && !is_myaddr) {
4261 		/*
4262 		 * Pick a source address for this solicitation, but
4263 		 * restrict the selection to addresses assigned to the
4264 		 * output interface.  We do this because the destination will
4265 		 * create a neighbor cache entry for the source address of
4266 		 * this packet, so the source address had better be a valid
4267 		 * neighbor.
4268 		 */
4269 		if (isv6) {
4270 			ipif = ipif_select_source_v6(ill, &ncec->ncec_addr,
4271 			    B_TRUE, IPV6_PREFER_SRC_DEFAULT, ALL_ZONES,
4272 			    B_FALSE, NULL);
4273 		} else {
4274 			ipaddr_t nce_addr;
4275 
4276 			IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, nce_addr);
4277 			ipif = ipif_select_source_v4(ill, nce_addr, ALL_ZONES,
4278 			    B_FALSE, NULL);
4279 		}
4280 		if (ipif == NULL && IS_IPMP(ill)) {
4281 			ill_t *send_ill = ipmp_ill_get_xmit_ill(ill, B_TRUE);
4282 
4283 			if (send_ill != NULL) {
4284 				if (isv6) {
4285 					ipif = ipif_select_source_v6(send_ill,
4286 					    &ncec->ncec_addr, B_TRUE,
4287 					    IPV6_PREFER_SRC_DEFAULT, ALL_ZONES,
4288 					    B_FALSE, NULL);
4289 				} else {
4290 					IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr,
4291 					    src4);
4292 					ipif = ipif_select_source_v4(send_ill,
4293 					    src4, ALL_ZONES, B_TRUE, NULL);
4294 				}
4295 				ill_refrele(send_ill);
4296 			}
4297 		}
4298 
4299 		if (ipif == NULL) {
4300 			char buf[INET6_ADDRSTRLEN];
4301 
4302 			ip1dbg(("nce_resolve_src: No source ipif for dst %s\n",
4303 			    inet_ntop((isv6 ? AF_INET6 : AF_INET),
4304 			    (char *)&ncec->ncec_addr, buf, sizeof (buf))));
4305 			DTRACE_PROBE1(nce__resolve__no__ipif, ncec_t *, ncec);
4306 			return (NULL);
4307 		}
4308 		src6 = ipif->ipif_v6lcl_addr;
4309 	}
4310 	*src = src6;
4311 	if (ipif != NULL) {
4312 		src_ill = ipif->ipif_ill;
4313 		if (IS_IPMP(src_ill))
4314 			src_ill = ipmp_ipif_hold_bound_ill(ipif);
4315 		else
4316 			ill_refhold(src_ill);
4317 		ipif_refrele(ipif);
4318 		DTRACE_PROBE2(nce__resolve__src__ill, ncec_t *, ncec,
4319 		    ill_t *, src_ill);
4320 	}
4321 	return (src_ill);
4322 }
4323 
4324 void
4325 ip_nce_lookup_and_update(ipaddr_t *addr, ipif_t *ipif, ip_stack_t *ipst,
4326     uchar_t *hwaddr, int hwaddr_len, int flags)
4327 {
4328 	ill_t	*ill;
4329 	ncec_t	*ncec;
4330 	nce_t	*nce;
4331 	uint16_t new_state;
4332 
4333 	ill = (ipif ? ipif->ipif_ill : NULL);
4334 	if (ill != NULL) {
4335 		/*
4336 		 * only one ncec is possible
4337 		 */
4338 		nce = nce_lookup_v4(ill, addr);
4339 		if (nce != NULL) {
4340 			ncec = nce->nce_common;
4341 			mutex_enter(&ncec->ncec_lock);
4342 			if (NCE_ISREACHABLE(ncec))
4343 				new_state = ND_UNCHANGED;
4344 			else
4345 				new_state = ND_STALE;
4346 			ncec->ncec_flags = flags;
4347 			nce_update(ncec, new_state, hwaddr);
4348 			mutex_exit(&ncec->ncec_lock);
4349 			nce_refrele(nce);
4350 			return;
4351 		}
4352 	} else {
4353 		/*
4354 		 * ill is wildcard; clean up all ncec's and ire's
4355 		 * that match on addr.
4356 		 */
4357 		nce_hw_map_t hwm;
4358 
4359 		hwm.hwm_addr = *addr;
4360 		hwm.hwm_hwlen = hwaddr_len;
4361 		hwm.hwm_hwaddr = hwaddr;
4362 		hwm.hwm_flags = flags;
4363 
4364 		ncec_walk_common(ipst->ips_ndp4, NULL,
4365 		    (pfi_t)nce_update_hw_changed, (uchar_t *)&hwm, B_TRUE);
4366 	}
4367 }
4368 
4369 /*
4370  * Common function to add ncec entries.
4371  * we always add the ncec with ncec_ill == ill, and always create
4372  * nce_t on ncec_ill. A dlpi fastpath message may be triggered if the
4373  * ncec is !reachable.
4374  *
4375  * When the caller passes in an nce_state of ND_UNCHANGED,
4376  * nce_add_common() will determine the state of the created nce based
4377  * on the ill_net_type and nce_flags used. Otherwise, the nce will
4378  * be created with state set to the passed in nce_state.
4379  */
4380 static int
4381 nce_add_common(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len,
4382     const in6_addr_t *addr, uint16_t flags, uint16_t nce_state, nce_t **retnce)
4383 {
4384 	static	ncec_t		nce_nil;
4385 	uchar_t			*template = NULL;
4386 	int			err;
4387 	ncec_t			*ncec;
4388 	ncec_t			**ncep;
4389 	ip_stack_t		*ipst = ill->ill_ipst;
4390 	uint16_t		state;
4391 	boolean_t		fastprobe = B_FALSE;
4392 	struct ndp_g_s		*ndp;
4393 	nce_t			*nce = NULL;
4394 	mblk_t			*dlur_mp = NULL;
4395 
4396 	if (ill->ill_isv6)
4397 		ndp = ill->ill_ipst->ips_ndp6;
4398 	else
4399 		ndp = ill->ill_ipst->ips_ndp4;
4400 
4401 	*retnce = NULL;
4402 
4403 	ASSERT(MUTEX_HELD(&ndp->ndp_g_lock));
4404 
4405 	if (IN6_IS_ADDR_UNSPECIFIED(addr)) {
4406 		ip0dbg(("nce_add_common: no addr\n"));
4407 		return (EINVAL);
4408 	}
4409 	if ((flags & ~NCE_EXTERNAL_FLAGS_MASK)) {
4410 		ip0dbg(("nce_add_common: flags = %x\n", (int)flags));
4411 		return (EINVAL);
4412 	}
4413 
4414 	if (ill->ill_isv6) {
4415 		ncep = ((ncec_t **)NCE_HASH_PTR_V6(ipst, *addr));
4416 	} else {
4417 		ipaddr_t v4addr;
4418 
4419 		IN6_V4MAPPED_TO_IPADDR(addr, v4addr);
4420 		ncep = ((ncec_t **)NCE_HASH_PTR_V4(ipst, v4addr));
4421 	}
4422 
4423 	/*
4424 	 * The caller has ensured that there is no nce on ill, but there could
4425 	 * still be an nce_common_t for the address, so that we find exisiting
4426 	 * ncec_t strucutures first, and atomically add a new nce_t if
4427 	 * one is found. The ndp_g_lock ensures that we don't cross threads
4428 	 * with an ncec_delete(). Unlike ncec_lookup_illgrp() we do not
4429 	 * compare for matches across the illgrp because this function is
4430 	 * called via nce_lookup_then_add_v* -> nce_add_v* -> nce_add_common,
4431 	 * with the nce_lookup_then_add_v* passing in the ipmp_ill where
4432 	 * appropriate.
4433 	 */
4434 	ncec = *ncep;
4435 	for (; ncec != NULL; ncec = ncec->ncec_next) {
4436 		if (ncec->ncec_ill == ill) {
4437 			if (IN6_ARE_ADDR_EQUAL(&ncec->ncec_addr, addr)) {
4438 				/*
4439 				 * We should never find *retnce to be
4440 				 * MYADDR, since the caller may then
4441 				 * incorrectly restart a DAD timer that's
4442 				 * already running.  However, if we are in
4443 				 * forwarding mode, and the interface is
4444 				 * moving in/out of groups, the data
4445 				 * path ire lookup (e.g., ire_revalidate_nce)
4446 				 * may  have determined that some destination
4447 				 * is offlink while the control path is adding
4448 				 * that address as a local address.
4449 				 * Recover from  this case by failing the
4450 				 * lookup
4451 				 */
4452 				if (NCE_MYADDR(ncec))
4453 					return (ENXIO);
4454 				*retnce = nce_ill_lookup_then_add(ill, ncec);
4455 				if (*retnce != NULL)
4456 					break;
4457 			}
4458 		}
4459 	}
4460 	if (*retnce != NULL) /* caller must trigger fastpath on nce */
4461 		return (0);
4462 
4463 	ncec = kmem_cache_alloc(ncec_cache, KM_NOSLEEP);
4464 	if (ncec == NULL)
4465 		return (ENOMEM);
4466 	*ncec = nce_nil;
4467 	ncec->ncec_ill = ill;
4468 	ncec->ncec_ipversion = (ill->ill_isv6 ? IPV6_VERSION : IPV4_VERSION);
4469 	ncec->ncec_flags = flags;
4470 	ncec->ncec_ipst = ipst;	/* No netstack_hold */
4471 
4472 	if (!ill->ill_isv6) {
4473 		ipaddr_t addr4;
4474 
4475 		/*
4476 		 * DAD probe interval and probe count are set based on
4477 		 * fast/slow probe settings. If the underlying link doesn't
4478 		 * have reliably up/down notifications or if we're working
4479 		 * with IPv4 169.254.0.0/16 Link Local Address space, then
4480 		 * don't use the fast timers.  Otherwise, use them.
4481 		 */
4482 		ASSERT(IN6_IS_ADDR_V4MAPPED(addr));
4483 		IN6_V4MAPPED_TO_IPADDR(addr, addr4);
4484 		if (ill->ill_note_link && !IS_IPV4_LL_SPACE(&addr4)) {
4485 			fastprobe = B_TRUE;
4486 		} else if (IS_IPMP(ill) && NCE_PUBLISH(ncec) &&
4487 		    !IS_IPV4_LL_SPACE(&addr4)) {
4488 			ill_t *hwaddr_ill;
4489 
4490 			hwaddr_ill = ipmp_illgrp_find_ill(ill->ill_grp, hw_addr,
4491 			    hw_addr_len);
4492 			if (hwaddr_ill != NULL && hwaddr_ill->ill_note_link)
4493 				fastprobe = B_TRUE;
4494 		}
4495 		if (fastprobe) {
4496 			ncec->ncec_xmit_interval =
4497 			    ipst->ips_arp_fastprobe_interval;
4498 			ncec->ncec_pcnt =
4499 			    ipst->ips_arp_fastprobe_count;
4500 			ncec->ncec_flags |= NCE_F_FAST;
4501 		} else {
4502 			ncec->ncec_xmit_interval =
4503 			    ipst->ips_arp_probe_interval;
4504 			ncec->ncec_pcnt =
4505 			    ipst->ips_arp_probe_count;
4506 		}
4507 		if (NCE_PUBLISH(ncec)) {
4508 			ncec->ncec_unsolicit_count =
4509 			    ipst->ips_ip_arp_publish_count;
4510 		}
4511 	} else {
4512 		/*
4513 		 * probe interval is constant: ILL_PROBE_INTERVAL
4514 		 * probe count is constant: ND_MAX_UNICAST_SOLICIT
4515 		 */
4516 		ncec->ncec_pcnt = ND_MAX_UNICAST_SOLICIT;
4517 		if (NCE_PUBLISH(ncec)) {
4518 			ncec->ncec_unsolicit_count =
4519 			    ipst->ips_ip_ndp_unsolicit_count;
4520 		}
4521 	}
4522 	ncec->ncec_rcnt = ill->ill_xmit_count;
4523 	ncec->ncec_addr = *addr;
4524 	ncec->ncec_qd_mp = NULL;
4525 	ncec->ncec_refcnt = 1; /* for ncec getting created */
4526 	mutex_init(&ncec->ncec_lock, NULL, MUTEX_DEFAULT, NULL);
4527 	ncec->ncec_trace_disable = B_FALSE;
4528 
4529 	/*
4530 	 * ncec_lladdr holds link layer address
4531 	 */
4532 	if (hw_addr_len > 0) {
4533 		template = kmem_alloc(hw_addr_len, KM_NOSLEEP);
4534 		if (template == NULL) {
4535 			err = ENOMEM;
4536 			goto err_ret;
4537 		}
4538 		ncec->ncec_lladdr = template;
4539 		ncec->ncec_lladdr_length = hw_addr_len;
4540 		bzero(ncec->ncec_lladdr, hw_addr_len);
4541 	}
4542 	if ((flags & NCE_F_BCAST) != 0) {
4543 		state = ND_REACHABLE;
4544 		ASSERT(hw_addr_len > 0);
4545 	} else if (ill->ill_net_type == IRE_IF_RESOLVER) {
4546 		state = ND_INITIAL;
4547 	} else if (ill->ill_net_type == IRE_IF_NORESOLVER) {
4548 		/*
4549 		 * NORESOLVER entries are always created in the REACHABLE
4550 		 * state.
4551 		 */
4552 		state = ND_REACHABLE;
4553 		if (ill->ill_phys_addr_length == IP_ADDR_LEN &&
4554 		    ill->ill_mactype != DL_IPV4 &&
4555 		    ill->ill_mactype != DL_6TO4) {
4556 			/*
4557 			 * We create a nce_res_mp with the IP nexthop address
4558 			 * as the destination address if the physical length
4559 			 * is exactly 4 bytes for point-to-multipoint links
4560 			 * that do their own resolution from IP to link-layer
4561 			 * address (e.g. IP over X.25).
4562 			 */
4563 			bcopy((uchar_t *)addr,
4564 			    ncec->ncec_lladdr, ill->ill_phys_addr_length);
4565 		}
4566 		if (ill->ill_phys_addr_length == IPV6_ADDR_LEN &&
4567 		    ill->ill_mactype != DL_IPV6) {
4568 			/*
4569 			 * We create a nce_res_mp with the IP nexthop address
4570 			 * as the destination address if the physical legnth
4571 			 * is exactly 16 bytes for point-to-multipoint links
4572 			 * that do their own resolution from IP to link-layer
4573 			 * address.
4574 			 */
4575 			bcopy((uchar_t *)addr,
4576 			    ncec->ncec_lladdr, ill->ill_phys_addr_length);
4577 		}
4578 		/*
4579 		 * Since NUD is not part of the base IPv4 protocol definition,
4580 		 * IPv4 neighbor entries on NORESOLVER interfaces will never
4581 		 * age, and are marked NCE_F_NONUD.
4582 		 */
4583 		if (!ill->ill_isv6)
4584 			ncec->ncec_flags |= NCE_F_NONUD;
4585 	} else if (ill->ill_net_type == IRE_LOOPBACK) {
4586 		state = ND_REACHABLE;
4587 	}
4588 
4589 	if (hw_addr != NULL || ill->ill_net_type == IRE_IF_NORESOLVER) {
4590 		/*
4591 		 * We are adding an ncec with a deterministic hw_addr,
4592 		 * so the state can only be one of {REACHABLE, STALE, PROBE}.
4593 		 *
4594 		 * if we are adding a unicast ncec for the local address
4595 		 * it would be REACHABLE; we would be adding a ND_STALE entry
4596 		 * for the requestor of an ARP_REQUEST/ND_SOLICIT. Our own
4597 		 * addresses are added in PROBE to trigger DAD.
4598 		 */
4599 		if ((flags & (NCE_F_MCAST|NCE_F_BCAST)) ||
4600 		    ill->ill_net_type == IRE_IF_NORESOLVER)
4601 			state = ND_REACHABLE;
4602 		else if (!NCE_PUBLISH(ncec))
4603 			state = ND_STALE;
4604 		else
4605 			state = ND_PROBE;
4606 		if (hw_addr != NULL)
4607 			nce_set_ll(ncec, hw_addr);
4608 	}
4609 	/* caller overrides internally computed state */
4610 	if (nce_state != ND_UNCHANGED)
4611 		state = nce_state;
4612 
4613 	if (state == ND_PROBE)
4614 		ncec->ncec_flags |= NCE_F_UNVERIFIED;
4615 
4616 	ncec->ncec_state = state;
4617 
4618 	if (state == ND_REACHABLE) {
4619 		ncec->ncec_last = ncec->ncec_init_time =
4620 		    TICK_TO_MSEC(ddi_get_lbolt64());
4621 	} else {
4622 		ncec->ncec_last = 0;
4623 		if (state == ND_INITIAL)
4624 			ncec->ncec_init_time = TICK_TO_MSEC(ddi_get_lbolt64());
4625 	}
4626 	list_create(&ncec->ncec_cb, sizeof (ncec_cb_t),
4627 	    offsetof(ncec_cb_t, ncec_cb_node));
4628 	/*
4629 	 * have all the memory allocations out of the way before taking locks
4630 	 * and adding the nce.
4631 	 */
4632 	nce = kmem_cache_alloc(nce_cache, KM_NOSLEEP);
4633 	if (nce == NULL) {
4634 		err = ENOMEM;
4635 		goto err_ret;
4636 	}
4637 	if (ncec->ncec_lladdr != NULL ||
4638 	    ill->ill_net_type == IRE_IF_NORESOLVER) {
4639 		dlur_mp = ill_dlur_gen(ncec->ncec_lladdr,
4640 		    ill->ill_phys_addr_length, ill->ill_sap,
4641 		    ill->ill_sap_length);
4642 		if (dlur_mp == NULL) {
4643 			err = ENOMEM;
4644 			goto err_ret;
4645 		}
4646 	}
4647 
4648 	/*
4649 	 * Atomically ensure that the ill is not CONDEMNED, before
4650 	 * adding the NCE.
4651 	 */
4652 	mutex_enter(&ill->ill_lock);
4653 	if (ill->ill_state_flags & ILL_CONDEMNED) {
4654 		mutex_exit(&ill->ill_lock);
4655 		err = EINVAL;
4656 		goto err_ret;
4657 	}
4658 	if (!NCE_MYADDR(ncec) &&
4659 	    (ill->ill_state_flags & ILL_DOWN_IN_PROGRESS)) {
4660 		mutex_exit(&ill->ill_lock);
4661 		DTRACE_PROBE1(nce__add__on__down__ill, ncec_t *, ncec);
4662 		err = EINVAL;
4663 		goto err_ret;
4664 	}
4665 	/*
4666 	 * Acquire the ncec_lock even before adding the ncec to the list
4667 	 * so that it cannot get deleted after the ncec is added, but
4668 	 * before we add the nce.
4669 	 */
4670 	mutex_enter(&ncec->ncec_lock);
4671 	if ((ncec->ncec_next = *ncep) != NULL)
4672 		ncec->ncec_next->ncec_ptpn = &ncec->ncec_next;
4673 	*ncep = ncec;
4674 	ncec->ncec_ptpn = ncep;
4675 
4676 	/* Bump up the number of ncec's referencing this ill */
4677 	DTRACE_PROBE3(ill__incr__cnt, (ill_t *), ill,
4678 	    (char *), "ncec", (void *), ncec);
4679 	ill->ill_ncec_cnt++;
4680 	/*
4681 	 * Since we hold the ncec_lock at this time, the ncec cannot be
4682 	 * condemned, and we can safely add the nce.
4683 	 */
4684 	*retnce = nce_add_impl(ill, ncec, nce, dlur_mp);
4685 	mutex_exit(&ncec->ncec_lock);
4686 	mutex_exit(&ill->ill_lock);
4687 
4688 	/* caller must trigger fastpath on *retnce */
4689 	return (0);
4690 
4691 err_ret:
4692 	if (ncec != NULL)
4693 		kmem_cache_free(ncec_cache, ncec);
4694 	if (nce != NULL)
4695 		kmem_cache_free(nce_cache, nce);
4696 	freemsg(dlur_mp);
4697 	if (template != NULL)
4698 		kmem_free(template, ill->ill_phys_addr_length);
4699 	return (err);
4700 }
4701 
4702 /*
4703  * take a ref on the nce
4704  */
4705 void
4706 nce_refhold(nce_t *nce)
4707 {
4708 	mutex_enter(&nce->nce_lock);
4709 	nce->nce_refcnt++;
4710 	ASSERT((nce)->nce_refcnt != 0);
4711 	mutex_exit(&nce->nce_lock);
4712 }
4713 
4714 /*
4715  * release a ref on the nce; In general, this
4716  * cannot be called with locks held because nce_inactive
4717  * may result in nce_inactive which will take the ill_lock,
4718  * do ipif_ill_refrele_tail etc. Thus the one exception
4719  * where this can be called with locks held is when the caller
4720  * is certain that the nce_refcnt is sufficient to prevent
4721  * the invocation of nce_inactive.
4722  */
4723 void
4724 nce_refrele(nce_t *nce)
4725 {
4726 	ASSERT((nce)->nce_refcnt != 0);
4727 	mutex_enter(&nce->nce_lock);
4728 	if (--nce->nce_refcnt == 0)
4729 		nce_inactive(nce); /* destroys the mutex */
4730 	else
4731 		mutex_exit(&nce->nce_lock);
4732 }
4733 
4734 /*
4735  * free the nce after all refs have gone away.
4736  */
4737 static void
4738 nce_inactive(nce_t *nce)
4739 {
4740 	ill_t *ill = nce->nce_ill;
4741 
4742 	ASSERT(nce->nce_refcnt == 0);
4743 
4744 	ncec_refrele_notr(nce->nce_common);
4745 	nce->nce_common = NULL;
4746 	freemsg(nce->nce_fp_mp);
4747 	freemsg(nce->nce_dlur_mp);
4748 
4749 	mutex_enter(&ill->ill_lock);
4750 	DTRACE_PROBE3(ill__decr__cnt, (ill_t *), ill,
4751 	    (char *), "nce", (void *), nce);
4752 	ill->ill_nce_cnt--;
4753 	nce->nce_ill = NULL;
4754 	/*
4755 	 * If the number of ncec's associated with this ill have dropped
4756 	 * to zero, check whether we need to restart any operation that
4757 	 * is waiting for this to happen.
4758 	 */
4759 	if (ILL_DOWN_OK(ill)) {
4760 		/* ipif_ill_refrele_tail drops the ill_lock */
4761 		ipif_ill_refrele_tail(ill);
4762 	} else {
4763 		mutex_exit(&ill->ill_lock);
4764 	}
4765 
4766 	mutex_destroy(&nce->nce_lock);
4767 	kmem_cache_free(nce_cache, nce);
4768 }
4769 
4770 /*
4771  * Add an nce to the ill_nce list.
4772  */
4773 static nce_t *
4774 nce_add_impl(ill_t *ill, ncec_t *ncec, nce_t *nce, mblk_t *dlur_mp)
4775 {
4776 	bzero(nce, sizeof (*nce));
4777 	mutex_init(&nce->nce_lock, NULL, MUTEX_DEFAULT, NULL);
4778 	nce->nce_common = ncec;
4779 	nce->nce_addr = ncec->ncec_addr;
4780 	nce->nce_ill = ill;
4781 	DTRACE_PROBE3(ill__incr__cnt, (ill_t *), ill,
4782 	    (char *), "nce", (void *), nce);
4783 	ill->ill_nce_cnt++;
4784 
4785 	nce->nce_refcnt = 1; /* for the thread */
4786 	ncec->ncec_refcnt++; /* want ncec_refhold_locked_notr(ncec) */
4787 	nce->nce_dlur_mp = dlur_mp;
4788 
4789 	/* add nce to the ill's fastpath list.  */
4790 	nce->nce_refcnt++; /* for the list */
4791 	list_insert_head(&ill->ill_nce, nce);
4792 	return (nce);
4793 }
4794 
4795 static nce_t *
4796 nce_add(ill_t *ill, ncec_t *ncec)
4797 {
4798 	nce_t	*nce;
4799 	mblk_t	*dlur_mp = NULL;
4800 
4801 	ASSERT(MUTEX_HELD(&ill->ill_lock));
4802 	ASSERT(MUTEX_HELD(&ncec->ncec_lock));
4803 
4804 	nce = kmem_cache_alloc(nce_cache, KM_NOSLEEP);
4805 	if (nce == NULL)
4806 		return (NULL);
4807 	if (ncec->ncec_lladdr != NULL ||
4808 	    ill->ill_net_type == IRE_IF_NORESOLVER) {
4809 		dlur_mp = ill_dlur_gen(ncec->ncec_lladdr,
4810 		    ill->ill_phys_addr_length, ill->ill_sap,
4811 		    ill->ill_sap_length);
4812 		if (dlur_mp == NULL) {
4813 			kmem_cache_free(nce_cache, nce);
4814 			return (NULL);
4815 		}
4816 	}
4817 	return (nce_add_impl(ill, ncec, nce, dlur_mp));
4818 }
4819 
4820 /*
4821  * remove the nce from the ill_faspath list
4822  */
4823 void
4824 nce_delete(nce_t *nce)
4825 {
4826 	ill_t	*ill = nce->nce_ill;
4827 
4828 	ASSERT(MUTEX_HELD(&ill->ill_lock));
4829 
4830 	mutex_enter(&nce->nce_lock);
4831 	if (nce->nce_is_condemned) {
4832 		/*
4833 		 * some other thread has removed this nce from the ill_nce list
4834 		 */
4835 		mutex_exit(&nce->nce_lock);
4836 		return;
4837 	}
4838 	nce->nce_is_condemned = B_TRUE;
4839 	mutex_exit(&nce->nce_lock);
4840 
4841 	list_remove(&ill->ill_nce, nce);
4842 	/*
4843 	 * even though we are holding the ill_lock, it is ok to
4844 	 * call nce_refrele here because we know that we should have
4845 	 * at least 2 refs on the nce: one for the thread, and one
4846 	 * for the list. The refrele below will release the one for
4847 	 * the list.
4848 	 */
4849 	nce_refrele(nce);
4850 }
4851 
4852 nce_t *
4853 nce_lookup(ill_t *ill, const in6_addr_t *addr)
4854 {
4855 	nce_t *nce = NULL;
4856 
4857 	ASSERT(ill != NULL);
4858 	ASSERT(MUTEX_HELD(&ill->ill_lock));
4859 
4860 	for (nce = list_head(&ill->ill_nce); nce != NULL;
4861 	    nce = list_next(&ill->ill_nce, nce)) {
4862 		if (IN6_ARE_ADDR_EQUAL(&nce->nce_addr, addr))
4863 			break;
4864 	}
4865 
4866 	/*
4867 	 * if we found the nce on the ill_nce list while holding
4868 	 * the ill_lock, then it cannot be condemned yet.
4869 	 */
4870 	if (nce != NULL) {
4871 		ASSERT(!nce->nce_is_condemned);
4872 		nce_refhold(nce);
4873 	}
4874 	return (nce);
4875 }
4876 
4877 /*
4878  * Walk the ill_nce list on ill. The callback function func() cannot perform
4879  * any destructive actions.
4880  */
4881 static void
4882 nce_walk_common(ill_t *ill, pfi_t func, void *arg)
4883 {
4884 	nce_t *nce = NULL, *nce_next;
4885 
4886 	ASSERT(MUTEX_HELD(&ill->ill_lock));
4887 	for (nce = list_head(&ill->ill_nce); nce != NULL; ) {
4888 		nce_next = list_next(&ill->ill_nce, nce);
4889 		if (func(ill, nce, arg) != 0)
4890 			break;
4891 		nce = nce_next;
4892 	}
4893 }
4894 
4895 void
4896 nce_walk(ill_t *ill, pfi_t func, void *arg)
4897 {
4898 	mutex_enter(&ill->ill_lock);
4899 	nce_walk_common(ill, func, arg);
4900 	mutex_exit(&ill->ill_lock);
4901 }
4902 
4903 void
4904 nce_flush(ill_t *ill, boolean_t flushall)
4905 {
4906 	nce_t *nce, *nce_next;
4907 	list_t dead;
4908 
4909 	list_create(&dead, sizeof (nce_t), offsetof(nce_t, nce_node));
4910 	mutex_enter(&ill->ill_lock);
4911 	for (nce = list_head(&ill->ill_nce); nce != NULL; ) {
4912 		nce_next = list_next(&ill->ill_nce, nce);
4913 		if (!flushall && NCE_PUBLISH(nce->nce_common)) {
4914 			nce = nce_next;
4915 			continue;
4916 		}
4917 		/*
4918 		 * nce_delete requires that the caller should either not
4919 		 * be holding locks, or should hold a ref to ensure that
4920 		 * we wont hit ncec_inactive. So take a ref and clean up
4921 		 * after the list is flushed.
4922 		 */
4923 		nce_refhold(nce);
4924 		nce_delete(nce);
4925 		list_insert_tail(&dead, nce);
4926 		nce = nce_next;
4927 	}
4928 	mutex_exit(&ill->ill_lock);
4929 	while ((nce = list_head(&dead)) != NULL) {
4930 		list_remove(&dead, nce);
4931 		nce_refrele(nce);
4932 	}
4933 	ASSERT(list_is_empty(&dead));
4934 	list_destroy(&dead);
4935 }
4936 
4937 /* Return an interval that is anywhere in the [1 .. intv] range */
4938 static clock_t
4939 nce_fuzz_interval(clock_t intv, boolean_t initial_time)
4940 {
4941 	clock_t rnd, frac;
4942 
4943 	(void) random_get_pseudo_bytes((uint8_t *)&rnd, sizeof (rnd));
4944 	/* Note that clock_t is signed; must chop off bits */
4945 	rnd &= (1ul << (NBBY * sizeof (rnd) - 1)) - 1;
4946 	if (initial_time) {
4947 		if (intv <= 0)
4948 			intv = 1;
4949 		else
4950 			intv = (rnd % intv) + 1;
4951 	} else {
4952 		/* Compute 'frac' as 20% of the configured interval */
4953 		if ((frac = intv / 5) <= 1)
4954 			frac = 2;
4955 		/* Set intv randomly in the range [intv-frac .. intv+frac] */
4956 		if ((intv = intv - frac + rnd % (2 * frac + 1)) <= 0)
4957 		intv = 1;
4958 	}
4959 	return (intv);
4960 }
4961 
4962 void
4963 nce_resolv_ipmp_ok(ncec_t *ncec)
4964 {
4965 	mblk_t *mp;
4966 	uint_t pkt_len;
4967 	iaflags_t ixaflags = IXAF_NO_TRACE;
4968 	nce_t *under_nce;
4969 	ill_t	*ill = ncec->ncec_ill;
4970 	boolean_t isv6 = (ncec->ncec_ipversion == IPV6_VERSION);
4971 	ipif_t *src_ipif = NULL;
4972 	ip_stack_t *ipst = ill->ill_ipst;
4973 	ill_t *send_ill;
4974 	uint_t nprobes;
4975 
4976 	ASSERT(IS_IPMP(ill));
4977 
4978 	mutex_enter(&ncec->ncec_lock);
4979 	nprobes = ncec->ncec_nprobes;
4980 	mp = ncec->ncec_qd_mp;
4981 	ncec->ncec_qd_mp = NULL;
4982 	ncec->ncec_nprobes = 0;
4983 	mutex_exit(&ncec->ncec_lock);
4984 
4985 	while (mp != NULL) {
4986 		mblk_t *nxt_mp;
4987 
4988 		nxt_mp = mp->b_next;
4989 		mp->b_next = NULL;
4990 		if (isv6) {
4991 			ip6_t *ip6h = (ip6_t *)mp->b_rptr;
4992 
4993 			pkt_len = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN;
4994 			src_ipif = ipif_lookup_addr_nondup_v6(&ip6h->ip6_src,
4995 			    ill, ALL_ZONES, ipst);
4996 		} else {
4997 			ipha_t *ipha = (ipha_t *)mp->b_rptr;
4998 
4999 			ixaflags |= IXAF_IS_IPV4;
5000 			pkt_len = ntohs(ipha->ipha_length);
5001 			src_ipif = ipif_lookup_addr_nondup(ipha->ipha_src,
5002 			    ill, ALL_ZONES, ipst);
5003 		}
5004 
5005 		/*
5006 		 * find a new nce based on an under_ill. The first IPMP probe
5007 		 * packet gets queued, so we could still find a src_ipif that
5008 		 * matches an IPMP test address.
5009 		 */
5010 		if (src_ipif == NULL || IS_IPMP(src_ipif->ipif_ill)) {
5011 			/*
5012 			 * if src_ipif is null, this could be either a
5013 			 * forwarded packet or a probe whose src got deleted.
5014 			 * We identify the former case by looking for the
5015 			 * ncec_nprobes: the first ncec_nprobes packets are
5016 			 * probes;
5017 			 */
5018 			if (src_ipif == NULL && nprobes > 0)
5019 				goto drop_pkt;
5020 
5021 			/*
5022 			 * For forwarded packets, we use the ipmp rotor
5023 			 * to find send_ill.
5024 			 */
5025 			send_ill = ipmp_ill_get_xmit_ill(ncec->ncec_ill,
5026 			    B_TRUE);
5027 		} else {
5028 			send_ill = src_ipif->ipif_ill;
5029 			ill_refhold(send_ill);
5030 		}
5031 
5032 		DTRACE_PROBE4(nce__resolve__ipmp, (mblk_t *), mp,
5033 		    (ncec_t *), ncec, (ipif_t *),
5034 		    src_ipif, (ill_t *), send_ill);
5035 
5036 		if (send_ill == NULL) {
5037 			if (src_ipif != NULL)
5038 				ipif_refrele(src_ipif);
5039 			goto drop_pkt;
5040 		}
5041 		/* create an under_nce on send_ill */
5042 		rw_enter(&ipst->ips_ill_g_lock, RW_READER);
5043 		if (IS_IN_SAME_ILLGRP(send_ill, ncec->ncec_ill))
5044 			under_nce = nce_fastpath_create(send_ill, ncec);
5045 		else
5046 			under_nce = NULL;
5047 		rw_exit(&ipst->ips_ill_g_lock);
5048 		if (under_nce != NULL && NCE_ISREACHABLE(ncec))
5049 			nce_fastpath_trigger(under_nce);
5050 
5051 		ill_refrele(send_ill);
5052 		if (src_ipif != NULL)
5053 			ipif_refrele(src_ipif);
5054 
5055 		if (under_nce != NULL) {
5056 			(void) ip_xmit(mp, under_nce, ixaflags, pkt_len, 0,
5057 			    ALL_ZONES, 0, NULL);
5058 			nce_refrele(under_nce);
5059 			if (nprobes > 0)
5060 				nprobes--;
5061 			mp = nxt_mp;
5062 			continue;
5063 		}
5064 drop_pkt:
5065 		if (isv6) {
5066 			BUMP_MIB(&ipst->ips_ip6_mib, ipIfStatsOutDiscards);
5067 		} else {
5068 			BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
5069 		}
5070 		ip_drop_output("ipIfStatsOutDiscards - no under_ill", mp, NULL);
5071 		freemsg(mp);
5072 		if (nprobes > 0)
5073 			nprobes--;
5074 		mp = nxt_mp;
5075 	}
5076 	ncec_cb_dispatch(ncec); /* complete callbacks */
5077 }
5078