xref: /titanic_52/usr/src/uts/common/inet/ip/ip_ndp.c (revision 2dea4eed7ad1c66ae4770263aa2911815a8b86eb)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #include <sys/types.h>
27 #include <sys/stream.h>
28 #include <sys/stropts.h>
29 #include <sys/strsun.h>
30 #include <sys/sysmacros.h>
31 #include <sys/errno.h>
32 #include <sys/dlpi.h>
33 #include <sys/socket.h>
34 #include <sys/ddi.h>
35 #include <sys/sunddi.h>
36 #include <sys/cmn_err.h>
37 #include <sys/debug.h>
38 #include <sys/vtrace.h>
39 #include <sys/kmem.h>
40 #include <sys/zone.h>
41 #include <sys/ethernet.h>
42 #include <sys/sdt.h>
43 #include <sys/mac.h>
44 
45 #include <net/if.h>
46 #include <net/if_types.h>
47 #include <net/if_dl.h>
48 #include <net/route.h>
49 #include <netinet/in.h>
50 #include <netinet/ip6.h>
51 #include <netinet/icmp6.h>
52 
53 #include <inet/common.h>
54 #include <inet/mi.h>
55 #include <inet/mib2.h>
56 #include <inet/nd.h>
57 #include <inet/ip.h>
58 #include <inet/ip_impl.h>
59 #include <inet/ipclassifier.h>
60 #include <inet/ip_if.h>
61 #include <inet/ip_ire.h>
62 #include <inet/ip_rts.h>
63 #include <inet/ip6.h>
64 #include <inet/ip_ndp.h>
65 #include <inet/sctp_ip.h>
66 #include <inet/ip_arp.h>
67 #include <inet/ip2mac_impl.h>
68 
69 #define	ANNOUNCE_INTERVAL(isv6) \
70 	(isv6 ? ipst->ips_ip_ndp_unsolicit_interval : \
71 	ipst->ips_ip_arp_publish_interval)
72 
73 #define	DEFENSE_INTERVAL(isv6) \
74 	(isv6 ? ipst->ips_ndp_defend_interval : \
75 	ipst->ips_arp_defend_interval)
76 
77 /* Non-tunable probe interval, based on link capabilities */
78 #define	ILL_PROBE_INTERVAL(ill)	((ill)->ill_note_link ? 150 : 1500)
79 
80 /*
81  * The IPv4 Link Local address space is special; we do extra duplicate checking
82  * there, as the entire assignment mechanism rests on random numbers.
83  */
84 #define	IS_IPV4_LL_SPACE(ptr)	(((uchar_t *)ptr)[0] == 169 && \
85 				((uchar_t *)ptr)[1] == 254)
86 
87 /*
88  * NCE_EXTERNAL_FLAGS_MASK defines the set of ncec_flags that may be passed
89  * in to the ncec*add* functions.
90  *
91  * NCE_F_AUTHORITY means that we ignore any incoming adverts for that
92  * mapping (though DAD is performed for the mapping). NCE_F_PUBLISH means
93  * that we will respond to requests for the protocol address.
94  */
95 #define	NCE_EXTERNAL_FLAGS_MASK \
96 	(NCE_F_MYADDR | NCE_F_ISROUTER | NCE_F_NONUD | \
97 	NCE_F_ANYCAST | NCE_F_UNSOL_ADV | NCE_F_BCAST | NCE_F_MCAST | \
98 	NCE_F_AUTHORITY | NCE_F_PUBLISH | NCE_F_STATIC)
99 
100 /*
101  * Lock ordering:
102  *
103  *	ndp_g_lock -> ill_lock -> ncec_lock
104  *
105  * The ndp_g_lock protects the NCE hash (nce_hash_tbl, NCE_HASH_PTR) and
106  * ncec_next.  ncec_lock protects the contents of the NCE (particularly
107  * ncec_refcnt).
108  */
109 
110 static	void	nce_cleanup_list(ncec_t *ncec);
111 static	void 	nce_set_ll(ncec_t *ncec, uchar_t *ll_addr);
112 static	ncec_t	*ncec_lookup_illgrp(ill_t *, const in6_addr_t *,
113     ncec_t *);
114 static	nce_t	*nce_lookup_addr(ill_t *, const in6_addr_t *);
115 static	int	nce_set_multicast_v6(ill_t *ill, const in6_addr_t *addr,
116     uint16_t ncec_flags, nce_t **newnce);
117 static	int	nce_set_multicast_v4(ill_t *ill, const in_addr_t *dst,
118     uint16_t ncec_flags, nce_t **newnce);
119 static	boolean_t	ndp_xmit(ill_t *ill, uint32_t operation,
120     uint8_t *hwaddr, uint_t hwaddr_len, const in6_addr_t *sender,
121     const in6_addr_t *target, int flag);
122 static void	ncec_refhold_locked(ncec_t *);
123 static boolean_t ill_defend_rate_limit(ill_t *, ncec_t *);
124 static	void	nce_queue_mp_common(ncec_t *, mblk_t *, boolean_t);
125 static	int	nce_add_common(ill_t *, uchar_t *, uint_t, const in6_addr_t *,
126     uint16_t, uint16_t, nce_t **);
127 static nce_t *nce_add_impl(ill_t *, ncec_t *, nce_t *, mblk_t *);
128 static nce_t *nce_add(ill_t *, ncec_t *);
129 static void nce_inactive(nce_t *);
130 extern nce_t 	*nce_lookup(ill_t *, const in6_addr_t *);
131 static nce_t *nce_ill_lookup_then_add(ill_t *, ncec_t *);
132 static int	nce_add_v6(ill_t *, uchar_t *, uint_t, const in6_addr_t *,
133     uint16_t, uint16_t, nce_t **);
134 static int	nce_add_v4(ill_t *, uchar_t *, uint_t, const in_addr_t *,
135     uint16_t, uint16_t, nce_t **);
136 static int  nce_add_v6_postprocess(nce_t *);
137 static int  nce_add_v4_postprocess(nce_t *);
138 static ill_t *nce_resolve_src(ncec_t *, in6_addr_t *);
139 static clock_t nce_fuzz_interval(clock_t, boolean_t);
140 static void nce_resolv_ipmp_ok(ncec_t *);
141 static void nce_walk_common(ill_t *, pfi_t, void *);
142 static void nce_start_timer(ncec_t *, uint_t);
143 static nce_t *nce_fastpath_create(ill_t *, ncec_t *);
144 static void nce_fastpath_trigger(nce_t *);
145 static nce_t *nce_fastpath(ncec_t *, boolean_t, nce_t *);
146 
147 #ifdef DEBUG
148 static void	ncec_trace_cleanup(const ncec_t *);
149 #endif
150 
151 #define	NCE_HASH_PTR_V4(ipst, addr)					\
152 	(&((ipst)->ips_ndp4->nce_hash_tbl[IRE_ADDR_HASH(addr, NCE_TABLE_SIZE)]))
153 
154 #define	NCE_HASH_PTR_V6(ipst, addr)				 \
155 	(&((ipst)->ips_ndp6->nce_hash_tbl[NCE_ADDR_HASH_V6(addr, \
156 		NCE_TABLE_SIZE)]))
157 
158 extern kmem_cache_t *ncec_cache;
159 extern kmem_cache_t *nce_cache;
160 
161 /*
162  * Send out a IPv6 (unicast) or IPv4 (broadcast) DAD probe
163  * If src_ill is not null, the ncec_addr is bound to src_ill. The
164  * src_ill is ignored by nce_dad for IPv4 Neighbor Cache entries where
165  * the probe is sent on the ncec_ill (in the non-IPMP case) or the
166  * IPMP cast_ill (in the IPMP case).
167  *
168  * Note that the probe interval is based on ncec->ncec_ill which
169  * may be the ipmp_ill.
170  */
171 static void
172 nce_dad(ncec_t *ncec, ill_t *src_ill, boolean_t send_probe)
173 {
174 	boolean_t dropped;
175 	uint32_t probe_interval;
176 
177 	ASSERT(!(ncec->ncec_flags & NCE_F_MCAST));
178 	ASSERT(!(ncec->ncec_flags & NCE_F_BCAST));
179 	if (ncec->ncec_ipversion == IPV6_VERSION) {
180 		dropped = ndp_xmit(src_ill, ND_NEIGHBOR_SOLICIT,
181 		    ncec->ncec_lladdr, ncec->ncec_lladdr_length,
182 		    &ipv6_all_zeros, &ncec->ncec_addr, NDP_PROBE);
183 		probe_interval = ILL_PROBE_INTERVAL(ncec->ncec_ill);
184 	} else {
185 		/* IPv4 DAD delay the initial probe. */
186 		if (send_probe)
187 			dropped = arp_probe(ncec);
188 		else
189 			dropped = B_TRUE;
190 		probe_interval = nce_fuzz_interval(ncec->ncec_xmit_interval,
191 		    !send_probe);
192 	}
193 	if (!dropped) {
194 		mutex_enter(&ncec->ncec_lock);
195 		ncec->ncec_pcnt--;
196 		mutex_exit(&ncec->ncec_lock);
197 	}
198 	nce_restart_timer(ncec, probe_interval);
199 }
200 
201 /*
202  * Compute default flags to use for an advertisement of this ncec's address.
203  */
204 static int
205 nce_advert_flags(const ncec_t *ncec)
206 {
207 	int flag = 0;
208 
209 	if (ncec->ncec_flags & NCE_F_ISROUTER)
210 		flag |= NDP_ISROUTER;
211 	if (!(ncec->ncec_flags & NCE_F_ANYCAST))
212 		flag |= NDP_ORIDE;
213 
214 	return (flag);
215 }
216 
217 /*
218  * NDP Cache Entry creation routine.
219  * This routine must always be called with ndp6->ndp_g_lock held.
220  */
221 int
222 nce_add_v6(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len,
223     const in6_addr_t *addr, uint16_t flags, uint16_t state, nce_t **newnce)
224 {
225 	int		err;
226 	nce_t		*nce;
227 
228 	ASSERT(MUTEX_HELD(&ill->ill_ipst->ips_ndp6->ndp_g_lock));
229 	ASSERT(ill != NULL && ill->ill_isv6);
230 
231 	err = nce_add_common(ill, hw_addr, hw_addr_len, addr, flags, state,
232 	    &nce);
233 	if (err != 0)
234 		return (err);
235 	ASSERT(newnce != NULL);
236 	*newnce = nce;
237 	return (err);
238 }
239 
240 /*
241  * Post-processing routine to be executed after nce_add_v6(). This function
242  * triggers fastpath (if appropriate) and DAD on the newly added nce entry
243  * and must be called without any locks held.
244  */
245 int
246 nce_add_v6_postprocess(nce_t *nce)
247 {
248 	ncec_t		*ncec = nce->nce_common;
249 	boolean_t	dropped = B_FALSE;
250 	uchar_t		*hw_addr = ncec->ncec_lladdr;
251 	uint_t		hw_addr_len = ncec->ncec_lladdr_length;
252 	ill_t		*ill = ncec->ncec_ill;
253 	int		err = 0;
254 	uint16_t	flags = ncec->ncec_flags;
255 	ip_stack_t	*ipst = ill->ill_ipst;
256 	boolean_t	trigger_fastpath = B_TRUE;
257 
258 	/*
259 	 * If the hw_addr is NULL, typically for ND_INCOMPLETE nces, then
260 	 * we call nce_fastpath as soon as the ncec is resolved in nce_process.
261 	 * We call nce_fastpath from nce_update if the link layer address of
262 	 * the peer changes from nce_update
263 	 */
264 	if (NCE_PUBLISH(ncec) || !NCE_ISREACHABLE(ncec) ||
265 	    (hw_addr == NULL && ill->ill_net_type != IRE_IF_NORESOLVER))
266 		trigger_fastpath = B_FALSE;
267 
268 	if (trigger_fastpath)
269 		nce_fastpath_trigger(nce);
270 	if (NCE_PUBLISH(ncec) && ncec->ncec_state == ND_PROBE) {
271 		ill_t *hwaddr_ill;
272 		/*
273 		 * Unicast entry that needs DAD.
274 		 */
275 		if (IS_IPMP(ill)) {
276 			hwaddr_ill = ipmp_illgrp_find_ill(ill->ill_grp,
277 			    hw_addr, hw_addr_len);
278 		} else {
279 			hwaddr_ill = ill;
280 		}
281 		nce_dad(ncec, hwaddr_ill, B_TRUE);
282 		err = EINPROGRESS;
283 	} else if (flags & NCE_F_UNSOL_ADV) {
284 		/*
285 		 * We account for the transmit below by assigning one
286 		 * less than the ndd variable. Subsequent decrements
287 		 * are done in nce_timer.
288 		 */
289 		mutex_enter(&ncec->ncec_lock);
290 		ncec->ncec_unsolicit_count =
291 		    ipst->ips_ip_ndp_unsolicit_count - 1;
292 		mutex_exit(&ncec->ncec_lock);
293 		dropped = ndp_xmit(ill,
294 		    ND_NEIGHBOR_ADVERT,
295 		    hw_addr,
296 		    hw_addr_len,
297 		    &ncec->ncec_addr,	/* Source and target of the adv */
298 		    &ipv6_all_hosts_mcast, /* Destination of the packet */
299 		    nce_advert_flags(ncec));
300 		mutex_enter(&ncec->ncec_lock);
301 		if (dropped)
302 			ncec->ncec_unsolicit_count++;
303 		else
304 			ncec->ncec_last_time_defended = ddi_get_lbolt();
305 		if (ncec->ncec_unsolicit_count != 0) {
306 			nce_start_timer(ncec,
307 			    ipst->ips_ip_ndp_unsolicit_interval);
308 		}
309 		mutex_exit(&ncec->ncec_lock);
310 	}
311 	return (err);
312 }
313 
314 /*
315  * Atomically lookup and add (if needed) Neighbor Cache information for
316  * an address.
317  *
318  * IPMP notes: the ncec for non-local (i.e., !NCE_MYADDR(ncec) addresses
319  * are always added pointing at the ipmp_ill. Thus, when the ill passed
320  * to nce_add_v6 is an under_ill (i.e., IS_UNDER_IPMP(ill)) two nce_t
321  * entries will be created, both pointing at the same ncec_t. The nce_t
322  * entries will have their nce_ill set to the ipmp_ill and the under_ill
323  * respectively, with the ncec_t having its ncec_ill pointing at the ipmp_ill.
324  * Local addresses are always created on the ill passed to nce_add_v6.
325  */
326 int
327 nce_lookup_then_add_v6(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len,
328     const in6_addr_t *addr, uint16_t flags, uint16_t state, nce_t **newnce)
329 {
330 	int		err = 0;
331 	ip_stack_t	*ipst = ill->ill_ipst;
332 	nce_t		*nce, *upper_nce = NULL;
333 	ill_t		*in_ill = ill;
334 	boolean_t	need_ill_refrele = B_FALSE;
335 
336 	if (flags & NCE_F_MCAST) {
337 		/*
338 		 * hw_addr will be figured out in nce_set_multicast_v6;
339 		 * caller has to select the cast_ill
340 		 */
341 		ASSERT(hw_addr == NULL);
342 		ASSERT(!IS_IPMP(ill));
343 		err = nce_set_multicast_v6(ill, addr, flags, newnce);
344 		return (err);
345 	}
346 	ASSERT(ill->ill_isv6);
347 	if (IS_UNDER_IPMP(ill) && !(flags & NCE_F_MYADDR)) {
348 		ill = ipmp_ill_hold_ipmp_ill(ill);
349 		if (ill == NULL)
350 			return (ENXIO);
351 		need_ill_refrele = B_TRUE;
352 	}
353 
354 	mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
355 	nce = nce_lookup_addr(ill, addr);
356 	if (nce == NULL) {
357 		err = nce_add_v6(ill, hw_addr, hw_addr_len, addr, flags, state,
358 		    &nce);
359 	} else {
360 		err = EEXIST;
361 	}
362 	mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
363 	if (err == 0)
364 		err = nce_add_v6_postprocess(nce);
365 	if (in_ill != ill && nce != NULL) {
366 		nce_t *under_nce = NULL;
367 
368 		/*
369 		 * in_ill was the under_ill. Try to create the under_nce.
370 		 * Hold the ill_g_lock to prevent changes to group membership
371 		 * until we are done.
372 		 */
373 		rw_enter(&ipst->ips_ill_g_lock, RW_READER);
374 		if (!IS_IN_SAME_ILLGRP(in_ill, ill)) {
375 			DTRACE_PROBE2(ill__not__in__group, nce_t *, nce,
376 			    ill_t *, ill);
377 			rw_exit(&ipst->ips_ill_g_lock);
378 			err = ENXIO;
379 			nce_refrele(nce);
380 			nce = NULL;
381 			goto bail;
382 		}
383 		under_nce = nce_fastpath_create(in_ill, nce->nce_common);
384 		if (under_nce == NULL) {
385 			rw_exit(&ipst->ips_ill_g_lock);
386 			err = EINVAL;
387 			nce_refrele(nce);
388 			nce = NULL;
389 			goto bail;
390 		}
391 		rw_exit(&ipst->ips_ill_g_lock);
392 		upper_nce = nce;
393 		nce = under_nce; /* will be returned to caller */
394 		if (NCE_ISREACHABLE(nce->nce_common))
395 			nce_fastpath_trigger(under_nce);
396 	}
397 	/* nce_refrele is deferred until the lock is dropped  */
398 	if (nce != NULL) {
399 		if (newnce != NULL)
400 			*newnce = nce;
401 		else
402 			nce_refrele(nce);
403 	}
404 bail:
405 	if (upper_nce != NULL)
406 		nce_refrele(upper_nce);
407 	if (need_ill_refrele)
408 		ill_refrele(ill);
409 	return (err);
410 }
411 
412 /*
413  * Remove all the CONDEMNED nces from the appropriate hash table.
414  * We create a private list of NCEs, these may have ires pointing
415  * to them, so the list will be passed through to clean up dependent
416  * ires and only then we can do ncec_refrele() which can make NCE inactive.
417  */
418 static void
419 nce_remove(ndp_g_t *ndp, ncec_t *ncec, ncec_t **free_nce_list)
420 {
421 	ncec_t *ncec1;
422 	ncec_t **ptpn;
423 
424 	ASSERT(MUTEX_HELD(&ndp->ndp_g_lock));
425 	ASSERT(ndp->ndp_g_walker == 0);
426 	for (; ncec; ncec = ncec1) {
427 		ncec1 = ncec->ncec_next;
428 		mutex_enter(&ncec->ncec_lock);
429 		if (NCE_ISCONDEMNED(ncec)) {
430 			ptpn = ncec->ncec_ptpn;
431 			ncec1 = ncec->ncec_next;
432 			if (ncec1 != NULL)
433 				ncec1->ncec_ptpn = ptpn;
434 			*ptpn = ncec1;
435 			ncec->ncec_ptpn = NULL;
436 			ncec->ncec_next = NULL;
437 			ncec->ncec_next = *free_nce_list;
438 			*free_nce_list = ncec;
439 		}
440 		mutex_exit(&ncec->ncec_lock);
441 	}
442 }
443 
444 /*
445  * 1. Mark the entry CONDEMNED. This ensures that no new nce_lookup()
446  *    will return this NCE. Also no new timeouts will
447  *    be started (See nce_restart_timer).
448  * 2. Cancel any currently running timeouts.
449  * 3. If there is an ndp walker, return. The walker will do the cleanup.
450  *    This ensures that walkers see a consistent list of NCEs while walking.
451  * 4. Otherwise remove the NCE from the list of NCEs
452  */
453 void
454 ncec_delete(ncec_t *ncec)
455 {
456 	ncec_t	**ptpn;
457 	ncec_t	*ncec1;
458 	int	ipversion = ncec->ncec_ipversion;
459 	ndp_g_t *ndp;
460 	ip_stack_t	*ipst = ncec->ncec_ipst;
461 
462 	if (ipversion == IPV4_VERSION)
463 		ndp = ipst->ips_ndp4;
464 	else
465 		ndp = ipst->ips_ndp6;
466 
467 	/* Serialize deletes */
468 	mutex_enter(&ncec->ncec_lock);
469 	if (NCE_ISCONDEMNED(ncec)) {
470 		/* Some other thread is doing the delete */
471 		mutex_exit(&ncec->ncec_lock);
472 		return;
473 	}
474 	/*
475 	 * Caller has a refhold. Also 1 ref for being in the list. Thus
476 	 * refcnt has to be >= 2
477 	 */
478 	ASSERT(ncec->ncec_refcnt >= 2);
479 	ncec->ncec_flags |= NCE_F_CONDEMNED;
480 	mutex_exit(&ncec->ncec_lock);
481 
482 	/* Count how many condemned ires for kmem_cache callback */
483 	atomic_add_32(&ipst->ips_num_nce_condemned, 1);
484 	nce_fastpath_list_delete(ncec->ncec_ill, ncec, NULL);
485 
486 	/* Complete any waiting callbacks */
487 	ncec_cb_dispatch(ncec);
488 
489 	/*
490 	 * Cancel any running timer. Timeout can't be restarted
491 	 * since CONDEMNED is set. Can't hold ncec_lock across untimeout.
492 	 * Passing invalid timeout id is fine.
493 	 */
494 	if (ncec->ncec_timeout_id != 0) {
495 		(void) untimeout(ncec->ncec_timeout_id);
496 		ncec->ncec_timeout_id = 0;
497 	}
498 
499 	mutex_enter(&ndp->ndp_g_lock);
500 	if (ncec->ncec_ptpn == NULL) {
501 		/*
502 		 * The last ndp walker has already removed this ncec from
503 		 * the list after we marked the ncec CONDEMNED and before
504 		 * we grabbed the global lock.
505 		 */
506 		mutex_exit(&ndp->ndp_g_lock);
507 		return;
508 	}
509 	if (ndp->ndp_g_walker > 0) {
510 		/*
511 		 * Can't unlink. The walker will clean up
512 		 */
513 		ndp->ndp_g_walker_cleanup = B_TRUE;
514 		mutex_exit(&ndp->ndp_g_lock);
515 		return;
516 	}
517 
518 	/*
519 	 * Now remove the ncec from the list. nce_restart_timer won't restart
520 	 * the timer since it is marked CONDEMNED.
521 	 */
522 	ptpn = ncec->ncec_ptpn;
523 	ncec1 = ncec->ncec_next;
524 	if (ncec1 != NULL)
525 		ncec1->ncec_ptpn = ptpn;
526 	*ptpn = ncec1;
527 	ncec->ncec_ptpn = NULL;
528 	ncec->ncec_next = NULL;
529 	mutex_exit(&ndp->ndp_g_lock);
530 
531 	/* Removed from ncec_ptpn/ncec_next list */
532 	ncec_refrele_notr(ncec);
533 }
534 
535 void
536 ncec_inactive(ncec_t *ncec)
537 {
538 	mblk_t		**mpp;
539 	ill_t		*ill = ncec->ncec_ill;
540 	ip_stack_t	*ipst = ncec->ncec_ipst;
541 
542 	ASSERT(ncec->ncec_refcnt == 0);
543 	ASSERT(MUTEX_HELD(&ncec->ncec_lock));
544 
545 	/* Count how many condemned nces for kmem_cache callback */
546 	if (NCE_ISCONDEMNED(ncec))
547 		atomic_add_32(&ipst->ips_num_nce_condemned, -1);
548 
549 	/* Free all allocated messages */
550 	mpp = &ncec->ncec_qd_mp;
551 	while (*mpp != NULL) {
552 		mblk_t  *mp;
553 
554 		mp = *mpp;
555 		*mpp = mp->b_next;
556 
557 		inet_freemsg(mp);
558 	}
559 	/*
560 	 * must have been cleaned up in ncec_delete
561 	 */
562 	ASSERT(list_is_empty(&ncec->ncec_cb));
563 	list_destroy(&ncec->ncec_cb);
564 	/*
565 	 * free the ncec_lladdr if one was allocated in nce_add_common()
566 	 */
567 	if (ncec->ncec_lladdr_length > 0)
568 		kmem_free(ncec->ncec_lladdr, ncec->ncec_lladdr_length);
569 
570 #ifdef DEBUG
571 	ncec_trace_cleanup(ncec);
572 #endif
573 
574 	mutex_enter(&ill->ill_lock);
575 	DTRACE_PROBE3(ill__decr__cnt, (ill_t *), ill,
576 	    (char *), "ncec", (void *), ncec);
577 	ill->ill_ncec_cnt--;
578 	ncec->ncec_ill = NULL;
579 	/*
580 	 * If the number of ncec's associated with this ill have dropped
581 	 * to zero, check whether we need to restart any operation that
582 	 * is waiting for this to happen.
583 	 */
584 	if (ILL_DOWN_OK(ill)) {
585 		/* ipif_ill_refrele_tail drops the ill_lock */
586 		ipif_ill_refrele_tail(ill);
587 	} else {
588 		mutex_exit(&ill->ill_lock);
589 	}
590 
591 	mutex_destroy(&ncec->ncec_lock);
592 	kmem_cache_free(ncec_cache, ncec);
593 }
594 
595 /*
596  * ncec_walk routine.  Delete the ncec if it is associated with the ill
597  * that is going away.  Always called as a writer.
598  */
599 void
600 ncec_delete_per_ill(ncec_t *ncec, uchar_t *arg)
601 {
602 	if ((ncec != NULL) && ncec->ncec_ill == (ill_t *)arg) {
603 		ncec_delete(ncec);
604 	}
605 }
606 
607 /*
608  * Neighbor Cache cleanup logic for a list of ncec_t entries.
609  */
610 static void
611 nce_cleanup_list(ncec_t *ncec)
612 {
613 	ncec_t *ncec_next;
614 
615 	ASSERT(ncec != NULL);
616 	while (ncec != NULL) {
617 		ncec_next = ncec->ncec_next;
618 		ncec->ncec_next = NULL;
619 
620 		/*
621 		 * It is possible for the last ndp walker (this thread)
622 		 * to come here after ncec_delete has marked the ncec CONDEMNED
623 		 * and before it has removed the ncec from the fastpath list
624 		 * or called untimeout. So we need to do it here. It is safe
625 		 * for both ncec_delete and this thread to do it twice or
626 		 * even simultaneously since each of the threads has a
627 		 * reference on the ncec.
628 		 */
629 		nce_fastpath_list_delete(ncec->ncec_ill, ncec, NULL);
630 		/*
631 		 * Cancel any running timer. Timeout can't be restarted
632 		 * since CONDEMNED is set. The ncec_lock can't be
633 		 * held across untimeout though passing invalid timeout
634 		 * id is fine.
635 		 */
636 		if (ncec->ncec_timeout_id != 0) {
637 			(void) untimeout(ncec->ncec_timeout_id);
638 			ncec->ncec_timeout_id = 0;
639 		}
640 		/* Removed from ncec_ptpn/ncec_next list */
641 		ncec_refrele_notr(ncec);
642 		ncec = ncec_next;
643 	}
644 }
645 
646 /*
647  * Restart DAD on given NCE.  Returns B_TRUE if DAD has been restarted.
648  */
649 boolean_t
650 nce_restart_dad(ncec_t *ncec)
651 {
652 	boolean_t started;
653 	ill_t *ill, *hwaddr_ill;
654 
655 	if (ncec == NULL)
656 		return (B_FALSE);
657 	ill = ncec->ncec_ill;
658 	mutex_enter(&ncec->ncec_lock);
659 	if (ncec->ncec_state == ND_PROBE) {
660 		mutex_exit(&ncec->ncec_lock);
661 		started = B_TRUE;
662 	} else if (ncec->ncec_state == ND_REACHABLE) {
663 		ASSERT(ncec->ncec_lladdr != NULL);
664 		ncec->ncec_state = ND_PROBE;
665 		ncec->ncec_pcnt = ND_MAX_UNICAST_SOLICIT;
666 		/*
667 		 * Slight cheat here: we don't use the initial probe delay
668 		 * for IPv4 in this obscure case.
669 		 */
670 		mutex_exit(&ncec->ncec_lock);
671 		if (IS_IPMP(ill)) {
672 			hwaddr_ill = ipmp_illgrp_find_ill(ill->ill_grp,
673 			    ncec->ncec_lladdr, ncec->ncec_lladdr_length);
674 		} else {
675 			hwaddr_ill = ill;
676 		}
677 		nce_dad(ncec, hwaddr_ill, B_TRUE);
678 		started = B_TRUE;
679 	} else {
680 		mutex_exit(&ncec->ncec_lock);
681 		started = B_FALSE;
682 	}
683 	return (started);
684 }
685 
686 /*
687  * IPv6 Cache entry lookup.  Try to find an ncec matching the parameters passed.
688  * If one is found, the refcnt on the ncec will be incremented.
689  */
690 ncec_t *
691 ncec_lookup_illgrp_v6(ill_t *ill, const in6_addr_t *addr)
692 {
693 	ncec_t		*ncec;
694 	ip_stack_t	*ipst = ill->ill_ipst;
695 
696 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
697 	mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
698 
699 	/* Get head of v6 hash table */
700 	ncec = *((ncec_t **)NCE_HASH_PTR_V6(ipst, *addr));
701 	ncec = ncec_lookup_illgrp(ill, addr, ncec);
702 	mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
703 	rw_exit(&ipst->ips_ill_g_lock);
704 	return (ncec);
705 }
706 /*
707  * IPv4 Cache entry lookup.  Try to find an ncec matching the parameters passed.
708  * If one is found, the refcnt on the ncec will be incremented.
709  */
710 ncec_t *
711 ncec_lookup_illgrp_v4(ill_t *ill, const in_addr_t *addr)
712 {
713 	ncec_t	*ncec = NULL;
714 	in6_addr_t addr6;
715 	ip_stack_t *ipst = ill->ill_ipst;
716 
717 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
718 	mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
719 
720 	/* Get head of v4 hash table */
721 	ncec = *((ncec_t **)NCE_HASH_PTR_V4(ipst, *addr));
722 	IN6_IPADDR_TO_V4MAPPED(*addr, &addr6);
723 	ncec = ncec_lookup_illgrp(ill, &addr6, ncec);
724 	mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
725 	rw_exit(&ipst->ips_ill_g_lock);
726 	return (ncec);
727 }
728 
729 /*
730  * Cache entry lookup.  Try to find an ncec matching the parameters passed.
731  * If an ncec is found, increment the hold count on that ncec.
732  * The caller passes in the start of the appropriate hash table, and must
733  * be holding the appropriate global lock (ndp_g_lock). In addition, since
734  * this function matches ncec_t entries across the illgrp, the ips_ill_g_lock
735  * must be held as reader.
736  *
737  * This function always matches across the ipmp group.
738  */
739 ncec_t *
740 ncec_lookup_illgrp(ill_t *ill, const in6_addr_t *addr, ncec_t *ncec)
741 {
742 	ndp_g_t		*ndp;
743 	ip_stack_t	*ipst = ill->ill_ipst;
744 
745 	if (ill->ill_isv6)
746 		ndp = ipst->ips_ndp6;
747 	else
748 		ndp = ipst->ips_ndp4;
749 
750 	ASSERT(ill != NULL);
751 	ASSERT(MUTEX_HELD(&ndp->ndp_g_lock));
752 	if (IN6_IS_ADDR_UNSPECIFIED(addr))
753 		return (NULL);
754 	for (; ncec != NULL; ncec = ncec->ncec_next) {
755 		if (ncec->ncec_ill == ill ||
756 		    IS_IN_SAME_ILLGRP(ill, ncec->ncec_ill)) {
757 			if (IN6_ARE_ADDR_EQUAL(&ncec->ncec_addr, addr)) {
758 				mutex_enter(&ncec->ncec_lock);
759 				if (!NCE_ISCONDEMNED(ncec)) {
760 					ncec_refhold_locked(ncec);
761 					mutex_exit(&ncec->ncec_lock);
762 					break;
763 				}
764 				mutex_exit(&ncec->ncec_lock);
765 			}
766 		}
767 	}
768 	return (ncec);
769 }
770 
771 /*
772  * Find an nce_t on ill with nce_addr == addr. Lookup the nce_t
773  * entries for ill only, i.e., when ill is part of an ipmp group,
774  * nce_lookup_v4 will never try to match across the group.
775  */
776 nce_t *
777 nce_lookup_v4(ill_t *ill, const in_addr_t *addr)
778 {
779 	nce_t *nce;
780 	in6_addr_t addr6;
781 	ip_stack_t *ipst = ill->ill_ipst;
782 
783 	mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
784 	IN6_IPADDR_TO_V4MAPPED(*addr, &addr6);
785 	nce = nce_lookup_addr(ill, &addr6);
786 	mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
787 	return (nce);
788 }
789 
790 /*
791  * Find an nce_t on ill with nce_addr == addr. Lookup the nce_t
792  * entries for ill only, i.e., when ill is part of an ipmp group,
793  * nce_lookup_v6 will never try to match across the group.
794  */
795 nce_t *
796 nce_lookup_v6(ill_t *ill, const in6_addr_t *addr6)
797 {
798 	nce_t *nce;
799 	ip_stack_t *ipst = ill->ill_ipst;
800 
801 	mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
802 	nce = nce_lookup_addr(ill, addr6);
803 	mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
804 	return (nce);
805 }
806 
807 static nce_t *
808 nce_lookup_addr(ill_t *ill, const in6_addr_t *addr)
809 {
810 	nce_t *nce;
811 
812 	ASSERT(ill != NULL);
813 #ifdef DEBUG
814 	if (ill->ill_isv6)
815 		ASSERT(MUTEX_HELD(&ill->ill_ipst->ips_ndp6->ndp_g_lock));
816 	else
817 		ASSERT(MUTEX_HELD(&ill->ill_ipst->ips_ndp4->ndp_g_lock));
818 #endif
819 	mutex_enter(&ill->ill_lock);
820 	nce = nce_lookup(ill, addr);
821 	mutex_exit(&ill->ill_lock);
822 	return (nce);
823 }
824 
825 
826 /*
827  * Router turned to host.  We need to make sure that cached copies of the ncec
828  * are not used for forwarding packets if they were derived from the default
829  * route, and that the default route itself is removed, as  required by
830  * section 7.2.5 of RFC 2461.
831  *
832  * Note that the ncec itself probably has valid link-layer information for the
833  * nexthop, so that there is no reason to delete the ncec, as long as the
834  * ISROUTER flag is turned off.
835  */
836 static void
837 ncec_router_to_host(ncec_t *ncec)
838 {
839 	ire_t		*ire;
840 	ip_stack_t	*ipst = ncec->ncec_ipst;
841 
842 	mutex_enter(&ncec->ncec_lock);
843 	ncec->ncec_flags &= ~NCE_F_ISROUTER;
844 	mutex_exit(&ncec->ncec_lock);
845 
846 	ire = ire_ftable_lookup_v6(&ipv6_all_zeros, &ipv6_all_zeros,
847 	    &ncec->ncec_addr, IRE_DEFAULT, ncec->ncec_ill, ALL_ZONES, NULL,
848 	    MATCH_IRE_ILL | MATCH_IRE_TYPE | MATCH_IRE_GW, 0, ipst, NULL);
849 	if (ire != NULL) {
850 		ip_rts_rtmsg(RTM_DELETE, ire, 0, ipst);
851 		ire_delete(ire);
852 		ire_refrele(ire);
853 	}
854 }
855 
856 /*
857  * Process passed in parameters either from an incoming packet or via
858  * user ioctl.
859  */
860 void
861 nce_process(ncec_t *ncec, uchar_t *hw_addr, uint32_t flag, boolean_t is_adv)
862 {
863 	ill_t	*ill = ncec->ncec_ill;
864 	uint32_t hw_addr_len = ill->ill_phys_addr_length;
865 	boolean_t ll_updated = B_FALSE;
866 	boolean_t ll_changed;
867 	nce_t	*nce;
868 
869 	ASSERT(ncec->ncec_ipversion == IPV6_VERSION);
870 	/*
871 	 * No updates of link layer address or the neighbor state is
872 	 * allowed, when the cache is in NONUD state.  This still
873 	 * allows for responding to reachability solicitation.
874 	 */
875 	mutex_enter(&ncec->ncec_lock);
876 	if (ncec->ncec_state == ND_INCOMPLETE) {
877 		if (hw_addr == NULL) {
878 			mutex_exit(&ncec->ncec_lock);
879 			return;
880 		}
881 		nce_set_ll(ncec, hw_addr);
882 		/*
883 		 * Update ncec state and send the queued packets
884 		 * back to ip this time ire will be added.
885 		 */
886 		if (flag & ND_NA_FLAG_SOLICITED) {
887 			nce_update(ncec, ND_REACHABLE, NULL);
888 		} else {
889 			nce_update(ncec, ND_STALE, NULL);
890 		}
891 		mutex_exit(&ncec->ncec_lock);
892 		nce = nce_fastpath(ncec, B_TRUE, NULL);
893 		nce_resolv_ok(ncec);
894 		if (nce != NULL)
895 			nce_refrele(nce);
896 		return;
897 	}
898 	ll_changed = nce_cmp_ll_addr(ncec, hw_addr, hw_addr_len);
899 	if (!is_adv) {
900 		/* If this is a SOLICITATION request only */
901 		if (ll_changed)
902 			nce_update(ncec, ND_STALE, hw_addr);
903 		mutex_exit(&ncec->ncec_lock);
904 		ncec_cb_dispatch(ncec);
905 		return;
906 	}
907 	if (!(flag & ND_NA_FLAG_OVERRIDE) && ll_changed) {
908 		/* If in any other state than REACHABLE, ignore */
909 		if (ncec->ncec_state == ND_REACHABLE) {
910 			nce_update(ncec, ND_STALE, NULL);
911 		}
912 		mutex_exit(&ncec->ncec_lock);
913 		ncec_cb_dispatch(ncec);
914 		return;
915 	} else {
916 		if (ll_changed) {
917 			nce_update(ncec, ND_UNCHANGED, hw_addr);
918 			ll_updated = B_TRUE;
919 		}
920 		if (flag & ND_NA_FLAG_SOLICITED) {
921 			nce_update(ncec, ND_REACHABLE, NULL);
922 		} else {
923 			if (ll_updated) {
924 				nce_update(ncec, ND_STALE, NULL);
925 			}
926 		}
927 		mutex_exit(&ncec->ncec_lock);
928 		if (!(flag & ND_NA_FLAG_ROUTER) && (ncec->ncec_flags &
929 		    NCE_F_ISROUTER)) {
930 			ncec_router_to_host(ncec);
931 		} else {
932 			ncec_cb_dispatch(ncec);
933 		}
934 	}
935 }
936 
937 /*
938  * Pass arg1 to the pfi supplied, along with each ncec in existence.
939  * ncec_walk() places a REFHOLD on the ncec and drops the lock when
940  * walking the hash list.
941  */
942 void
943 ncec_walk_common(ndp_g_t *ndp, ill_t *ill, pfi_t pfi, void *arg1,
944     boolean_t trace)
945 {
946 	ncec_t	*ncec;
947 	ncec_t	*ncec1;
948 	ncec_t	**ncep;
949 	ncec_t	*free_nce_list = NULL;
950 
951 	mutex_enter(&ndp->ndp_g_lock);
952 	/* Prevent ncec_delete from unlink and free of NCE */
953 	ndp->ndp_g_walker++;
954 	mutex_exit(&ndp->ndp_g_lock);
955 	for (ncep = ndp->nce_hash_tbl;
956 	    ncep < A_END(ndp->nce_hash_tbl); ncep++) {
957 		for (ncec = *ncep; ncec != NULL; ncec = ncec1) {
958 			ncec1 = ncec->ncec_next;
959 			if (ill == NULL || ncec->ncec_ill == ill) {
960 				if (trace) {
961 					ncec_refhold(ncec);
962 					(*pfi)(ncec, arg1);
963 					ncec_refrele(ncec);
964 				} else {
965 					ncec_refhold_notr(ncec);
966 					(*pfi)(ncec, arg1);
967 					ncec_refrele_notr(ncec);
968 				}
969 			}
970 		}
971 	}
972 	mutex_enter(&ndp->ndp_g_lock);
973 	ndp->ndp_g_walker--;
974 	if (ndp->ndp_g_walker_cleanup && ndp->ndp_g_walker == 0) {
975 		/* Time to delete condemned entries */
976 		for (ncep = ndp->nce_hash_tbl;
977 		    ncep < A_END(ndp->nce_hash_tbl); ncep++) {
978 			ncec = *ncep;
979 			if (ncec != NULL) {
980 				nce_remove(ndp, ncec, &free_nce_list);
981 			}
982 		}
983 		ndp->ndp_g_walker_cleanup = B_FALSE;
984 	}
985 
986 	mutex_exit(&ndp->ndp_g_lock);
987 
988 	if (free_nce_list != NULL) {
989 		nce_cleanup_list(free_nce_list);
990 	}
991 }
992 
993 /*
994  * Walk everything.
995  * Note that ill can be NULL hence can't derive the ipst from it.
996  */
997 void
998 ncec_walk(ill_t *ill, pfi_t pfi, void *arg1, ip_stack_t *ipst)
999 {
1000 	ncec_walk_common(ipst->ips_ndp4, ill, pfi, arg1, B_TRUE);
1001 	ncec_walk_common(ipst->ips_ndp6, ill, pfi, arg1, B_TRUE);
1002 }
1003 
1004 /*
1005  * For each interface an entry is added for the unspecified multicast group.
1006  * Here that mapping is used to form the multicast cache entry for a particular
1007  * multicast destination.
1008  */
1009 static int
1010 nce_set_multicast_v6(ill_t *ill, const in6_addr_t *dst,
1011     uint16_t flags, nce_t **newnce)
1012 {
1013 	uchar_t		*hw_addr;
1014 	int		err = 0;
1015 	ip_stack_t	*ipst = ill->ill_ipst;
1016 	nce_t		*nce;
1017 
1018 	ASSERT(ill != NULL);
1019 	ASSERT(ill->ill_isv6);
1020 	ASSERT(!(IN6_IS_ADDR_UNSPECIFIED(dst)));
1021 
1022 	mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
1023 	nce = nce_lookup_addr(ill, dst);
1024 	if (nce != NULL) {
1025 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
1026 		goto done;
1027 	}
1028 	if (ill->ill_net_type == IRE_IF_RESOLVER) {
1029 		/*
1030 		 * For IRE_IF_RESOLVER a hardware mapping can be
1031 		 * generated.
1032 		 */
1033 		hw_addr = kmem_alloc(ill->ill_nd_lla_len, KM_NOSLEEP);
1034 		if (hw_addr == NULL) {
1035 			mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
1036 			return (ENOMEM);
1037 		}
1038 		ip_mcast_mapping(ill, (uchar_t *)dst, hw_addr);
1039 	} else {
1040 		/* No hw_addr is needed for IRE_IF_NORESOLVER. */
1041 		hw_addr = NULL;
1042 	}
1043 	ASSERT((flags & NCE_F_MCAST) != 0);
1044 	ASSERT((flags & NCE_F_NONUD) != 0);
1045 	/* nce_state will be computed by nce_add_common() */
1046 	err = nce_add_v6(ill, hw_addr, ill->ill_phys_addr_length, dst, flags,
1047 	    ND_UNCHANGED, &nce);
1048 	mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
1049 	if (err == 0)
1050 		err = nce_add_v6_postprocess(nce);
1051 	if (hw_addr != NULL)
1052 		kmem_free(hw_addr, ill->ill_nd_lla_len);
1053 	if (err != 0) {
1054 		ip1dbg(("nce_set_multicast_v6: create failed" "%d\n", err));
1055 		return (err);
1056 	}
1057 done:
1058 	ASSERT(nce->nce_common->ncec_state == ND_REACHABLE);
1059 	if (newnce != NULL)
1060 		*newnce = nce;
1061 	else
1062 		nce_refrele(nce);
1063 	return (0);
1064 }
1065 
1066 /*
1067  * Return the link layer address, and any flags of a ncec.
1068  */
1069 int
1070 ndp_query(ill_t *ill, struct lif_nd_req *lnr)
1071 {
1072 	ncec_t		*ncec;
1073 	in6_addr_t	*addr;
1074 	sin6_t		*sin6;
1075 
1076 	ASSERT(ill != NULL && ill->ill_isv6);
1077 	sin6 = (sin6_t *)&lnr->lnr_addr;
1078 	addr =  &sin6->sin6_addr;
1079 
1080 	/*
1081 	 * NOTE: if the ill is an IPMP interface, then match against the whole
1082 	 * illgrp.  This e.g. allows in.ndpd to retrieve the link layer
1083 	 * addresses for the data addresses on an IPMP interface even though
1084 	 * ipif_ndp_up() created them with an ncec_ill of ipif_bound_ill.
1085 	 */
1086 	ncec = ncec_lookup_illgrp_v6(ill, addr);
1087 	if (ncec == NULL)
1088 		return (ESRCH);
1089 	/* If no link layer address is available yet, return ESRCH */
1090 	if (!NCE_ISREACHABLE(ncec)) {
1091 		ncec_refrele(ncec);
1092 		return (ESRCH);
1093 	}
1094 	lnr->lnr_hdw_len = ill->ill_phys_addr_length;
1095 	bcopy(ncec->ncec_lladdr, (uchar_t *)&lnr->lnr_hdw_addr,
1096 	    lnr->lnr_hdw_len);
1097 	if (ncec->ncec_flags & NCE_F_ISROUTER)
1098 		lnr->lnr_flags = NDF_ISROUTER_ON;
1099 	if (ncec->ncec_flags & NCE_F_ANYCAST)
1100 		lnr->lnr_flags |= NDF_ANYCAST_ON;
1101 	ncec_refrele(ncec);
1102 	return (0);
1103 }
1104 
1105 /*
1106  * Finish setting up the Enable/Disable multicast for the driver.
1107  */
1108 mblk_t *
1109 ndp_mcastreq(ill_t *ill, const in6_addr_t *v6group, uint32_t hw_addr_len,
1110     uint32_t hw_addr_offset, mblk_t *mp)
1111 {
1112 	uchar_t		*hw_addr;
1113 	ipaddr_t	v4group;
1114 	uchar_t		*addr;
1115 
1116 	ASSERT(ill->ill_net_type == IRE_IF_RESOLVER);
1117 	if (IN6_IS_ADDR_V4MAPPED(v6group)) {
1118 		IN6_V4MAPPED_TO_IPADDR(v6group, v4group);
1119 
1120 		ASSERT(CLASSD(v4group));
1121 		ASSERT(!(ill->ill_isv6));
1122 
1123 		addr = (uchar_t *)&v4group;
1124 	} else {
1125 		ASSERT(IN6_IS_ADDR_MULTICAST(v6group));
1126 		ASSERT(ill->ill_isv6);
1127 
1128 		addr = (uchar_t *)v6group;
1129 	}
1130 	hw_addr = mi_offset_paramc(mp, hw_addr_offset, hw_addr_len);
1131 	if (hw_addr == NULL) {
1132 		ip0dbg(("ndp_mcastreq NULL hw_addr\n"));
1133 		freemsg(mp);
1134 		return (NULL);
1135 	}
1136 
1137 	ip_mcast_mapping(ill, addr, hw_addr);
1138 	return (mp);
1139 }
1140 
1141 void
1142 ip_ndp_resolve(ncec_t *ncec)
1143 {
1144 	in_addr_t	sender4 = INADDR_ANY;
1145 	in6_addr_t	sender6 = ipv6_all_zeros;
1146 	ill_t		*src_ill;
1147 	uint32_t	ms;
1148 
1149 	src_ill = nce_resolve_src(ncec, &sender6);
1150 	if (src_ill == NULL) {
1151 		/* Make sure we try again later */
1152 		ms = ncec->ncec_ill->ill_reachable_retrans_time;
1153 		nce_restart_timer(ncec, (clock_t)ms);
1154 		return;
1155 	}
1156 	if (ncec->ncec_ipversion == IPV4_VERSION)
1157 		IN6_V4MAPPED_TO_IPADDR(&sender6, sender4);
1158 	mutex_enter(&ncec->ncec_lock);
1159 	if (ncec->ncec_ipversion == IPV6_VERSION)
1160 		ms = ndp_solicit(ncec, sender6, src_ill);
1161 	else
1162 		ms = arp_request(ncec, sender4, src_ill);
1163 	mutex_exit(&ncec->ncec_lock);
1164 	if (ms == 0) {
1165 		if (ncec->ncec_state != ND_REACHABLE) {
1166 			if (ncec->ncec_ipversion == IPV6_VERSION)
1167 				ndp_resolv_failed(ncec);
1168 			else
1169 				arp_resolv_failed(ncec);
1170 			ASSERT((ncec->ncec_flags & NCE_F_STATIC) == 0);
1171 			nce_make_unreachable(ncec);
1172 			ncec_delete(ncec);
1173 		}
1174 	} else {
1175 		nce_restart_timer(ncec, (clock_t)ms);
1176 	}
1177 done:
1178 	ill_refrele(src_ill);
1179 }
1180 
1181 /*
1182  * Send an IPv6 neighbor solicitation.
1183  * Returns number of milliseconds after which we should either rexmit or abort.
1184  * Return of zero means we should abort.
1185  * The caller holds the ncec_lock to protect ncec_qd_mp and ncec_rcnt.
1186  * The optional source address is used as a hint to ndp_solicit for
1187  * which source to use in the packet.
1188  *
1189  * NOTE: This routine drops ncec_lock (and later reacquires it) when sending
1190  * the packet.
1191  */
1192 uint32_t
1193 ndp_solicit(ncec_t *ncec, in6_addr_t src, ill_t *ill)
1194 {
1195 	in6_addr_t	dst;
1196 	boolean_t	dropped = B_FALSE;
1197 
1198 	ASSERT(ncec->ncec_ipversion == IPV6_VERSION);
1199 	ASSERT(MUTEX_HELD(&ncec->ncec_lock));
1200 
1201 	if (ncec->ncec_rcnt == 0)
1202 		return (0);
1203 
1204 	dst = ncec->ncec_addr;
1205 	ncec->ncec_rcnt--;
1206 	mutex_exit(&ncec->ncec_lock);
1207 	dropped = ndp_xmit(ill, ND_NEIGHBOR_SOLICIT, ill->ill_phys_addr,
1208 	    ill->ill_phys_addr_length, &src, &dst, 0);
1209 	mutex_enter(&ncec->ncec_lock);
1210 	if (dropped)
1211 		ncec->ncec_rcnt++;
1212 	return (ncec->ncec_ill->ill_reachable_retrans_time);
1213 }
1214 
1215 /*
1216  * Attempt to recover an address on an interface that's been marked as a
1217  * duplicate.  Because NCEs are destroyed when the interface goes down, there's
1218  * no easy way to just probe the address and have the right thing happen if
1219  * it's no longer in use.  Instead, we just bring it up normally and allow the
1220  * regular interface start-up logic to probe for a remaining duplicate and take
1221  * us back down if necessary.
1222  * Neither DHCP nor temporary addresses arrive here; they're excluded by
1223  * ip_ndp_excl.
1224  */
1225 /* ARGSUSED */
1226 void
1227 ip_addr_recover(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg)
1228 {
1229 	ill_t	*ill = rq->q_ptr;
1230 	ipif_t	*ipif;
1231 	in6_addr_t *addr6 = (in6_addr_t *)mp->b_rptr;
1232 	in_addr_t *addr4 = (in_addr_t *)mp->b_rptr;
1233 	boolean_t addr_equal;
1234 
1235 	for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
1236 		/*
1237 		 * We do not support recovery of proxy ARP'd interfaces,
1238 		 * because the system lacks a complete proxy ARP mechanism.
1239 		 */
1240 		if (ill->ill_isv6) {
1241 			addr_equal = IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6lcl_addr,
1242 			    addr6);
1243 		} else {
1244 			addr_equal = (ipif->ipif_lcl_addr == *addr4);
1245 		}
1246 
1247 		if ((ipif->ipif_flags & IPIF_POINTOPOINT) || !addr_equal)
1248 			continue;
1249 
1250 		/*
1251 		 * If we have already recovered or if the interface is going
1252 		 * away, then ignore.
1253 		 */
1254 		mutex_enter(&ill->ill_lock);
1255 		if (!(ipif->ipif_flags & IPIF_DUPLICATE) ||
1256 		    (ipif->ipif_state_flags & IPIF_CONDEMNED)) {
1257 			mutex_exit(&ill->ill_lock);
1258 			continue;
1259 		}
1260 
1261 		ipif->ipif_flags &= ~IPIF_DUPLICATE;
1262 		ill->ill_ipif_dup_count--;
1263 		mutex_exit(&ill->ill_lock);
1264 		ipif->ipif_was_dup = B_TRUE;
1265 
1266 		if (ill->ill_isv6) {
1267 			VERIFY(ipif_ndp_up(ipif, B_TRUE) != EINPROGRESS);
1268 			(void) ipif_up_done_v6(ipif);
1269 		} else {
1270 			VERIFY(ipif_arp_up(ipif, Res_act_initial, B_TRUE) !=
1271 			    EINPROGRESS);
1272 			(void) ipif_up_done(ipif);
1273 		}
1274 	}
1275 	freeb(mp);
1276 }
1277 
1278 /*
1279  * Attempt to recover an IPv6 interface that's been shut down as a duplicate.
1280  * As long as someone else holds the address, the interface will stay down.
1281  * When that conflict goes away, the interface is brought back up.  This is
1282  * done so that accidental shutdowns of addresses aren't made permanent.  Your
1283  * server will recover from a failure.
1284  *
1285  * For DHCP and temporary addresses, recovery is not done in the kernel.
1286  * Instead, it's handled by user space processes (dhcpagent and in.ndpd).
1287  *
1288  * This function is entered on a timer expiry; the ID is in ipif_recovery_id.
1289  */
1290 void
1291 ipif_dup_recovery(void *arg)
1292 {
1293 	ipif_t *ipif = arg;
1294 
1295 	ipif->ipif_recovery_id = 0;
1296 	if (!(ipif->ipif_flags & IPIF_DUPLICATE))
1297 		return;
1298 
1299 	/*
1300 	 * No lock, because this is just an optimization.
1301 	 */
1302 	if (ipif->ipif_state_flags & IPIF_CONDEMNED)
1303 		return;
1304 
1305 	/* If the link is down, we'll retry this later */
1306 	if (!(ipif->ipif_ill->ill_phyint->phyint_flags & PHYI_RUNNING))
1307 		return;
1308 
1309 	ipif_do_recovery(ipif);
1310 }
1311 
1312 /*
1313  * Perform interface recovery by forcing the duplicate interfaces up and
1314  * allowing the system to determine which ones should stay up.
1315  *
1316  * Called both by recovery timer expiry and link-up notification.
1317  */
1318 void
1319 ipif_do_recovery(ipif_t *ipif)
1320 {
1321 	ill_t *ill = ipif->ipif_ill;
1322 	mblk_t *mp;
1323 	ip_stack_t *ipst = ill->ill_ipst;
1324 	size_t mp_size;
1325 
1326 	if (ipif->ipif_isv6)
1327 		mp_size = sizeof (ipif->ipif_v6lcl_addr);
1328 	else
1329 		mp_size = sizeof (ipif->ipif_lcl_addr);
1330 	mp = allocb(mp_size, BPRI_MED);
1331 	if (mp == NULL) {
1332 		mutex_enter(&ill->ill_lock);
1333 		if (ipst->ips_ip_dup_recovery > 0 &&
1334 		    ipif->ipif_recovery_id == 0 &&
1335 		    !(ipif->ipif_state_flags & IPIF_CONDEMNED)) {
1336 			ipif->ipif_recovery_id = timeout(ipif_dup_recovery,
1337 			    ipif, MSEC_TO_TICK(ipst->ips_ip_dup_recovery));
1338 		}
1339 		mutex_exit(&ill->ill_lock);
1340 	} else {
1341 		/*
1342 		 * A recovery timer may still be running if we got here from
1343 		 * ill_restart_dad(); cancel that timer.
1344 		 */
1345 		if (ipif->ipif_recovery_id != 0)
1346 			(void) untimeout(ipif->ipif_recovery_id);
1347 		ipif->ipif_recovery_id = 0;
1348 
1349 		if (ipif->ipif_isv6) {
1350 			bcopy(&ipif->ipif_v6lcl_addr, mp->b_rptr,
1351 			    sizeof (ipif->ipif_v6lcl_addr));
1352 		} else  {
1353 			bcopy(&ipif->ipif_lcl_addr, mp->b_rptr,
1354 			    sizeof (ipif->ipif_lcl_addr));
1355 		}
1356 		ill_refhold(ill);
1357 		qwriter_ip(ill, ill->ill_rq, mp, ip_addr_recover, NEW_OP,
1358 		    B_FALSE);
1359 	}
1360 }
1361 
1362 /*
1363  * Find the MAC and IP addresses in an NA/NS message.
1364  */
1365 static void
1366 ip_ndp_find_addresses(mblk_t *mp, ip_recv_attr_t *ira, ill_t *ill,
1367     in6_addr_t *targp, uchar_t **haddr, uint_t *haddrlenp)
1368 {
1369 	icmp6_t *icmp6 = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
1370 	nd_neighbor_solicit_t *ns = (nd_neighbor_solicit_t *)icmp6;
1371 	uchar_t *addr;
1372 	int alen;
1373 
1374 	/* icmp_inbound_v6 ensures this */
1375 	ASSERT(ira->ira_flags & IRAF_L2SRC_SET);
1376 
1377 	addr = ira->ira_l2src;
1378 	alen = ill->ill_phys_addr_length;
1379 	if (alen > 0) {
1380 		*haddr = addr;
1381 		*haddrlenp = alen;
1382 	} else {
1383 		*haddr = NULL;
1384 		*haddrlenp = 0;
1385 	}
1386 
1387 	/* nd_ns_target and nd_na_target are at the same offset, so we cheat */
1388 	*targp = ns->nd_ns_target;
1389 }
1390 
1391 /*
1392  * This is for exclusive changes due to NDP duplicate address detection
1393  * failure.
1394  */
1395 /* ARGSUSED */
1396 static void
1397 ip_ndp_excl(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg)
1398 {
1399 	ill_t	*ill = rq->q_ptr;
1400 	ipif_t	*ipif;
1401 	uchar_t	*haddr;
1402 	uint_t	haddrlen;
1403 	ip_stack_t *ipst = ill->ill_ipst;
1404 	in6_addr_t targ;
1405 	ip_recv_attr_t iras;
1406 	mblk_t	*attrmp;
1407 
1408 	attrmp = mp;
1409 	mp = mp->b_cont;
1410 	attrmp->b_cont = NULL;
1411 	if (!ip_recv_attr_from_mblk(attrmp, &iras)) {
1412 		/* The ill or ip_stack_t disappeared on us */
1413 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
1414 		ip_drop_input("ip_recv_attr_from_mblk", mp, ill);
1415 		freemsg(mp);
1416 		ira_cleanup(&iras, B_TRUE);
1417 		return;
1418 	}
1419 
1420 	ASSERT(ill == iras.ira_rill);
1421 
1422 	ip_ndp_find_addresses(mp, &iras, ill, &targ, &haddr, &haddrlen);
1423 	if (haddr != NULL && haddrlen == ill->ill_phys_addr_length) {
1424 		/*
1425 		 * Ignore conflicts generated by misbehaving switches that
1426 		 * just reflect our own messages back to us.  For IPMP, we may
1427 		 * see reflections across any ill in the illgrp.
1428 		 *
1429 		 * RFC2462 and revisions tried to detect both the case
1430 		 * when a statically configured IPv6 address is a duplicate,
1431 		 * and the case when the L2 address itself is a duplicate. The
1432 		 * later is important because, with stateles address autoconf,
1433 		 * if the L2 address is a duplicate, the resulting IPv6
1434 		 * address(es) would also be duplicates. We rely on DAD of the
1435 		 * IPv6 address itself to detect the latter case.
1436 		 */
1437 		/* For an under ill_grp can change under lock */
1438 		rw_enter(&ipst->ips_ill_g_lock, RW_READER);
1439 		if (bcmp(haddr, ill->ill_phys_addr, haddrlen) == 0 ||
1440 		    IS_UNDER_IPMP(ill) &&
1441 		    ipmp_illgrp_find_ill(ill->ill_grp, haddr,
1442 		    haddrlen) != NULL) {
1443 			rw_exit(&ipst->ips_ill_g_lock);
1444 			goto ignore_conflict;
1445 		}
1446 		rw_exit(&ipst->ips_ill_g_lock);
1447 	}
1448 
1449 	/*
1450 	 * Look up the appropriate ipif.
1451 	 */
1452 	ipif = ipif_lookup_addr_v6(&targ, ill, ALL_ZONES, ipst);
1453 	if (ipif == NULL)
1454 		goto ignore_conflict;
1455 
1456 	/* Reload the ill to match the ipif */
1457 	ill = ipif->ipif_ill;
1458 
1459 	/* If it's already duplicate or ineligible, then don't do anything. */
1460 	if (ipif->ipif_flags & (IPIF_POINTOPOINT|IPIF_DUPLICATE)) {
1461 		ipif_refrele(ipif);
1462 		goto ignore_conflict;
1463 	}
1464 
1465 	/*
1466 	 * If this is a failure during duplicate recovery, then don't
1467 	 * complain.  It may take a long time to recover.
1468 	 */
1469 	if (!ipif->ipif_was_dup) {
1470 		char ibuf[LIFNAMSIZ];
1471 		char hbuf[MAC_STR_LEN];
1472 		char sbuf[INET6_ADDRSTRLEN];
1473 
1474 		ipif_get_name(ipif, ibuf, sizeof (ibuf));
1475 		cmn_err(CE_WARN, "%s has duplicate address %s (in use by %s);"
1476 		    " disabled", ibuf,
1477 		    inet_ntop(AF_INET6, &targ, sbuf, sizeof (sbuf)),
1478 		    mac_colon_addr(haddr, haddrlen, hbuf, sizeof (hbuf)));
1479 	}
1480 	mutex_enter(&ill->ill_lock);
1481 	ASSERT(!(ipif->ipif_flags & IPIF_DUPLICATE));
1482 	ipif->ipif_flags |= IPIF_DUPLICATE;
1483 	ill->ill_ipif_dup_count++;
1484 	mutex_exit(&ill->ill_lock);
1485 	(void) ipif_down(ipif, NULL, NULL);
1486 	(void) ipif_down_tail(ipif);
1487 	mutex_enter(&ill->ill_lock);
1488 	if (!(ipif->ipif_flags & (IPIF_DHCPRUNNING|IPIF_TEMPORARY)) &&
1489 	    ill->ill_net_type == IRE_IF_RESOLVER &&
1490 	    !(ipif->ipif_state_flags & IPIF_CONDEMNED) &&
1491 	    ipst->ips_ip_dup_recovery > 0) {
1492 		ASSERT(ipif->ipif_recovery_id == 0);
1493 		ipif->ipif_recovery_id = timeout(ipif_dup_recovery,
1494 		    ipif, MSEC_TO_TICK(ipst->ips_ip_dup_recovery));
1495 	}
1496 	mutex_exit(&ill->ill_lock);
1497 	ipif_refrele(ipif);
1498 
1499 ignore_conflict:
1500 	freemsg(mp);
1501 	ira_cleanup(&iras, B_TRUE);
1502 }
1503 
1504 /*
1505  * Handle failure by tearing down the ipifs with the specified address.  Note
1506  * that tearing down the ipif also means deleting the ncec through ipif_down, so
1507  * it's not possible to do recovery by just restarting the ncec timer.  Instead,
1508  * we start a timer on the ipif.
1509  * Caller has to free mp;
1510  */
1511 static void
1512 ndp_failure(mblk_t *mp, ip_recv_attr_t *ira)
1513 {
1514 	const uchar_t	*haddr;
1515 	ill_t		*ill = ira->ira_rill;
1516 
1517 	/*
1518 	 * Ignore conflicts generated by misbehaving switches that just
1519 	 * reflect our own messages back to us.
1520 	 */
1521 
1522 	/* icmp_inbound_v6 ensures this */
1523 	ASSERT(ira->ira_flags & IRAF_L2SRC_SET);
1524 	haddr = ira->ira_l2src;
1525 	if (haddr != NULL &&
1526 	    bcmp(haddr, ill->ill_phys_addr, ill->ill_phys_addr_length) == 0) {
1527 		return;
1528 	}
1529 
1530 	if ((mp = copymsg(mp)) != NULL) {
1531 		mblk_t	*attrmp;
1532 
1533 		attrmp = ip_recv_attr_to_mblk(ira);
1534 		if (attrmp == NULL) {
1535 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
1536 			ip_drop_input("ipIfStatsInDiscards", mp, ill);
1537 			freemsg(mp);
1538 		} else {
1539 			ASSERT(attrmp->b_cont == NULL);
1540 			attrmp->b_cont = mp;
1541 			mp = attrmp;
1542 			ill_refhold(ill);
1543 			qwriter_ip(ill, ill->ill_rq, mp, ip_ndp_excl, NEW_OP,
1544 			    B_FALSE);
1545 		}
1546 	}
1547 }
1548 
1549 /*
1550  * Handle a discovered conflict: some other system is advertising that it owns
1551  * one of our IP addresses.  We need to defend ourselves, or just shut down the
1552  * interface.
1553  *
1554  * Handles both IPv4 and IPv6
1555  */
1556 boolean_t
1557 ip_nce_conflict(mblk_t *mp, ip_recv_attr_t *ira, ncec_t *ncec)
1558 {
1559 	ipif_t		*ipif;
1560 	clock_t		now;
1561 	uint_t		maxdefense;
1562 	uint_t		defs;
1563 	ill_t		*ill = ira->ira_ill;
1564 	ip_stack_t	*ipst = ill->ill_ipst;
1565 	uint32_t	elapsed;
1566 	boolean_t	isv6 = ill->ill_isv6;
1567 	ipaddr_t	ncec_addr;
1568 
1569 	if (isv6) {
1570 		ipif = ipif_lookup_addr_v6(&ncec->ncec_addr, ill, ALL_ZONES,
1571 		    ipst);
1572 	} else {
1573 		if (arp_no_defense) {
1574 			/*
1575 			 * Yes, there is a conflict, but no, we do not
1576 			 * defend ourself.
1577 			 */
1578 			return (B_TRUE);
1579 		}
1580 		IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, ncec_addr);
1581 		ipif = ipif_lookup_addr(ncec_addr, ill, ALL_ZONES,
1582 		    ipst);
1583 	}
1584 	if (ipif == NULL)
1585 		return (B_FALSE);
1586 
1587 	/*
1588 	 * First, figure out if this address is disposable.
1589 	 */
1590 	if (ipif->ipif_flags & (IPIF_DHCPRUNNING | IPIF_TEMPORARY))
1591 		maxdefense = ipst->ips_ip_max_temp_defend;
1592 	else
1593 		maxdefense = ipst->ips_ip_max_defend;
1594 
1595 	/*
1596 	 * Now figure out how many times we've defended ourselves.  Ignore
1597 	 * defenses that happened long in the past.
1598 	 */
1599 	now = ddi_get_lbolt();
1600 	elapsed = (drv_hztousec(now - ncec->ncec_last_time_defended))/1000000;
1601 	mutex_enter(&ncec->ncec_lock);
1602 	if ((defs = ncec->ncec_defense_count) > 0 &&
1603 	    elapsed > ipst->ips_ip_defend_interval) {
1604 		/*
1605 		 * ip_defend_interval has elapsed.
1606 		 * reset the defense count.
1607 		 */
1608 		ncec->ncec_defense_count = defs = 0;
1609 	}
1610 	ncec->ncec_defense_count++;
1611 	ncec->ncec_last_time_defended = now;
1612 	mutex_exit(&ncec->ncec_lock);
1613 	ipif_refrele(ipif);
1614 
1615 	/*
1616 	 * If we've defended ourselves too many times already, then give up and
1617 	 * tear down the interface(s) using this address.
1618 	 * Otherwise, caller has to defend by sending out an announce.
1619 	 */
1620 	if (defs >= maxdefense) {
1621 		if (isv6)
1622 			ndp_failure(mp, ira);
1623 		else
1624 			arp_failure(mp, ira);
1625 	} else {
1626 		return (B_TRUE); /* caller must defend this address */
1627 	}
1628 	return (B_FALSE);
1629 }
1630 
1631 /*
1632  * Handle reception of Neighbor Solicitation messages.
1633  */
1634 static void
1635 ndp_input_solicit(mblk_t *mp, ip_recv_attr_t *ira)
1636 {
1637 	ill_t		*ill = ira->ira_ill, *under_ill;
1638 	nd_neighbor_solicit_t *ns;
1639 	uint32_t	hlen = ill->ill_phys_addr_length;
1640 	uchar_t		*haddr = NULL;
1641 	icmp6_t		*icmp_nd;
1642 	ip6_t		*ip6h;
1643 	ncec_t		*our_ncec = NULL;
1644 	in6_addr_t	target;
1645 	in6_addr_t	src;
1646 	int		len;
1647 	int		flag = 0;
1648 	nd_opt_hdr_t	*opt = NULL;
1649 	boolean_t	bad_solicit = B_FALSE;
1650 	mib2_ipv6IfIcmpEntry_t	*mib = ill->ill_icmp6_mib;
1651 	boolean_t	need_ill_refrele = B_FALSE;
1652 
1653 	ip6h = (ip6_t *)mp->b_rptr;
1654 	icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
1655 	len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN;
1656 	src = ip6h->ip6_src;
1657 	ns = (nd_neighbor_solicit_t *)icmp_nd;
1658 	target = ns->nd_ns_target;
1659 	if (IN6_IS_ADDR_MULTICAST(&target)) {
1660 		if (ip_debug > 2) {
1661 			/* ip1dbg */
1662 			pr_addr_dbg("ndp_input_solicit: Target is"
1663 			    " multicast! %s\n", AF_INET6, &target);
1664 		}
1665 		bad_solicit = B_TRUE;
1666 		goto done;
1667 	}
1668 	if (len > sizeof (nd_neighbor_solicit_t)) {
1669 		/* Options present */
1670 		opt = (nd_opt_hdr_t *)&ns[1];
1671 		len -= sizeof (nd_neighbor_solicit_t);
1672 		if (!ndp_verify_optlen(opt, len)) {
1673 			ip1dbg(("ndp_input_solicit: Bad opt len\n"));
1674 			bad_solicit = B_TRUE;
1675 			goto done;
1676 		}
1677 	}
1678 	if (IN6_IS_ADDR_UNSPECIFIED(&src)) {
1679 		/* Check to see if this is a valid DAD solicitation */
1680 		if (!IN6_IS_ADDR_MC_SOLICITEDNODE(&ip6h->ip6_dst)) {
1681 			if (ip_debug > 2) {
1682 				/* ip1dbg */
1683 				pr_addr_dbg("ndp_input_solicit: IPv6 "
1684 				    "Destination is not solicited node "
1685 				    "multicast %s\n", AF_INET6,
1686 				    &ip6h->ip6_dst);
1687 			}
1688 			bad_solicit = B_TRUE;
1689 			goto done;
1690 		}
1691 	}
1692 
1693 	/*
1694 	 * NOTE: with IPMP, it's possible the nominated multicast ill (which
1695 	 * received this packet if it's multicast) is not the ill tied to
1696 	 * e.g. the IPMP ill's data link-local.  So we match across the illgrp
1697 	 * to ensure we find the associated NCE.
1698 	 */
1699 	our_ncec = ncec_lookup_illgrp_v6(ill, &target);
1700 	/*
1701 	 * If this is a valid Solicitation for an address we are publishing,
1702 	 * then a PUBLISH entry should exist in the cache
1703 	 */
1704 	if (our_ncec == NULL || !NCE_PUBLISH(our_ncec)) {
1705 		ip1dbg(("ndp_input_solicit: Wrong target in NS?!"
1706 		    "ifname=%s ", ill->ill_name));
1707 		if (ip_debug > 2) {
1708 			/* ip1dbg */
1709 			pr_addr_dbg(" dst %s\n", AF_INET6, &target);
1710 		}
1711 		if (our_ncec == NULL)
1712 			bad_solicit = B_TRUE;
1713 		goto done;
1714 	}
1715 
1716 	/* At this point we should have a verified NS per spec */
1717 	if (opt != NULL) {
1718 		opt = ndp_get_option(opt, len, ND_OPT_SOURCE_LINKADDR);
1719 		if (opt != NULL) {
1720 			haddr = (uchar_t *)&opt[1];
1721 			if (hlen > opt->nd_opt_len * 8 - sizeof (*opt) ||
1722 			    hlen == 0) {
1723 				ip1dbg(("ndp_input_advert: bad SLLA\n"));
1724 				bad_solicit = B_TRUE;
1725 				goto done;
1726 			}
1727 		}
1728 	}
1729 
1730 	/* If sending directly to peer, set the unicast flag */
1731 	if (!IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst))
1732 		flag |= NDP_UNICAST;
1733 
1734 	/*
1735 	 * Create/update the entry for the soliciting node on the ipmp_ill.
1736 	 * or respond to outstanding queries, don't if
1737 	 * the source is unspecified address.
1738 	 */
1739 	if (!IN6_IS_ADDR_UNSPECIFIED(&src)) {
1740 		int	err;
1741 		nce_t	*nnce;
1742 
1743 		ASSERT(ill->ill_isv6);
1744 		/*
1745 		 * Regular solicitations *must* include the Source Link-Layer
1746 		 * Address option.  Ignore messages that do not.
1747 		 */
1748 		if (haddr == NULL && IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst)) {
1749 			ip1dbg(("ndp_input_solicit: source link-layer address "
1750 			    "option missing with a specified source.\n"));
1751 			bad_solicit = B_TRUE;
1752 			goto done;
1753 		}
1754 
1755 		/*
1756 		 * This is a regular solicitation.  If we're still in the
1757 		 * process of verifying the address, then don't respond at all
1758 		 * and don't keep track of the sender.
1759 		 */
1760 		if (our_ncec->ncec_state == ND_PROBE)
1761 			goto done;
1762 
1763 		/*
1764 		 * If the solicitation doesn't have sender hardware address
1765 		 * (legal for unicast solicitation), then process without
1766 		 * installing the return NCE.  Either we already know it, or
1767 		 * we'll be forced to look it up when (and if) we reply to the
1768 		 * packet.
1769 		 */
1770 		if (haddr == NULL)
1771 			goto no_source;
1772 
1773 		under_ill = ill;
1774 		if (IS_UNDER_IPMP(under_ill)) {
1775 			ill = ipmp_ill_hold_ipmp_ill(under_ill);
1776 			if (ill == NULL)
1777 				ill = under_ill;
1778 			else
1779 				need_ill_refrele = B_TRUE;
1780 		}
1781 		err = nce_lookup_then_add_v6(ill,
1782 		    haddr, hlen,
1783 		    &src,	/* Soliciting nodes address */
1784 		    0,
1785 		    ND_STALE,
1786 		    &nnce);
1787 
1788 		if (need_ill_refrele) {
1789 			ill_refrele(ill);
1790 			ill = under_ill;
1791 			need_ill_refrele =  B_FALSE;
1792 		}
1793 		switch (err) {
1794 		case 0:
1795 			/* done with this entry */
1796 			nce_refrele(nnce);
1797 			break;
1798 		case EEXIST:
1799 			/*
1800 			 * B_FALSE indicates this is not an an advertisement.
1801 			 */
1802 			nce_process(nnce->nce_common, haddr, 0, B_FALSE);
1803 			nce_refrele(nnce);
1804 			break;
1805 		default:
1806 			ip1dbg(("ndp_input_solicit: Can't create NCE %d\n",
1807 			    err));
1808 			goto done;
1809 		}
1810 no_source:
1811 		flag |= NDP_SOLICITED;
1812 	} else {
1813 		/*
1814 		 * No source link layer address option should be present in a
1815 		 * valid DAD request.
1816 		 */
1817 		if (haddr != NULL) {
1818 			ip1dbg(("ndp_input_solicit: source link-layer address "
1819 			    "option present with an unspecified source.\n"));
1820 			bad_solicit = B_TRUE;
1821 			goto done;
1822 		}
1823 		if (our_ncec->ncec_state == ND_PROBE) {
1824 			/*
1825 			 * Internally looped-back probes will have
1826 			 * IRAF_L2SRC_LOOPBACK set so we can ignore our own
1827 			 * transmissions.
1828 			 */
1829 			if (!(ira->ira_flags & IRAF_L2SRC_LOOPBACK)) {
1830 				/*
1831 				 * If someone else is probing our address, then
1832 				 * we've crossed wires.  Declare failure.
1833 				 */
1834 				ndp_failure(mp, ira);
1835 			}
1836 			goto done;
1837 		}
1838 		/*
1839 		 * This is a DAD probe.  Multicast the advertisement to the
1840 		 * all-nodes address.
1841 		 */
1842 		src = ipv6_all_hosts_mcast;
1843 	}
1844 	flag |= nce_advert_flags(our_ncec);
1845 	(void) ndp_xmit(ill,
1846 	    ND_NEIGHBOR_ADVERT,
1847 	    our_ncec->ncec_lladdr,
1848 	    our_ncec->ncec_lladdr_length,
1849 	    &target,	/* Source and target of the advertisement pkt */
1850 	    &src,	/* IP Destination (source of original pkt) */
1851 	    flag);
1852 done:
1853 	if (bad_solicit)
1854 		BUMP_MIB(mib, ipv6IfIcmpInBadNeighborSolicitations);
1855 	if (our_ncec != NULL)
1856 		ncec_refrele(our_ncec);
1857 }
1858 
1859 /*
1860  * Handle reception of Neighbor Solicitation messages
1861  */
1862 void
1863 ndp_input_advert(mblk_t *mp, ip_recv_attr_t *ira)
1864 {
1865 	ill_t		*ill = ira->ira_ill;
1866 	nd_neighbor_advert_t *na;
1867 	uint32_t	hlen = ill->ill_phys_addr_length;
1868 	uchar_t		*haddr = NULL;
1869 	icmp6_t		*icmp_nd;
1870 	ip6_t		*ip6h;
1871 	ncec_t		*dst_ncec = NULL;
1872 	in6_addr_t	target;
1873 	nd_opt_hdr_t	*opt = NULL;
1874 	int		len;
1875 	ip_stack_t	*ipst = ill->ill_ipst;
1876 	mib2_ipv6IfIcmpEntry_t	*mib = ill->ill_icmp6_mib;
1877 
1878 	ip6h = (ip6_t *)mp->b_rptr;
1879 	icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
1880 	len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN;
1881 	na = (nd_neighbor_advert_t *)icmp_nd;
1882 
1883 	if (IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst) &&
1884 	    (na->nd_na_flags_reserved & ND_NA_FLAG_SOLICITED)) {
1885 		ip1dbg(("ndp_input_advert: Target is multicast but the "
1886 		    "solicited flag is not zero\n"));
1887 		BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements);
1888 		return;
1889 	}
1890 	target = na->nd_na_target;
1891 	if (IN6_IS_ADDR_MULTICAST(&target)) {
1892 		ip1dbg(("ndp_input_advert: Target is multicast!\n"));
1893 		BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements);
1894 		return;
1895 	}
1896 	if (len > sizeof (nd_neighbor_advert_t)) {
1897 		opt = (nd_opt_hdr_t *)&na[1];
1898 		if (!ndp_verify_optlen(opt,
1899 		    len - sizeof (nd_neighbor_advert_t))) {
1900 			ip1dbg(("ndp_input_advert: cannot verify SLLA\n"));
1901 			BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements);
1902 			return;
1903 		}
1904 		/* At this point we have a verified NA per spec */
1905 		len -= sizeof (nd_neighbor_advert_t);
1906 		opt = ndp_get_option(opt, len, ND_OPT_TARGET_LINKADDR);
1907 		if (opt != NULL) {
1908 			haddr = (uchar_t *)&opt[1];
1909 			if (hlen > opt->nd_opt_len * 8 - sizeof (*opt) ||
1910 			    hlen == 0) {
1911 				ip1dbg(("ndp_input_advert: bad SLLA\n"));
1912 				BUMP_MIB(mib,
1913 				    ipv6IfIcmpInBadNeighborAdvertisements);
1914 				return;
1915 			}
1916 		}
1917 	}
1918 
1919 	/*
1920 	 * NOTE: we match across the illgrp since we need to do DAD for all of
1921 	 * our local addresses, and those are spread across all the active
1922 	 * ills in the group.
1923 	 */
1924 	if ((dst_ncec = ncec_lookup_illgrp_v6(ill, &target)) == NULL)
1925 		return;
1926 
1927 	if (NCE_PUBLISH(dst_ncec)) {
1928 		/*
1929 		 * Someone just advertised an addresses that we publish. First,
1930 		 * check it it was us -- if so, we can safely ignore it.
1931 		 * We don't get the haddr from the ira_l2src because, in the
1932 		 * case that the packet originated from us, on an IPMP group,
1933 		 * the ira_l2src may would be the link-layer address of the
1934 		 * cast_ill used to send the packet, which may not be the same
1935 		 * as the dst_ncec->ncec_lladdr of the address.
1936 		 */
1937 		if (haddr != NULL) {
1938 			if (ira->ira_flags & IRAF_L2SRC_LOOPBACK)
1939 				goto out;
1940 
1941 			if (!nce_cmp_ll_addr(dst_ncec, haddr, hlen))
1942 				goto out;   /* from us -- no conflict */
1943 
1944 			/*
1945 			 * If we're in an IPMP group, check if this is an echo
1946 			 * from another ill in the group.  Use the double-
1947 			 * checked locking pattern to avoid grabbing
1948 			 * ill_g_lock in the non-IPMP case.
1949 			 */
1950 			if (IS_UNDER_IPMP(ill)) {
1951 				rw_enter(&ipst->ips_ill_g_lock, RW_READER);
1952 				if (IS_UNDER_IPMP(ill) && ipmp_illgrp_find_ill(
1953 				    ill->ill_grp, haddr, hlen) != NULL) {
1954 					rw_exit(&ipst->ips_ill_g_lock);
1955 					goto out;
1956 				}
1957 				rw_exit(&ipst->ips_ill_g_lock);
1958 			}
1959 		}
1960 
1961 		/*
1962 		 * This appears to be a real conflict.  If we're trying to
1963 		 * configure this NCE (ND_PROBE), then shut it down.
1964 		 * Otherwise, handle the discovered conflict.
1965 		 */
1966 		if (dst_ncec->ncec_state == ND_PROBE) {
1967 			ndp_failure(mp, ira);
1968 		} else {
1969 			if (ip_nce_conflict(mp, ira, dst_ncec)) {
1970 				char hbuf[MAC_STR_LEN];
1971 				char sbuf[INET6_ADDRSTRLEN];
1972 
1973 				cmn_err(CE_WARN,
1974 				    "node '%s' is using %s on %s",
1975 				    inet_ntop(AF_INET6, &target, sbuf,
1976 				    sizeof (sbuf)),
1977 				    haddr == NULL ? "<none>" :
1978 				    mac_colon_addr(haddr, hlen, hbuf,
1979 				    sizeof (hbuf)), ill->ill_name);
1980 				/*
1981 				 * RFC 4862, Section 5.4.4 does not mandate
1982 				 * any specific behavior when an NA matches
1983 				 * a non-tentative address assigned to the
1984 				 * receiver. We make the choice of defending
1985 				 * our address, based on the assumption that
1986 				 * the sender has not detected the Duplicate.
1987 				 *
1988 				 * ncec_last_time_defended has been adjusted
1989 				 * in ip_nce_conflict()
1990 				 */
1991 				(void) ndp_announce(dst_ncec);
1992 			}
1993 		}
1994 	} else {
1995 		if (na->nd_na_flags_reserved & ND_NA_FLAG_ROUTER)
1996 			dst_ncec->ncec_flags |= NCE_F_ISROUTER;
1997 
1998 		/* B_TRUE indicates this an advertisement */
1999 		nce_process(dst_ncec, haddr, na->nd_na_flags_reserved, B_TRUE);
2000 	}
2001 out:
2002 	ncec_refrele(dst_ncec);
2003 }
2004 
2005 /*
2006  * Process NDP neighbor solicitation/advertisement messages.
2007  * The checksum has already checked o.k before reaching here.
2008  * Information about the datalink header is contained in ira_l2src, but
2009  * that should be ignored for loopback packets.
2010  */
2011 void
2012 ndp_input(mblk_t *mp, ip_recv_attr_t *ira)
2013 {
2014 	ill_t		*ill = ira->ira_rill;
2015 	icmp6_t		*icmp_nd;
2016 	ip6_t		*ip6h;
2017 	int		len;
2018 	mib2_ipv6IfIcmpEntry_t	*mib = ill->ill_icmp6_mib;
2019 	ill_t		*orig_ill = NULL;
2020 
2021 	/*
2022 	 * Since ira_ill is where the IRE_LOCAL was hosted we use ira_rill
2023 	 * and make it be the IPMP upper so avoid being confused by a packet
2024 	 * addressed to a unicast address on a different ill.
2025 	 */
2026 	if (IS_UNDER_IPMP(ill)) {
2027 		orig_ill = ill;
2028 		ill = ipmp_ill_hold_ipmp_ill(orig_ill);
2029 		if (ill == NULL) {
2030 			ill = orig_ill;
2031 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2032 			ip_drop_input("ipIfStatsInDiscards - IPMP ill",
2033 			    mp, ill);
2034 			freemsg(mp);
2035 			return;
2036 		}
2037 		ASSERT(ill != orig_ill);
2038 		orig_ill = ira->ira_ill;
2039 		ira->ira_ill = ill;
2040 		mib = ill->ill_icmp6_mib;
2041 	}
2042 	if (!pullupmsg(mp, -1)) {
2043 		ip1dbg(("ndp_input: pullupmsg failed\n"));
2044 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2045 		ip_drop_input("ipIfStatsInDiscards - pullupmsg", mp, ill);
2046 		goto done;
2047 	}
2048 	ip6h = (ip6_t *)mp->b_rptr;
2049 	if (ip6h->ip6_hops != IPV6_MAX_HOPS) {
2050 		ip1dbg(("ndp_input: hoplimit != IPV6_MAX_HOPS\n"));
2051 		ip_drop_input("ipv6IfIcmpBadHoplimit", mp, ill);
2052 		BUMP_MIB(mib, ipv6IfIcmpBadHoplimit);
2053 		goto done;
2054 	}
2055 	/*
2056 	 * NDP does not accept any extension headers between the
2057 	 * IP header and the ICMP header since e.g. a routing
2058 	 * header could be dangerous.
2059 	 * This assumes that any AH or ESP headers are removed
2060 	 * by ip prior to passing the packet to ndp_input.
2061 	 */
2062 	if (ip6h->ip6_nxt != IPPROTO_ICMPV6) {
2063 		ip1dbg(("ndp_input: Wrong next header 0x%x\n",
2064 		    ip6h->ip6_nxt));
2065 		ip_drop_input("Wrong next header", mp, ill);
2066 		BUMP_MIB(mib, ipv6IfIcmpInErrors);
2067 		goto done;
2068 	}
2069 	icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
2070 	ASSERT(icmp_nd->icmp6_type == ND_NEIGHBOR_SOLICIT ||
2071 	    icmp_nd->icmp6_type == ND_NEIGHBOR_ADVERT);
2072 	if (icmp_nd->icmp6_code != 0) {
2073 		ip1dbg(("ndp_input: icmp6 code != 0 \n"));
2074 		ip_drop_input("code non-zero", mp, ill);
2075 		BUMP_MIB(mib, ipv6IfIcmpInErrors);
2076 		goto done;
2077 	}
2078 	len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN;
2079 	/*
2080 	 * Make sure packet length is large enough for either
2081 	 * a NS or a NA icmp packet.
2082 	 */
2083 	if (len <  sizeof (struct icmp6_hdr) + sizeof (struct in6_addr)) {
2084 		ip1dbg(("ndp_input: packet too short\n"));
2085 		ip_drop_input("packet too short", mp, ill);
2086 		BUMP_MIB(mib, ipv6IfIcmpInErrors);
2087 		goto done;
2088 	}
2089 	if (icmp_nd->icmp6_type == ND_NEIGHBOR_SOLICIT) {
2090 		ndp_input_solicit(mp, ira);
2091 	} else {
2092 		ndp_input_advert(mp, ira);
2093 	}
2094 done:
2095 	freemsg(mp);
2096 	if (orig_ill != NULL) {
2097 		ill_refrele(ill);
2098 		ira->ira_ill = orig_ill;
2099 	}
2100 }
2101 
2102 /*
2103  * ndp_xmit is called to form and transmit a ND solicitation or
2104  * advertisement ICMP packet.
2105  *
2106  * If the source address is unspecified and this isn't a probe (used for
2107  * duplicate address detection), an appropriate source address and link layer
2108  * address will be chosen here.  The link layer address option is included if
2109  * the source is specified (i.e., all non-probe packets), and omitted (per the
2110  * specification) otherwise.
2111  *
2112  * It returns B_FALSE only if it does a successful put() to the
2113  * corresponding ill's ill_wq otherwise returns B_TRUE.
2114  */
2115 static boolean_t
2116 ndp_xmit(ill_t *ill, uint32_t operation, uint8_t *hw_addr, uint_t hw_addr_len,
2117     const in6_addr_t *sender, const in6_addr_t *target, int flag)
2118 {
2119 	uint32_t	len;
2120 	icmp6_t 	*icmp6;
2121 	mblk_t		*mp;
2122 	ip6_t		*ip6h;
2123 	nd_opt_hdr_t	*opt;
2124 	uint_t		plen;
2125 	zoneid_t	zoneid = GLOBAL_ZONEID;
2126 	ill_t		*hwaddr_ill = ill;
2127 	ip_xmit_attr_t	ixas;
2128 	ip_stack_t	*ipst = ill->ill_ipst;
2129 	boolean_t	need_refrele = B_FALSE;
2130 	boolean_t	probe = B_FALSE;
2131 
2132 	if (IS_UNDER_IPMP(ill)) {
2133 		probe = ipif_lookup_testaddr_v6(ill, sender, NULL);
2134 		/*
2135 		 * We send non-probe packets on the upper IPMP interface.
2136 		 * ip_output_simple() will use cast_ill for sending any
2137 		 * multicast packets. Note that we can't follow the same
2138 		 * logic for probe packets because all interfaces in the ipmp
2139 		 * group may have failed, so that we really want to only try
2140 		 * to send the ND packet on the ill corresponding to the src
2141 		 * address.
2142 		 */
2143 		if (!probe) {
2144 			ill = ipmp_ill_hold_ipmp_ill(ill);
2145 			if (ill != NULL)
2146 				need_refrele = B_TRUE;
2147 			else
2148 				ill = hwaddr_ill;
2149 		}
2150 	}
2151 
2152 	/*
2153 	 * If we have a unspecified source(sender) address, select a
2154 	 * proper source address for the solicitation here itself so
2155 	 * that we can initialize the h/w address correctly.
2156 	 *
2157 	 * If the sender is specified then we use this address in order
2158 	 * to lookup the zoneid before calling ip_output_v6(). This is to
2159 	 * enable unicast ND_NEIGHBOR_ADVERT packets to be routed correctly
2160 	 * by IP (we cannot guarantee that the global zone has an interface
2161 	 * route to the destination).
2162 	 *
2163 	 * Note that the NA never comes here with the unspecified source
2164 	 * address.
2165 	 */
2166 
2167 	/*
2168 	 * Probes will have unspec src at this point.
2169 	 */
2170 	if (!(IN6_IS_ADDR_UNSPECIFIED(sender))) {
2171 		zoneid = ipif_lookup_addr_zoneid_v6(sender, ill, ipst);
2172 		/*
2173 		 * It's possible for ipif_lookup_addr_zoneid_v6() to return
2174 		 * ALL_ZONES if it cannot find a matching ipif for the address
2175 		 * we are trying to use. In this case we err on the side of
2176 		 * trying to send the packet by defaulting to the GLOBAL_ZONEID.
2177 		 */
2178 		if (zoneid == ALL_ZONES)
2179 			zoneid = GLOBAL_ZONEID;
2180 	}
2181 
2182 	plen = (sizeof (nd_opt_hdr_t) + hw_addr_len + 7) / 8;
2183 	len = IPV6_HDR_LEN + sizeof (nd_neighbor_advert_t) + plen * 8;
2184 	mp = allocb(len,  BPRI_LO);
2185 	if (mp == NULL) {
2186 		if (need_refrele)
2187 			ill_refrele(ill);
2188 		return (B_TRUE);
2189 	}
2190 
2191 	bzero((char *)mp->b_rptr, len);
2192 	mp->b_wptr = mp->b_rptr + len;
2193 
2194 	bzero(&ixas, sizeof (ixas));
2195 	ixas.ixa_flags = IXAF_BASIC_SIMPLE_V6 | IXAF_NO_HW_CKSUM;
2196 
2197 	ixas.ixa_ifindex = ill->ill_phyint->phyint_ifindex;
2198 	ixas.ixa_ipst = ipst;
2199 	ixas.ixa_cred = kcred;
2200 	ixas.ixa_cpid = NOPID;
2201 	ixas.ixa_tsl = NULL;
2202 	ixas.ixa_zoneid = zoneid;
2203 
2204 	ip6h = (ip6_t *)mp->b_rptr;
2205 	ip6h->ip6_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
2206 	ip6h->ip6_plen = htons(len - IPV6_HDR_LEN);
2207 	ip6h->ip6_nxt = IPPROTO_ICMPV6;
2208 	ip6h->ip6_hops = IPV6_MAX_HOPS;
2209 	ixas.ixa_multicast_ttl = ip6h->ip6_hops;
2210 	ip6h->ip6_dst = *target;
2211 	icmp6 = (icmp6_t *)&ip6h[1];
2212 
2213 	if (hw_addr_len != 0) {
2214 		opt = (nd_opt_hdr_t *)((uint8_t *)ip6h + IPV6_HDR_LEN +
2215 		    sizeof (nd_neighbor_advert_t));
2216 	} else {
2217 		opt = NULL;
2218 	}
2219 	if (operation == ND_NEIGHBOR_SOLICIT) {
2220 		nd_neighbor_solicit_t *ns = (nd_neighbor_solicit_t *)icmp6;
2221 
2222 		if (opt != NULL && !(flag & NDP_PROBE)) {
2223 			/*
2224 			 * Note that we don't send out SLLA for ND probes
2225 			 * per RFC 4862, even though we do send out the src
2226 			 * haddr for IPv4 DAD probes, even though both IPv4
2227 			 * and IPv6 go out with the unspecified/INADDR_ANY
2228 			 * src IP addr.
2229 			 */
2230 			opt->nd_opt_type = ND_OPT_SOURCE_LINKADDR;
2231 		}
2232 		ip6h->ip6_src = *sender;
2233 		ns->nd_ns_target = *target;
2234 		if (!(flag & NDP_UNICAST)) {
2235 			/* Form multicast address of the target */
2236 			ip6h->ip6_dst = ipv6_solicited_node_mcast;
2237 			ip6h->ip6_dst.s6_addr32[3] |=
2238 			    ns->nd_ns_target.s6_addr32[3];
2239 		}
2240 	} else {
2241 		nd_neighbor_advert_t *na = (nd_neighbor_advert_t *)icmp6;
2242 
2243 		ASSERT(!(flag & NDP_PROBE));
2244 		if (opt != NULL)
2245 			opt->nd_opt_type = ND_OPT_TARGET_LINKADDR;
2246 		ip6h->ip6_src = *sender;
2247 		na->nd_na_target = *sender;
2248 		if (flag & NDP_ISROUTER)
2249 			na->nd_na_flags_reserved |= ND_NA_FLAG_ROUTER;
2250 		if (flag & NDP_SOLICITED)
2251 			na->nd_na_flags_reserved |= ND_NA_FLAG_SOLICITED;
2252 		if (flag & NDP_ORIDE)
2253 			na->nd_na_flags_reserved |= ND_NA_FLAG_OVERRIDE;
2254 	}
2255 
2256 	if (!(flag & NDP_PROBE)) {
2257 		if (hw_addr != NULL && opt != NULL) {
2258 			/* Fill in link layer address and option len */
2259 			opt->nd_opt_len = (uint8_t)plen;
2260 			bcopy(hw_addr, &opt[1], hw_addr_len);
2261 		}
2262 	}
2263 	if (opt != NULL && opt->nd_opt_type == 0) {
2264 		/* If there's no link layer address option, then strip it. */
2265 		len -= plen * 8;
2266 		mp->b_wptr = mp->b_rptr + len;
2267 		ip6h->ip6_plen = htons(len - IPV6_HDR_LEN);
2268 	}
2269 
2270 	icmp6->icmp6_type = (uint8_t)operation;
2271 	icmp6->icmp6_code = 0;
2272 	/*
2273 	 * Prepare for checksum by putting icmp length in the icmp
2274 	 * checksum field. The checksum is calculated in ip_output.c.
2275 	 */
2276 	icmp6->icmp6_cksum = ip6h->ip6_plen;
2277 
2278 	(void) ip_output_simple(mp, &ixas);
2279 	ixa_cleanup(&ixas);
2280 	if (need_refrele)
2281 		ill_refrele(ill);
2282 	return (B_FALSE);
2283 }
2284 
2285 /*
2286  * Used to set ND_UNREACHBLE before ncec_delete sets it NCE_F_CONDEMNED.
2287  * The datapath uses this as an indication that there
2288  * is a problem (as opposed to a NCE that was just
2289  * reclaimed due to lack of memory.
2290  * Note that static ARP entries never become unreachable.
2291  */
2292 void
2293 nce_make_unreachable(ncec_t *ncec)
2294 {
2295 	mutex_enter(&ncec->ncec_lock);
2296 	ncec->ncec_state = ND_UNREACHABLE;
2297 	mutex_exit(&ncec->ncec_lock);
2298 }
2299 
2300 /*
2301  * NCE retransmit timer. Common to IPv4 and IPv6.
2302  * This timer goes off when:
2303  * a. It is time to retransmit a resolution for resolver.
2304  * b. It is time to send reachability probes.
2305  */
2306 void
2307 nce_timer(void *arg)
2308 {
2309 	ncec_t		*ncec = arg;
2310 	ill_t		*ill = ncec->ncec_ill, *src_ill;
2311 	char		addrbuf[INET6_ADDRSTRLEN];
2312 	boolean_t	dropped = B_FALSE;
2313 	ip_stack_t	*ipst = ncec->ncec_ipst;
2314 	boolean_t	isv6 = (ncec->ncec_ipversion == IPV6_VERSION);
2315 	in_addr_t	sender4 = INADDR_ANY;
2316 	in6_addr_t	sender6 = ipv6_all_zeros;
2317 
2318 	/*
2319 	 * The timer has to be cancelled by ncec_delete before doing the final
2320 	 * refrele. So the NCE is guaranteed to exist when the timer runs
2321 	 * until it clears the timeout_id. Before clearing the timeout_id
2322 	 * bump up the refcnt so that we can continue to use the ncec
2323 	 */
2324 	ASSERT(ncec != NULL);
2325 	mutex_enter(&ncec->ncec_lock);
2326 	ncec_refhold_locked(ncec);
2327 	ncec->ncec_timeout_id = 0;
2328 	mutex_exit(&ncec->ncec_lock);
2329 
2330 	src_ill = nce_resolve_src(ncec, &sender6);
2331 	/* if we could not find a sender address, return */
2332 	if (src_ill == NULL) {
2333 		if (!isv6) {
2334 			IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, sender4);
2335 			ip1dbg(("no src ill for %s\n", inet_ntop(AF_INET,
2336 			    &sender4, addrbuf, sizeof (addrbuf))));
2337 		} else {
2338 			ip1dbg(("no src ill for %s\n", inet_ntop(AF_INET6,
2339 			    &ncec->ncec_addr, addrbuf, sizeof (addrbuf))));
2340 		}
2341 		nce_restart_timer(ncec, ill->ill_reachable_retrans_time);
2342 		ncec_refrele(ncec);
2343 		return;
2344 	}
2345 	if (!isv6)
2346 		IN6_V4MAPPED_TO_IPADDR(&sender6, sender4);
2347 
2348 	mutex_enter(&ncec->ncec_lock);
2349 	/*
2350 	 * Check the reachability state.
2351 	 */
2352 	switch (ncec->ncec_state) {
2353 	case ND_DELAY:
2354 		ASSERT(ncec->ncec_lladdr != NULL);
2355 		ncec->ncec_state = ND_PROBE;
2356 		ncec->ncec_pcnt = ND_MAX_UNICAST_SOLICIT;
2357 		if (isv6) {
2358 			mutex_exit(&ncec->ncec_lock);
2359 			dropped = ndp_xmit(src_ill, ND_NEIGHBOR_SOLICIT,
2360 			    src_ill->ill_phys_addr,
2361 			    src_ill->ill_phys_addr_length,
2362 			    &sender6, &ncec->ncec_addr,
2363 			    NDP_UNICAST);
2364 		} else {
2365 			dropped = arp_request(ncec, sender4, src_ill);
2366 			mutex_exit(&ncec->ncec_lock);
2367 		}
2368 		if (!dropped) {
2369 			mutex_enter(&ncec->ncec_lock);
2370 			ncec->ncec_pcnt--;
2371 			mutex_exit(&ncec->ncec_lock);
2372 		}
2373 		if (ip_debug > 3) {
2374 			/* ip2dbg */
2375 			pr_addr_dbg("nce_timer: state for %s changed "
2376 			    "to PROBE\n", AF_INET6, &ncec->ncec_addr);
2377 		}
2378 		nce_restart_timer(ncec, ill->ill_reachable_retrans_time);
2379 		break;
2380 	case ND_PROBE:
2381 		/* must be retransmit timer */
2382 		ASSERT(ncec->ncec_pcnt >= -1);
2383 		if (ncec->ncec_pcnt > 0) {
2384 			/*
2385 			 * As per RFC2461, the ncec gets deleted after
2386 			 * MAX_UNICAST_SOLICIT unsuccessful re-transmissions.
2387 			 * Note that the first unicast solicitation is sent
2388 			 * during the DELAY state.
2389 			 */
2390 			ip2dbg(("nce_timer: pcount=%x dst %s\n",
2391 			    ncec->ncec_pcnt,
2392 			    inet_ntop((isv6? AF_INET6 : AF_INET),
2393 			    &ncec->ncec_addr, addrbuf, sizeof (addrbuf))));
2394 			if (NCE_PUBLISH(ncec)) {
2395 				mutex_exit(&ncec->ncec_lock);
2396 				/*
2397 				 * send out a probe; note that src_ill
2398 				 * is ignored by nce_dad() for all
2399 				 * DAD message types other than IPv6
2400 				 * unicast probes
2401 				 */
2402 				nce_dad(ncec, src_ill, B_TRUE);
2403 			} else {
2404 				ASSERT(src_ill != NULL);
2405 				if (isv6) {
2406 					mutex_exit(&ncec->ncec_lock);
2407 					dropped = ndp_xmit(src_ill,
2408 					    ND_NEIGHBOR_SOLICIT,
2409 					    src_ill->ill_phys_addr,
2410 					    src_ill->ill_phys_addr_length,
2411 					    &sender6, &ncec->ncec_addr,
2412 					    NDP_UNICAST);
2413 				} else {
2414 					/*
2415 					 * since the nce is REACHABLE,
2416 					 * the ARP request will be sent out
2417 					 * as a link-layer unicast.
2418 					 */
2419 					dropped = arp_request(ncec, sender4,
2420 					    src_ill);
2421 					mutex_exit(&ncec->ncec_lock);
2422 				}
2423 				if (!dropped) {
2424 					mutex_enter(&ncec->ncec_lock);
2425 					ncec->ncec_pcnt--;
2426 					mutex_exit(&ncec->ncec_lock);
2427 				}
2428 				nce_restart_timer(ncec,
2429 				    ill->ill_reachable_retrans_time);
2430 			}
2431 		} else if (ncec->ncec_pcnt < 0) {
2432 			/* No hope, delete the ncec */
2433 			/* Tell datapath it went bad */
2434 			ncec->ncec_state = ND_UNREACHABLE;
2435 			mutex_exit(&ncec->ncec_lock);
2436 			if (ip_debug > 2) {
2437 				/* ip1dbg */
2438 				pr_addr_dbg("nce_timer: Delete NCE for"
2439 				    " dst %s\n", (isv6? AF_INET6: AF_INET),
2440 				    &ncec->ncec_addr);
2441 			}
2442 			/* if static ARP can't delete. */
2443 			if ((ncec->ncec_flags & NCE_F_STATIC) == 0)
2444 				ncec_delete(ncec);
2445 
2446 		} else if (!NCE_PUBLISH(ncec)) {
2447 			/*
2448 			 * Probe count is 0 for a dynamic entry (one that we
2449 			 * ourselves are not publishing). We should never get
2450 			 * here if NONUD was requested, hence the ASSERT below.
2451 			 */
2452 			ASSERT((ncec->ncec_flags & NCE_F_NONUD) == 0);
2453 			ip2dbg(("nce_timer: pcount=%x dst %s\n",
2454 			    ncec->ncec_pcnt, inet_ntop(AF_INET6,
2455 			    &ncec->ncec_addr, addrbuf, sizeof (addrbuf))));
2456 			ncec->ncec_pcnt--;
2457 			mutex_exit(&ncec->ncec_lock);
2458 			/* Wait one interval before killing */
2459 			nce_restart_timer(ncec,
2460 			    ill->ill_reachable_retrans_time);
2461 		} else if (ill->ill_phyint->phyint_flags & PHYI_RUNNING) {
2462 			ipif_t *ipif;
2463 			ipaddr_t ncec_addr;
2464 
2465 			/*
2466 			 * We're done probing, and we can now declare this
2467 			 * address to be usable.  Let IP know that it's ok to
2468 			 * use.
2469 			 */
2470 			ncec->ncec_state = ND_REACHABLE;
2471 			ncec->ncec_flags &= ~NCE_F_UNVERIFIED;
2472 			mutex_exit(&ncec->ncec_lock);
2473 			if (isv6) {
2474 				ipif = ipif_lookup_addr_exact_v6(
2475 				    &ncec->ncec_addr, ill, ipst);
2476 			} else {
2477 				IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr,
2478 				    ncec_addr);
2479 				ipif = ipif_lookup_addr_exact(ncec_addr, ill,
2480 				    ipst);
2481 			}
2482 			if (ipif != NULL) {
2483 				if (ipif->ipif_was_dup) {
2484 					char ibuf[LIFNAMSIZ];
2485 					char sbuf[INET6_ADDRSTRLEN];
2486 
2487 					ipif->ipif_was_dup = B_FALSE;
2488 					(void) inet_ntop(AF_INET6,
2489 					    &ipif->ipif_v6lcl_addr,
2490 					    sbuf, sizeof (sbuf));
2491 					ipif_get_name(ipif, ibuf,
2492 					    sizeof (ibuf));
2493 					cmn_err(CE_NOTE, "recovered address "
2494 					    "%s on %s", sbuf, ibuf);
2495 				}
2496 				if ((ipif->ipif_flags & IPIF_UP) &&
2497 				    !ipif->ipif_addr_ready)
2498 					ipif_up_notify(ipif);
2499 				ipif->ipif_addr_ready = 1;
2500 				ipif_refrele(ipif);
2501 			}
2502 			if (!isv6 && arp_no_defense)
2503 				break;
2504 			/* Begin defending our new address */
2505 			if (ncec->ncec_unsolicit_count > 0) {
2506 				ncec->ncec_unsolicit_count--;
2507 				if (isv6) {
2508 					dropped = ndp_announce(ncec);
2509 				} else {
2510 					dropped = arp_announce(ncec);
2511 				}
2512 
2513 				if (dropped)
2514 					ncec->ncec_unsolicit_count++;
2515 				else
2516 					ncec->ncec_last_time_defended =
2517 					    ddi_get_lbolt();
2518 			}
2519 			if (ncec->ncec_unsolicit_count > 0) {
2520 				nce_restart_timer(ncec,
2521 				    ANNOUNCE_INTERVAL(isv6));
2522 			} else if (DEFENSE_INTERVAL(isv6) != 0) {
2523 				nce_restart_timer(ncec, DEFENSE_INTERVAL(isv6));
2524 			}
2525 		} else {
2526 			/*
2527 			 * This is an address we're probing to be our own, but
2528 			 * the ill is down.  Wait until it comes back before
2529 			 * doing anything, but switch to reachable state so
2530 			 * that the restart will work.
2531 			 */
2532 			ncec->ncec_state = ND_REACHABLE;
2533 			mutex_exit(&ncec->ncec_lock);
2534 		}
2535 		break;
2536 	case ND_INCOMPLETE: {
2537 		mblk_t	*mp, *nextmp;
2538 		mblk_t	**prevmpp;
2539 
2540 		/*
2541 		 * Per case (2) in the nce_queue_mp() comments, scan ncec_qd_mp
2542 		 * for any IPMP probe packets, and toss them.  IPMP probe
2543 		 * packets will always be at the head of ncec_qd_mp, so that
2544 		 * we can stop at the first queued ND packet that is
2545 		 * not a probe packet.
2546 		 */
2547 		prevmpp = &ncec->ncec_qd_mp;
2548 		for (mp = ncec->ncec_qd_mp; mp != NULL; mp = nextmp) {
2549 			nextmp = mp->b_next;
2550 
2551 			if (IS_UNDER_IPMP(ill) && ncec->ncec_nprobes > 0) {
2552 				inet_freemsg(mp);
2553 				ncec->ncec_nprobes--;
2554 				*prevmpp = nextmp;
2555 			} else {
2556 				prevmpp = &mp->b_next;
2557 			}
2558 		}
2559 
2560 		/*
2561 		 * Must be resolver's retransmit timer.
2562 		 */
2563 		mutex_exit(&ncec->ncec_lock);
2564 		ip_ndp_resolve(ncec);
2565 		break;
2566 	}
2567 	case ND_REACHABLE:
2568 		if (((ncec->ncec_flags & NCE_F_UNSOL_ADV) &&
2569 		    ncec->ncec_unsolicit_count != 0) ||
2570 		    (NCE_PUBLISH(ncec) && DEFENSE_INTERVAL(isv6) != 0)) {
2571 			if (ncec->ncec_unsolicit_count > 0) {
2572 				ncec->ncec_unsolicit_count--;
2573 				mutex_exit(&ncec->ncec_lock);
2574 				/*
2575 				 * When we get to zero announcements left,
2576 				 * switch to address defense
2577 				 */
2578 			} else {
2579 				boolean_t rate_limit;
2580 
2581 				mutex_exit(&ncec->ncec_lock);
2582 				rate_limit = ill_defend_rate_limit(ill, ncec);
2583 				if (rate_limit) {
2584 					nce_restart_timer(ncec,
2585 					    DEFENSE_INTERVAL(isv6));
2586 					break;
2587 				}
2588 			}
2589 			if (isv6) {
2590 				dropped = ndp_announce(ncec);
2591 			} else {
2592 				dropped = arp_announce(ncec);
2593 			}
2594 			mutex_enter(&ncec->ncec_lock);
2595 			if (dropped) {
2596 				ncec->ncec_unsolicit_count++;
2597 			} else {
2598 				ncec->ncec_last_time_defended =
2599 				    ddi_get_lbolt();
2600 			}
2601 			mutex_exit(&ncec->ncec_lock);
2602 			if (ncec->ncec_unsolicit_count != 0) {
2603 				nce_restart_timer(ncec,
2604 				    ANNOUNCE_INTERVAL(isv6));
2605 			} else {
2606 				nce_restart_timer(ncec, DEFENSE_INTERVAL(isv6));
2607 			}
2608 		} else {
2609 			mutex_exit(&ncec->ncec_lock);
2610 		}
2611 		break;
2612 	default:
2613 		mutex_exit(&ncec->ncec_lock);
2614 		break;
2615 	}
2616 done:
2617 	ncec_refrele(ncec);
2618 	ill_refrele(src_ill);
2619 }
2620 
2621 /*
2622  * Set a link layer address from the ll_addr passed in.
2623  * Copy SAP from ill.
2624  */
2625 static void
2626 nce_set_ll(ncec_t *ncec, uchar_t *ll_addr)
2627 {
2628 	ill_t	*ill = ncec->ncec_ill;
2629 
2630 	ASSERT(ll_addr != NULL);
2631 	if (ill->ill_phys_addr_length > 0) {
2632 		/*
2633 		 * The bcopy() below used to be called for the physical address
2634 		 * length rather than the link layer address length. For
2635 		 * ethernet and many other media, the phys_addr and lla are
2636 		 * identical.
2637 		 *
2638 		 * The phys_addr and lla may not be the same for devices that
2639 		 * support DL_IPV6_LINK_LAYER_ADDR, though there are currently
2640 		 * no known instances of these.
2641 		 *
2642 		 * For PPP or other interfaces with a zero length
2643 		 * physical address, don't do anything here.
2644 		 * The bcopy() with a zero phys_addr length was previously
2645 		 * a no-op for interfaces with a zero-length physical address.
2646 		 * Using the lla for them would change the way they operate.
2647 		 * Doing nothing in such cases preserves expected behavior.
2648 		 */
2649 		bcopy(ll_addr, ncec->ncec_lladdr, ill->ill_nd_lla_len);
2650 	}
2651 }
2652 
2653 boolean_t
2654 nce_cmp_ll_addr(const ncec_t *ncec, const uchar_t *ll_addr,
2655     uint32_t ll_addr_len)
2656 {
2657 	ASSERT(ncec->ncec_lladdr != NULL);
2658 	if (ll_addr == NULL)
2659 		return (B_FALSE);
2660 	if (bcmp(ll_addr, ncec->ncec_lladdr, ll_addr_len) != 0)
2661 		return (B_TRUE);
2662 	return (B_FALSE);
2663 }
2664 
2665 /*
2666  * Updates the link layer address or the reachability state of
2667  * a cache entry.  Reset probe counter if needed.
2668  */
2669 void
2670 nce_update(ncec_t *ncec, uint16_t new_state, uchar_t *new_ll_addr)
2671 {
2672 	ill_t	*ill = ncec->ncec_ill;
2673 	boolean_t need_stop_timer = B_FALSE;
2674 	boolean_t need_fastpath_update = B_FALSE;
2675 	nce_t	*nce = NULL;
2676 	timeout_id_t tid;
2677 
2678 	ASSERT(MUTEX_HELD(&ncec->ncec_lock));
2679 	/*
2680 	 * If this interface does not do NUD, there is no point
2681 	 * in allowing an update to the cache entry.  Although
2682 	 * we will respond to NS.
2683 	 * The only time we accept an update for a resolver when
2684 	 * NUD is turned off is when it has just been created.
2685 	 * Non-Resolvers will always be created as REACHABLE.
2686 	 */
2687 	if (new_state != ND_UNCHANGED) {
2688 		if ((ncec->ncec_flags & NCE_F_NONUD) &&
2689 		    (ncec->ncec_state != ND_INCOMPLETE))
2690 			return;
2691 		ASSERT((int16_t)new_state >= ND_STATE_VALID_MIN);
2692 		ASSERT((int16_t)new_state <= ND_STATE_VALID_MAX);
2693 		need_stop_timer = B_TRUE;
2694 		if (new_state == ND_REACHABLE)
2695 			ncec->ncec_last = TICK_TO_MSEC(ddi_get_lbolt64());
2696 		else {
2697 			/* We force NUD in this case */
2698 			ncec->ncec_last = 0;
2699 		}
2700 		ncec->ncec_state = new_state;
2701 		ncec->ncec_pcnt = ND_MAX_UNICAST_SOLICIT;
2702 		ASSERT(ncec->ncec_lladdr != NULL || new_state == ND_INITIAL ||
2703 		    new_state == ND_INCOMPLETE);
2704 	}
2705 	if (need_stop_timer || (ncec->ncec_flags & NCE_F_STATIC)) {
2706 		tid = ncec->ncec_timeout_id;
2707 		ncec->ncec_timeout_id = 0;
2708 	}
2709 	/*
2710 	 * Re-trigger fastpath probe and
2711 	 * overwrite the DL_UNITDATA_REQ data, noting we'll lose
2712 	 * whatever packets that happens to be transmitting at the time.
2713 	 */
2714 	if (new_ll_addr != NULL) {
2715 		bcopy(new_ll_addr, ncec->ncec_lladdr,
2716 		    ill->ill_phys_addr_length);
2717 		need_fastpath_update = B_TRUE;
2718 	}
2719 	mutex_exit(&ncec->ncec_lock);
2720 	if (need_stop_timer || (ncec->ncec_flags & NCE_F_STATIC)) {
2721 		if (tid != 0)
2722 			(void) untimeout(tid);
2723 	}
2724 	if (need_fastpath_update) {
2725 		/*
2726 		 * Delete any existing existing dlur_mp and fp_mp information.
2727 		 * For IPMP interfaces, all underlying ill's must be checked
2728 		 * and purged.
2729 		 */
2730 		nce_fastpath_list_delete(ncec->ncec_ill, ncec, NULL);
2731 		/*
2732 		 * add the new dlur_mp and fp_mp
2733 		 */
2734 		nce = nce_fastpath(ncec, B_TRUE, NULL);
2735 		if (nce != NULL)
2736 			nce_refrele(nce);
2737 	}
2738 	mutex_enter(&ncec->ncec_lock);
2739 }
2740 
2741 static void
2742 nce_queue_mp_common(ncec_t *ncec, mblk_t *mp, boolean_t head_insert)
2743 {
2744 	uint_t	count = 0;
2745 	mblk_t  **mpp, *tmp;
2746 
2747 	ASSERT(MUTEX_HELD(&ncec->ncec_lock));
2748 
2749 	for (mpp = &ncec->ncec_qd_mp; *mpp != NULL; mpp = &(*mpp)->b_next) {
2750 		if (++count > ncec->ncec_ill->ill_max_buf) {
2751 			tmp = ncec->ncec_qd_mp->b_next;
2752 			ncec->ncec_qd_mp->b_next = NULL;
2753 			/*
2754 			 * if we never create data addrs on the under_ill
2755 			 * does this matter?
2756 			 */
2757 			BUMP_MIB(ncec->ncec_ill->ill_ip_mib,
2758 			    ipIfStatsOutDiscards);
2759 			ip_drop_output("ipIfStatsOutDiscards", ncec->ncec_qd_mp,
2760 			    ncec->ncec_ill);
2761 			freemsg(ncec->ncec_qd_mp);
2762 			ncec->ncec_qd_mp = tmp;
2763 		}
2764 	}
2765 
2766 	if (head_insert) {
2767 		ncec->ncec_nprobes++;
2768 		mp->b_next = ncec->ncec_qd_mp;
2769 		ncec->ncec_qd_mp = mp;
2770 	} else {
2771 		*mpp = mp;
2772 	}
2773 }
2774 
2775 /*
2776  * nce_queue_mp will queue the packet into the ncec_qd_mp. The packet will be
2777  * queued at the head or tail of the queue based on the input argument
2778  * 'head_insert'. The caller should specify this argument as B_TRUE if this
2779  * packet is an IPMP probe packet, in which case the following happens:
2780  *
2781  *   1. Insert it at the head of the ncec_qd_mp list.  Consider the normal
2782  *	(non-ipmp_probe) load-speading case where the source address of the ND
2783  *	packet is not tied to ncec_ill. If the ill bound to the source address
2784  *	cannot receive, the response to the ND packet will not be received.
2785  *	However, if ND packets for ncec_ill's probes are queued	behind that ND
2786  *	packet, those probes will also fail to be sent, and thus in.mpathd will
2787  *	 erroneously conclude that ncec_ill has also failed.
2788  *
2789  *   2. Drop the ipmp_probe packet in ndp_timer() if the ND did	not succeed on
2790  *	the first attempt.  This ensures that ND problems do not manifest as
2791  *	probe RTT spikes.
2792  *
2793  * We achieve this by inserting ipmp_probe() packets at the head of the
2794  * nce_queue.
2795  *
2796  * The ncec for the probe target is created with ncec_ill set to the ipmp_ill,
2797  * but the caller needs to set head_insert to B_TRUE if this is a probe packet.
2798  */
2799 void
2800 nce_queue_mp(ncec_t *ncec, mblk_t *mp, boolean_t head_insert)
2801 {
2802 	ASSERT(MUTEX_HELD(&ncec->ncec_lock));
2803 	nce_queue_mp_common(ncec, mp, head_insert);
2804 }
2805 
2806 /*
2807  * Called when address resolution failed due to a timeout.
2808  * Send an ICMP unreachable in response to all queued packets.
2809  */
2810 void
2811 ndp_resolv_failed(ncec_t *ncec)
2812 {
2813 	mblk_t	*mp, *nxt_mp;
2814 	char	buf[INET6_ADDRSTRLEN];
2815 	ill_t *ill = ncec->ncec_ill;
2816 	ip_recv_attr_t	iras;
2817 
2818 	bzero(&iras, sizeof (iras));
2819 	iras.ira_flags = 0;
2820 	/*
2821 	 * we are setting the ira_rill to the ipmp_ill (instead of
2822 	 * the actual ill on which the packet was received), but this
2823 	 * is ok because we don't actually need the real ira_rill.
2824 	 * to send the icmp unreachable to the sender.
2825 	 */
2826 	iras.ira_ill = iras.ira_rill = ill;
2827 	iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex;
2828 	iras.ira_rifindex = iras.ira_ruifindex;
2829 
2830 	ip1dbg(("ndp_resolv_failed: dst %s\n",
2831 	    inet_ntop(AF_INET6, (char *)&ncec->ncec_addr, buf, sizeof (buf))));
2832 	mutex_enter(&ncec->ncec_lock);
2833 	mp = ncec->ncec_qd_mp;
2834 	ncec->ncec_qd_mp = NULL;
2835 	ncec->ncec_nprobes = 0;
2836 	mutex_exit(&ncec->ncec_lock);
2837 	while (mp != NULL) {
2838 		nxt_mp = mp->b_next;
2839 		mp->b_next = NULL;
2840 
2841 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
2842 		ip_drop_output("ipIfStatsOutDiscards - address unreachable",
2843 		    mp, ill);
2844 		icmp_unreachable_v6(mp,
2845 		    ICMP6_DST_UNREACH_ADDR, B_FALSE, &iras);
2846 		ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE));
2847 		mp = nxt_mp;
2848 	}
2849 	ncec_cb_dispatch(ncec); /* finish off waiting callbacks */
2850 }
2851 
2852 /*
2853  * Handle the completion of NDP and ARP resolution.
2854  */
2855 void
2856 nce_resolv_ok(ncec_t *ncec)
2857 {
2858 	mblk_t *mp;
2859 	uint_t pkt_len;
2860 	iaflags_t ixaflags = IXAF_NO_TRACE;
2861 	nce_t *nce;
2862 	ill_t	*ill = ncec->ncec_ill;
2863 	boolean_t isv6 = (ncec->ncec_ipversion == IPV6_VERSION);
2864 	ip_stack_t *ipst = ill->ill_ipst;
2865 
2866 	if (IS_IPMP(ncec->ncec_ill)) {
2867 		nce_resolv_ipmp_ok(ncec);
2868 		return;
2869 	}
2870 	/* non IPMP case */
2871 
2872 	mutex_enter(&ncec->ncec_lock);
2873 	ASSERT(ncec->ncec_nprobes == 0);
2874 	mp = ncec->ncec_qd_mp;
2875 	ncec->ncec_qd_mp = NULL;
2876 	mutex_exit(&ncec->ncec_lock);
2877 
2878 	while (mp != NULL) {
2879 		mblk_t *nxt_mp;
2880 
2881 		if (ill->ill_isv6) {
2882 			ip6_t *ip6h = (ip6_t *)mp->b_rptr;
2883 
2884 			pkt_len = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN;
2885 		} else {
2886 			ipha_t *ipha = (ipha_t *)mp->b_rptr;
2887 
2888 			ixaflags |= IXAF_IS_IPV4;
2889 			pkt_len = ntohs(ipha->ipha_length);
2890 		}
2891 		nxt_mp = mp->b_next;
2892 		mp->b_next = NULL;
2893 		/*
2894 		 * IXAF_NO_DEV_FLOW_CTL information for TCP packets is no
2895 		 * longer available, but it's ok to drop this flag because TCP
2896 		 * has its own flow-control in effect, so TCP packets
2897 		 * are not likely to get here when flow-control is in effect.
2898 		 */
2899 		mutex_enter(&ill->ill_lock);
2900 		nce = nce_lookup(ill, &ncec->ncec_addr);
2901 		mutex_exit(&ill->ill_lock);
2902 
2903 		if (nce == NULL) {
2904 			if (isv6) {
2905 				BUMP_MIB(&ipst->ips_ip6_mib,
2906 				    ipIfStatsOutDiscards);
2907 			} else {
2908 				BUMP_MIB(&ipst->ips_ip_mib,
2909 				    ipIfStatsOutDiscards);
2910 			}
2911 			ip_drop_output("ipIfStatsOutDiscards - no nce",
2912 			    mp, NULL);
2913 			freemsg(mp);
2914 		} else {
2915 			/*
2916 			 * We don't know the zoneid, but
2917 			 * ip_xmit does not care since IXAF_NO_TRACE
2918 			 * is set. (We traced the packet the first
2919 			 * time through ip_xmit.)
2920 			 */
2921 			(void) ip_xmit(mp, nce, ixaflags, pkt_len, 0,
2922 			    ALL_ZONES, 0, NULL);
2923 			nce_refrele(nce);
2924 		}
2925 		mp = nxt_mp;
2926 	}
2927 
2928 	ncec_cb_dispatch(ncec); /* complete callbacks */
2929 }
2930 
2931 /*
2932  * Called by SIOCSNDP* ioctl to add/change an ncec entry
2933  * and the corresponding attributes.
2934  * Disallow states other than ND_REACHABLE or ND_STALE.
2935  */
2936 int
2937 ndp_sioc_update(ill_t *ill, lif_nd_req_t *lnr)
2938 {
2939 	sin6_t		*sin6;
2940 	in6_addr_t	*addr;
2941 	ncec_t		*ncec;
2942 	nce_t		*nce;
2943 	int		err = 0;
2944 	uint16_t	new_flags = 0;
2945 	uint16_t	old_flags = 0;
2946 	int		inflags = lnr->lnr_flags;
2947 	ip_stack_t	*ipst = ill->ill_ipst;
2948 	boolean_t	do_postprocess = B_FALSE;
2949 
2950 	ASSERT(ill->ill_isv6);
2951 	if ((lnr->lnr_state_create != ND_REACHABLE) &&
2952 	    (lnr->lnr_state_create != ND_STALE))
2953 		return (EINVAL);
2954 
2955 	sin6 = (sin6_t *)&lnr->lnr_addr;
2956 	addr = &sin6->sin6_addr;
2957 
2958 	mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
2959 	ASSERT(!IS_UNDER_IPMP(ill));
2960 	nce = nce_lookup_addr(ill, addr);
2961 	if (nce != NULL)
2962 		new_flags = nce->nce_common->ncec_flags;
2963 
2964 	switch (inflags & (NDF_ISROUTER_ON|NDF_ISROUTER_OFF)) {
2965 	case NDF_ISROUTER_ON:
2966 		new_flags |= NCE_F_ISROUTER;
2967 		break;
2968 	case NDF_ISROUTER_OFF:
2969 		new_flags &= ~NCE_F_ISROUTER;
2970 		break;
2971 	case (NDF_ISROUTER_OFF|NDF_ISROUTER_ON):
2972 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
2973 		if (nce != NULL)
2974 			nce_refrele(nce);
2975 		return (EINVAL);
2976 	}
2977 
2978 	switch (inflags & (NDF_ANYCAST_ON|NDF_ANYCAST_OFF)) {
2979 	case NDF_ANYCAST_ON:
2980 		new_flags |= NCE_F_ANYCAST;
2981 		break;
2982 	case NDF_ANYCAST_OFF:
2983 		new_flags &= ~NCE_F_ANYCAST;
2984 		break;
2985 	case (NDF_ANYCAST_OFF|NDF_ANYCAST_ON):
2986 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
2987 		if (nce != NULL)
2988 			nce_refrele(nce);
2989 		return (EINVAL);
2990 	}
2991 
2992 	if (nce == NULL) {
2993 		err = nce_add_v6(ill,
2994 		    (uchar_t *)lnr->lnr_hdw_addr,
2995 		    ill->ill_phys_addr_length,
2996 		    addr,
2997 		    new_flags,
2998 		    lnr->lnr_state_create,
2999 		    &nce);
3000 		if (err != 0) {
3001 			mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
3002 			ip1dbg(("ndp_sioc_update: Can't create NCE %d\n", err));
3003 			return (err);
3004 		} else {
3005 			do_postprocess = B_TRUE;
3006 		}
3007 	}
3008 	ncec = nce->nce_common;
3009 	old_flags = ncec->ncec_flags;
3010 	if (old_flags & NCE_F_ISROUTER && !(new_flags & NCE_F_ISROUTER)) {
3011 		ncec_router_to_host(ncec);
3012 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
3013 		if (do_postprocess)
3014 			err = nce_add_v6_postprocess(nce);
3015 		nce_refrele(nce);
3016 		return (0);
3017 	}
3018 	mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
3019 
3020 	if (do_postprocess)
3021 		err = nce_add_v6_postprocess(nce);
3022 	/*
3023 	 * err cannot be anything other than 0 because we don't support
3024 	 * proxy arp of static addresses.
3025 	 */
3026 	ASSERT(err == 0);
3027 
3028 	mutex_enter(&ncec->ncec_lock);
3029 	ncec->ncec_flags = new_flags;
3030 	mutex_exit(&ncec->ncec_lock);
3031 	/*
3032 	 * Note that we ignore the state at this point, which
3033 	 * should be either STALE or REACHABLE.  Instead we let
3034 	 * the link layer address passed in to determine the state
3035 	 * much like incoming packets.
3036 	 */
3037 	nce_process(ncec, (uchar_t *)lnr->lnr_hdw_addr, 0, B_FALSE);
3038 	nce_refrele(nce);
3039 	return (0);
3040 }
3041 
3042 /*
3043  * Create an nce_t structure for ill using the ncec->ncec_lladdr to set up
3044  * the nce_dlur_mp. If ill != ncec->ncec_ill, then the ips_ill_g_lock must
3045  * be held to ensure that they are in the same group.
3046  */
3047 static nce_t *
3048 nce_fastpath_create(ill_t *ill, ncec_t *ncec)
3049 {
3050 
3051 	nce_t *nce;
3052 
3053 	nce = nce_ill_lookup_then_add(ill, ncec);
3054 
3055 	if (nce == NULL || IS_LOOPBACK(nce->nce_ill) || IS_VNI(nce->nce_ill))
3056 		return (nce);
3057 
3058 	/*
3059 	 * hold the ncec_lock to synchronize with nce_update() so that,
3060 	 * at the end of this function, the contents of nce_dlur_mp are
3061 	 * consistent with ncec->ncec_lladdr, even though some intermediate
3062 	 * packet may have been sent out with a mangled address, which would
3063 	 * only be a transient condition.
3064 	 */
3065 	mutex_enter(&ncec->ncec_lock);
3066 	if (ncec->ncec_lladdr != NULL) {
3067 		bcopy(ncec->ncec_lladdr, nce->nce_dlur_mp->b_rptr +
3068 		    NCE_LL_ADDR_OFFSET(ill), ill->ill_phys_addr_length);
3069 	} else {
3070 		nce->nce_dlur_mp = ill_dlur_gen(NULL, 0, ill->ill_sap,
3071 		    ill->ill_sap_length);
3072 	}
3073 	mutex_exit(&ncec->ncec_lock);
3074 	return (nce);
3075 }
3076 
3077 /*
3078  * we make nce_fp_mp to have an M_DATA prepend.
3079  * The caller ensures there is hold on ncec for this function.
3080  * Note that since ill_fastpath_probe() copies the mblk there is
3081  * no need to hold the nce or ncec beyond this function.
3082  *
3083  * If the caller has passed in a non-null ncec_nce to nce_faspath() that
3084  * ncec_nce must correspond to the nce for ncec with nce_ill == ncec->ncec_ill
3085  * and will be returned back by this function, so that no extra nce_refrele
3086  * is required for the caller. The calls from nce_add_common() use this
3087  * method. All other callers (that pass in NULL ncec_nce) will have to do a
3088  * nce_refrele of the returned nce (when it is non-null).
3089  */
3090 nce_t *
3091 nce_fastpath(ncec_t *ncec, boolean_t trigger_fp_req, nce_t *ncec_nce)
3092 {
3093 	nce_t *nce;
3094 	ill_t *ill = ncec->ncec_ill;
3095 
3096 	ASSERT(ill != NULL);
3097 
3098 	if (IS_IPMP(ill) && trigger_fp_req) {
3099 		trigger_fp_req = B_FALSE;
3100 		ipmp_ncec_fastpath(ncec, ill);
3101 
3102 	}
3103 	/*
3104 	 * If the caller already has the nce corresponding to the ill, use
3105 	 * that one. Otherwise we have to lookup/add the nce. Calls from
3106 	 * nce_add_common() fall in the former category, and have just done
3107 	 * the nce lookup/add that can be reused.
3108 	 */
3109 	if (ncec_nce == NULL)
3110 		nce = nce_fastpath_create(ill, ncec);
3111 	else
3112 		nce = ncec_nce;
3113 
3114 	if (nce == NULL || IS_LOOPBACK(nce->nce_ill) || IS_VNI(nce->nce_ill))
3115 		return (nce);
3116 
3117 	if (trigger_fp_req)
3118 		nce_fastpath_trigger(nce);
3119 	return (nce);
3120 }
3121 
3122 /*
3123  * Trigger fastpath on nce. No locks may be held.
3124  */
3125 static void
3126 nce_fastpath_trigger(nce_t *nce)
3127 {
3128 	int res;
3129 	ill_t *ill = nce->nce_ill;
3130 	ncec_t *ncec = nce->nce_common;
3131 
3132 	res = ill_fastpath_probe(ill, nce->nce_dlur_mp);
3133 	/*
3134 	 * EAGAIN is an indication of a transient error
3135 	 * i.e. allocation failure etc. leave the ncec in the list it
3136 	 * will be updated when another probe happens for another ire
3137 	 * if not it will be taken out of the list when the ire is
3138 	 * deleted.
3139 	 */
3140 	if (res != 0 && res != EAGAIN && res != ENOTSUP)
3141 		nce_fastpath_list_delete(ill, ncec, NULL);
3142 }
3143 
3144 /*
3145  * Add ncec to the nce fastpath list on ill.
3146  */
3147 static nce_t *
3148 nce_ill_lookup_then_add_locked(ill_t *ill, ncec_t *ncec)
3149 {
3150 	nce_t *nce = NULL;
3151 
3152 	ASSERT(MUTEX_HELD(&ill->ill_lock));
3153 	/*
3154 	 * Atomically ensure that the ill is not CONDEMNED and is not going
3155 	 * down, before adding the NCE.
3156 	 */
3157 	if (ill->ill_state_flags & ILL_CONDEMNED)
3158 		return (NULL);
3159 	mutex_enter(&ncec->ncec_lock);
3160 	/*
3161 	 * if ncec has not been deleted and
3162 	 * is not already in the list add it.
3163 	 */
3164 	if (!NCE_ISCONDEMNED(ncec)) {
3165 		nce = nce_lookup(ill, &ncec->ncec_addr);
3166 		if (nce != NULL)
3167 			goto done;
3168 		nce = nce_add(ill, ncec);
3169 	}
3170 done:
3171 	mutex_exit(&ncec->ncec_lock);
3172 	return (nce);
3173 }
3174 
3175 nce_t *
3176 nce_ill_lookup_then_add(ill_t *ill, ncec_t *ncec)
3177 {
3178 	nce_t *nce;
3179 
3180 	mutex_enter(&ill->ill_lock);
3181 	nce = nce_ill_lookup_then_add_locked(ill, ncec);
3182 	mutex_exit(&ill->ill_lock);
3183 	return (nce);
3184 }
3185 
3186 
3187 /*
3188  * remove ncec from the ill_nce list. If 'dead' is non-null, the deleted
3189  * nce is added to the 'dead' list, and the caller must nce_refrele() the
3190  * entry after all locks have been dropped.
3191  */
3192 void
3193 nce_fastpath_list_delete(ill_t *ill, ncec_t *ncec, list_t *dead)
3194 {
3195 	nce_t *nce;
3196 
3197 	ASSERT(ill != NULL);
3198 
3199 	/* first clean out any nce pointers in the under_ills */
3200 	if (IS_IPMP(ill))
3201 		ipmp_ncec_flush_nce(ncec);
3202 
3203 	/* now the ill itself */
3204 	mutex_enter(&ill->ill_lock);
3205 	for (nce = list_head(&ill->ill_nce); nce != NULL;
3206 	    nce = list_next(&ill->ill_nce, nce)) {
3207 		if (nce->nce_common == ncec) {
3208 			nce_refhold(nce);
3209 			nce_delete(nce);
3210 			break;
3211 		}
3212 	}
3213 	mutex_exit(&ill->ill_lock);
3214 	if (nce != NULL) {
3215 		if (dead == NULL)
3216 			nce_refrele(nce);
3217 		else
3218 			list_insert_tail(dead, nce);
3219 	}
3220 }
3221 
3222 /*
3223  * when the fastpath response does not fit in the datab
3224  * associated with the existing nce_fp_mp, we delete and
3225  * add the nce to retrigger fastpath based on the information
3226  * in the ncec_t.
3227  */
3228 static nce_t *
3229 nce_delete_then_add(nce_t *nce)
3230 {
3231 	ill_t		*ill = nce->nce_ill;
3232 	nce_t		*newnce = NULL;
3233 
3234 	ip0dbg(("nce_delete_then_add nce %p ill %s\n",
3235 	    (void *)nce, ill->ill_name));
3236 	mutex_enter(&ill->ill_lock);
3237 	mutex_enter(&nce->nce_common->ncec_lock);
3238 	nce_delete(nce);
3239 	/*
3240 	 * Make sure that ncec is not condemned before adding. We hold the
3241 	 * ill_lock and ncec_lock to synchronize with ncec_delete() and
3242 	 * ipmp_ncec_flush_nce()
3243 	 */
3244 	if (!NCE_ISCONDEMNED(nce->nce_common))
3245 		newnce = nce_add(ill, nce->nce_common);
3246 	mutex_exit(&nce->nce_common->ncec_lock);
3247 	mutex_exit(&ill->ill_lock);
3248 	nce_refrele(nce);
3249 	return (newnce); /* could be null if nomem */
3250 }
3251 
3252 typedef struct nce_fp_match_s {
3253 	nce_t	*nce_fp_match_res;
3254 	mblk_t	*nce_fp_match_ack_mp;
3255 } nce_fp_match_t;
3256 
3257 /* ARGSUSED */
3258 static int
3259 nce_fastpath_match_dlur(ill_t *ill, nce_t *nce, void *arg)
3260 {
3261 	nce_fp_match_t	*nce_fp_marg = arg;
3262 	ncec_t		*ncec = nce->nce_common;
3263 	mblk_t		*mp = nce_fp_marg->nce_fp_match_ack_mp;
3264 	uchar_t	*mp_rptr, *ud_mp_rptr;
3265 	mblk_t		*ud_mp = nce->nce_dlur_mp;
3266 	ptrdiff_t	cmplen;
3267 
3268 	/*
3269 	 * mp is the mp associated with the fastpath ack.
3270 	 * ud_mp is the outstanding DL_UNITDATA_REQ on the nce_t
3271 	 * under consideration. If the contents match, then the
3272 	 * fastpath ack is used to update the nce.
3273 	 */
3274 	if (ud_mp == NULL)
3275 		return (0);
3276 	mp_rptr = mp->b_rptr;
3277 	cmplen = mp->b_wptr - mp_rptr;
3278 	ASSERT(cmplen >= 0);
3279 
3280 	ud_mp_rptr = ud_mp->b_rptr;
3281 	/*
3282 	 * The ncec is locked here to prevent any other threads from accessing
3283 	 * and changing nce_dlur_mp when the address becomes resolved to an
3284 	 * lla while we're in the middle of looking at and comparing the
3285 	 * hardware address (lla). It is also locked to prevent multiple
3286 	 * threads in nce_fastpath() from examining nce_dlur_mp at the same
3287 	 * time.
3288 	 */
3289 	mutex_enter(&ncec->ncec_lock);
3290 	if (ud_mp->b_wptr - ud_mp_rptr != cmplen ||
3291 	    bcmp((char *)mp_rptr, (char *)ud_mp_rptr, cmplen) == 0) {
3292 		nce_fp_marg->nce_fp_match_res = nce;
3293 		mutex_exit(&ncec->ncec_lock);
3294 		nce_refhold(nce);
3295 		return (1);
3296 	}
3297 	mutex_exit(&ncec->ncec_lock);
3298 	return (0);
3299 }
3300 
3301 /*
3302  * Update all NCE's that are not in fastpath mode and
3303  * have an nce_fp_mp that matches mp. mp->b_cont contains
3304  * the fastpath header.
3305  *
3306  * Returns TRUE if entry should be dequeued, or FALSE otherwise.
3307  */
3308 void
3309 nce_fastpath_update(ill_t *ill,  mblk_t *mp)
3310 {
3311 	nce_fp_match_t nce_fp_marg;
3312 	nce_t *nce;
3313 	mblk_t *nce_fp_mp, *fp_mp;
3314 
3315 	nce_fp_marg.nce_fp_match_res = NULL;
3316 	nce_fp_marg.nce_fp_match_ack_mp = mp;
3317 
3318 	nce_walk(ill, nce_fastpath_match_dlur, &nce_fp_marg);
3319 
3320 	if ((nce = nce_fp_marg.nce_fp_match_res) == NULL)
3321 		return;
3322 
3323 	mutex_enter(&nce->nce_lock);
3324 	nce_fp_mp = nce->nce_fp_mp;
3325 
3326 	if (nce_fp_mp != NULL) {
3327 		fp_mp = mp->b_cont;
3328 		if (nce_fp_mp->b_rptr + MBLKL(fp_mp) >
3329 		    nce_fp_mp->b_datap->db_lim) {
3330 			mutex_exit(&nce->nce_lock);
3331 			nce = nce_delete_then_add(nce);
3332 			if (nce == NULL) {
3333 				return;
3334 			}
3335 			mutex_enter(&nce->nce_lock);
3336 			nce_fp_mp = nce->nce_fp_mp;
3337 		}
3338 	}
3339 
3340 	/* Matched - install mp as the fastpath mp */
3341 	if (nce_fp_mp == NULL) {
3342 		fp_mp = dupb(mp->b_cont);
3343 		nce->nce_fp_mp = fp_mp;
3344 	} else {
3345 		fp_mp = mp->b_cont;
3346 		bcopy(fp_mp->b_rptr, nce_fp_mp->b_rptr, MBLKL(fp_mp));
3347 		nce->nce_fp_mp->b_wptr = nce->nce_fp_mp->b_rptr
3348 		    + MBLKL(fp_mp);
3349 	}
3350 	mutex_exit(&nce->nce_lock);
3351 	nce_refrele(nce);
3352 }
3353 
3354 /*
3355  * Return a pointer to a given option in the packet.
3356  * Assumes that option part of the packet have already been validated.
3357  */
3358 nd_opt_hdr_t *
3359 ndp_get_option(nd_opt_hdr_t *opt, int optlen, int opt_type)
3360 {
3361 	while (optlen > 0) {
3362 		if (opt->nd_opt_type == opt_type)
3363 			return (opt);
3364 		optlen -= 8 * opt->nd_opt_len;
3365 		opt = (struct nd_opt_hdr *)((char *)opt + 8 * opt->nd_opt_len);
3366 	}
3367 	return (NULL);
3368 }
3369 
3370 /*
3371  * Verify all option lengths present are > 0, also check to see
3372  * if the option lengths and packet length are consistent.
3373  */
3374 boolean_t
3375 ndp_verify_optlen(nd_opt_hdr_t *opt, int optlen)
3376 {
3377 	ASSERT(opt != NULL);
3378 	while (optlen > 0) {
3379 		if (opt->nd_opt_len == 0)
3380 			return (B_FALSE);
3381 		optlen -= 8 * opt->nd_opt_len;
3382 		if (optlen < 0)
3383 			return (B_FALSE);
3384 		opt = (struct nd_opt_hdr *)((char *)opt + 8 * opt->nd_opt_len);
3385 	}
3386 	return (B_TRUE);
3387 }
3388 
3389 /*
3390  * ncec_walk function.
3391  * Free a fraction of the NCE cache entries.
3392  *
3393  * A possible optimization here would be to use ncec_last where possible, and
3394  * delete the least-frequently used entry, which would require more complex
3395  * computation as we walk through the ncec's (e.g., track ncec entries by
3396  * order of ncec_last and/or maintain state)
3397  */
3398 static void
3399 ncec_cache_reclaim(ncec_t *ncec, char *arg)
3400 {
3401 	ip_stack_t	*ipst = ncec->ncec_ipst;
3402 	uint_t		fraction = *(uint_t *)arg;
3403 	uint_t		rand;
3404 
3405 	if ((ncec->ncec_flags &
3406 	    (NCE_F_MYADDR | NCE_F_STATIC | NCE_F_BCAST)) != 0) {
3407 		return;
3408 	}
3409 
3410 	rand = (uint_t)ddi_get_lbolt() +
3411 	    NCE_ADDR_HASH_V6(ncec->ncec_addr, NCE_TABLE_SIZE);
3412 	if ((rand/fraction)*fraction == rand) {
3413 		IP_STAT(ipst, ip_nce_reclaim_deleted);
3414 		ncec_delete(ncec);
3415 	}
3416 }
3417 
3418 /*
3419  * kmem_cache callback to free up memory.
3420  *
3421  * For now we just delete a fixed fraction.
3422  */
3423 static void
3424 ip_nce_reclaim_stack(ip_stack_t *ipst)
3425 {
3426 	uint_t		fraction = ipst->ips_ip_nce_reclaim_fraction;
3427 
3428 	IP_STAT(ipst, ip_nce_reclaim_calls);
3429 
3430 	ncec_walk(NULL, (pfi_t)ncec_cache_reclaim, (uchar_t *)&fraction, ipst);
3431 
3432 	/*
3433 	 * Walk all CONNs that can have a reference on an ire, ncec or dce.
3434 	 * Get them to update any stale references to drop any refholds they
3435 	 * have.
3436 	 */
3437 	ipcl_walk(conn_ixa_cleanup, (void *)B_FALSE, ipst);
3438 }
3439 
3440 /*
3441  * Called by the memory allocator subsystem directly, when the system
3442  * is running low on memory.
3443  */
3444 /* ARGSUSED */
3445 void
3446 ip_nce_reclaim(void *args)
3447 {
3448 	netstack_handle_t nh;
3449 	netstack_t *ns;
3450 
3451 	netstack_next_init(&nh);
3452 	while ((ns = netstack_next(&nh)) != NULL) {
3453 		ip_nce_reclaim_stack(ns->netstack_ip);
3454 		netstack_rele(ns);
3455 	}
3456 	netstack_next_fini(&nh);
3457 }
3458 
3459 #ifdef DEBUG
3460 void
3461 ncec_trace_ref(ncec_t *ncec)
3462 {
3463 	ASSERT(MUTEX_HELD(&ncec->ncec_lock));
3464 
3465 	if (ncec->ncec_trace_disable)
3466 		return;
3467 
3468 	if (!th_trace_ref(ncec, ncec->ncec_ipst)) {
3469 		ncec->ncec_trace_disable = B_TRUE;
3470 		ncec_trace_cleanup(ncec);
3471 	}
3472 }
3473 
3474 void
3475 ncec_untrace_ref(ncec_t *ncec)
3476 {
3477 	ASSERT(MUTEX_HELD(&ncec->ncec_lock));
3478 
3479 	if (!ncec->ncec_trace_disable)
3480 		th_trace_unref(ncec);
3481 }
3482 
3483 static void
3484 ncec_trace_cleanup(const ncec_t *ncec)
3485 {
3486 	th_trace_cleanup(ncec, ncec->ncec_trace_disable);
3487 }
3488 #endif
3489 
3490 /*
3491  * Called when address resolution fails due to a timeout.
3492  * Send an ICMP unreachable in response to all queued packets.
3493  */
3494 void
3495 arp_resolv_failed(ncec_t *ncec)
3496 {
3497 	mblk_t	*mp, *nxt_mp;
3498 	char	buf[INET6_ADDRSTRLEN];
3499 	struct in_addr ipv4addr;
3500 	ill_t *ill = ncec->ncec_ill;
3501 	ip_stack_t *ipst = ncec->ncec_ipst;
3502 	ip_recv_attr_t	iras;
3503 
3504 	bzero(&iras, sizeof (iras));
3505 	iras.ira_flags = IRAF_IS_IPV4;
3506 	/*
3507 	 * we are setting the ira_rill to the ipmp_ill (instead of
3508 	 * the actual ill on which the packet was received), but this
3509 	 * is ok because we don't actually need the real ira_rill.
3510 	 * to send the icmp unreachable to the sender.
3511 	 */
3512 	iras.ira_ill = iras.ira_rill = ill;
3513 	iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex;
3514 	iras.ira_rifindex = iras.ira_ruifindex;
3515 
3516 	IN6_V4MAPPED_TO_INADDR(&ncec->ncec_addr, &ipv4addr);
3517 	ip3dbg(("arp_resolv_failed: dst %s\n",
3518 	    inet_ntop(AF_INET, &ipv4addr, buf, sizeof (buf))));
3519 	mutex_enter(&ncec->ncec_lock);
3520 	mp = ncec->ncec_qd_mp;
3521 	ncec->ncec_qd_mp = NULL;
3522 	ncec->ncec_nprobes = 0;
3523 	mutex_exit(&ncec->ncec_lock);
3524 	while (mp != NULL) {
3525 		nxt_mp = mp->b_next;
3526 		mp->b_next = NULL;
3527 
3528 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
3529 		ip_drop_output("ipIfStatsOutDiscards - address unreachable",
3530 		    mp, ill);
3531 		if (ipst->ips_ip_arp_icmp_error) {
3532 			ip3dbg(("arp_resolv_failed: "
3533 			    "Calling icmp_unreachable\n"));
3534 			icmp_unreachable(mp, ICMP_HOST_UNREACHABLE, &iras);
3535 		} else {
3536 			freemsg(mp);
3537 		}
3538 		ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE));
3539 		mp = nxt_mp;
3540 	}
3541 	ncec_cb_dispatch(ncec); /* finish off waiting callbacks */
3542 }
3543 
3544 /*
3545  * if ill is an under_ill, translate it to the ipmp_ill and add the
3546  * nce on the ipmp_ill. Two nce_t entries (one on the ipmp_ill, and
3547  * one on the underlying in_ill) will be created for the
3548  * ncec_t in this case. The ncec_t itself will be created on the ipmp_ill.
3549  */
3550 int
3551 nce_lookup_then_add_v4(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len,
3552     const in_addr_t *addr, uint16_t flags, uint16_t state, nce_t **newnce)
3553 {
3554 	int	err;
3555 	in6_addr_t addr6;
3556 	ip_stack_t *ipst = ill->ill_ipst;
3557 	nce_t	*nce, *upper_nce = NULL;
3558 	ill_t	*in_ill = ill, *under = NULL;
3559 	boolean_t need_ill_refrele = B_FALSE;
3560 
3561 	if (flags & NCE_F_MCAST) {
3562 		/*
3563 		 * hw_addr will be figured out in nce_set_multicast_v4;
3564 		 * caller needs to pass in the cast_ill for ipmp
3565 		 */
3566 		ASSERT(hw_addr == NULL);
3567 		ASSERT(!IS_IPMP(ill));
3568 		err = nce_set_multicast_v4(ill, addr, flags, newnce);
3569 		return (err);
3570 	}
3571 
3572 	if (IS_UNDER_IPMP(ill) && !(flags & NCE_F_MYADDR)) {
3573 		ill = ipmp_ill_hold_ipmp_ill(ill);
3574 		if (ill == NULL)
3575 			return (ENXIO);
3576 		need_ill_refrele = B_TRUE;
3577 	}
3578 	if ((flags & NCE_F_BCAST) != 0) {
3579 		/*
3580 		 * IPv4 broadcast ncec: compute the hwaddr.
3581 		 */
3582 		if (IS_IPMP(ill)) {
3583 			under = ipmp_ill_get_xmit_ill(ill, B_FALSE);
3584 			if (under == NULL)  {
3585 				if (need_ill_refrele)
3586 					ill_refrele(ill);
3587 				return (ENETDOWN);
3588 			}
3589 			hw_addr = under->ill_bcast_mp->b_rptr +
3590 			    NCE_LL_ADDR_OFFSET(under);
3591 			hw_addr_len = under->ill_phys_addr_length;
3592 		} else {
3593 			hw_addr = ill->ill_bcast_mp->b_rptr +
3594 			    NCE_LL_ADDR_OFFSET(ill),
3595 			    hw_addr_len = ill->ill_phys_addr_length;
3596 		}
3597 	}
3598 
3599 	mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
3600 	IN6_IPADDR_TO_V4MAPPED(*addr, &addr6);
3601 	nce = nce_lookup_addr(ill, &addr6);
3602 	if (nce == NULL) {
3603 		err = nce_add_v4(ill, hw_addr, hw_addr_len, addr, flags,
3604 		    state, &nce);
3605 	} else {
3606 		err = EEXIST;
3607 	}
3608 	mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
3609 	if (err == 0)
3610 		err = nce_add_v4_postprocess(nce);
3611 
3612 	if (in_ill != ill && nce != NULL) {
3613 		nce_t *under_nce = NULL;
3614 
3615 		/*
3616 		 * in_ill was the under_ill. Try to create the under_nce.
3617 		 * Hold the ill_g_lock to prevent changes to group membership
3618 		 * until we are done.
3619 		 */
3620 		rw_enter(&ipst->ips_ill_g_lock, RW_READER);
3621 		if (!IS_IN_SAME_ILLGRP(in_ill, ill)) {
3622 			DTRACE_PROBE2(ill__not__in__group, nce_t *, nce,
3623 			    ill_t *, ill);
3624 			rw_exit(&ipst->ips_ill_g_lock);
3625 			err = ENXIO;
3626 			nce_refrele(nce);
3627 			nce = NULL;
3628 			goto bail;
3629 		}
3630 		under_nce = nce_fastpath_create(in_ill, nce->nce_common);
3631 		if (under_nce == NULL) {
3632 			rw_exit(&ipst->ips_ill_g_lock);
3633 			err = EINVAL;
3634 			nce_refrele(nce);
3635 			nce = NULL;
3636 			goto bail;
3637 		}
3638 		rw_exit(&ipst->ips_ill_g_lock);
3639 		upper_nce = nce;
3640 		nce = under_nce; /* will be returned to caller */
3641 		if (NCE_ISREACHABLE(nce->nce_common))
3642 			nce_fastpath_trigger(under_nce);
3643 	}
3644 	if (nce != NULL) {
3645 		if (newnce != NULL)
3646 			*newnce = nce;
3647 		else
3648 			nce_refrele(nce);
3649 	}
3650 bail:
3651 	if (under != NULL)
3652 		ill_refrele(under);
3653 	if (upper_nce != NULL)
3654 		nce_refrele(upper_nce);
3655 	if (need_ill_refrele)
3656 		ill_refrele(ill);
3657 
3658 	return (err);
3659 }
3660 
3661 /*
3662  * NDP Cache Entry creation routine for IPv4.
3663  * This routine must always be called with ndp4->ndp_g_lock held.
3664  * Prior to return, ncec_refcnt is incremented.
3665  *
3666  * IPMP notes: the ncec for non-local (i.e., !NCE_MYADDR(ncec) addresses
3667  * are always added pointing at the ipmp_ill. Thus, when the ill passed
3668  * to nce_add_v4 is an under_ill (i.e., IS_UNDER_IPMP(ill)) two nce_t
3669  * entries will be created, both pointing at the same ncec_t. The nce_t
3670  * entries will have their nce_ill set to the ipmp_ill and the under_ill
3671  * respectively, with the ncec_t having its ncec_ill pointing at the ipmp_ill.
3672  * Local addresses are always created on the ill passed to nce_add_v4.
3673  */
3674 int
3675 nce_add_v4(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len,
3676     const in_addr_t *addr, uint16_t flags, uint16_t state, nce_t **newnce)
3677 {
3678 	int		err;
3679 	boolean_t	is_multicast = (flags & NCE_F_MCAST);
3680 	struct in6_addr	addr6;
3681 	nce_t		*nce;
3682 
3683 	ASSERT(MUTEX_HELD(&ill->ill_ipst->ips_ndp4->ndp_g_lock));
3684 	ASSERT(!ill->ill_isv6);
3685 	ASSERT(!IN_MULTICAST(htonl(*addr)) || is_multicast);
3686 
3687 	IN6_IPADDR_TO_V4MAPPED(*addr, &addr6);
3688 	err = nce_add_common(ill, hw_addr, hw_addr_len, &addr6, flags, state,
3689 	    &nce);
3690 	ASSERT(newnce != NULL);
3691 	*newnce = nce;
3692 	return (err);
3693 }
3694 
3695 /*
3696  * Post-processing routine to be executed after nce_add_v4(). This function
3697  * triggers fastpath (if appropriate) and DAD on the newly added nce entry
3698  * and must be called without any locks held.
3699  *
3700  * Always returns 0, but we return an int to keep this symmetric with the
3701  * IPv6 counter-part.
3702  */
3703 int
3704 nce_add_v4_postprocess(nce_t *nce)
3705 {
3706 	ncec_t		*ncec = nce->nce_common;
3707 	uint16_t	flags = ncec->ncec_flags;
3708 	boolean_t	ndp_need_dad = B_FALSE;
3709 	boolean_t	dropped;
3710 	clock_t		delay;
3711 	ip_stack_t	*ipst = ncec->ncec_ill->ill_ipst;
3712 	uchar_t		*hw_addr = ncec->ncec_lladdr;
3713 	boolean_t	trigger_fastpath = B_TRUE;
3714 
3715 	/*
3716 	 * If the hw_addr is NULL, typically for ND_INCOMPLETE nces, then
3717 	 * we call nce_fastpath as soon as the ncec is resolved in nce_process.
3718 	 * We call nce_fastpath from nce_update if the link layer address of
3719 	 * the peer changes from nce_update
3720 	 */
3721 	if (NCE_PUBLISH(ncec) || !NCE_ISREACHABLE(ncec) || (hw_addr == NULL &&
3722 	    ncec->ncec_ill->ill_net_type != IRE_IF_NORESOLVER))
3723 		trigger_fastpath = B_FALSE;
3724 
3725 	if (trigger_fastpath)
3726 		nce_fastpath_trigger(nce);
3727 
3728 	if (NCE_PUBLISH(ncec) && ncec->ncec_state == ND_PROBE) {
3729 		/*
3730 		 * Either the caller (by passing in ND_PROBE)
3731 		 * or nce_add_common() (by the internally computed state
3732 		 * based on ncec_addr and ill_net_type) has determined
3733 		 * that this unicast entry needs DAD. Trigger DAD.
3734 		 */
3735 		ndp_need_dad = B_TRUE;
3736 	} else if (flags & NCE_F_UNSOL_ADV) {
3737 		/*
3738 		 * We account for the transmit below by assigning one
3739 		 * less than the ndd variable. Subsequent decrements
3740 		 * are done in nce_timer.
3741 		 */
3742 		mutex_enter(&ncec->ncec_lock);
3743 		ncec->ncec_unsolicit_count =
3744 		    ipst->ips_ip_arp_publish_count - 1;
3745 		mutex_exit(&ncec->ncec_lock);
3746 		dropped = arp_announce(ncec);
3747 		mutex_enter(&ncec->ncec_lock);
3748 		if (dropped)
3749 			ncec->ncec_unsolicit_count++;
3750 		else
3751 			ncec->ncec_last_time_defended = ddi_get_lbolt();
3752 		if (ncec->ncec_unsolicit_count != 0) {
3753 			nce_start_timer(ncec,
3754 			    ipst->ips_ip_arp_publish_interval);
3755 		}
3756 		mutex_exit(&ncec->ncec_lock);
3757 	}
3758 
3759 	/*
3760 	 * If ncec_xmit_interval is 0, user has configured us to send the first
3761 	 * probe right away.  Do so, and set up for the subsequent probes.
3762 	 */
3763 	if (ndp_need_dad) {
3764 		mutex_enter(&ncec->ncec_lock);
3765 		if (ncec->ncec_pcnt == 0) {
3766 			/*
3767 			 * DAD probes and announce can be
3768 			 * administratively disabled by setting the
3769 			 * probe_count to zero. Restart the timer in
3770 			 * this case to mark the ipif as ready.
3771 			 */
3772 			ncec->ncec_unsolicit_count = 0;
3773 			mutex_exit(&ncec->ncec_lock);
3774 			nce_restart_timer(ncec, 0);
3775 		} else {
3776 			mutex_exit(&ncec->ncec_lock);
3777 			delay = ((ncec->ncec_flags & NCE_F_FAST) ?
3778 			    ipst->ips_arp_probe_delay :
3779 			    ipst->ips_arp_fastprobe_delay);
3780 			nce_dad(ncec, NULL, (delay == 0 ? B_TRUE : B_FALSE));
3781 		}
3782 	}
3783 	return (0);
3784 }
3785 
3786 /*
3787  * ncec_walk routine to update all entries that have a given destination or
3788  * gateway address and cached link layer (MAC) address.  This is used when ARP
3789  * informs us that a network-to-link-layer mapping may have changed.
3790  */
3791 void
3792 nce_update_hw_changed(ncec_t *ncec, void *arg)
3793 {
3794 	nce_hw_map_t *hwm = arg;
3795 	ipaddr_t ncec_addr;
3796 
3797 	if (ncec->ncec_state != ND_REACHABLE)
3798 		return;
3799 
3800 	IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, ncec_addr);
3801 	if (ncec_addr != hwm->hwm_addr)
3802 		return;
3803 
3804 	mutex_enter(&ncec->ncec_lock);
3805 	if (hwm->hwm_flags != 0)
3806 		ncec->ncec_flags = hwm->hwm_flags;
3807 	nce_update(ncec, ND_STALE, hwm->hwm_hwaddr);
3808 	mutex_exit(&ncec->ncec_lock);
3809 }
3810 
3811 void
3812 ncec_refhold(ncec_t *ncec)
3813 {
3814 	mutex_enter(&(ncec)->ncec_lock);
3815 	(ncec)->ncec_refcnt++;
3816 	ASSERT((ncec)->ncec_refcnt != 0);
3817 #ifdef DEBUG
3818 	ncec_trace_ref(ncec);
3819 #endif
3820 	mutex_exit(&(ncec)->ncec_lock);
3821 }
3822 
3823 void
3824 ncec_refhold_notr(ncec_t *ncec)
3825 {
3826 	mutex_enter(&(ncec)->ncec_lock);
3827 	(ncec)->ncec_refcnt++;
3828 	ASSERT((ncec)->ncec_refcnt != 0);
3829 	mutex_exit(&(ncec)->ncec_lock);
3830 }
3831 
3832 static void
3833 ncec_refhold_locked(ncec_t *ncec)
3834 {
3835 	ASSERT(MUTEX_HELD(&(ncec)->ncec_lock));
3836 	(ncec)->ncec_refcnt++;
3837 #ifdef DEBUG
3838 	ncec_trace_ref(ncec);
3839 #endif
3840 }
3841 
3842 /* ncec_inactive destroys the mutex thus no mutex_exit is needed */
3843 void
3844 ncec_refrele(ncec_t *ncec)
3845 {
3846 	mutex_enter(&(ncec)->ncec_lock);
3847 #ifdef DEBUG
3848 	ncec_untrace_ref(ncec);
3849 #endif
3850 	ASSERT((ncec)->ncec_refcnt != 0);
3851 	if (--(ncec)->ncec_refcnt == 0) {
3852 		ncec_inactive(ncec);
3853 	} else {
3854 		mutex_exit(&(ncec)->ncec_lock);
3855 	}
3856 }
3857 
3858 void
3859 ncec_refrele_notr(ncec_t *ncec)
3860 {
3861 	mutex_enter(&(ncec)->ncec_lock);
3862 	ASSERT((ncec)->ncec_refcnt != 0);
3863 	if (--(ncec)->ncec_refcnt == 0) {
3864 		ncec_inactive(ncec);
3865 	} else {
3866 		mutex_exit(&(ncec)->ncec_lock);
3867 	}
3868 }
3869 
3870 /*
3871  * Common to IPv4 and IPv6.
3872  */
3873 void
3874 nce_restart_timer(ncec_t *ncec, uint_t ms)
3875 {
3876 	timeout_id_t tid;
3877 
3878 	ASSERT(!MUTEX_HELD(&(ncec)->ncec_lock));
3879 
3880 	/* First cancel any running timer */
3881 	mutex_enter(&ncec->ncec_lock);
3882 	tid = ncec->ncec_timeout_id;
3883 	ncec->ncec_timeout_id = 0;
3884 	if (tid != 0) {
3885 		mutex_exit(&ncec->ncec_lock);
3886 		(void) untimeout(tid);
3887 		mutex_enter(&ncec->ncec_lock);
3888 	}
3889 
3890 	/* Restart timer */
3891 	nce_start_timer(ncec, ms);
3892 	mutex_exit(&ncec->ncec_lock);
3893 }
3894 
3895 static void
3896 nce_start_timer(ncec_t *ncec, uint_t ms)
3897 {
3898 	ASSERT(MUTEX_HELD(&ncec->ncec_lock));
3899 	/*
3900 	 * Don't start the timer if the ncec has been deleted, or if the timer
3901 	 * is already running
3902 	 */
3903 	if (!NCE_ISCONDEMNED(ncec) && ncec->ncec_timeout_id == 0) {
3904 		ncec->ncec_timeout_id = timeout(nce_timer, ncec,
3905 		    MSEC_TO_TICK(ms) == 0 ? 1 : MSEC_TO_TICK(ms));
3906 	}
3907 }
3908 
3909 int
3910 nce_set_multicast_v4(ill_t *ill, const in_addr_t *dst,
3911     uint16_t flags, nce_t **newnce)
3912 {
3913 	uchar_t		*hw_addr;
3914 	int		err = 0;
3915 	ip_stack_t	*ipst = ill->ill_ipst;
3916 	in6_addr_t	dst6;
3917 	nce_t		*nce;
3918 
3919 	ASSERT(!ill->ill_isv6);
3920 
3921 	IN6_IPADDR_TO_V4MAPPED(*dst, &dst6);
3922 	mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
3923 	if ((nce = nce_lookup_addr(ill, &dst6)) != NULL) {
3924 		mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
3925 		goto done;
3926 	}
3927 	if (ill->ill_net_type == IRE_IF_RESOLVER) {
3928 		/*
3929 		 * For IRE_IF_RESOLVER a hardware mapping can be
3930 		 * generated, for IRE_IF_NORESOLVER, resolution cookie
3931 		 * in the ill is copied in nce_add_v4().
3932 		 */
3933 		hw_addr = kmem_alloc(ill->ill_phys_addr_length, KM_NOSLEEP);
3934 		if (hw_addr == NULL) {
3935 			mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
3936 			return (ENOMEM);
3937 		}
3938 		ip_mcast_mapping(ill, (uchar_t *)dst, hw_addr);
3939 	} else {
3940 		/*
3941 		 * IRE_IF_NORESOLVER type simply copies the resolution
3942 		 * cookie passed in.  So no hw_addr is needed.
3943 		 */
3944 		hw_addr = NULL;
3945 	}
3946 	ASSERT(flags & NCE_F_MCAST);
3947 	ASSERT(flags & NCE_F_NONUD);
3948 	/* nce_state will be computed by nce_add_common() */
3949 	err = nce_add_v4(ill, hw_addr, ill->ill_phys_addr_length, dst, flags,
3950 	    ND_UNCHANGED, &nce);
3951 	mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
3952 	if (err == 0)
3953 		err = nce_add_v4_postprocess(nce);
3954 	if (hw_addr != NULL)
3955 		kmem_free(hw_addr, ill->ill_phys_addr_length);
3956 	if (err != 0) {
3957 		ip1dbg(("nce_set_multicast_v4: create failed" "%d\n", err));
3958 		return (err);
3959 	}
3960 done:
3961 	if (newnce != NULL)
3962 		*newnce = nce;
3963 	else
3964 		nce_refrele(nce);
3965 	return (0);
3966 }
3967 
3968 /*
3969  * This is used when scanning for "old" (least recently broadcast) NCEs.  We
3970  * don't want to have to walk the list for every single one, so we gather up
3971  * batches at a time.
3972  */
3973 #define	NCE_RESCHED_LIST_LEN	8
3974 
3975 typedef struct {
3976 	ill_t	*ncert_ill;
3977 	uint_t	ncert_num;
3978 	ncec_t	*ncert_nces[NCE_RESCHED_LIST_LEN];
3979 } nce_resched_t;
3980 
3981 /*
3982  * Pick the longest waiting NCEs for defense.
3983  */
3984 /* ARGSUSED */
3985 static int
3986 ncec_reschedule(ill_t *ill, nce_t *nce, void *arg)
3987 {
3988 	nce_resched_t *ncert = arg;
3989 	ncec_t **ncecs;
3990 	ncec_t **ncec_max;
3991 	ncec_t *ncec_temp;
3992 	ncec_t *ncec = nce->nce_common;
3993 
3994 	ASSERT(ncec->ncec_ill == ncert->ncert_ill);
3995 	/*
3996 	 * Only reachable entries that are ready for announcement are eligible.
3997 	 */
3998 	if (!NCE_MYADDR(ncec) || ncec->ncec_state != ND_REACHABLE)
3999 		return (0);
4000 	if (ncert->ncert_num < NCE_RESCHED_LIST_LEN) {
4001 		ncec_refhold(ncec);
4002 		ncert->ncert_nces[ncert->ncert_num++] = ncec;
4003 	} else {
4004 		ncecs = ncert->ncert_nces;
4005 		ncec_max = ncecs + NCE_RESCHED_LIST_LEN;
4006 		ncec_refhold(ncec);
4007 		for (; ncecs < ncec_max; ncecs++) {
4008 			ASSERT(ncec != NULL);
4009 			if ((*ncecs)->ncec_last_time_defended >
4010 			    ncec->ncec_last_time_defended) {
4011 				ncec_temp = *ncecs;
4012 				*ncecs = ncec;
4013 				ncec = ncec_temp;
4014 			}
4015 		}
4016 		ncec_refrele(ncec);
4017 	}
4018 	return (0);
4019 }
4020 
4021 /*
4022  * Reschedule the ARP defense of any long-waiting NCEs.  It's assumed that this
4023  * doesn't happen very often (if at all), and thus it needn't be highly
4024  * optimized.  (Note, though, that it's actually O(N) complexity, because the
4025  * outer loop is bounded by a constant rather than by the length of the list.)
4026  */
4027 static void
4028 nce_ill_reschedule(ill_t *ill, nce_resched_t *ncert)
4029 {
4030 	ncec_t		*ncec;
4031 	ip_stack_t	*ipst = ill->ill_ipst;
4032 	uint_t		i, defend_rate;
4033 
4034 	i = ill->ill_defend_count;
4035 	ill->ill_defend_count = 0;
4036 	if (ill->ill_isv6)
4037 		defend_rate = ipst->ips_ndp_defend_rate;
4038 	else
4039 		defend_rate = ipst->ips_arp_defend_rate;
4040 	/* If none could be sitting around, then don't reschedule */
4041 	if (i < defend_rate) {
4042 		DTRACE_PROBE1(reschedule_none, ill_t *, ill);
4043 		return;
4044 	}
4045 	ncert->ncert_ill = ill;
4046 	while (ill->ill_defend_count < defend_rate) {
4047 		nce_walk_common(ill, ncec_reschedule, ncert);
4048 		for (i = 0; i < ncert->ncert_num; i++) {
4049 
4050 			ncec = ncert->ncert_nces[i];
4051 			mutex_enter(&ncec->ncec_lock);
4052 			ncec->ncec_flags |= NCE_F_DELAYED;
4053 			mutex_exit(&ncec->ncec_lock);
4054 			/*
4055 			 * we plan to schedule this ncec, so incr the
4056 			 * defend_count in anticipation.
4057 			 */
4058 			if (++ill->ill_defend_count >= defend_rate)
4059 				break;
4060 		}
4061 		if (ncert->ncert_num < NCE_RESCHED_LIST_LEN)
4062 			break;
4063 	}
4064 }
4065 
4066 /*
4067  * Check if the current rate-limiting parameters permit the sending
4068  * of another address defense announcement for both IPv4 and IPv6.
4069  * Returns B_TRUE if rate-limiting is in effect (i.e., send is not
4070  * permitted), and B_FALSE otherwise. The `defend_rate' parameter
4071  * determines how many address defense announcements are permitted
4072  * in any `defense_perio' interval.
4073  */
4074 static boolean_t
4075 ill_defend_rate_limit(ill_t *ill, ncec_t *ncec)
4076 {
4077 	clock_t		now = ddi_get_lbolt();
4078 	ip_stack_t	*ipst = ill->ill_ipst;
4079 	clock_t		start = ill->ill_defend_start;
4080 	uint32_t	elapsed, defend_period, defend_rate;
4081 	nce_resched_t	ncert;
4082 	boolean_t	ret;
4083 	int		i;
4084 
4085 	if (ill->ill_isv6) {
4086 		defend_period = ipst->ips_ndp_defend_period;
4087 		defend_rate = ipst->ips_ndp_defend_rate;
4088 	} else {
4089 		defend_period = ipst->ips_arp_defend_period;
4090 		defend_rate = ipst->ips_arp_defend_rate;
4091 	}
4092 	if (defend_rate == 0)
4093 		return (B_TRUE);
4094 	bzero(&ncert, sizeof (ncert));
4095 	mutex_enter(&ill->ill_lock);
4096 	if (start > 0) {
4097 		elapsed = now - start;
4098 		if (elapsed > SEC_TO_TICK(defend_period)) {
4099 			ill->ill_defend_start = now;
4100 			/*
4101 			 * nce_ill_reschedule will attempt to
4102 			 * prevent starvation by reschduling the
4103 			 * oldest entries, which are marked with
4104 			 * the NCE_F_DELAYED flag.
4105 			 */
4106 			nce_ill_reschedule(ill, &ncert);
4107 		}
4108 	} else {
4109 		ill->ill_defend_start = now;
4110 	}
4111 	ASSERT(ill->ill_defend_count <= defend_rate);
4112 	mutex_enter(&ncec->ncec_lock);
4113 	if (ncec->ncec_flags & NCE_F_DELAYED) {
4114 		/*
4115 		 * This ncec was rescheduled as one of the really old
4116 		 * entries needing on-going defense. The
4117 		 * ill_defend_count was already incremented in
4118 		 * nce_ill_reschedule. Go ahead and send the announce.
4119 		 */
4120 		ncec->ncec_flags &= ~NCE_F_DELAYED;
4121 		mutex_exit(&ncec->ncec_lock);
4122 		ret = B_FALSE;
4123 		goto done;
4124 	}
4125 	mutex_exit(&ncec->ncec_lock);
4126 	if (ill->ill_defend_count < defend_rate)
4127 		ill->ill_defend_count++;
4128 	if (ill->ill_defend_count == defend_rate) {
4129 		/*
4130 		 * we are no longer allowed to send unbidden defense
4131 		 * messages. Wait for rescheduling.
4132 		 */
4133 		ret = B_TRUE;
4134 	} else {
4135 		ret = B_FALSE;
4136 	}
4137 done:
4138 	mutex_exit(&ill->ill_lock);
4139 	/*
4140 	 * After all the locks have been dropped we can restart nce timer,
4141 	 * and refrele the delayed ncecs
4142 	 */
4143 	for (i = 0; i < ncert.ncert_num; i++) {
4144 		clock_t	xmit_interval;
4145 		ncec_t	*tmp;
4146 
4147 		tmp = ncert.ncert_nces[i];
4148 		xmit_interval = nce_fuzz_interval(tmp->ncec_xmit_interval,
4149 		    B_FALSE);
4150 		nce_restart_timer(tmp, xmit_interval);
4151 		ncec_refrele(tmp);
4152 	}
4153 	return (ret);
4154 }
4155 
4156 boolean_t
4157 ndp_announce(ncec_t *ncec)
4158 {
4159 	return (ndp_xmit(ncec->ncec_ill, ND_NEIGHBOR_ADVERT, ncec->ncec_lladdr,
4160 	    ncec->ncec_lladdr_length, &ncec->ncec_addr, &ipv6_all_hosts_mcast,
4161 	    nce_advert_flags(ncec)));
4162 }
4163 
4164 ill_t *
4165 nce_resolve_src(ncec_t *ncec, in6_addr_t *src)
4166 {
4167 	mblk_t		*mp;
4168 	in6_addr_t	src6;
4169 	ipaddr_t	src4;
4170 	ill_t		*ill = ncec->ncec_ill;
4171 	ill_t		*src_ill = NULL;
4172 	ipif_t		*ipif = NULL;
4173 	boolean_t	is_myaddr = NCE_MYADDR(ncec);
4174 	boolean_t	isv6 = (ncec->ncec_ipversion == IPV6_VERSION);
4175 
4176 	ASSERT(src != NULL);
4177 	ASSERT(IN6_IS_ADDR_UNSPECIFIED(src));
4178 	src6 = *src;
4179 	if (is_myaddr) {
4180 		src6 = ncec->ncec_addr;
4181 		if (!isv6)
4182 			IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, src4);
4183 	} else {
4184 		/*
4185 		 * try to find one from the outgoing packet.
4186 		 */
4187 		mutex_enter(&ncec->ncec_lock);
4188 		mp = ncec->ncec_qd_mp;
4189 		if (mp != NULL) {
4190 			if (isv6) {
4191 				ip6_t	*ip6h = (ip6_t *)mp->b_rptr;
4192 
4193 				src6 = ip6h->ip6_src;
4194 			} else {
4195 				ipha_t  *ipha = (ipha_t *)mp->b_rptr;
4196 
4197 				src4 = ipha->ipha_src;
4198 				IN6_IPADDR_TO_V4MAPPED(src4, &src6);
4199 			}
4200 		}
4201 		mutex_exit(&ncec->ncec_lock);
4202 	}
4203 
4204 	/*
4205 	 * For outgoing packets, if the src of outgoing packet is one
4206 	 * of the assigned interface addresses use it, otherwise we
4207 	 * will pick the source address below.
4208 	 * For local addresses (is_myaddr) doing DAD, NDP announce
4209 	 * messages are mcast. So we use the (IPMP) cast_ill or the
4210 	 * (non-IPMP) ncec_ill for these message types. The only case
4211 	 * of unicast DAD messages are for IPv6 ND probes, for which
4212 	 * we find the ipif_bound_ill corresponding to the ncec_addr.
4213 	 */
4214 	if (!IN6_IS_ADDR_UNSPECIFIED(&src6) || is_myaddr) {
4215 		if (isv6) {
4216 			ipif = ipif_lookup_addr_nondup_v6(&src6, ill, ALL_ZONES,
4217 			    ill->ill_ipst);
4218 		} else {
4219 			ipif = ipif_lookup_addr_nondup(src4, ill, ALL_ZONES,
4220 			    ill->ill_ipst);
4221 		}
4222 
4223 		/*
4224 		 * If no relevant ipif can be found, then it's not one of our
4225 		 * addresses.  Reset to :: and try to find a src for the NS or
4226 		 * ARP request using ipif_select_source_v[4,6]  below.
4227 		 * If an ipif can be found, but it's not yet done with
4228 		 * DAD verification, and we are not being invoked for
4229 		 * DAD (i.e., !is_myaddr), then just postpone this
4230 		 * transmission until later.
4231 		 */
4232 		if (ipif == NULL) {
4233 			src6 = ipv6_all_zeros;
4234 			src4 = INADDR_ANY;
4235 		} else if (!ipif->ipif_addr_ready && !is_myaddr) {
4236 			DTRACE_PROBE2(nce__resolve__ipif__not__ready,
4237 			    ncec_t *, ncec, ipif_t *, ipif);
4238 			ipif_refrele(ipif);
4239 			return (NULL);
4240 		}
4241 	}
4242 
4243 	if (IN6_IS_ADDR_UNSPECIFIED(&src6) && !is_myaddr) {
4244 		/*
4245 		 * Pick a source address for this solicitation, but
4246 		 * restrict the selection to addresses assigned to the
4247 		 * output interface.  We do this because the destination will
4248 		 * create a neighbor cache entry for the source address of
4249 		 * this packet, so the source address had better be a valid
4250 		 * neighbor.
4251 		 */
4252 		if (isv6) {
4253 			ipif = ipif_select_source_v6(ill, &ncec->ncec_addr,
4254 			    B_TRUE, IPV6_PREFER_SRC_DEFAULT, ALL_ZONES,
4255 			    B_FALSE, NULL);
4256 		} else {
4257 			ipaddr_t nce_addr;
4258 
4259 			IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, nce_addr);
4260 			ipif = ipif_select_source_v4(ill, nce_addr, ALL_ZONES,
4261 			    B_FALSE, NULL);
4262 		}
4263 		if (ipif == NULL && IS_IPMP(ill)) {
4264 			ill_t *send_ill = ipmp_ill_get_xmit_ill(ill, B_TRUE);
4265 
4266 			if (send_ill != NULL) {
4267 				if (isv6) {
4268 					ipif = ipif_select_source_v6(send_ill,
4269 					    &ncec->ncec_addr, B_TRUE,
4270 					    IPV6_PREFER_SRC_DEFAULT, ALL_ZONES,
4271 					    B_FALSE, NULL);
4272 				} else {
4273 					IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr,
4274 					    src4);
4275 					ipif = ipif_select_source_v4(send_ill,
4276 					    src4, ALL_ZONES, B_TRUE, NULL);
4277 				}
4278 				ill_refrele(send_ill);
4279 			}
4280 		}
4281 
4282 		if (ipif == NULL) {
4283 			char buf[INET6_ADDRSTRLEN];
4284 
4285 			ip1dbg(("nce_resolve_src: No source ipif for dst %s\n",
4286 			    inet_ntop((isv6 ? AF_INET6 : AF_INET),
4287 			    (char *)&ncec->ncec_addr, buf, sizeof (buf))));
4288 			DTRACE_PROBE1(nce__resolve__no__ipif, ncec_t *, ncec);
4289 			return (NULL);
4290 		}
4291 		src6 = ipif->ipif_v6lcl_addr;
4292 	}
4293 	*src = src6;
4294 	if (ipif != NULL) {
4295 		src_ill = ipif->ipif_ill;
4296 		if (IS_IPMP(src_ill))
4297 			src_ill = ipmp_ipif_hold_bound_ill(ipif);
4298 		else
4299 			ill_refhold(src_ill);
4300 		ipif_refrele(ipif);
4301 		DTRACE_PROBE2(nce__resolve__src__ill, ncec_t *, ncec,
4302 		    ill_t *, src_ill);
4303 	}
4304 	return (src_ill);
4305 }
4306 
4307 void
4308 ip_nce_lookup_and_update(ipaddr_t *addr, ipif_t *ipif, ip_stack_t *ipst,
4309     uchar_t *hwaddr, int hwaddr_len, int flags)
4310 {
4311 	ill_t	*ill;
4312 	ncec_t	*ncec;
4313 	nce_t	*nce;
4314 	uint16_t new_state;
4315 
4316 	ill = (ipif ? ipif->ipif_ill : NULL);
4317 	if (ill != NULL) {
4318 		/*
4319 		 * only one ncec is possible
4320 		 */
4321 		nce = nce_lookup_v4(ill, addr);
4322 		if (nce != NULL) {
4323 			ncec = nce->nce_common;
4324 			mutex_enter(&ncec->ncec_lock);
4325 			if (NCE_ISREACHABLE(ncec))
4326 				new_state = ND_UNCHANGED;
4327 			else
4328 				new_state = ND_STALE;
4329 			ncec->ncec_flags = flags;
4330 			nce_update(ncec, new_state, hwaddr);
4331 			mutex_exit(&ncec->ncec_lock);
4332 			nce_refrele(nce);
4333 			return;
4334 		}
4335 	} else {
4336 		/*
4337 		 * ill is wildcard; clean up all ncec's and ire's
4338 		 * that match on addr.
4339 		 */
4340 		nce_hw_map_t hwm;
4341 
4342 		hwm.hwm_addr = *addr;
4343 		hwm.hwm_hwlen = hwaddr_len;
4344 		hwm.hwm_hwaddr = hwaddr;
4345 		hwm.hwm_flags = flags;
4346 
4347 		ncec_walk_common(ipst->ips_ndp4, NULL,
4348 		    (pfi_t)nce_update_hw_changed, (uchar_t *)&hwm, B_TRUE);
4349 	}
4350 }
4351 
4352 /*
4353  * Common function to add ncec entries.
4354  * we always add the ncec with ncec_ill == ill, and always create
4355  * nce_t on ncec_ill. A dlpi fastpath message may be triggered if the
4356  * ncec is !reachable.
4357  *
4358  * When the caller passes in an nce_state of ND_UNCHANGED,
4359  * nce_add_common() will determine the state of the created nce based
4360  * on the ill_net_type and nce_flags used. Otherwise, the nce will
4361  * be created with state set to the passed in nce_state.
4362  */
4363 static int
4364 nce_add_common(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len,
4365     const in6_addr_t *addr, uint16_t flags, uint16_t nce_state, nce_t **retnce)
4366 {
4367 	static	ncec_t		nce_nil;
4368 	uchar_t			*template = NULL;
4369 	int			err;
4370 	ncec_t			*ncec;
4371 	ncec_t			**ncep;
4372 	ip_stack_t		*ipst = ill->ill_ipst;
4373 	uint16_t		state;
4374 	boolean_t		fastprobe = B_FALSE;
4375 	struct ndp_g_s		*ndp;
4376 	nce_t			*nce = NULL;
4377 	mblk_t			*dlur_mp = NULL;
4378 
4379 	if (ill->ill_isv6)
4380 		ndp = ill->ill_ipst->ips_ndp6;
4381 	else
4382 		ndp = ill->ill_ipst->ips_ndp4;
4383 
4384 	*retnce = NULL;
4385 
4386 	ASSERT(MUTEX_HELD(&ndp->ndp_g_lock));
4387 
4388 	if (IN6_IS_ADDR_UNSPECIFIED(addr)) {
4389 		ip0dbg(("nce_add_common: no addr\n"));
4390 		return (EINVAL);
4391 	}
4392 	if ((flags & ~NCE_EXTERNAL_FLAGS_MASK)) {
4393 		ip0dbg(("nce_add_common: flags = %x\n", (int)flags));
4394 		return (EINVAL);
4395 	}
4396 
4397 	if (ill->ill_isv6) {
4398 		ncep = ((ncec_t **)NCE_HASH_PTR_V6(ipst, *addr));
4399 	} else {
4400 		ipaddr_t v4addr;
4401 
4402 		IN6_V4MAPPED_TO_IPADDR(addr, v4addr);
4403 		ncep = ((ncec_t **)NCE_HASH_PTR_V4(ipst, v4addr));
4404 	}
4405 
4406 	/*
4407 	 * The caller has ensured that there is no nce on ill, but there could
4408 	 * still be an nce_common_t for the address, so that we find exisiting
4409 	 * ncec_t strucutures first, and atomically add a new nce_t if
4410 	 * one is found. The ndp_g_lock ensures that we don't cross threads
4411 	 * with an ncec_delete(). Unlike ncec_lookup_illgrp() we do not
4412 	 * compare for matches across the illgrp because this function is
4413 	 * called via nce_lookup_then_add_v* -> nce_add_v* -> nce_add_common,
4414 	 * with the nce_lookup_then_add_v* passing in the ipmp_ill where
4415 	 * appropriate.
4416 	 */
4417 	ncec = *ncep;
4418 	for (; ncec != NULL; ncec = ncec->ncec_next) {
4419 		if (ncec->ncec_ill == ill) {
4420 			if (IN6_ARE_ADDR_EQUAL(&ncec->ncec_addr, addr)) {
4421 				/*
4422 				 * We should never find *retnce to be
4423 				 * MYADDR, since the caller may then
4424 				 * incorrectly restart a DAD timer that's
4425 				 * already running.  However, if we are in
4426 				 * forwarding mode, and the interface is
4427 				 * moving in/out of groups, the data
4428 				 * path ire lookup (e.g., ire_revalidate_nce)
4429 				 * may  have determined that some destination
4430 				 * is offlink while the control path is adding
4431 				 * that address as a local address.
4432 				 * Recover from  this case by failing the
4433 				 * lookup
4434 				 */
4435 				if (NCE_MYADDR(ncec))
4436 					return (ENXIO);
4437 				*retnce = nce_ill_lookup_then_add(ill, ncec);
4438 				if (*retnce != NULL)
4439 					break;
4440 			}
4441 		}
4442 	}
4443 	if (*retnce != NULL) /* caller must trigger fastpath on nce */
4444 		return (0);
4445 
4446 	ncec = kmem_cache_alloc(ncec_cache, KM_NOSLEEP);
4447 	if (ncec == NULL)
4448 		return (ENOMEM);
4449 	*ncec = nce_nil;
4450 	ncec->ncec_ill = ill;
4451 	ncec->ncec_ipversion = (ill->ill_isv6 ? IPV6_VERSION : IPV4_VERSION);
4452 	ncec->ncec_flags = flags;
4453 	ncec->ncec_ipst = ipst;	/* No netstack_hold */
4454 
4455 	if (!ill->ill_isv6) {
4456 		ipaddr_t addr4;
4457 
4458 		/*
4459 		 * DAD probe interval and probe count are set based on
4460 		 * fast/slow probe settings. If the underlying link doesn't
4461 		 * have reliably up/down notifications or if we're working
4462 		 * with IPv4 169.254.0.0/16 Link Local Address space, then
4463 		 * don't use the fast timers.  Otherwise, use them.
4464 		 */
4465 		ASSERT(IN6_IS_ADDR_V4MAPPED(addr));
4466 		IN6_V4MAPPED_TO_IPADDR(addr, addr4);
4467 		if (ill->ill_note_link && !IS_IPV4_LL_SPACE(&addr4))
4468 			fastprobe = B_TRUE;
4469 		if (fastprobe) {
4470 			ncec->ncec_xmit_interval =
4471 			    ipst->ips_arp_fastprobe_interval;
4472 			ncec->ncec_pcnt =
4473 			    ipst->ips_arp_fastprobe_count;
4474 			ncec->ncec_flags |= NCE_F_FAST;
4475 		} else {
4476 			ncec->ncec_xmit_interval =
4477 			    ipst->ips_arp_probe_interval;
4478 			ncec->ncec_pcnt =
4479 			    ipst->ips_arp_probe_count;
4480 		}
4481 		if (NCE_PUBLISH(ncec)) {
4482 			ncec->ncec_unsolicit_count =
4483 			    ipst->ips_ip_arp_publish_count;
4484 		}
4485 	} else {
4486 		/*
4487 		 * probe interval is constant: ILL_PROBE_INTERVAL
4488 		 * probe count is constant: ND_MAX_UNICAST_SOLICIT
4489 		 */
4490 		ncec->ncec_pcnt = ND_MAX_UNICAST_SOLICIT;
4491 		if (NCE_PUBLISH(ncec)) {
4492 			ncec->ncec_unsolicit_count =
4493 			    ipst->ips_ip_ndp_unsolicit_count;
4494 		}
4495 	}
4496 	ncec->ncec_rcnt = ill->ill_xmit_count;
4497 	ncec->ncec_addr = *addr;
4498 	ncec->ncec_qd_mp = NULL;
4499 	ncec->ncec_refcnt = 1; /* for ncec getting created */
4500 	mutex_init(&ncec->ncec_lock, NULL, MUTEX_DEFAULT, NULL);
4501 	ncec->ncec_trace_disable = B_FALSE;
4502 
4503 	/*
4504 	 * ncec_lladdr holds link layer address
4505 	 */
4506 	if (hw_addr_len > 0) {
4507 		template = kmem_alloc(hw_addr_len, KM_NOSLEEP);
4508 		if (template == NULL) {
4509 			err = ENOMEM;
4510 			goto err_ret;
4511 		}
4512 		ncec->ncec_lladdr = template;
4513 		ncec->ncec_lladdr_length = hw_addr_len;
4514 		bzero(ncec->ncec_lladdr, hw_addr_len);
4515 	}
4516 	if ((flags & NCE_F_BCAST) != 0) {
4517 		state = ND_REACHABLE;
4518 		ASSERT(hw_addr_len > 0);
4519 	} else if (ill->ill_net_type == IRE_IF_RESOLVER) {
4520 		state = ND_INITIAL;
4521 	} else if (ill->ill_net_type == IRE_IF_NORESOLVER) {
4522 		/*
4523 		 * NORESOLVER entries are always created in the REACHABLE
4524 		 * state.
4525 		 */
4526 		state = ND_REACHABLE;
4527 		if (ill->ill_phys_addr_length == IP_ADDR_LEN &&
4528 		    ill->ill_mactype != DL_IPV4 &&
4529 		    ill->ill_mactype != DL_6TO4) {
4530 			/*
4531 			 * We create a nce_res_mp with the IP nexthop address
4532 			 * as the destination address if the physical length
4533 			 * is exactly 4 bytes for point-to-multipoint links
4534 			 * that do their own resolution from IP to link-layer
4535 			 * address (e.g. IP over X.25).
4536 			 */
4537 			bcopy((uchar_t *)addr,
4538 			    ncec->ncec_lladdr, ill->ill_phys_addr_length);
4539 		}
4540 		if (ill->ill_phys_addr_length == IPV6_ADDR_LEN &&
4541 		    ill->ill_mactype != DL_IPV6) {
4542 			/*
4543 			 * We create a nce_res_mp with the IP nexthop address
4544 			 * as the destination address if the physical legnth
4545 			 * is exactly 16 bytes for point-to-multipoint links
4546 			 * that do their own resolution from IP to link-layer
4547 			 * address.
4548 			 */
4549 			bcopy((uchar_t *)addr,
4550 			    ncec->ncec_lladdr, ill->ill_phys_addr_length);
4551 		}
4552 		/*
4553 		 * Since NUD is not part of the base IPv4 protocol definition,
4554 		 * IPv4 neighbor entries on NORESOLVER interfaces will never
4555 		 * age, and are marked NCE_F_NONUD.
4556 		 */
4557 		if (!ill->ill_isv6)
4558 			ncec->ncec_flags |= NCE_F_NONUD;
4559 	} else if (ill->ill_net_type == IRE_LOOPBACK) {
4560 		state = ND_REACHABLE;
4561 	}
4562 
4563 	if (hw_addr != NULL || ill->ill_net_type == IRE_IF_NORESOLVER) {
4564 		/*
4565 		 * We are adding an ncec with a deterministic hw_addr,
4566 		 * so the state can only be one of {REACHABLE, STALE, PROBE}.
4567 		 *
4568 		 * if we are adding a unicast ncec for the local address
4569 		 * it would be REACHABLE; we would be adding a ND_STALE entry
4570 		 * for the requestor of an ARP_REQUEST/ND_SOLICIT. Our own
4571 		 * addresses are added in PROBE to trigger DAD.
4572 		 */
4573 		if ((flags & (NCE_F_MCAST|NCE_F_BCAST)) ||
4574 		    ill->ill_net_type == IRE_IF_NORESOLVER)
4575 			state = ND_REACHABLE;
4576 		else if (!NCE_PUBLISH(ncec))
4577 			state = ND_STALE;
4578 		else
4579 			state = ND_PROBE;
4580 		if (hw_addr != NULL)
4581 			nce_set_ll(ncec, hw_addr);
4582 	}
4583 	/* caller overrides internally computed state */
4584 	if (nce_state != ND_UNCHANGED)
4585 		state = nce_state;
4586 
4587 	if (state == ND_PROBE)
4588 		ncec->ncec_flags |= NCE_F_UNVERIFIED;
4589 
4590 	ncec->ncec_state = state;
4591 
4592 	if (state == ND_REACHABLE) {
4593 		ncec->ncec_last = ncec->ncec_init_time =
4594 		    TICK_TO_MSEC(ddi_get_lbolt64());
4595 	} else {
4596 		ncec->ncec_last = 0;
4597 		if (state == ND_INITIAL)
4598 			ncec->ncec_init_time = TICK_TO_MSEC(ddi_get_lbolt64());
4599 	}
4600 	list_create(&ncec->ncec_cb, sizeof (ncec_cb_t),
4601 	    offsetof(ncec_cb_t, ncec_cb_node));
4602 	/*
4603 	 * have all the memory allocations out of the way before taking locks
4604 	 * and adding the nce.
4605 	 */
4606 	nce = kmem_cache_alloc(nce_cache, KM_NOSLEEP);
4607 	if (nce == NULL) {
4608 		err = ENOMEM;
4609 		goto err_ret;
4610 	}
4611 	if (ncec->ncec_lladdr != NULL ||
4612 	    ill->ill_net_type == IRE_IF_NORESOLVER) {
4613 		dlur_mp = ill_dlur_gen(ncec->ncec_lladdr,
4614 		    ill->ill_phys_addr_length, ill->ill_sap,
4615 		    ill->ill_sap_length);
4616 		if (dlur_mp == NULL) {
4617 			err = ENOMEM;
4618 			goto err_ret;
4619 		}
4620 	}
4621 
4622 	/*
4623 	 * Atomically ensure that the ill is not CONDEMNED, before
4624 	 * adding the NCE.
4625 	 */
4626 	mutex_enter(&ill->ill_lock);
4627 	if (ill->ill_state_flags & ILL_CONDEMNED) {
4628 		mutex_exit(&ill->ill_lock);
4629 		err = EINVAL;
4630 		goto err_ret;
4631 	}
4632 	if (!NCE_MYADDR(ncec) &&
4633 	    (ill->ill_state_flags & ILL_DOWN_IN_PROGRESS)) {
4634 		mutex_exit(&ill->ill_lock);
4635 		DTRACE_PROBE1(nce__add__on__down__ill, ncec_t *, ncec);
4636 		err = EINVAL;
4637 		goto err_ret;
4638 	}
4639 	/*
4640 	 * Acquire the ncec_lock even before adding the ncec to the list
4641 	 * so that it cannot get deleted after the ncec is added, but
4642 	 * before we add the nce.
4643 	 */
4644 	mutex_enter(&ncec->ncec_lock);
4645 	if ((ncec->ncec_next = *ncep) != NULL)
4646 		ncec->ncec_next->ncec_ptpn = &ncec->ncec_next;
4647 	*ncep = ncec;
4648 	ncec->ncec_ptpn = ncep;
4649 
4650 	/* Bump up the number of ncec's referencing this ill */
4651 	DTRACE_PROBE3(ill__incr__cnt, (ill_t *), ill,
4652 	    (char *), "ncec", (void *), ncec);
4653 	ill->ill_ncec_cnt++;
4654 	/*
4655 	 * Since we hold the ncec_lock at this time, the ncec cannot be
4656 	 * condemned, and we can safely add the nce.
4657 	 */
4658 	*retnce = nce_add_impl(ill, ncec, nce, dlur_mp);
4659 	mutex_exit(&ncec->ncec_lock);
4660 	mutex_exit(&ill->ill_lock);
4661 
4662 	/* caller must trigger fastpath on *retnce */
4663 	return (0);
4664 
4665 err_ret:
4666 	if (ncec != NULL)
4667 		kmem_cache_free(ncec_cache, ncec);
4668 	if (nce != NULL)
4669 		kmem_cache_free(nce_cache, nce);
4670 	freemsg(dlur_mp);
4671 	if (template != NULL)
4672 		kmem_free(template, ill->ill_phys_addr_length);
4673 	return (err);
4674 }
4675 
4676 /*
4677  * take a ref on the nce
4678  */
4679 void
4680 nce_refhold(nce_t *nce)
4681 {
4682 	mutex_enter(&nce->nce_lock);
4683 	nce->nce_refcnt++;
4684 	ASSERT((nce)->nce_refcnt != 0);
4685 	mutex_exit(&nce->nce_lock);
4686 }
4687 
4688 /*
4689  * release a ref on the nce; In general, this
4690  * cannot be called with locks held because nce_inactive
4691  * may result in nce_inactive which will take the ill_lock,
4692  * do ipif_ill_refrele_tail etc. Thus the one exception
4693  * where this can be called with locks held is when the caller
4694  * is certain that the nce_refcnt is sufficient to prevent
4695  * the invocation of nce_inactive.
4696  */
4697 void
4698 nce_refrele(nce_t *nce)
4699 {
4700 	ASSERT((nce)->nce_refcnt != 0);
4701 	mutex_enter(&nce->nce_lock);
4702 	if (--nce->nce_refcnt == 0)
4703 		nce_inactive(nce); /* destroys the mutex */
4704 	else
4705 		mutex_exit(&nce->nce_lock);
4706 }
4707 
4708 /*
4709  * free the nce after all refs have gone away.
4710  */
4711 static void
4712 nce_inactive(nce_t *nce)
4713 {
4714 	ill_t *ill = nce->nce_ill;
4715 
4716 	ASSERT(nce->nce_refcnt == 0);
4717 
4718 	ncec_refrele_notr(nce->nce_common);
4719 	nce->nce_common = NULL;
4720 	freemsg(nce->nce_fp_mp);
4721 	freemsg(nce->nce_dlur_mp);
4722 
4723 	mutex_enter(&ill->ill_lock);
4724 	DTRACE_PROBE3(ill__decr__cnt, (ill_t *), ill,
4725 	    (char *), "nce", (void *), nce);
4726 	ill->ill_nce_cnt--;
4727 	nce->nce_ill = NULL;
4728 	/*
4729 	 * If the number of ncec's associated with this ill have dropped
4730 	 * to zero, check whether we need to restart any operation that
4731 	 * is waiting for this to happen.
4732 	 */
4733 	if (ILL_DOWN_OK(ill)) {
4734 		/* ipif_ill_refrele_tail drops the ill_lock */
4735 		ipif_ill_refrele_tail(ill);
4736 	} else {
4737 		mutex_exit(&ill->ill_lock);
4738 	}
4739 
4740 	mutex_destroy(&nce->nce_lock);
4741 	kmem_cache_free(nce_cache, nce);
4742 }
4743 
4744 /*
4745  * Add an nce to the ill_nce list.
4746  */
4747 static nce_t *
4748 nce_add_impl(ill_t *ill, ncec_t *ncec, nce_t *nce, mblk_t *dlur_mp)
4749 {
4750 	bzero(nce, sizeof (*nce));
4751 	mutex_init(&nce->nce_lock, NULL, MUTEX_DEFAULT, NULL);
4752 	nce->nce_common = ncec;
4753 	nce->nce_addr = ncec->ncec_addr;
4754 	nce->nce_ill = ill;
4755 	DTRACE_PROBE3(ill__incr__cnt, (ill_t *), ill,
4756 	    (char *), "nce", (void *), nce);
4757 	ill->ill_nce_cnt++;
4758 
4759 	nce->nce_refcnt = 1; /* for the thread */
4760 	ncec->ncec_refcnt++; /* want ncec_refhold_locked_notr(ncec) */
4761 	nce->nce_dlur_mp = dlur_mp;
4762 
4763 	/* add nce to the ill's fastpath list.  */
4764 	nce->nce_refcnt++; /* for the list */
4765 	list_insert_head(&ill->ill_nce, nce);
4766 	return (nce);
4767 }
4768 
4769 static nce_t *
4770 nce_add(ill_t *ill, ncec_t *ncec)
4771 {
4772 	nce_t	*nce;
4773 	mblk_t	*dlur_mp = NULL;
4774 
4775 	ASSERT(MUTEX_HELD(&ill->ill_lock));
4776 	ASSERT(MUTEX_HELD(&ncec->ncec_lock));
4777 
4778 	nce = kmem_cache_alloc(nce_cache, KM_NOSLEEP);
4779 	if (nce == NULL)
4780 		return (NULL);
4781 	if (ncec->ncec_lladdr != NULL ||
4782 	    ill->ill_net_type == IRE_IF_NORESOLVER) {
4783 		dlur_mp = ill_dlur_gen(ncec->ncec_lladdr,
4784 		    ill->ill_phys_addr_length, ill->ill_sap,
4785 		    ill->ill_sap_length);
4786 		if (dlur_mp == NULL) {
4787 			kmem_cache_free(nce_cache, nce);
4788 			return (NULL);
4789 		}
4790 	}
4791 	return (nce_add_impl(ill, ncec, nce, dlur_mp));
4792 }
4793 
4794 /*
4795  * remove the nce from the ill_faspath list
4796  */
4797 void
4798 nce_delete(nce_t *nce)
4799 {
4800 	ill_t	*ill = nce->nce_ill;
4801 
4802 	ASSERT(MUTEX_HELD(&ill->ill_lock));
4803 
4804 	mutex_enter(&nce->nce_lock);
4805 	if (nce->nce_is_condemned) {
4806 		/*
4807 		 * some other thread has removed this nce from the ill_nce list
4808 		 */
4809 		mutex_exit(&nce->nce_lock);
4810 		return;
4811 	}
4812 	nce->nce_is_condemned = B_TRUE;
4813 	mutex_exit(&nce->nce_lock);
4814 
4815 	list_remove(&ill->ill_nce, nce);
4816 	/*
4817 	 * even though we are holding the ill_lock, it is ok to
4818 	 * call nce_refrele here because we know that we should have
4819 	 * at least 2 refs on the nce: one for the thread, and one
4820 	 * for the list. The refrele below will release the one for
4821 	 * the list.
4822 	 */
4823 	nce_refrele(nce);
4824 }
4825 
4826 nce_t *
4827 nce_lookup(ill_t *ill, const in6_addr_t *addr)
4828 {
4829 	nce_t *nce = NULL;
4830 
4831 	ASSERT(ill != NULL);
4832 	ASSERT(MUTEX_HELD(&ill->ill_lock));
4833 
4834 	for (nce = list_head(&ill->ill_nce); nce != NULL;
4835 	    nce = list_next(&ill->ill_nce, nce)) {
4836 		if (IN6_ARE_ADDR_EQUAL(&nce->nce_addr, addr))
4837 			break;
4838 	}
4839 
4840 	/*
4841 	 * if we found the nce on the ill_nce list while holding
4842 	 * the ill_lock, then it cannot be condemned yet.
4843 	 */
4844 	if (nce != NULL) {
4845 		ASSERT(!nce->nce_is_condemned);
4846 		nce_refhold(nce);
4847 	}
4848 	return (nce);
4849 }
4850 
4851 /*
4852  * Walk the ill_nce list on ill. The callback function func() cannot perform
4853  * any destructive actions.
4854  */
4855 static void
4856 nce_walk_common(ill_t *ill, pfi_t func, void *arg)
4857 {
4858 	nce_t *nce = NULL, *nce_next;
4859 
4860 	ASSERT(MUTEX_HELD(&ill->ill_lock));
4861 	for (nce = list_head(&ill->ill_nce); nce != NULL; ) {
4862 		nce_next = list_next(&ill->ill_nce, nce);
4863 		if (func(ill, nce, arg) != 0)
4864 			break;
4865 		nce = nce_next;
4866 	}
4867 }
4868 
4869 void
4870 nce_walk(ill_t *ill, pfi_t func, void *arg)
4871 {
4872 	mutex_enter(&ill->ill_lock);
4873 	nce_walk_common(ill, func, arg);
4874 	mutex_exit(&ill->ill_lock);
4875 }
4876 
4877 void
4878 nce_flush(ill_t *ill, boolean_t flushall)
4879 {
4880 	nce_t *nce, *nce_next;
4881 	list_t dead;
4882 
4883 	list_create(&dead, sizeof (nce_t), offsetof(nce_t, nce_node));
4884 	mutex_enter(&ill->ill_lock);
4885 	for (nce = list_head(&ill->ill_nce); nce != NULL; ) {
4886 		nce_next = list_next(&ill->ill_nce, nce);
4887 		if (!flushall && NCE_PUBLISH(nce->nce_common)) {
4888 			nce = nce_next;
4889 			continue;
4890 		}
4891 		/*
4892 		 * nce_delete requires that the caller should either not
4893 		 * be holding locks, or should hold a ref to ensure that
4894 		 * we wont hit ncec_inactive. So take a ref and clean up
4895 		 * after the list is flushed.
4896 		 */
4897 		nce_refhold(nce);
4898 		nce_delete(nce);
4899 		list_insert_tail(&dead, nce);
4900 		nce = nce_next;
4901 	}
4902 	mutex_exit(&ill->ill_lock);
4903 	while ((nce = list_head(&dead)) != NULL) {
4904 		list_remove(&dead, nce);
4905 		nce_refrele(nce);
4906 	}
4907 	ASSERT(list_is_empty(&dead));
4908 	list_destroy(&dead);
4909 }
4910 
4911 /* Return an interval that is anywhere in the [1 .. intv] range */
4912 static clock_t
4913 nce_fuzz_interval(clock_t intv, boolean_t initial_time)
4914 {
4915 	clock_t rnd, frac;
4916 
4917 	(void) random_get_pseudo_bytes((uint8_t *)&rnd, sizeof (rnd));
4918 	/* Note that clock_t is signed; must chop off bits */
4919 	rnd &= (1ul << (NBBY * sizeof (rnd) - 1)) - 1;
4920 	if (initial_time) {
4921 		if (intv <= 0)
4922 			intv = 1;
4923 		else
4924 			intv = (rnd % intv) + 1;
4925 	} else {
4926 		/* Compute 'frac' as 20% of the configured interval */
4927 		if ((frac = intv / 5) <= 1)
4928 			frac = 2;
4929 		/* Set intv randomly in the range [intv-frac .. intv+frac] */
4930 		if ((intv = intv - frac + rnd % (2 * frac + 1)) <= 0)
4931 		intv = 1;
4932 	}
4933 	return (intv);
4934 }
4935 
4936 void
4937 nce_resolv_ipmp_ok(ncec_t *ncec)
4938 {
4939 	mblk_t *mp;
4940 	uint_t pkt_len;
4941 	iaflags_t ixaflags = IXAF_NO_TRACE;
4942 	nce_t *under_nce;
4943 	ill_t	*ill = ncec->ncec_ill;
4944 	boolean_t isv6 = (ncec->ncec_ipversion == IPV6_VERSION);
4945 	ipif_t *src_ipif = NULL;
4946 	ip_stack_t *ipst = ill->ill_ipst;
4947 	ill_t *send_ill;
4948 	uint_t nprobes;
4949 
4950 	ASSERT(IS_IPMP(ill));
4951 
4952 	mutex_enter(&ncec->ncec_lock);
4953 	nprobes = ncec->ncec_nprobes;
4954 	mp = ncec->ncec_qd_mp;
4955 	ncec->ncec_qd_mp = NULL;
4956 	ncec->ncec_nprobes = 0;
4957 	mutex_exit(&ncec->ncec_lock);
4958 
4959 	while (mp != NULL) {
4960 		mblk_t *nxt_mp;
4961 
4962 		nxt_mp = mp->b_next;
4963 		mp->b_next = NULL;
4964 		if (isv6) {
4965 			ip6_t *ip6h = (ip6_t *)mp->b_rptr;
4966 
4967 			pkt_len = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN;
4968 			src_ipif = ipif_lookup_addr_nondup_v6(&ip6h->ip6_src,
4969 			    ill, ALL_ZONES, ipst);
4970 		} else {
4971 			ipha_t *ipha = (ipha_t *)mp->b_rptr;
4972 
4973 			ixaflags |= IXAF_IS_IPV4;
4974 			pkt_len = ntohs(ipha->ipha_length);
4975 			src_ipif = ipif_lookup_addr_nondup(ipha->ipha_src,
4976 			    ill, ALL_ZONES, ipst);
4977 		}
4978 
4979 		/*
4980 		 * find a new nce based on an under_ill. The first IPMP probe
4981 		 * packet gets queued, so we could still find a src_ipif that
4982 		 * matches an IPMP test address.
4983 		 */
4984 		if (src_ipif == NULL || IS_IPMP(src_ipif->ipif_ill)) {
4985 			/*
4986 			 * if src_ipif is null, this could be either a
4987 			 * forwarded packet or a probe whose src got deleted.
4988 			 * We identify the former case by looking for the
4989 			 * ncec_nprobes: the first ncec_nprobes packets are
4990 			 * probes;
4991 			 */
4992 			if (src_ipif == NULL && nprobes > 0)
4993 				goto drop_pkt;
4994 
4995 			/*
4996 			 * For forwarded packets, we use the ipmp rotor
4997 			 * to find send_ill.
4998 			 */
4999 			send_ill = ipmp_ill_get_xmit_ill(ncec->ncec_ill,
5000 			    B_TRUE);
5001 		} else {
5002 			send_ill = src_ipif->ipif_ill;
5003 			ill_refhold(send_ill);
5004 		}
5005 
5006 		DTRACE_PROBE4(nce__resolve__ipmp, (mblk_t *), mp,
5007 		    (ncec_t *), ncec, (ipif_t *),
5008 		    src_ipif, (ill_t *), send_ill);
5009 
5010 		if (send_ill == NULL) {
5011 			if (src_ipif != NULL)
5012 				ipif_refrele(src_ipif);
5013 			goto drop_pkt;
5014 		}
5015 		/* create an under_nce on send_ill */
5016 		rw_enter(&ipst->ips_ill_g_lock, RW_READER);
5017 		if (IS_IN_SAME_ILLGRP(send_ill, ncec->ncec_ill))
5018 			under_nce = nce_fastpath_create(send_ill, ncec);
5019 		else
5020 			under_nce = NULL;
5021 		rw_exit(&ipst->ips_ill_g_lock);
5022 		if (under_nce != NULL && NCE_ISREACHABLE(ncec))
5023 			nce_fastpath_trigger(under_nce);
5024 
5025 		ill_refrele(send_ill);
5026 		if (src_ipif != NULL)
5027 			ipif_refrele(src_ipif);
5028 
5029 		if (under_nce != NULL) {
5030 			(void) ip_xmit(mp, under_nce, ixaflags, pkt_len, 0,
5031 			    ALL_ZONES, 0, NULL);
5032 			nce_refrele(under_nce);
5033 			if (nprobes > 0)
5034 				nprobes--;
5035 			mp = nxt_mp;
5036 			continue;
5037 		}
5038 drop_pkt:
5039 		if (isv6) {
5040 			BUMP_MIB(&ipst->ips_ip6_mib, ipIfStatsOutDiscards);
5041 		} else {
5042 			BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
5043 		}
5044 		ip_drop_output("ipIfStatsOutDiscards - no under_ill", mp, NULL);
5045 		freemsg(mp);
5046 		if (nprobes > 0)
5047 			nprobes--;
5048 		mp = nxt_mp;
5049 	}
5050 	ncec_cb_dispatch(ncec); /* complete callbacks */
5051 }
5052