xref: /titanic_51/usr/src/uts/common/inet/ip/ip_ndp.c (revision f4ce81cfdad23bacfdb147be77d8d5fbe7673847)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #include <sys/types.h>
27 #include <sys/stream.h>
28 #include <sys/stropts.h>
29 #include <sys/strsun.h>
30 #include <sys/sysmacros.h>
31 #include <sys/errno.h>
32 #include <sys/dlpi.h>
33 #include <sys/socket.h>
34 #include <sys/ddi.h>
35 #include <sys/sunddi.h>
36 #include <sys/cmn_err.h>
37 #include <sys/debug.h>
38 #include <sys/vtrace.h>
39 #include <sys/kmem.h>
40 #include <sys/zone.h>
41 #include <sys/ethernet.h>
42 #include <sys/sdt.h>
43 #include <sys/mac.h>
44 
45 #include <net/if.h>
46 #include <net/if_types.h>
47 #include <net/if_dl.h>
48 #include <net/route.h>
49 #include <netinet/in.h>
50 #include <netinet/ip6.h>
51 #include <netinet/icmp6.h>
52 
53 #include <inet/common.h>
54 #include <inet/mi.h>
55 #include <inet/mib2.h>
56 #include <inet/nd.h>
57 #include <inet/ip.h>
58 #include <inet/ip_impl.h>
59 #include <inet/ipclassifier.h>
60 #include <inet/ip_if.h>
61 #include <inet/ip_ire.h>
62 #include <inet/ip_rts.h>
63 #include <inet/ip6.h>
64 #include <inet/ip_ndp.h>
65 #include <inet/sctp_ip.h>
66 #include <inet/ip_arp.h>
67 #include <inet/ip2mac_impl.h>
68 
69 #define	ANNOUNCE_INTERVAL(isv6) \
70 	(isv6 ? ipst->ips_ip_ndp_unsolicit_interval : \
71 	ipst->ips_ip_arp_publish_interval)
72 
73 #define	DEFENSE_INTERVAL(isv6) \
74 	(isv6 ? ipst->ips_ndp_defend_interval : \
75 	ipst->ips_arp_defend_interval)
76 
77 /* Non-tunable probe interval, based on link capabilities */
78 #define	ILL_PROBE_INTERVAL(ill)	((ill)->ill_note_link ? 150 : 1500)
79 
80 /*
81  * The IPv4 Link Local address space is special; we do extra duplicate checking
82  * there, as the entire assignment mechanism rests on random numbers.
83  */
84 #define	IS_IPV4_LL_SPACE(ptr)	(((uchar_t *)ptr)[0] == 169 && \
85 				((uchar_t *)ptr)[1] == 254)
86 
87 /*
88  * NCE_EXTERNAL_FLAGS_MASK defines the set of ncec_flags that may be passed
89  * in to the ncec*add* functions.
90  *
91  * NCE_F_AUTHORITY means that we ignore any incoming adverts for that
92  * mapping (though DAD is performed for the mapping). NCE_F_PUBLISH means
93  * that we will respond to requests for the protocol address.
94  */
95 #define	NCE_EXTERNAL_FLAGS_MASK \
96 	(NCE_F_MYADDR | NCE_F_ISROUTER | NCE_F_NONUD | \
97 	NCE_F_ANYCAST | NCE_F_UNSOL_ADV | NCE_F_BCAST | NCE_F_MCAST | \
98 	NCE_F_AUTHORITY | NCE_F_PUBLISH | NCE_F_STATIC)
99 
100 /*
101  * Function names with nce_ prefix are static while function
102  * names with ndp_ prefix are used by rest of the IP.
103  *
104  * Lock ordering:
105  *
106  *	ndp_g_lock -> ill_lock -> ncec_lock
107  *
108  * The ndp_g_lock protects the NCE hash (nce_hash_tbl, NCE_HASH_PTR) and
109  * ncec_next.  ncec_lock protects the contents of the NCE (particularly
110  * ncec_refcnt).
111  */
112 
113 static	void	nce_cleanup_list(ncec_t *ncec);
114 static	void 	nce_set_ll(ncec_t *ncec, uchar_t *ll_addr);
115 static	ncec_t	*ncec_lookup_illgrp(ill_t *, const in6_addr_t *,
116     ncec_t *);
117 static	nce_t	*nce_lookup_addr(ill_t *, const in6_addr_t *);
118 static	int	nce_set_multicast_v6(ill_t *ill, const in6_addr_t *addr,
119     uint16_t ncec_flags, nce_t **newnce);
120 static	int	nce_set_multicast_v4(ill_t *ill, const in_addr_t *dst,
121     uint16_t ncec_flags, nce_t **newnce);
122 static	boolean_t	ndp_xmit(ill_t *ill, uint32_t operation,
123     uint8_t *hwaddr, uint_t hwaddr_len, const in6_addr_t *sender,
124     const in6_addr_t *target, int flag);
125 static void	ncec_refhold_locked(ncec_t *);
126 static boolean_t ill_defend_rate_limit(ill_t *, ncec_t *);
127 static	void	nce_queue_mp_common(ncec_t *, mblk_t *, boolean_t);
128 static	int	nce_add_common(ill_t *, uchar_t *, uint_t, const in6_addr_t *,
129     uint16_t, uint16_t, nce_t **);
130 static nce_t *nce_add_impl(ill_t *, ncec_t *, nce_t *, mblk_t *);
131 static nce_t *nce_add(ill_t *, ncec_t *);
132 static void nce_inactive(nce_t *);
133 extern nce_t 	*nce_lookup(ill_t *, const in6_addr_t *);
134 static nce_t *nce_ill_lookup_then_add(ill_t *, ncec_t *);
135 static int	nce_add_v6(ill_t *, uchar_t *, uint_t, const in6_addr_t *,
136     uint16_t, uint16_t, nce_t **);
137 static int	nce_add_v4(ill_t *, uchar_t *, uint_t, const in_addr_t *,
138     uint16_t, uint16_t, nce_t **);
139 static int  nce_add_v6_postprocess(nce_t *);
140 static int  nce_add_v4_postprocess(nce_t *);
141 static ill_t *nce_resolve_src(ncec_t *, in6_addr_t *);
142 static clock_t nce_fuzz_interval(clock_t, boolean_t);
143 static void nce_resolv_ipmp_ok(ncec_t *);
144 static void nce_walk_common(ill_t *, pfi_t, void *);
145 static void nce_start_timer(ncec_t *, uint_t);
146 static nce_t *nce_fastpath_create(ill_t *, ncec_t *);
147 static void nce_fastpath_trigger(nce_t *);
148 static nce_t *nce_fastpath(ncec_t *, boolean_t, nce_t *);
149 
150 #ifdef DEBUG
151 static void	ncec_trace_cleanup(const ncec_t *);
152 #endif
153 
154 #define	NCE_HASH_PTR_V4(ipst, addr)					\
155 	(&((ipst)->ips_ndp4->nce_hash_tbl[IRE_ADDR_HASH(addr, NCE_TABLE_SIZE)]))
156 
157 #define	NCE_HASH_PTR_V6(ipst, addr)				 \
158 	(&((ipst)->ips_ndp6->nce_hash_tbl[NCE_ADDR_HASH_V6(addr, \
159 		NCE_TABLE_SIZE)]))
160 
161 extern kmem_cache_t *ncec_cache;
162 extern kmem_cache_t *nce_cache;
163 
164 /*
165  * Send out a IPv6 (unicast) or IPv4 (broadcast) DAD probe
166  * If src_ill is not null, the ncec_addr is bound to src_ill. The
167  * src_ill is ignored by nce_dad for IPv4 Neighbor Cache entries where
168  * the probe is sent on the ncec_ill (in the non-IPMP case) or the
169  * IPMP cast_ill (in the IPMP case).
170  *
171  * Note that the probe interval is based on ncec->ncec_ill which
172  * may be the ipmp_ill.
173  */
174 static void
175 nce_dad(ncec_t *ncec, ill_t *src_ill, boolean_t send_probe)
176 {
177 	boolean_t dropped;
178 	uint32_t probe_interval;
179 
180 	ASSERT(!(ncec->ncec_flags & NCE_F_MCAST));
181 	ASSERT(!(ncec->ncec_flags & NCE_F_BCAST));
182 	if (ncec->ncec_ipversion == IPV6_VERSION) {
183 		dropped = ndp_xmit(src_ill, ND_NEIGHBOR_SOLICIT,
184 		    ncec->ncec_lladdr, ncec->ncec_lladdr_length,
185 		    &ipv6_all_zeros, &ncec->ncec_addr, NDP_PROBE);
186 		probe_interval = ILL_PROBE_INTERVAL(ncec->ncec_ill);
187 	} else {
188 		/* IPv4 DAD delay the initial probe. */
189 		if (send_probe)
190 			dropped = arp_probe(ncec);
191 		else
192 			dropped = B_TRUE;
193 		probe_interval = nce_fuzz_interval(ncec->ncec_xmit_interval,
194 		    !send_probe);
195 	}
196 	if (!dropped) {
197 		mutex_enter(&ncec->ncec_lock);
198 		ncec->ncec_pcnt--;
199 		mutex_exit(&ncec->ncec_lock);
200 	}
201 	nce_restart_timer(ncec, probe_interval);
202 }
203 
204 /*
205  * Compute default flags to use for an advertisement of this ncec's address.
206  */
207 static int
208 nce_advert_flags(const ncec_t *ncec)
209 {
210 	int flag = 0;
211 
212 	if (ncec->ncec_flags & NCE_F_ISROUTER)
213 		flag |= NDP_ISROUTER;
214 	if (!(ncec->ncec_flags & NCE_F_ANYCAST))
215 		flag |= NDP_ORIDE;
216 
217 	return (flag);
218 }
219 
220 /*
221  * NDP Cache Entry creation routine.
222  * Mapped entries will never do NUD .
223  * This routine must always be called with ndp6->ndp_g_lock held.
224  */
225 int
226 nce_add_v6(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len,
227     const in6_addr_t *addr, uint16_t flags, uint16_t state, nce_t **newnce)
228 {
229 	int		err;
230 	nce_t		*nce;
231 
232 	ASSERT(MUTEX_HELD(&ill->ill_ipst->ips_ndp6->ndp_g_lock));
233 	ASSERT(ill != NULL && ill->ill_isv6);
234 
235 	err = nce_add_common(ill, hw_addr, hw_addr_len, addr, flags, state,
236 	    &nce);
237 	if (err != 0)
238 		return (err);
239 	ASSERT(newnce != NULL);
240 	*newnce = nce;
241 	return (err);
242 }
243 
244 /*
245  * Post-processing routine to be executed after nce_add_v6(). This function
246  * triggers fastpath (if appropriate) and DAD on the newly added nce entry
247  * and must be called without any locks held.
248  */
249 int
250 nce_add_v6_postprocess(nce_t *nce)
251 {
252 	ncec_t		*ncec = nce->nce_common;
253 	boolean_t	dropped = B_FALSE;
254 	uchar_t		*hw_addr = ncec->ncec_lladdr;
255 	uint_t		hw_addr_len = ncec->ncec_lladdr_length;
256 	ill_t		*ill = ncec->ncec_ill;
257 	int		err = 0;
258 	uint16_t	flags = ncec->ncec_flags;
259 	ip_stack_t	*ipst = ill->ill_ipst;
260 	boolean_t	trigger_fastpath = B_TRUE;
261 
262 	/*
263 	 * If the hw_addr is NULL, typically for ND_INCOMPLETE nces, then
264 	 * we call nce_fastpath as soon as the ncec is resolved in nce_process.
265 	 * We call nce_fastpath from nce_update if the link layer address of
266 	 * the peer changes from nce_update
267 	 */
268 	if (NCE_PUBLISH(ncec) || !NCE_ISREACHABLE(ncec) ||
269 	    (hw_addr == NULL && ill->ill_net_type != IRE_IF_NORESOLVER))
270 		trigger_fastpath = B_FALSE;
271 
272 	if (trigger_fastpath)
273 		nce_fastpath_trigger(nce);
274 	if (NCE_PUBLISH(ncec) && ncec->ncec_state == ND_PROBE) {
275 		ill_t *hwaddr_ill;
276 		/*
277 		 * Unicast entry that needs DAD.
278 		 */
279 		if (IS_IPMP(ill)) {
280 			hwaddr_ill = ipmp_illgrp_find_ill(ill->ill_grp,
281 			    hw_addr, hw_addr_len);
282 		} else {
283 			hwaddr_ill = ill;
284 		}
285 		nce_dad(ncec, hwaddr_ill, B_TRUE);
286 		err = EINPROGRESS;
287 	} else if (flags & NCE_F_UNSOL_ADV) {
288 		/*
289 		 * We account for the transmit below by assigning one
290 		 * less than the ndd variable. Subsequent decrements
291 		 * are done in nce_timer.
292 		 */
293 		mutex_enter(&ncec->ncec_lock);
294 		ncec->ncec_unsolicit_count =
295 		    ipst->ips_ip_ndp_unsolicit_count - 1;
296 		mutex_exit(&ncec->ncec_lock);
297 		dropped = ndp_xmit(ill,
298 		    ND_NEIGHBOR_ADVERT,
299 		    hw_addr,
300 		    hw_addr_len,
301 		    &ncec->ncec_addr,	/* Source and target of the adv */
302 		    &ipv6_all_hosts_mcast, /* Destination of the packet */
303 		    nce_advert_flags(ncec));
304 		mutex_enter(&ncec->ncec_lock);
305 		if (dropped)
306 			ncec->ncec_unsolicit_count++;
307 		else
308 			ncec->ncec_last_time_defended = ddi_get_lbolt();
309 		if (ncec->ncec_unsolicit_count != 0) {
310 			nce_start_timer(ncec,
311 			    ipst->ips_ip_ndp_unsolicit_interval);
312 		}
313 		mutex_exit(&ncec->ncec_lock);
314 	}
315 	return (err);
316 }
317 
318 /*
319  * Atomically lookup and add (if needed) Neighbor Cache information for
320  * an address.
321  *
322  * IPMP notes: the ncec for non-local (i.e., !NCE_MYADDR(ncec) addresses
323  * are always added pointing at the ipmp_ill. Thus, when the ill passed
324  * to nce_add_v6 is an under_ill (i.e., IS_UNDER_IPMP(ill)) two nce_t
325  * entries will be created, both pointing at the same ncec_t. The nce_t
326  * entries will have their nce_ill set to the ipmp_ill and the under_ill
327  * respectively, with the ncec_t having its ncec_ill pointing at the ipmp_ill.
328  * Local addresses are always created on the ill passed to nce_add_v6.
329  */
330 int
331 nce_lookup_then_add_v6(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len,
332     const in6_addr_t *addr, uint16_t flags, uint16_t state, nce_t **newnce)
333 {
334 	int		err = 0;
335 	ip_stack_t	*ipst = ill->ill_ipst;
336 	nce_t		*nce, *upper_nce = NULL;
337 	ill_t		*in_ill = ill;
338 	boolean_t	need_ill_refrele = B_FALSE;
339 
340 	if (flags & NCE_F_MCAST) {
341 		/*
342 		 * hw_addr will be figured out in nce_set_multicast_v6;
343 		 * caller has to select the cast_ill
344 		 */
345 		ASSERT(hw_addr == NULL);
346 		ASSERT(!IS_IPMP(ill));
347 		err = nce_set_multicast_v6(ill, addr, flags, newnce);
348 		return (err);
349 	}
350 	ASSERT(ill->ill_isv6);
351 	if (IS_UNDER_IPMP(ill) && !(flags & NCE_F_MYADDR)) {
352 		ill = ipmp_ill_hold_ipmp_ill(ill);
353 		if (ill == NULL)
354 			return (ENXIO);
355 		need_ill_refrele = B_TRUE;
356 	}
357 
358 	mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
359 	nce = nce_lookup_addr(ill, addr);
360 	if (nce == NULL) {
361 		err = nce_add_v6(ill, hw_addr, hw_addr_len, addr, flags, state,
362 		    &nce);
363 	} else {
364 		err = EEXIST;
365 	}
366 	mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
367 	if (err == 0)
368 		err = nce_add_v6_postprocess(nce);
369 	if (in_ill != ill && nce != NULL) {
370 		nce_t *under_nce;
371 
372 		/*
373 		 * in_ill was the under_ill. Try to create the under_nce.
374 		 * Hold the ill_g_lock to prevent changes to group membership
375 		 * until we are done.
376 		 */
377 		rw_enter(&ipst->ips_ill_g_lock, RW_READER);
378 		if (IS_IN_SAME_ILLGRP(in_ill, ill)) {
379 			under_nce = nce_fastpath_create(in_ill,
380 			    nce->nce_common);
381 			upper_nce = nce;
382 			if ((nce = under_nce) == NULL)
383 				err = EINVAL;
384 		}
385 		rw_exit(&ipst->ips_ill_g_lock);
386 		if (under_nce != NULL && NCE_ISREACHABLE(nce->nce_common))
387 			nce_fastpath_trigger(under_nce);
388 	}
389 	if (nce != NULL) {
390 		if (newnce != NULL)
391 			*newnce = nce;
392 		else
393 			nce_refrele(nce);
394 	}
395 	/* nce_refrele is deferred until the lock is dropped  */
396 	if (upper_nce != NULL)
397 		nce_refrele(upper_nce);
398 	if (need_ill_refrele)
399 		ill_refrele(ill);
400 	return (err);
401 }
402 
403 /*
404  * Remove all the CONDEMNED nces from the appropriate hash table.
405  * We create a private list of NCEs, these may have ires pointing
406  * to them, so the list will be passed through to clean up dependent
407  * ires and only then we can do ncec_refrele() which can make NCE inactive.
408  */
409 static void
410 nce_remove(ndp_g_t *ndp, ncec_t *ncec, ncec_t **free_nce_list)
411 {
412 	ncec_t *ncec1;
413 	ncec_t **ptpn;
414 
415 	ASSERT(MUTEX_HELD(&ndp->ndp_g_lock));
416 	ASSERT(ndp->ndp_g_walker == 0);
417 	for (; ncec; ncec = ncec1) {
418 		ncec1 = ncec->ncec_next;
419 		mutex_enter(&ncec->ncec_lock);
420 		if (NCE_ISCONDEMNED(ncec)) {
421 			ptpn = ncec->ncec_ptpn;
422 			ncec1 = ncec->ncec_next;
423 			if (ncec1 != NULL)
424 				ncec1->ncec_ptpn = ptpn;
425 			*ptpn = ncec1;
426 			ncec->ncec_ptpn = NULL;
427 			ncec->ncec_next = NULL;
428 			ncec->ncec_next = *free_nce_list;
429 			*free_nce_list = ncec;
430 		}
431 		mutex_exit(&ncec->ncec_lock);
432 	}
433 }
434 
435 /*
436  * 1. Mark the entry CONDEMNED. This ensures that no new nce_lookup()
437  *    will return this NCE. Also no new timeouts will
438  *    be started (See nce_restart_timer).
439  * 2. Cancel any currently running timeouts.
440  * 3. If there is an ndp walker, return. The walker will do the cleanup.
441  *    This ensures that walkers see a consistent list of NCEs while walking.
442  * 4. Otherwise remove the NCE from the list of NCEs
443  */
444 void
445 ncec_delete(ncec_t *ncec)
446 {
447 	ncec_t	**ptpn;
448 	ncec_t	*ncec1;
449 	int	ipversion = ncec->ncec_ipversion;
450 	ndp_g_t *ndp;
451 	ip_stack_t	*ipst = ncec->ncec_ipst;
452 
453 	if (ipversion == IPV4_VERSION)
454 		ndp = ipst->ips_ndp4;
455 	else
456 		ndp = ipst->ips_ndp6;
457 
458 	/* Serialize deletes */
459 	mutex_enter(&ncec->ncec_lock);
460 	if (NCE_ISCONDEMNED(ncec)) {
461 		/* Some other thread is doing the delete */
462 		mutex_exit(&ncec->ncec_lock);
463 		return;
464 	}
465 	/*
466 	 * Caller has a refhold. Also 1 ref for being in the list. Thus
467 	 * refcnt has to be >= 2
468 	 */
469 	ASSERT(ncec->ncec_refcnt >= 2);
470 	ncec->ncec_flags |= NCE_F_CONDEMNED;
471 	mutex_exit(&ncec->ncec_lock);
472 
473 	/* Count how many condemned ires for kmem_cache callback */
474 	atomic_add_32(&ipst->ips_num_nce_condemned, 1);
475 	nce_fastpath_list_delete(ncec->ncec_ill, ncec, NULL);
476 
477 	/* Complete any waiting callbacks */
478 	ncec_cb_dispatch(ncec);
479 
480 	/*
481 	 * Cancel any running timer. Timeout can't be restarted
482 	 * since CONDEMNED is set. Can't hold ncec_lock across untimeout.
483 	 * Passing invalid timeout id is fine.
484 	 */
485 	if (ncec->ncec_timeout_id != 0) {
486 		(void) untimeout(ncec->ncec_timeout_id);
487 		ncec->ncec_timeout_id = 0;
488 	}
489 
490 	mutex_enter(&ndp->ndp_g_lock);
491 	if (ncec->ncec_ptpn == NULL) {
492 		/*
493 		 * The last ndp walker has already removed this ncec from
494 		 * the list after we marked the ncec CONDEMNED and before
495 		 * we grabbed the global lock.
496 		 */
497 		mutex_exit(&ndp->ndp_g_lock);
498 		return;
499 	}
500 	if (ndp->ndp_g_walker > 0) {
501 		/*
502 		 * Can't unlink. The walker will clean up
503 		 */
504 		ndp->ndp_g_walker_cleanup = B_TRUE;
505 		mutex_exit(&ndp->ndp_g_lock);
506 		return;
507 	}
508 
509 	/*
510 	 * Now remove the ncec from the list. nce_restart_timer won't restart
511 	 * the timer since it is marked CONDEMNED.
512 	 */
513 	ptpn = ncec->ncec_ptpn;
514 	ncec1 = ncec->ncec_next;
515 	if (ncec1 != NULL)
516 		ncec1->ncec_ptpn = ptpn;
517 	*ptpn = ncec1;
518 	ncec->ncec_ptpn = NULL;
519 	ncec->ncec_next = NULL;
520 	mutex_exit(&ndp->ndp_g_lock);
521 
522 	/* Removed from ncec_ptpn/ncec_next list */
523 	ncec_refrele_notr(ncec);
524 }
525 
526 void
527 ncec_inactive(ncec_t *ncec)
528 {
529 	mblk_t		**mpp;
530 	ill_t		*ill = ncec->ncec_ill;
531 	ip_stack_t	*ipst = ncec->ncec_ipst;
532 
533 	ASSERT(ncec->ncec_refcnt == 0);
534 	ASSERT(MUTEX_HELD(&ncec->ncec_lock));
535 
536 	/* Count how many condemned nces for kmem_cache callback */
537 	if (NCE_ISCONDEMNED(ncec))
538 		atomic_add_32(&ipst->ips_num_nce_condemned, -1);
539 
540 	/* Free all allocated messages */
541 	mpp = &ncec->ncec_qd_mp;
542 	while (*mpp != NULL) {
543 		mblk_t  *mp;
544 
545 		mp = *mpp;
546 		*mpp = mp->b_next;
547 
548 		inet_freemsg(mp);
549 	}
550 	/*
551 	 * must have been cleaned up in ncec_delete
552 	 */
553 	ASSERT(list_is_empty(&ncec->ncec_cb));
554 	list_destroy(&ncec->ncec_cb);
555 	/*
556 	 * free the ncec_lladdr if one was allocated in nce_add_common()
557 	 */
558 	if (ncec->ncec_lladdr_length > 0)
559 		kmem_free(ncec->ncec_lladdr, ncec->ncec_lladdr_length);
560 
561 #ifdef DEBUG
562 	ncec_trace_cleanup(ncec);
563 #endif
564 
565 	mutex_enter(&ill->ill_lock);
566 	DTRACE_PROBE3(ill__decr__cnt, (ill_t *), ill,
567 	    (char *), "ncec", (void *), ncec);
568 	ill->ill_ncec_cnt--;
569 	ncec->ncec_ill = NULL;
570 	/*
571 	 * If the number of ncec's associated with this ill have dropped
572 	 * to zero, check whether we need to restart any operation that
573 	 * is waiting for this to happen.
574 	 */
575 	if (ILL_DOWN_OK(ill)) {
576 		/* ipif_ill_refrele_tail drops the ill_lock */
577 		ipif_ill_refrele_tail(ill);
578 	} else {
579 		mutex_exit(&ill->ill_lock);
580 	}
581 
582 	mutex_destroy(&ncec->ncec_lock);
583 	kmem_cache_free(ncec_cache, ncec);
584 }
585 
586 /*
587  * ncec_walk routine.  Delete the ncec if it is associated with the ill
588  * that is going away.  Always called as a writer.
589  */
590 void
591 ncec_delete_per_ill(ncec_t *ncec, uchar_t *arg)
592 {
593 	if ((ncec != NULL) && ncec->ncec_ill == (ill_t *)arg) {
594 		ncec_delete(ncec);
595 	}
596 }
597 
598 /*
599  * Neighbor Cache cleanup logic for a list of ncec_t entries.
600  */
601 static void
602 nce_cleanup_list(ncec_t *ncec)
603 {
604 	ncec_t *ncec_next;
605 
606 	ASSERT(ncec != NULL);
607 	while (ncec != NULL) {
608 		ncec_next = ncec->ncec_next;
609 		ncec->ncec_next = NULL;
610 
611 		/*
612 		 * It is possible for the last ndp walker (this thread)
613 		 * to come here after ncec_delete has marked the ncec CONDEMNED
614 		 * and before it has removed the ncec from the fastpath list
615 		 * or called untimeout. So we need to do it here. It is safe
616 		 * for both ncec_delete and this thread to do it twice or
617 		 * even simultaneously since each of the threads has a
618 		 * reference on the ncec.
619 		 */
620 		nce_fastpath_list_delete(ncec->ncec_ill, ncec, NULL);
621 		/*
622 		 * Cancel any running timer. Timeout can't be restarted
623 		 * since CONDEMNED is set. The ncec_lock can't be
624 		 * held across untimeout though passing invalid timeout
625 		 * id is fine.
626 		 */
627 		if (ncec->ncec_timeout_id != 0) {
628 			(void) untimeout(ncec->ncec_timeout_id);
629 			ncec->ncec_timeout_id = 0;
630 		}
631 		/* Removed from ncec_ptpn/ncec_next list */
632 		ncec_refrele_notr(ncec);
633 		ncec = ncec_next;
634 	}
635 }
636 
637 /*
638  * Restart DAD on given NCE.  Returns B_TRUE if DAD has been restarted.
639  */
640 boolean_t
641 nce_restart_dad(ncec_t *ncec)
642 {
643 	boolean_t started;
644 	ill_t *ill, *hwaddr_ill;
645 
646 	if (ncec == NULL)
647 		return (B_FALSE);
648 	ill = ncec->ncec_ill;
649 	mutex_enter(&ncec->ncec_lock);
650 	if (ncec->ncec_state == ND_PROBE) {
651 		mutex_exit(&ncec->ncec_lock);
652 		started = B_TRUE;
653 	} else if (ncec->ncec_state == ND_REACHABLE) {
654 		ASSERT(ncec->ncec_lladdr != NULL);
655 		ncec->ncec_state = ND_PROBE;
656 		ncec->ncec_pcnt = ND_MAX_UNICAST_SOLICIT;
657 		/*
658 		 * Slight cheat here: we don't use the initial probe delay
659 		 * for IPv4 in this obscure case.
660 		 */
661 		mutex_exit(&ncec->ncec_lock);
662 		if (IS_IPMP(ill)) {
663 			hwaddr_ill = ipmp_illgrp_find_ill(ill->ill_grp,
664 			    ncec->ncec_lladdr, ncec->ncec_lladdr_length);
665 		} else {
666 			hwaddr_ill = ill;
667 		}
668 		nce_dad(ncec, hwaddr_ill, B_TRUE);
669 		started = B_TRUE;
670 	} else {
671 		mutex_exit(&ncec->ncec_lock);
672 		started = B_FALSE;
673 	}
674 	return (started);
675 }
676 
677 /*
678  * IPv6 Cache entry lookup.  Try to find an ncec matching the parameters passed.
679  * If one is found, the refcnt on the ncec will be incremented.
680  */
681 ncec_t *
682 ncec_lookup_illgrp_v6(ill_t *ill, const in6_addr_t *addr)
683 {
684 	ncec_t		*ncec;
685 	ip_stack_t	*ipst = ill->ill_ipst;
686 
687 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
688 	mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
689 
690 	/* Get head of v6 hash table */
691 	ncec = *((ncec_t **)NCE_HASH_PTR_V6(ipst, *addr));
692 	ncec = ncec_lookup_illgrp(ill, addr, ncec);
693 	mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
694 	rw_exit(&ipst->ips_ill_g_lock);
695 	return (ncec);
696 }
697 /*
698  * IPv4 Cache entry lookup.  Try to find an ncec matching the parameters passed.
699  * If one is found, the refcnt on the ncec will be incremented.
700  */
701 ncec_t *
702 ncec_lookup_illgrp_v4(ill_t *ill, const in_addr_t *addr)
703 {
704 	ncec_t	*ncec = NULL;
705 	in6_addr_t addr6;
706 	ip_stack_t *ipst = ill->ill_ipst;
707 
708 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
709 	mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
710 
711 	/* Get head of v4 hash table */
712 	ncec = *((ncec_t **)NCE_HASH_PTR_V4(ipst, *addr));
713 	IN6_IPADDR_TO_V4MAPPED(*addr, &addr6);
714 	ncec = ncec_lookup_illgrp(ill, &addr6, ncec);
715 	mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
716 	rw_exit(&ipst->ips_ill_g_lock);
717 	return (ncec);
718 }
719 
720 /*
721  * Cache entry lookup.  Try to find an ncec matching the parameters passed.
722  * If an ncec is found, increment the hold count on that ncec.
723  * The caller passes in the start of the appropriate hash table, and must
724  * be holding the appropriate global lock (ndp_g_lock). In addition, since
725  * this function matches ncec_t entries across the illgrp, the ips_ill_g_lock
726  * must be held as reader.
727  *
728  * This function always matches across the ipmp group.
729  */
730 ncec_t *
731 ncec_lookup_illgrp(ill_t *ill, const in6_addr_t *addr, ncec_t *ncec)
732 {
733 	ndp_g_t		*ndp;
734 	ip_stack_t	*ipst = ill->ill_ipst;
735 
736 	if (ill->ill_isv6)
737 		ndp = ipst->ips_ndp6;
738 	else
739 		ndp = ipst->ips_ndp4;
740 
741 	ASSERT(ill != NULL);
742 	ASSERT(MUTEX_HELD(&ndp->ndp_g_lock));
743 	if (IN6_IS_ADDR_UNSPECIFIED(addr))
744 		return (NULL);
745 	for (; ncec != NULL; ncec = ncec->ncec_next) {
746 		if (ncec->ncec_ill == ill ||
747 		    IS_IN_SAME_ILLGRP(ill, ncec->ncec_ill)) {
748 			if (IN6_ARE_ADDR_EQUAL(&ncec->ncec_addr, addr)) {
749 				mutex_enter(&ncec->ncec_lock);
750 				if (!NCE_ISCONDEMNED(ncec)) {
751 					ncec_refhold_locked(ncec);
752 					mutex_exit(&ncec->ncec_lock);
753 					break;
754 				}
755 				mutex_exit(&ncec->ncec_lock);
756 			}
757 		}
758 	}
759 	return (ncec);
760 }
761 
762 /*
763  * Find an nce_t on ill with nce_addr == addr. Lookup the nce_t
764  * entries for ill only, i.e., when ill is part of an ipmp group,
765  * nce_lookup_v4 will never try to match across the group.
766  */
767 nce_t *
768 nce_lookup_v4(ill_t *ill, const in_addr_t *addr)
769 {
770 	nce_t *nce;
771 	in6_addr_t addr6;
772 	ip_stack_t *ipst = ill->ill_ipst;
773 
774 	mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
775 	IN6_IPADDR_TO_V4MAPPED(*addr, &addr6);
776 	nce = nce_lookup_addr(ill, &addr6);
777 	mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
778 	return (nce);
779 }
780 
781 /*
782  * Find an nce_t on ill with nce_addr == addr. Lookup the nce_t
783  * entries for ill only, i.e., when ill is part of an ipmp group,
784  * nce_lookup_v6 will never try to match across the group.
785  */
786 nce_t *
787 nce_lookup_v6(ill_t *ill, const in6_addr_t *addr6)
788 {
789 	nce_t *nce;
790 	ip_stack_t *ipst = ill->ill_ipst;
791 
792 	mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
793 	nce = nce_lookup_addr(ill, addr6);
794 	mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
795 	return (nce);
796 }
797 
798 static nce_t *
799 nce_lookup_addr(ill_t *ill, const in6_addr_t *addr)
800 {
801 	nce_t *nce;
802 
803 	ASSERT(ill != NULL);
804 #ifdef DEBUG
805 	if (ill->ill_isv6)
806 		ASSERT(MUTEX_HELD(&ill->ill_ipst->ips_ndp6->ndp_g_lock));
807 	else
808 		ASSERT(MUTEX_HELD(&ill->ill_ipst->ips_ndp4->ndp_g_lock));
809 #endif
810 	mutex_enter(&ill->ill_lock);
811 	nce = nce_lookup(ill, addr);
812 	mutex_exit(&ill->ill_lock);
813 	return (nce);
814 }
815 
816 
817 /*
818  * Router turned to host.  We need to make sure that cached copies of the ncec
819  * are not used for forwarding packets if they were derived from the default
820  * route, and that the default route itself is removed, as  required by
821  * section 7.2.5 of RFC 2461.
822  *
823  * Note that the ncec itself probably has valid link-layer information for the
824  * nexthop, so that there is no reason to delete the ncec, as long as the
825  * ISROUTER flag is turned off.
826  */
827 static void
828 ncec_router_to_host(ncec_t *ncec)
829 {
830 	ire_t		*ire;
831 	ip_stack_t	*ipst = ncec->ncec_ipst;
832 
833 	mutex_enter(&ncec->ncec_lock);
834 	ncec->ncec_flags &= ~NCE_F_ISROUTER;
835 	mutex_exit(&ncec->ncec_lock);
836 
837 	ire = ire_ftable_lookup_v6(&ipv6_all_zeros, &ipv6_all_zeros,
838 	    &ncec->ncec_addr, IRE_DEFAULT, ncec->ncec_ill, ALL_ZONES, NULL,
839 	    MATCH_IRE_ILL | MATCH_IRE_TYPE | MATCH_IRE_GW, 0, ipst, NULL);
840 	if (ire != NULL) {
841 		ip_rts_rtmsg(RTM_DELETE, ire, 0, ipst);
842 		ire_delete(ire);
843 		ire_refrele(ire);
844 	}
845 }
846 
847 /*
848  * Process passed in parameters either from an incoming packet or via
849  * user ioctl.
850  */
851 void
852 nce_process(ncec_t *ncec, uchar_t *hw_addr, uint32_t flag, boolean_t is_adv)
853 {
854 	ill_t	*ill = ncec->ncec_ill;
855 	uint32_t hw_addr_len = ill->ill_phys_addr_length;
856 	boolean_t ll_updated = B_FALSE;
857 	boolean_t ll_changed;
858 	nce_t	*nce;
859 
860 	ASSERT(ncec->ncec_ipversion == IPV6_VERSION);
861 	/*
862 	 * No updates of link layer address or the neighbor state is
863 	 * allowed, when the cache is in NONUD state.  This still
864 	 * allows for responding to reachability solicitation.
865 	 */
866 	mutex_enter(&ncec->ncec_lock);
867 	if (ncec->ncec_state == ND_INCOMPLETE) {
868 		if (hw_addr == NULL) {
869 			mutex_exit(&ncec->ncec_lock);
870 			return;
871 		}
872 		nce_set_ll(ncec, hw_addr);
873 		/*
874 		 * Update ncec state and send the queued packets
875 		 * back to ip this time ire will be added.
876 		 */
877 		if (flag & ND_NA_FLAG_SOLICITED) {
878 			nce_update(ncec, ND_REACHABLE, NULL);
879 		} else {
880 			nce_update(ncec, ND_STALE, NULL);
881 		}
882 		mutex_exit(&ncec->ncec_lock);
883 		nce = nce_fastpath(ncec, B_TRUE, NULL);
884 		nce_resolv_ok(ncec);
885 		if (nce != NULL)
886 			nce_refrele(nce);
887 		return;
888 	}
889 	ll_changed = nce_cmp_ll_addr(ncec, hw_addr, hw_addr_len);
890 	if (!is_adv) {
891 		/* If this is a SOLICITATION request only */
892 		if (ll_changed)
893 			nce_update(ncec, ND_STALE, hw_addr);
894 		mutex_exit(&ncec->ncec_lock);
895 		ncec_cb_dispatch(ncec);
896 		return;
897 	}
898 	if (!(flag & ND_NA_FLAG_OVERRIDE) && ll_changed) {
899 		/* If in any other state than REACHABLE, ignore */
900 		if (ncec->ncec_state == ND_REACHABLE) {
901 			nce_update(ncec, ND_STALE, NULL);
902 		}
903 		mutex_exit(&ncec->ncec_lock);
904 		ncec_cb_dispatch(ncec);
905 		return;
906 	} else {
907 		if (ll_changed) {
908 			nce_update(ncec, ND_UNCHANGED, hw_addr);
909 			ll_updated = B_TRUE;
910 		}
911 		if (flag & ND_NA_FLAG_SOLICITED) {
912 			nce_update(ncec, ND_REACHABLE, NULL);
913 		} else {
914 			if (ll_updated) {
915 				nce_update(ncec, ND_STALE, NULL);
916 			}
917 		}
918 		mutex_exit(&ncec->ncec_lock);
919 		if (!(flag & ND_NA_FLAG_ROUTER) && (ncec->ncec_flags &
920 		    NCE_F_ISROUTER)) {
921 			ncec_router_to_host(ncec);
922 		} else {
923 			ncec_cb_dispatch(ncec);
924 		}
925 	}
926 }
927 
928 /*
929  * Pass arg1 to the pfi supplied, along with each ncec in existence.
930  * ncec_walk() places a REFHOLD on the ncec and drops the lock when
931  * walking the hash list.
932  */
933 void
934 ncec_walk_common(ndp_g_t *ndp, ill_t *ill, pfi_t pfi, void *arg1,
935     boolean_t trace)
936 {
937 	ncec_t	*ncec;
938 	ncec_t	*ncec1;
939 	ncec_t	**ncep;
940 	ncec_t	*free_nce_list = NULL;
941 
942 	mutex_enter(&ndp->ndp_g_lock);
943 	/* Prevent ncec_delete from unlink and free of NCE */
944 	ndp->ndp_g_walker++;
945 	mutex_exit(&ndp->ndp_g_lock);
946 	for (ncep = ndp->nce_hash_tbl;
947 	    ncep < A_END(ndp->nce_hash_tbl); ncep++) {
948 		for (ncec = *ncep; ncec != NULL; ncec = ncec1) {
949 			ncec1 = ncec->ncec_next;
950 			if (ill == NULL || ncec->ncec_ill == ill) {
951 				if (trace) {
952 					ncec_refhold(ncec);
953 					(*pfi)(ncec, arg1);
954 					ncec_refrele(ncec);
955 				} else {
956 					ncec_refhold_notr(ncec);
957 					(*pfi)(ncec, arg1);
958 					ncec_refrele_notr(ncec);
959 				}
960 			}
961 		}
962 	}
963 	mutex_enter(&ndp->ndp_g_lock);
964 	ndp->ndp_g_walker--;
965 	if (ndp->ndp_g_walker_cleanup && ndp->ndp_g_walker == 0) {
966 		/* Time to delete condemned entries */
967 		for (ncep = ndp->nce_hash_tbl;
968 		    ncep < A_END(ndp->nce_hash_tbl); ncep++) {
969 			ncec = *ncep;
970 			if (ncec != NULL) {
971 				nce_remove(ndp, ncec, &free_nce_list);
972 			}
973 		}
974 		ndp->ndp_g_walker_cleanup = B_FALSE;
975 	}
976 
977 	mutex_exit(&ndp->ndp_g_lock);
978 
979 	if (free_nce_list != NULL) {
980 		nce_cleanup_list(free_nce_list);
981 	}
982 }
983 
984 /*
985  * Walk everything.
986  * Note that ill can be NULL hence can't derive the ipst from it.
987  */
988 void
989 ncec_walk(ill_t *ill, pfi_t pfi, void *arg1, ip_stack_t *ipst)
990 {
991 	ncec_walk_common(ipst->ips_ndp4, ill, pfi, arg1, B_TRUE);
992 	ncec_walk_common(ipst->ips_ndp6, ill, pfi, arg1, B_TRUE);
993 }
994 
995 /*
996  * For each interface an entry is added for the unspecified multicast group.
997  * Here that mapping is used to form the multicast cache entry for a particular
998  * multicast destination.
999  */
1000 static int
1001 nce_set_multicast_v6(ill_t *ill, const in6_addr_t *dst,
1002     uint16_t flags, nce_t **newnce)
1003 {
1004 	uchar_t		*hw_addr;
1005 	int		err = 0;
1006 	ip_stack_t	*ipst = ill->ill_ipst;
1007 	nce_t		*nce;
1008 
1009 	ASSERT(ill != NULL);
1010 	ASSERT(ill->ill_isv6);
1011 	ASSERT(!(IN6_IS_ADDR_UNSPECIFIED(dst)));
1012 
1013 	mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
1014 	nce = nce_lookup_addr(ill, dst);
1015 	if (nce != NULL) {
1016 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
1017 		goto done;
1018 	}
1019 	if (ill->ill_net_type == IRE_IF_RESOLVER) {
1020 		/*
1021 		 * For IRE_IF_RESOLVER a hardware mapping can be
1022 		 * generated.
1023 		 */
1024 		hw_addr = kmem_alloc(ill->ill_nd_lla_len, KM_NOSLEEP);
1025 		if (hw_addr == NULL) {
1026 			mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
1027 			return (ENOMEM);
1028 		}
1029 		ip_mcast_mapping(ill, (uchar_t *)dst, hw_addr);
1030 	} else {
1031 		/*
1032 		 * So no hw_addr is needed for IRE_IF_NORESOLVER.
1033 		 */
1034 		hw_addr = NULL;
1035 	}
1036 	ASSERT((flags & NCE_F_MCAST) != 0);
1037 	ASSERT((flags & NCE_F_NONUD) != 0);
1038 	/* nce_state will be computed by nce_add_common() */
1039 	err = nce_add_v6(ill, hw_addr, ill->ill_phys_addr_length, dst, flags,
1040 	    ND_UNCHANGED, &nce);
1041 	mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
1042 	if (err == 0)
1043 		err = nce_add_v6_postprocess(nce);
1044 	if (hw_addr != NULL)
1045 		kmem_free(hw_addr, ill->ill_nd_lla_len);
1046 	if (err != 0) {
1047 		ip1dbg(("nce_set_multicast_v6: create failed" "%d\n", err));
1048 		return (err);
1049 	}
1050 done:
1051 	ASSERT(nce->nce_common->ncec_state == ND_REACHABLE);
1052 	if (newnce != NULL)
1053 		*newnce = nce;
1054 	else
1055 		nce_refrele(nce);
1056 	return (0);
1057 }
1058 
1059 /*
1060  * Return the link layer address, and any flags of a ncec.
1061  */
1062 int
1063 ndp_query(ill_t *ill, struct lif_nd_req *lnr)
1064 {
1065 	ncec_t		*ncec;
1066 	in6_addr_t	*addr;
1067 	sin6_t		*sin6;
1068 
1069 	ASSERT(ill != NULL && ill->ill_isv6);
1070 	sin6 = (sin6_t *)&lnr->lnr_addr;
1071 	addr =  &sin6->sin6_addr;
1072 
1073 	/*
1074 	 * NOTE: if the ill is an IPMP interface, then match against the whole
1075 	 * illgrp.  This e.g. allows in.ndpd to retrieve the link layer
1076 	 * addresses for the data addresses on an IPMP interface even though
1077 	 * ipif_ndp_up() created them with an ncec_ill of ipif_bound_ill.
1078 	 */
1079 	ncec = ncec_lookup_illgrp_v6(ill, addr);
1080 	if (ncec == NULL)
1081 		return (ESRCH);
1082 	/* If no link layer address is available yet, return ESRCH */
1083 	if (!NCE_ISREACHABLE(ncec)) {
1084 		ncec_refrele(ncec);
1085 		return (ESRCH);
1086 	}
1087 	lnr->lnr_hdw_len = ill->ill_phys_addr_length;
1088 	bcopy(ncec->ncec_lladdr, (uchar_t *)&lnr->lnr_hdw_addr,
1089 	    lnr->lnr_hdw_len);
1090 	if (ncec->ncec_flags & NCE_F_ISROUTER)
1091 		lnr->lnr_flags = NDF_ISROUTER_ON;
1092 	if (ncec->ncec_flags & NCE_F_ANYCAST)
1093 		lnr->lnr_flags |= NDF_ANYCAST_ON;
1094 	ncec_refrele(ncec);
1095 	return (0);
1096 }
1097 
1098 /*
1099  * Finish setting up the Enable/Disable multicast for the driver.
1100  */
1101 mblk_t *
1102 ndp_mcastreq(ill_t *ill, const in6_addr_t *v6group, uint32_t hw_addr_len,
1103     uint32_t hw_addr_offset, mblk_t *mp)
1104 {
1105 	uchar_t		*hw_addr;
1106 	ipaddr_t	v4group;
1107 	uchar_t		*addr;
1108 
1109 	ASSERT(ill->ill_net_type == IRE_IF_RESOLVER);
1110 	if (IN6_IS_ADDR_V4MAPPED(v6group)) {
1111 		IN6_V4MAPPED_TO_IPADDR(v6group, v4group);
1112 
1113 		ASSERT(CLASSD(v4group));
1114 		ASSERT(!(ill->ill_isv6));
1115 
1116 		addr = (uchar_t *)&v4group;
1117 	} else {
1118 		ASSERT(IN6_IS_ADDR_MULTICAST(v6group));
1119 		ASSERT(ill->ill_isv6);
1120 
1121 		addr = (uchar_t *)v6group;
1122 	}
1123 	hw_addr = mi_offset_paramc(mp, hw_addr_offset, hw_addr_len);
1124 	if (hw_addr == NULL) {
1125 		ip0dbg(("ndp_mcastreq NULL hw_addr\n"));
1126 		freemsg(mp);
1127 		return (NULL);
1128 	}
1129 
1130 	ip_mcast_mapping(ill, addr, hw_addr);
1131 	return (mp);
1132 }
1133 
1134 void
1135 ip_ndp_resolve(ncec_t *ncec)
1136 {
1137 	in_addr_t	sender4 = INADDR_ANY;
1138 	in6_addr_t	sender6 = ipv6_all_zeros;
1139 	ill_t		*src_ill;
1140 	uint32_t	ms;
1141 
1142 	src_ill = nce_resolve_src(ncec, &sender6);
1143 	if (src_ill == NULL) {
1144 		/* Make sure we try again later */
1145 		ms = ncec->ncec_ill->ill_reachable_retrans_time;
1146 		nce_restart_timer(ncec, (clock_t)ms);
1147 		return;
1148 	}
1149 	if (ncec->ncec_ipversion == IPV4_VERSION)
1150 		IN6_V4MAPPED_TO_IPADDR(&sender6, sender4);
1151 	mutex_enter(&ncec->ncec_lock);
1152 	if (ncec->ncec_ipversion == IPV6_VERSION)
1153 		ms = ndp_solicit(ncec, sender6, src_ill);
1154 	else
1155 		ms = arp_request(ncec, sender4, src_ill);
1156 	mutex_exit(&ncec->ncec_lock);
1157 	if (ms == 0) {
1158 		if (ncec->ncec_state != ND_REACHABLE) {
1159 			if (ncec->ncec_ipversion == IPV6_VERSION)
1160 				ndp_resolv_failed(ncec);
1161 			else
1162 				arp_resolv_failed(ncec);
1163 			ASSERT((ncec->ncec_flags & NCE_F_STATIC) == 0);
1164 			nce_make_unreachable(ncec);
1165 			ncec_delete(ncec);
1166 		}
1167 	} else {
1168 		nce_restart_timer(ncec, (clock_t)ms);
1169 	}
1170 done:
1171 	ill_refrele(src_ill);
1172 }
1173 
1174 /*
1175  * Send an IPv6 neighbor solicitation.
1176  * Returns number of milliseconds after which we should either rexmit or abort.
1177  * Return of zero means we should abort.
1178  * The caller holds the ncec_lock to protect ncec_qd_mp and ncec_rcnt.
1179  * The optional source address is used as a hint to ndp_solicit for
1180  * which source to use in the packet.
1181  *
1182  * NOTE: This routine drops ncec_lock (and later reacquires it) when sending
1183  * the packet.
1184  */
1185 uint32_t
1186 ndp_solicit(ncec_t *ncec, in6_addr_t src, ill_t *ill)
1187 {
1188 	in6_addr_t	dst;
1189 	boolean_t	dropped = B_FALSE;
1190 
1191 	ASSERT(ncec->ncec_ipversion == IPV6_VERSION);
1192 	ASSERT(MUTEX_HELD(&ncec->ncec_lock));
1193 
1194 	if (ncec->ncec_rcnt == 0)
1195 		return (0);
1196 
1197 	dst = ncec->ncec_addr;
1198 	ncec->ncec_rcnt--;
1199 	mutex_exit(&ncec->ncec_lock);
1200 	dropped = ndp_xmit(ill, ND_NEIGHBOR_SOLICIT, ill->ill_phys_addr,
1201 	    ill->ill_phys_addr_length, &src, &dst, 0);
1202 	mutex_enter(&ncec->ncec_lock);
1203 	if (dropped)
1204 		ncec->ncec_rcnt++;
1205 	return (ncec->ncec_ill->ill_reachable_retrans_time);
1206 }
1207 
1208 /*
1209  * Attempt to recover an address on an interface that's been marked as a
1210  * duplicate.  Because NCEs are destroyed when the interface goes down, there's
1211  * no easy way to just probe the address and have the right thing happen if
1212  * it's no longer in use.  Instead, we just bring it up normally and allow the
1213  * regular interface start-up logic to probe for a remaining duplicate and take
1214  * us back down if necessary.
1215  * Neither DHCP nor temporary addresses arrive here; they're excluded by
1216  * ip_ndp_excl.
1217  */
1218 /* ARGSUSED */
1219 void
1220 ip_addr_recover(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg)
1221 {
1222 	ill_t	*ill = rq->q_ptr;
1223 	ipif_t	*ipif;
1224 	in6_addr_t *addr6 = (in6_addr_t *)mp->b_rptr;
1225 	in_addr_t *addr4 = (in_addr_t *)mp->b_rptr;
1226 	boolean_t addr_equal;
1227 
1228 	for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
1229 		/*
1230 		 * We do not support recovery of proxy ARP'd interfaces,
1231 		 * because the system lacks a complete proxy ARP mechanism.
1232 		 */
1233 		if (ill->ill_isv6) {
1234 			addr_equal = IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6lcl_addr,
1235 			    addr6);
1236 		} else {
1237 			addr_equal = (ipif->ipif_lcl_addr == *addr4);
1238 		}
1239 
1240 		if ((ipif->ipif_flags & IPIF_POINTOPOINT) || !addr_equal)
1241 			continue;
1242 
1243 		/*
1244 		 * If we have already recovered or if the interface is going
1245 		 * away, then ignore.
1246 		 */
1247 		mutex_enter(&ill->ill_lock);
1248 		if (!(ipif->ipif_flags & IPIF_DUPLICATE) ||
1249 		    (ipif->ipif_state_flags & IPIF_CONDEMNED)) {
1250 			mutex_exit(&ill->ill_lock);
1251 			continue;
1252 		}
1253 
1254 		ipif->ipif_flags &= ~IPIF_DUPLICATE;
1255 		ill->ill_ipif_dup_count--;
1256 		mutex_exit(&ill->ill_lock);
1257 		ipif->ipif_was_dup = B_TRUE;
1258 
1259 		if (ill->ill_isv6) {
1260 			VERIFY(ipif_ndp_up(ipif, B_TRUE) != EINPROGRESS);
1261 			(void) ipif_up_done_v6(ipif);
1262 		} else {
1263 			VERIFY(ipif_arp_up(ipif, Res_act_initial, B_TRUE) !=
1264 			    EINPROGRESS);
1265 			(void) ipif_up_done(ipif);
1266 		}
1267 	}
1268 	freeb(mp);
1269 }
1270 
1271 /*
1272  *
1273  * Attempt to recover an IPv6 interface that's been shut down as a duplicate.
1274  * As long as someone else holds the address, the interface will stay down.
1275  * When that conflict goes away, the interface is brought back up.  This is
1276  * done so that accidental shutdowns of addresses aren't made permanent.  Your
1277  * server will recover from a failure.
1278  *
1279  * For DHCP and temporary addresses, recovery is not done in the kernel.
1280  * Instead, it's handled by user space processes (dhcpagent and in.ndpd).
1281  *
1282  * This function is entered on a timer expiry; the ID is in ipif_recovery_id.
1283  */
1284 void
1285 ipif_dup_recovery(void *arg)
1286 {
1287 	ipif_t *ipif = arg;
1288 
1289 	ipif->ipif_recovery_id = 0;
1290 	if (!(ipif->ipif_flags & IPIF_DUPLICATE))
1291 		return;
1292 
1293 	/*
1294 	 * No lock, because this is just an optimization.
1295 	 */
1296 	if (ipif->ipif_state_flags & IPIF_CONDEMNED)
1297 		return;
1298 
1299 	/* If the link is down, we'll retry this later */
1300 	if (!(ipif->ipif_ill->ill_phyint->phyint_flags & PHYI_RUNNING))
1301 		return;
1302 
1303 	ipif_do_recovery(ipif);
1304 }
1305 
1306 /*
1307  * Perform interface recovery by forcing the duplicate interfaces up and
1308  * allowing the system to determine which ones should stay up.
1309  *
1310  * Called both by recovery timer expiry and link-up notification.
1311  */
1312 void
1313 ipif_do_recovery(ipif_t *ipif)
1314 {
1315 	ill_t *ill = ipif->ipif_ill;
1316 	mblk_t *mp;
1317 	ip_stack_t *ipst = ill->ill_ipst;
1318 	size_t mp_size;
1319 
1320 	if (ipif->ipif_isv6)
1321 		mp_size = sizeof (ipif->ipif_v6lcl_addr);
1322 	else
1323 		mp_size = sizeof (ipif->ipif_lcl_addr);
1324 	mp = allocb(mp_size, BPRI_MED);
1325 	if (mp == NULL) {
1326 		mutex_enter(&ill->ill_lock);
1327 		if (ipst->ips_ip_dup_recovery > 0 &&
1328 		    ipif->ipif_recovery_id == 0 &&
1329 		    !(ipif->ipif_state_flags & IPIF_CONDEMNED)) {
1330 			ipif->ipif_recovery_id = timeout(ipif_dup_recovery,
1331 			    ipif, MSEC_TO_TICK(ipst->ips_ip_dup_recovery));
1332 		}
1333 		mutex_exit(&ill->ill_lock);
1334 	} else {
1335 		/*
1336 		 * A recovery timer may still be running if we got here from
1337 		 * ill_restart_dad(); cancel that timer.
1338 		 */
1339 		if (ipif->ipif_recovery_id != 0)
1340 			(void) untimeout(ipif->ipif_recovery_id);
1341 		ipif->ipif_recovery_id = 0;
1342 
1343 		if (ipif->ipif_isv6) {
1344 			bcopy(&ipif->ipif_v6lcl_addr, mp->b_rptr,
1345 			    sizeof (ipif->ipif_v6lcl_addr));
1346 		} else  {
1347 			bcopy(&ipif->ipif_lcl_addr, mp->b_rptr,
1348 			    sizeof (ipif->ipif_lcl_addr));
1349 		}
1350 		ill_refhold(ill);
1351 		qwriter_ip(ill, ill->ill_rq, mp, ip_addr_recover, NEW_OP,
1352 		    B_FALSE);
1353 	}
1354 }
1355 
1356 /*
1357  * Find the MAC and IP addresses in an NA/NS message.
1358  */
1359 static void
1360 ip_ndp_find_addresses(mblk_t *mp, ip_recv_attr_t *ira, ill_t *ill,
1361     in6_addr_t *targp, uchar_t **haddr, uint_t *haddrlenp)
1362 {
1363 	icmp6_t *icmp6 = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
1364 	nd_neighbor_solicit_t *ns = (nd_neighbor_solicit_t *)icmp6;
1365 	uchar_t *addr;
1366 	int alen;
1367 
1368 	/* icmp_inbound_v6 ensures this */
1369 	ASSERT(ira->ira_flags & IRAF_L2SRC_SET);
1370 
1371 	addr = ira->ira_l2src;
1372 	alen = ill->ill_phys_addr_length;
1373 	if (alen > 0) {
1374 		*haddr = addr;
1375 		*haddrlenp = alen;
1376 	} else {
1377 		*haddr = NULL;
1378 		*haddrlenp = 0;
1379 	}
1380 
1381 	/* nd_ns_target and nd_na_target are at the same offset, so we cheat */
1382 	*targp = ns->nd_ns_target;
1383 }
1384 
1385 /*
1386  * This is for exclusive changes due to NDP duplicate address detection
1387  * failure.
1388  */
1389 /* ARGSUSED */
1390 static void
1391 ip_ndp_excl(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg)
1392 {
1393 	ill_t	*ill = rq->q_ptr;
1394 	ipif_t	*ipif;
1395 	uchar_t	*haddr;
1396 	uint_t	haddrlen;
1397 	ip_stack_t *ipst = ill->ill_ipst;
1398 	in6_addr_t targ;
1399 	ip_recv_attr_t iras;
1400 	mblk_t	*attrmp;
1401 
1402 	attrmp = mp;
1403 	mp = mp->b_cont;
1404 	attrmp->b_cont = NULL;
1405 	if (!ip_recv_attr_from_mblk(attrmp, &iras)) {
1406 		/* The ill or ip_stack_t disappeared on us */
1407 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
1408 		ip_drop_input("ip_recv_attr_from_mblk", mp, ill);
1409 		freemsg(mp);
1410 		ira_cleanup(&iras, B_TRUE);
1411 		return;
1412 	}
1413 
1414 	ASSERT(ill == iras.ira_rill);
1415 
1416 	ip_ndp_find_addresses(mp, &iras, ill, &targ, &haddr, &haddrlen);
1417 	if (haddr != NULL && haddrlen == ill->ill_phys_addr_length) {
1418 		/*
1419 		 * Ignore conflicts generated by misbehaving switches that
1420 		 * just reflect our own messages back to us.  For IPMP, we may
1421 		 * see reflections across any ill in the illgrp.
1422 		 *
1423 		 * RFC2462 and revisions tried to detect both the case
1424 		 * when a statically configured IPv6 address is a duplicate,
1425 		 * and the case when the L2 address itself is a duplicate. The
1426 		 * later is important because, with stateles address autoconf,
1427 		 * if the L2 address is a duplicate, the resulting IPv6
1428 		 * address(es) would also be duplicates. We rely on DAD of the
1429 		 * IPv6 address itself to detect the latter case.
1430 		 */
1431 		/* For an under ill_grp can change under lock */
1432 		rw_enter(&ipst->ips_ill_g_lock, RW_READER);
1433 		if (bcmp(haddr, ill->ill_phys_addr, haddrlen) == 0 ||
1434 		    IS_UNDER_IPMP(ill) &&
1435 		    ipmp_illgrp_find_ill(ill->ill_grp, haddr,
1436 		    haddrlen) != NULL) {
1437 			rw_exit(&ipst->ips_ill_g_lock);
1438 			goto ignore_conflict;
1439 		}
1440 		rw_exit(&ipst->ips_ill_g_lock);
1441 	}
1442 
1443 	/*
1444 	 * Look up the appropriate ipif.
1445 	 */
1446 	ipif = ipif_lookup_addr_v6(&targ, ill, ALL_ZONES, ipst);
1447 	if (ipif == NULL)
1448 		goto ignore_conflict;
1449 
1450 	/* Reload the ill to match the ipif */
1451 	ill = ipif->ipif_ill;
1452 
1453 	/* If it's already duplicate or ineligible, then don't do anything. */
1454 	if (ipif->ipif_flags & (IPIF_POINTOPOINT|IPIF_DUPLICATE)) {
1455 		ipif_refrele(ipif);
1456 		goto ignore_conflict;
1457 	}
1458 
1459 	/*
1460 	 * If this is a failure during duplicate recovery, then don't
1461 	 * complain.  It may take a long time to recover.
1462 	 */
1463 	if (!ipif->ipif_was_dup) {
1464 		char ibuf[LIFNAMSIZ];
1465 		char hbuf[MAC_STR_LEN];
1466 		char sbuf[INET6_ADDRSTRLEN];
1467 
1468 		ipif_get_name(ipif, ibuf, sizeof (ibuf));
1469 		cmn_err(CE_WARN, "%s has duplicate address %s (in use by %s);"
1470 		    " disabled", ibuf,
1471 		    inet_ntop(AF_INET6, &targ, sbuf, sizeof (sbuf)),
1472 		    mac_colon_addr(haddr, haddrlen, hbuf, sizeof (hbuf)));
1473 	}
1474 	mutex_enter(&ill->ill_lock);
1475 	ASSERT(!(ipif->ipif_flags & IPIF_DUPLICATE));
1476 	ipif->ipif_flags |= IPIF_DUPLICATE;
1477 	ill->ill_ipif_dup_count++;
1478 	mutex_exit(&ill->ill_lock);
1479 	(void) ipif_down(ipif, NULL, NULL);
1480 	(void) ipif_down_tail(ipif);
1481 	mutex_enter(&ill->ill_lock);
1482 	if (!(ipif->ipif_flags & (IPIF_DHCPRUNNING|IPIF_TEMPORARY)) &&
1483 	    ill->ill_net_type == IRE_IF_RESOLVER &&
1484 	    !(ipif->ipif_state_flags & IPIF_CONDEMNED) &&
1485 	    ipst->ips_ip_dup_recovery > 0) {
1486 		ASSERT(ipif->ipif_recovery_id == 0);
1487 		ipif->ipif_recovery_id = timeout(ipif_dup_recovery,
1488 		    ipif, MSEC_TO_TICK(ipst->ips_ip_dup_recovery));
1489 	}
1490 	mutex_exit(&ill->ill_lock);
1491 	ipif_refrele(ipif);
1492 
1493 ignore_conflict:
1494 	freemsg(mp);
1495 	ira_cleanup(&iras, B_TRUE);
1496 }
1497 
1498 /*
1499  * Handle failure by tearing down the ipifs with the specified address.  Note
1500  * that tearing down the ipif also means deleting the ncec through ipif_down, so
1501  * it's not possible to do recovery by just restarting the ncec timer.  Instead,
1502  * we start a timer on the ipif.
1503  * Caller has to free mp;
1504  */
1505 static void
1506 ndp_failure(mblk_t *mp, ip_recv_attr_t *ira)
1507 {
1508 	const uchar_t	*haddr;
1509 	ill_t		*ill = ira->ira_rill;
1510 
1511 	/*
1512 	 * Ignore conflicts generated by misbehaving switches that just
1513 	 * reflect our own messages back to us.
1514 	 */
1515 
1516 	/* icmp_inbound_v6 ensures this */
1517 	ASSERT(ira->ira_flags & IRAF_L2SRC_SET);
1518 	haddr = ira->ira_l2src;
1519 	if (haddr != NULL &&
1520 	    bcmp(haddr, ill->ill_phys_addr, ill->ill_phys_addr_length) == 0) {
1521 		return;
1522 	}
1523 
1524 	if ((mp = copymsg(mp)) != NULL) {
1525 		mblk_t	*attrmp;
1526 
1527 		attrmp = ip_recv_attr_to_mblk(ira);
1528 		if (attrmp == NULL) {
1529 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
1530 			ip_drop_input("ipIfStatsInDiscards", mp, ill);
1531 			freemsg(mp);
1532 		} else {
1533 			ASSERT(attrmp->b_cont == NULL);
1534 			attrmp->b_cont = mp;
1535 			mp = attrmp;
1536 			ill_refhold(ill);
1537 			qwriter_ip(ill, ill->ill_rq, mp, ip_ndp_excl, NEW_OP,
1538 			    B_FALSE);
1539 		}
1540 	}
1541 }
1542 
1543 /*
1544  * Handle a discovered conflict: some other system is advertising that it owns
1545  * one of our IP addresses.  We need to defend ourselves, or just shut down the
1546  * interface.
1547  *
1548  * Handles both IPv4 and IPv6
1549  */
1550 boolean_t
1551 ip_nce_conflict(mblk_t *mp, ip_recv_attr_t *ira, ncec_t *ncec)
1552 {
1553 	ipif_t		*ipif;
1554 	clock_t		now;
1555 	uint_t		maxdefense;
1556 	uint_t		defs;
1557 	ill_t		*ill = ira->ira_ill;
1558 	ip_stack_t	*ipst = ill->ill_ipst;
1559 	uint32_t	elapsed;
1560 	boolean_t	isv6 = ill->ill_isv6;
1561 	ipaddr_t	ncec_addr;
1562 
1563 	if (isv6) {
1564 		ipif = ipif_lookup_addr_v6(&ncec->ncec_addr, ill, ALL_ZONES,
1565 		    ipst);
1566 	} else {
1567 		if (arp_no_defense) {
1568 			/*
1569 			 * Yes, there is a conflict, but no, we do not
1570 			 * defend ourself.
1571 			 */
1572 			return (B_TRUE);
1573 		}
1574 		IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, ncec_addr);
1575 		ipif = ipif_lookup_addr(ncec_addr, ill, ALL_ZONES,
1576 		    ipst);
1577 	}
1578 	if (ipif == NULL)
1579 		return (B_FALSE);
1580 
1581 	/*
1582 	 * First, figure out if this address is disposable.
1583 	 */
1584 	if (ipif->ipif_flags & (IPIF_DHCPRUNNING | IPIF_TEMPORARY))
1585 		maxdefense = ipst->ips_ip_max_temp_defend;
1586 	else
1587 		maxdefense = ipst->ips_ip_max_defend;
1588 
1589 	/*
1590 	 * Now figure out how many times we've defended ourselves.  Ignore
1591 	 * defenses that happened long in the past.
1592 	 */
1593 	now = ddi_get_lbolt();
1594 	elapsed = (drv_hztousec(now - ncec->ncec_last_time_defended))/1000000;
1595 	mutex_enter(&ncec->ncec_lock);
1596 	if ((defs = ncec->ncec_defense_count) > 0 &&
1597 	    elapsed > ipst->ips_ip_defend_interval) {
1598 		/*
1599 		 * ip_defend_interval has elapsed.
1600 		 * reset the defense count.
1601 		 */
1602 		ncec->ncec_defense_count = defs = 0;
1603 	}
1604 	ncec->ncec_defense_count++;
1605 	ncec->ncec_last_time_defended = now;
1606 	mutex_exit(&ncec->ncec_lock);
1607 	ipif_refrele(ipif);
1608 
1609 	/*
1610 	 * If we've defended ourselves too many times already, then give up and
1611 	 * tear down the interface(s) using this address.
1612 	 * Otherwise, caller has to defend by sending out an announce.
1613 	 */
1614 	if (defs >= maxdefense) {
1615 		if (isv6)
1616 			ndp_failure(mp, ira);
1617 		else
1618 			arp_failure(mp, ira);
1619 	} else {
1620 		return (B_TRUE); /* caller must defend this address */
1621 	}
1622 	return (B_FALSE);
1623 }
1624 
1625 /*
1626  * Handle reception of Neighbor Solicitation messages.
1627  */
1628 static void
1629 ndp_input_solicit(mblk_t *mp, ip_recv_attr_t *ira)
1630 {
1631 	ill_t		*ill = ira->ira_ill, *under_ill;
1632 	nd_neighbor_solicit_t *ns;
1633 	uint32_t	hlen = ill->ill_phys_addr_length;
1634 	uchar_t		*haddr = NULL;
1635 	icmp6_t		*icmp_nd;
1636 	ip6_t		*ip6h;
1637 	ncec_t		*our_ncec = NULL;
1638 	in6_addr_t	target;
1639 	in6_addr_t	src;
1640 	int		len;
1641 	int		flag = 0;
1642 	nd_opt_hdr_t	*opt = NULL;
1643 	boolean_t	bad_solicit = B_FALSE;
1644 	mib2_ipv6IfIcmpEntry_t	*mib = ill->ill_icmp6_mib;
1645 	boolean_t	need_ill_refrele = B_FALSE;
1646 
1647 	ip6h = (ip6_t *)mp->b_rptr;
1648 	icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
1649 	len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN;
1650 	src = ip6h->ip6_src;
1651 	ns = (nd_neighbor_solicit_t *)icmp_nd;
1652 	target = ns->nd_ns_target;
1653 	if (IN6_IS_ADDR_MULTICAST(&target)) {
1654 		if (ip_debug > 2) {
1655 			/* ip1dbg */
1656 			pr_addr_dbg("ndp_input_solicit: Target is"
1657 			    " multicast! %s\n", AF_INET6, &target);
1658 		}
1659 		bad_solicit = B_TRUE;
1660 		goto done;
1661 	}
1662 	if (len > sizeof (nd_neighbor_solicit_t)) {
1663 		/* Options present */
1664 		opt = (nd_opt_hdr_t *)&ns[1];
1665 		len -= sizeof (nd_neighbor_solicit_t);
1666 		if (!ndp_verify_optlen(opt, len)) {
1667 			ip1dbg(("ndp_input_solicit: Bad opt len\n"));
1668 			bad_solicit = B_TRUE;
1669 			goto done;
1670 		}
1671 	}
1672 	if (IN6_IS_ADDR_UNSPECIFIED(&src)) {
1673 		/* Check to see if this is a valid DAD solicitation */
1674 		if (!IN6_IS_ADDR_MC_SOLICITEDNODE(&ip6h->ip6_dst)) {
1675 			if (ip_debug > 2) {
1676 				/* ip1dbg */
1677 				pr_addr_dbg("ndp_input_solicit: IPv6 "
1678 				    "Destination is not solicited node "
1679 				    "multicast %s\n", AF_INET6,
1680 				    &ip6h->ip6_dst);
1681 			}
1682 			bad_solicit = B_TRUE;
1683 			goto done;
1684 		}
1685 	}
1686 
1687 	/*
1688 	 * NOTE: with IPMP, it's possible the nominated multicast ill (which
1689 	 * received this packet if it's multicast) is not the ill tied to
1690 	 * e.g. the IPMP ill's data link-local.  So we match across the illgrp
1691 	 * to ensure we find the associated NCE.
1692 	 */
1693 	our_ncec = ncec_lookup_illgrp_v6(ill, &target);
1694 	/*
1695 	 * If this is a valid Solicitation for an address we are publishing,
1696 	 * then a PUBLISH entry should exist in the cache
1697 	 */
1698 	if (our_ncec == NULL || !NCE_PUBLISH(our_ncec)) {
1699 		ip1dbg(("ndp_input_solicit: Wrong target in NS?!"
1700 		    "ifname=%s ", ill->ill_name));
1701 		if (ip_debug > 2) {
1702 			/* ip1dbg */
1703 			pr_addr_dbg(" dst %s\n", AF_INET6, &target);
1704 		}
1705 		if (our_ncec == NULL)
1706 			bad_solicit = B_TRUE;
1707 		goto done;
1708 	}
1709 
1710 	/* At this point we should have a verified NS per spec */
1711 	if (opt != NULL) {
1712 		opt = ndp_get_option(opt, len, ND_OPT_SOURCE_LINKADDR);
1713 		if (opt != NULL) {
1714 			haddr = (uchar_t *)&opt[1];
1715 			if (hlen > opt->nd_opt_len * 8 - sizeof (*opt) ||
1716 			    hlen == 0) {
1717 				ip1dbg(("ndp_input_advert: bad SLLA\n"));
1718 				bad_solicit = B_TRUE;
1719 				goto done;
1720 			}
1721 		}
1722 	}
1723 
1724 	/* If sending directly to peer, set the unicast flag */
1725 	if (!IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst))
1726 		flag |= NDP_UNICAST;
1727 
1728 	/*
1729 	 * Create/update the entry for the soliciting node on the ipmp_ill.
1730 	 * or respond to outstanding queries, don't if
1731 	 * the source is unspecified address.
1732 	 */
1733 	if (!IN6_IS_ADDR_UNSPECIFIED(&src)) {
1734 		int	err;
1735 		nce_t	*nnce;
1736 
1737 		ASSERT(ill->ill_isv6);
1738 		/*
1739 		 * Regular solicitations *must* include the Source Link-Layer
1740 		 * Address option.  Ignore messages that do not.
1741 		 */
1742 		if (haddr == NULL && IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst)) {
1743 			ip1dbg(("ndp_input_solicit: source link-layer address "
1744 			    "option missing with a specified source.\n"));
1745 			bad_solicit = B_TRUE;
1746 			goto done;
1747 		}
1748 
1749 		/*
1750 		 * This is a regular solicitation.  If we're still in the
1751 		 * process of verifying the address, then don't respond at all
1752 		 * and don't keep track of the sender.
1753 		 */
1754 		if (our_ncec->ncec_state == ND_PROBE)
1755 			goto done;
1756 
1757 		/*
1758 		 * If the solicitation doesn't have sender hardware address
1759 		 * (legal for unicast solicitation), then process without
1760 		 * installing the return NCE.  Either we already know it, or
1761 		 * we'll be forced to look it up when (and if) we reply to the
1762 		 * packet.
1763 		 */
1764 		if (haddr == NULL)
1765 			goto no_source;
1766 
1767 		under_ill = ill;
1768 		if (IS_UNDER_IPMP(under_ill)) {
1769 			ill = ipmp_ill_hold_ipmp_ill(under_ill);
1770 			if (ill == NULL)
1771 				ill = under_ill;
1772 			else
1773 				need_ill_refrele = B_TRUE;
1774 		}
1775 		err = nce_lookup_then_add_v6(ill,
1776 		    haddr, hlen,
1777 		    &src,	/* Soliciting nodes address */
1778 		    0,
1779 		    ND_STALE,
1780 		    &nnce);
1781 
1782 		if (need_ill_refrele) {
1783 			ill_refrele(ill);
1784 			ill = under_ill;
1785 			need_ill_refrele =  B_FALSE;
1786 		}
1787 		switch (err) {
1788 		case 0:
1789 			/* done with this entry */
1790 			nce_refrele(nnce);
1791 			break;
1792 		case EEXIST:
1793 			/*
1794 			 * B_FALSE indicates this is not an an advertisement.
1795 			 */
1796 			nce_process(nnce->nce_common, haddr, 0, B_FALSE);
1797 			nce_refrele(nnce);
1798 			break;
1799 		default:
1800 			ip1dbg(("ndp_input_solicit: Can't create NCE %d\n",
1801 			    err));
1802 			goto done;
1803 		}
1804 no_source:
1805 		flag |= NDP_SOLICITED;
1806 	} else {
1807 		/*
1808 		 * No source link layer address option should be present in a
1809 		 * valid DAD request.
1810 		 */
1811 		if (haddr != NULL) {
1812 			ip1dbg(("ndp_input_solicit: source link-layer address "
1813 			    "option present with an unspecified source.\n"));
1814 			bad_solicit = B_TRUE;
1815 			goto done;
1816 		}
1817 		if (our_ncec->ncec_state == ND_PROBE) {
1818 			/*
1819 			 * Internally looped-back probes will have
1820 			 * IRAF_L2SRC_LOOPBACK set so we can ignore our own
1821 			 * transmissions.
1822 			 */
1823 			if (!(ira->ira_flags & IRAF_L2SRC_LOOPBACK)) {
1824 				/*
1825 				 * If someone else is probing our address, then
1826 				 * we've crossed wires.  Declare failure.
1827 				 */
1828 				ndp_failure(mp, ira);
1829 			}
1830 			goto done;
1831 		}
1832 		/*
1833 		 * This is a DAD probe.  Multicast the advertisement to the
1834 		 * all-nodes address.
1835 		 */
1836 		src = ipv6_all_hosts_mcast;
1837 	}
1838 	flag |= nce_advert_flags(our_ncec);
1839 	(void) ndp_xmit(ill,
1840 	    ND_NEIGHBOR_ADVERT,
1841 	    our_ncec->ncec_lladdr,
1842 	    our_ncec->ncec_lladdr_length,
1843 	    &target,	/* Source and target of the advertisement pkt */
1844 	    &src,	/* IP Destination (source of original pkt) */
1845 	    flag);
1846 done:
1847 	if (bad_solicit)
1848 		BUMP_MIB(mib, ipv6IfIcmpInBadNeighborSolicitations);
1849 	if (our_ncec != NULL)
1850 		ncec_refrele(our_ncec);
1851 }
1852 
1853 /*
1854  * Handle reception of Neighbor Solicitation messages
1855  */
1856 void
1857 ndp_input_advert(mblk_t *mp, ip_recv_attr_t *ira)
1858 {
1859 	ill_t		*ill = ira->ira_ill;
1860 	nd_neighbor_advert_t *na;
1861 	uint32_t	hlen = ill->ill_phys_addr_length;
1862 	uchar_t		*haddr = NULL;
1863 	icmp6_t		*icmp_nd;
1864 	ip6_t		*ip6h;
1865 	ncec_t		*dst_ncec = NULL;
1866 	in6_addr_t	target;
1867 	nd_opt_hdr_t	*opt = NULL;
1868 	int		len;
1869 	ip_stack_t	*ipst = ill->ill_ipst;
1870 	mib2_ipv6IfIcmpEntry_t	*mib = ill->ill_icmp6_mib;
1871 
1872 	ip6h = (ip6_t *)mp->b_rptr;
1873 	icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
1874 	len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN;
1875 	na = (nd_neighbor_advert_t *)icmp_nd;
1876 
1877 	if (IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst) &&
1878 	    (na->nd_na_flags_reserved & ND_NA_FLAG_SOLICITED)) {
1879 		ip1dbg(("ndp_input_advert: Target is multicast but the "
1880 		    "solicited flag is not zero\n"));
1881 		BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements);
1882 		return;
1883 	}
1884 	target = na->nd_na_target;
1885 	if (IN6_IS_ADDR_MULTICAST(&target)) {
1886 		ip1dbg(("ndp_input_advert: Target is multicast!\n"));
1887 		BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements);
1888 		return;
1889 	}
1890 	if (len > sizeof (nd_neighbor_advert_t)) {
1891 		opt = (nd_opt_hdr_t *)&na[1];
1892 		if (!ndp_verify_optlen(opt,
1893 		    len - sizeof (nd_neighbor_advert_t))) {
1894 			ip1dbg(("ndp_input_advert: cannot verify SLLA\n"));
1895 			BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements);
1896 			return;
1897 		}
1898 		/* At this point we have a verified NA per spec */
1899 		len -= sizeof (nd_neighbor_advert_t);
1900 		opt = ndp_get_option(opt, len, ND_OPT_TARGET_LINKADDR);
1901 		if (opt != NULL) {
1902 			haddr = (uchar_t *)&opt[1];
1903 			if (hlen > opt->nd_opt_len * 8 - sizeof (*opt) ||
1904 			    hlen == 0) {
1905 				ip1dbg(("ndp_input_advert: bad SLLA\n"));
1906 				BUMP_MIB(mib,
1907 				    ipv6IfIcmpInBadNeighborAdvertisements);
1908 				return;
1909 			}
1910 		}
1911 	}
1912 
1913 	/*
1914 	 * NOTE: we match across the illgrp since we need to do DAD for all of
1915 	 * our local addresses, and those are spread across all the active
1916 	 * ills in the group.
1917 	 */
1918 	if ((dst_ncec = ncec_lookup_illgrp_v6(ill, &target)) == NULL)
1919 		return;
1920 
1921 	if (NCE_PUBLISH(dst_ncec)) {
1922 		/*
1923 		 * Someone just advertised an addresses that we publish. First,
1924 		 * check it it was us -- if so, we can safely ignore it.
1925 		 * We don't get the haddr from the ira_l2src because, in the
1926 		 * case that the packet originated from us, on an IPMP group,
1927 		 * the ira_l2src may would be the link-layer address of the
1928 		 * cast_ill used to send the packet, which may not be the same
1929 		 * as the dst_ncec->ncec_lladdr of the address.
1930 		 */
1931 		if (haddr != NULL) {
1932 			if (ira->ira_flags & IRAF_L2SRC_LOOPBACK)
1933 				goto out;
1934 
1935 			if (!nce_cmp_ll_addr(dst_ncec, haddr, hlen))
1936 				goto out;   /* from us -- no conflict */
1937 
1938 			/*
1939 			 * If we're in an IPMP group, check if this is an echo
1940 			 * from another ill in the group.  Use the double-
1941 			 * checked locking pattern to avoid grabbing
1942 			 * ill_g_lock in the non-IPMP case.
1943 			 */
1944 			if (IS_UNDER_IPMP(ill)) {
1945 				rw_enter(&ipst->ips_ill_g_lock, RW_READER);
1946 				if (IS_UNDER_IPMP(ill) && ipmp_illgrp_find_ill(
1947 				    ill->ill_grp, haddr, hlen) != NULL) {
1948 					rw_exit(&ipst->ips_ill_g_lock);
1949 					goto out;
1950 				}
1951 				rw_exit(&ipst->ips_ill_g_lock);
1952 			}
1953 		}
1954 
1955 		/*
1956 		 * This appears to be a real conflict.  If we're trying to
1957 		 * configure this NCE (ND_PROBE), then shut it down.
1958 		 * Otherwise, handle the discovered conflict.
1959 		 */
1960 		if (dst_ncec->ncec_state == ND_PROBE) {
1961 			ndp_failure(mp, ira);
1962 		} else {
1963 			if (ip_nce_conflict(mp, ira, dst_ncec)) {
1964 				char hbuf[MAC_STR_LEN];
1965 				char sbuf[INET6_ADDRSTRLEN];
1966 
1967 				cmn_err(CE_WARN,
1968 				    "node '%s' is using %s on %s",
1969 				    inet_ntop(AF_INET6, &target, sbuf,
1970 				    sizeof (sbuf)),
1971 				    haddr == NULL ? "<none>" :
1972 				    mac_colon_addr(haddr, hlen, hbuf,
1973 				    sizeof (hbuf)), ill->ill_name);
1974 				/*
1975 				 * RFC 4862, Section 5.4.4 does not mandate
1976 				 * any specific behavior when an NA matches
1977 				 * a non-tentative address assigned to the
1978 				 * receiver. We make the choice of defending
1979 				 * our address, based on the assumption that
1980 				 * the sender has not detected the Duplicate.
1981 				 *
1982 				 * ncec_last_time_defended has been adjusted
1983 				 * in ip_nce_conflict()
1984 				 */
1985 				(void) ndp_announce(dst_ncec);
1986 			}
1987 		}
1988 	} else {
1989 		if (na->nd_na_flags_reserved & ND_NA_FLAG_ROUTER)
1990 			dst_ncec->ncec_flags |= NCE_F_ISROUTER;
1991 
1992 		/* B_TRUE indicates this an advertisement */
1993 		nce_process(dst_ncec, haddr, na->nd_na_flags_reserved, B_TRUE);
1994 	}
1995 out:
1996 	ncec_refrele(dst_ncec);
1997 }
1998 
1999 /*
2000  * Process NDP neighbor solicitation/advertisement messages.
2001  * The checksum has already checked o.k before reaching here.
2002  * Information about the datalink header is contained in ira_l2src, but
2003  * that should be ignored for loopback packets.
2004  */
2005 void
2006 ndp_input(mblk_t *mp, ip_recv_attr_t *ira)
2007 {
2008 	ill_t		*ill = ira->ira_rill;
2009 	icmp6_t		*icmp_nd;
2010 	ip6_t		*ip6h;
2011 	int		len;
2012 	mib2_ipv6IfIcmpEntry_t	*mib = ill->ill_icmp6_mib;
2013 	ill_t		*orig_ill = NULL;
2014 
2015 	/*
2016 	 * Since ira_ill is where the IRE_LOCAL was hosted we use ira_rill
2017 	 * and make it be the IPMP upper so avoid being confused by a packet
2018 	 * addressed to a unicast address on a different ill.
2019 	 */
2020 	if (IS_UNDER_IPMP(ill)) {
2021 		orig_ill = ill;
2022 		ill = ipmp_ill_hold_ipmp_ill(orig_ill);
2023 		if (ill == NULL) {
2024 			ill = orig_ill;
2025 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2026 			ip_drop_input("ipIfStatsInDiscards - IPMP ill",
2027 			    mp, ill);
2028 			freemsg(mp);
2029 			return;
2030 		}
2031 		ASSERT(ill != orig_ill);
2032 		orig_ill = ira->ira_ill;
2033 		ira->ira_ill = ill;
2034 		mib = ill->ill_icmp6_mib;
2035 	}
2036 	if (!pullupmsg(mp, -1)) {
2037 		ip1dbg(("ndp_input: pullupmsg failed\n"));
2038 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2039 		ip_drop_input("ipIfStatsInDiscards - pullupmsg", mp, ill);
2040 		goto done;
2041 	}
2042 	ip6h = (ip6_t *)mp->b_rptr;
2043 	if (ip6h->ip6_hops != IPV6_MAX_HOPS) {
2044 		ip1dbg(("ndp_input: hoplimit != IPV6_MAX_HOPS\n"));
2045 		ip_drop_input("ipv6IfIcmpBadHoplimit", mp, ill);
2046 		BUMP_MIB(mib, ipv6IfIcmpBadHoplimit);
2047 		goto done;
2048 	}
2049 	/*
2050 	 * NDP does not accept any extension headers between the
2051 	 * IP header and the ICMP header since e.g. a routing
2052 	 * header could be dangerous.
2053 	 * This assumes that any AH or ESP headers are removed
2054 	 * by ip prior to passing the packet to ndp_input.
2055 	 */
2056 	if (ip6h->ip6_nxt != IPPROTO_ICMPV6) {
2057 		ip1dbg(("ndp_input: Wrong next header 0x%x\n",
2058 		    ip6h->ip6_nxt));
2059 		ip_drop_input("Wrong next header", mp, ill);
2060 		BUMP_MIB(mib, ipv6IfIcmpInErrors);
2061 		goto done;
2062 	}
2063 	icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
2064 	ASSERT(icmp_nd->icmp6_type == ND_NEIGHBOR_SOLICIT ||
2065 	    icmp_nd->icmp6_type == ND_NEIGHBOR_ADVERT);
2066 	if (icmp_nd->icmp6_code != 0) {
2067 		ip1dbg(("ndp_input: icmp6 code != 0 \n"));
2068 		ip_drop_input("code non-zero", mp, ill);
2069 		BUMP_MIB(mib, ipv6IfIcmpInErrors);
2070 		goto done;
2071 	}
2072 	len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN;
2073 	/*
2074 	 * Make sure packet length is large enough for either
2075 	 * a NS or a NA icmp packet.
2076 	 */
2077 	if (len <  sizeof (struct icmp6_hdr) + sizeof (struct in6_addr)) {
2078 		ip1dbg(("ndp_input: packet too short\n"));
2079 		ip_drop_input("packet too short", mp, ill);
2080 		BUMP_MIB(mib, ipv6IfIcmpInErrors);
2081 		goto done;
2082 	}
2083 	if (icmp_nd->icmp6_type == ND_NEIGHBOR_SOLICIT) {
2084 		ndp_input_solicit(mp, ira);
2085 	} else {
2086 		ndp_input_advert(mp, ira);
2087 	}
2088 done:
2089 	freemsg(mp);
2090 	if (orig_ill != NULL) {
2091 		ill_refrele(ill);
2092 		ira->ira_ill = orig_ill;
2093 	}
2094 }
2095 
2096 /*
2097  * ndp_xmit is called to form and transmit a ND solicitation or
2098  * advertisement ICMP packet.
2099  *
2100  * If the source address is unspecified and this isn't a probe (used for
2101  * duplicate address detection), an appropriate source address and link layer
2102  * address will be chosen here.  The link layer address option is included if
2103  * the source is specified (i.e., all non-probe packets), and omitted (per the
2104  * specification) otherwise.
2105  *
2106  * It returns B_FALSE only if it does a successful put() to the
2107  * corresponding ill's ill_wq otherwise returns B_TRUE.
2108  */
2109 static boolean_t
2110 ndp_xmit(ill_t *ill, uint32_t operation, uint8_t *hw_addr, uint_t hw_addr_len,
2111     const in6_addr_t *sender, const in6_addr_t *target, int flag)
2112 {
2113 	uint32_t	len;
2114 	icmp6_t 	*icmp6;
2115 	mblk_t		*mp;
2116 	ip6_t		*ip6h;
2117 	nd_opt_hdr_t	*opt;
2118 	uint_t		plen;
2119 	zoneid_t	zoneid = GLOBAL_ZONEID;
2120 	ill_t		*hwaddr_ill = ill;
2121 	ip_xmit_attr_t	ixas;
2122 	ip_stack_t	*ipst = ill->ill_ipst;
2123 	boolean_t	need_refrele = B_FALSE;
2124 	boolean_t	probe = B_FALSE;
2125 
2126 	if (IS_UNDER_IPMP(ill)) {
2127 		probe = ipif_lookup_testaddr_v6(ill, sender, NULL);
2128 		/*
2129 		 * We send non-probe packets on the upper IPMP interface.
2130 		 * ip_output_simple() will use cast_ill for sending any
2131 		 * multicast packets. Note that we can't follow the same
2132 		 * logic for probe packets because all interfaces in the ipmp
2133 		 * group may have failed, so that we really want to only try
2134 		 * to send the ND packet on the ill corresponding to the src
2135 		 * address.
2136 		 */
2137 		if (!probe) {
2138 			ill = ipmp_ill_hold_ipmp_ill(ill);
2139 			if (ill != NULL)
2140 				need_refrele = B_TRUE;
2141 			else
2142 				ill = hwaddr_ill;
2143 		}
2144 	}
2145 
2146 	/*
2147 	 * If we have a unspecified source(sender) address, select a
2148 	 * proper source address for the solicitation here itself so
2149 	 * that we can initialize the h/w address correctly.
2150 	 *
2151 	 * If the sender is specified then we use this address in order
2152 	 * to lookup the zoneid before calling ip_output_v6(). This is to
2153 	 * enable unicast ND_NEIGHBOR_ADVERT packets to be routed correctly
2154 	 * by IP (we cannot guarantee that the global zone has an interface
2155 	 * route to the destination).
2156 	 *
2157 	 * Note that the NA never comes here with the unspecified source
2158 	 * address.
2159 	 */
2160 
2161 	/*
2162 	 * Probes will have unspec src at this point.
2163 	 */
2164 	if (!(IN6_IS_ADDR_UNSPECIFIED(sender))) {
2165 		zoneid = ipif_lookup_addr_zoneid_v6(sender, ill, ipst);
2166 		/*
2167 		 * It's possible for ipif_lookup_addr_zoneid_v6() to return
2168 		 * ALL_ZONES if it cannot find a matching ipif for the address
2169 		 * we are trying to use. In this case we err on the side of
2170 		 * trying to send the packet by defaulting to the GLOBAL_ZONEID.
2171 		 */
2172 		if (zoneid == ALL_ZONES)
2173 			zoneid = GLOBAL_ZONEID;
2174 	}
2175 
2176 	plen = (sizeof (nd_opt_hdr_t) + hw_addr_len + 7) / 8;
2177 	len = IPV6_HDR_LEN + sizeof (nd_neighbor_advert_t) + plen * 8;
2178 	mp = allocb(len,  BPRI_LO);
2179 	if (mp == NULL) {
2180 		if (need_refrele)
2181 			ill_refrele(ill);
2182 		return (B_TRUE);
2183 	}
2184 
2185 	bzero((char *)mp->b_rptr, len);
2186 	mp->b_wptr = mp->b_rptr + len;
2187 
2188 	bzero(&ixas, sizeof (ixas));
2189 	ixas.ixa_flags = IXAF_BASIC_SIMPLE_V6 | IXAF_NO_HW_CKSUM;
2190 
2191 	ixas.ixa_ifindex = ill->ill_phyint->phyint_ifindex;
2192 	ixas.ixa_ipst = ipst;
2193 	ixas.ixa_cred = kcred;
2194 	ixas.ixa_cpid = NOPID;
2195 	ixas.ixa_tsl = NULL;
2196 	ixas.ixa_zoneid = zoneid;
2197 
2198 	ip6h = (ip6_t *)mp->b_rptr;
2199 	ip6h->ip6_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
2200 	ip6h->ip6_plen = htons(len - IPV6_HDR_LEN);
2201 	ip6h->ip6_nxt = IPPROTO_ICMPV6;
2202 	ip6h->ip6_hops = IPV6_MAX_HOPS;
2203 	ixas.ixa_multicast_ttl = ip6h->ip6_hops;
2204 	ip6h->ip6_dst = *target;
2205 	icmp6 = (icmp6_t *)&ip6h[1];
2206 
2207 	if (hw_addr_len != 0) {
2208 		opt = (nd_opt_hdr_t *)((uint8_t *)ip6h + IPV6_HDR_LEN +
2209 		    sizeof (nd_neighbor_advert_t));
2210 	} else {
2211 		opt = NULL;
2212 	}
2213 	if (operation == ND_NEIGHBOR_SOLICIT) {
2214 		nd_neighbor_solicit_t *ns = (nd_neighbor_solicit_t *)icmp6;
2215 
2216 		if (opt != NULL && !(flag & NDP_PROBE)) {
2217 			/*
2218 			 * Note that we don't send out SLLA for ND probes
2219 			 * per RFC 4862, even though we do send out the src
2220 			 * haddr for IPv4 DAD probes, even though both IPv4
2221 			 * and IPv6 go out with the unspecified/INADDR_ANY
2222 			 * src IP addr.
2223 			 */
2224 			opt->nd_opt_type = ND_OPT_SOURCE_LINKADDR;
2225 		}
2226 		ip6h->ip6_src = *sender;
2227 		ns->nd_ns_target = *target;
2228 		if (!(flag & NDP_UNICAST)) {
2229 			/* Form multicast address of the target */
2230 			ip6h->ip6_dst = ipv6_solicited_node_mcast;
2231 			ip6h->ip6_dst.s6_addr32[3] |=
2232 			    ns->nd_ns_target.s6_addr32[3];
2233 		}
2234 	} else {
2235 		nd_neighbor_advert_t *na = (nd_neighbor_advert_t *)icmp6;
2236 
2237 		ASSERT(!(flag & NDP_PROBE));
2238 		if (opt != NULL)
2239 			opt->nd_opt_type = ND_OPT_TARGET_LINKADDR;
2240 		ip6h->ip6_src = *sender;
2241 		na->nd_na_target = *sender;
2242 		if (flag & NDP_ISROUTER)
2243 			na->nd_na_flags_reserved |= ND_NA_FLAG_ROUTER;
2244 		if (flag & NDP_SOLICITED)
2245 			na->nd_na_flags_reserved |= ND_NA_FLAG_SOLICITED;
2246 		if (flag & NDP_ORIDE)
2247 			na->nd_na_flags_reserved |= ND_NA_FLAG_OVERRIDE;
2248 	}
2249 
2250 	if (!(flag & NDP_PROBE)) {
2251 		if (hw_addr != NULL && opt != NULL) {
2252 			/* Fill in link layer address and option len */
2253 			opt->nd_opt_len = (uint8_t)plen;
2254 			bcopy(hw_addr, &opt[1], hw_addr_len);
2255 		}
2256 	}
2257 	if (opt != NULL && opt->nd_opt_type == 0) {
2258 		/* If there's no link layer address option, then strip it. */
2259 		len -= plen * 8;
2260 		mp->b_wptr = mp->b_rptr + len;
2261 		ip6h->ip6_plen = htons(len - IPV6_HDR_LEN);
2262 	}
2263 
2264 	icmp6->icmp6_type = (uint8_t)operation;
2265 	icmp6->icmp6_code = 0;
2266 	/*
2267 	 * Prepare for checksum by putting icmp length in the icmp
2268 	 * checksum field. The checksum is calculated in ip_output.c.
2269 	 */
2270 	icmp6->icmp6_cksum = ip6h->ip6_plen;
2271 
2272 	(void) ip_output_simple(mp, &ixas);
2273 	ixa_cleanup(&ixas);
2274 	if (need_refrele)
2275 		ill_refrele(ill);
2276 	return (B_FALSE);
2277 }
2278 
2279 /*
2280  * Used to set ND_UNREACHBLE before ncec_delete sets it NCE_F_CONDEMNED.
2281  * The datapath uses this as an indication that there
2282  * is a problem (as opposed to a NCE that was just
2283  * reclaimed due to lack of memory.
2284  * Note that static ARP entries never become unreachable.
2285  */
2286 void
2287 nce_make_unreachable(ncec_t *ncec)
2288 {
2289 	mutex_enter(&ncec->ncec_lock);
2290 	ncec->ncec_state = ND_UNREACHABLE;
2291 	mutex_exit(&ncec->ncec_lock);
2292 }
2293 
2294 /*
2295  * NCE retransmit timer. Common to IPv4 and IPv6.
2296  * This timer goes off when:
2297  * a. It is time to retransmit a resolution for resolver.
2298  * b. It is time to send reachability probes.
2299  */
2300 void
2301 nce_timer(void *arg)
2302 {
2303 	ncec_t		*ncec = arg;
2304 	ill_t		*ill = ncec->ncec_ill, *src_ill;
2305 	char		addrbuf[INET6_ADDRSTRLEN];
2306 	boolean_t	dropped = B_FALSE;
2307 	ip_stack_t	*ipst = ncec->ncec_ipst;
2308 	boolean_t	isv6 = (ncec->ncec_ipversion == IPV6_VERSION);
2309 	in_addr_t	sender4 = INADDR_ANY;
2310 	in6_addr_t	sender6 = ipv6_all_zeros;
2311 
2312 	/*
2313 	 * The timer has to be cancelled by ncec_delete before doing the final
2314 	 * refrele. So the NCE is guaranteed to exist when the timer runs
2315 	 * until it clears the timeout_id. Before clearing the timeout_id
2316 	 * bump up the refcnt so that we can continue to use the ncec
2317 	 */
2318 	ASSERT(ncec != NULL);
2319 	mutex_enter(&ncec->ncec_lock);
2320 	ncec_refhold_locked(ncec);
2321 	ncec->ncec_timeout_id = 0;
2322 	mutex_exit(&ncec->ncec_lock);
2323 
2324 	src_ill = nce_resolve_src(ncec, &sender6);
2325 	/* if we could not find a sender address, return */
2326 	if (src_ill == NULL) {
2327 		if (!isv6) {
2328 			IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, sender4);
2329 			ip1dbg(("no src ill for %s\n", inet_ntop(AF_INET,
2330 			    &sender4, addrbuf, sizeof (addrbuf))));
2331 		} else {
2332 			ip1dbg(("no src ill for %s\n", inet_ntop(AF_INET6,
2333 			    &ncec->ncec_addr, addrbuf, sizeof (addrbuf))));
2334 		}
2335 		nce_restart_timer(ncec, ill->ill_reachable_retrans_time);
2336 		ncec_refrele(ncec);
2337 		return;
2338 	}
2339 	if (!isv6)
2340 		IN6_V4MAPPED_TO_IPADDR(&sender6, sender4);
2341 
2342 	mutex_enter(&ncec->ncec_lock);
2343 	/*
2344 	 * Check the reachability state.
2345 	 */
2346 	switch (ncec->ncec_state) {
2347 	case ND_DELAY:
2348 		ASSERT(ncec->ncec_lladdr != NULL);
2349 		ncec->ncec_state = ND_PROBE;
2350 		ncec->ncec_pcnt = ND_MAX_UNICAST_SOLICIT;
2351 		if (isv6) {
2352 			mutex_exit(&ncec->ncec_lock);
2353 			(void) ndp_xmit(src_ill, ND_NEIGHBOR_SOLICIT,
2354 			    src_ill->ill_phys_addr,
2355 			    src_ill->ill_phys_addr_length,
2356 			    &sender6, &ncec->ncec_addr,
2357 			    NDP_UNICAST);
2358 		} else {
2359 			(void) arp_request(ncec, sender4, src_ill);
2360 			mutex_exit(&ncec->ncec_lock);
2361 		}
2362 		if (ip_debug > 3) {
2363 			/* ip2dbg */
2364 			pr_addr_dbg("nce_timer: state for %s changed "
2365 			    "to PROBE\n", AF_INET6, &ncec->ncec_addr);
2366 		}
2367 		nce_restart_timer(ncec, ill->ill_reachable_retrans_time);
2368 		break;
2369 	case ND_PROBE:
2370 		/* must be retransmit timer */
2371 		ASSERT(ncec->ncec_pcnt >= -1);
2372 		if (ncec->ncec_pcnt > 0) {
2373 			/*
2374 			 * As per RFC2461, the ncec gets deleted after
2375 			 * MAX_UNICAST_SOLICIT unsuccessful re-transmissions.
2376 			 * Note that the first unicast solicitation is sent
2377 			 * during the DELAY state.
2378 			 */
2379 			ip2dbg(("nce_timer: pcount=%x dst %s\n",
2380 			    ncec->ncec_pcnt,
2381 			    inet_ntop((isv6? AF_INET6 : AF_INET),
2382 			    &ncec->ncec_addr, addrbuf, sizeof (addrbuf))));
2383 			if (NCE_PUBLISH(ncec)) {
2384 				mutex_exit(&ncec->ncec_lock);
2385 				/*
2386 				 * send out a probe; note that src_ill
2387 				 * is ignored by nce_dad() for all
2388 				 * DAD message types other than IPv6
2389 				 * unicast probes
2390 				 */
2391 				nce_dad(ncec, src_ill, B_TRUE);
2392 			} else {
2393 				ASSERT(src_ill != NULL);
2394 				ncec->ncec_pcnt--;
2395 				if (isv6) {
2396 					mutex_exit(&ncec->ncec_lock);
2397 					(void) ndp_xmit(src_ill,
2398 					    ND_NEIGHBOR_SOLICIT,
2399 					    src_ill->ill_phys_addr,
2400 					    src_ill->ill_phys_addr_length,
2401 					    &sender6, &ncec->ncec_addr,
2402 					    NDP_UNICAST);
2403 				} else {
2404 					/*
2405 					 * since the nce is REACHABLE,
2406 					 * the ARP request will be sent out
2407 					 * as a link-layer unicast.
2408 					 */
2409 					(void) arp_request(ncec, sender4,
2410 					    src_ill);
2411 					mutex_exit(&ncec->ncec_lock);
2412 				}
2413 				nce_restart_timer(ncec,
2414 				    ill->ill_reachable_retrans_time);
2415 			}
2416 		} else if (ncec->ncec_pcnt < 0) {
2417 			/* No hope, delete the ncec */
2418 			/* Tell datapath it went bad */
2419 			ncec->ncec_state = ND_UNREACHABLE;
2420 			mutex_exit(&ncec->ncec_lock);
2421 			if (ip_debug > 2) {
2422 				/* ip1dbg */
2423 				pr_addr_dbg("nce_timer: Delete NCE for"
2424 				    " dst %s\n", (isv6? AF_INET6: AF_INET),
2425 				    &ncec->ncec_addr);
2426 			}
2427 			/* if static ARP can't delete. */
2428 			if ((ncec->ncec_flags & NCE_F_STATIC) == 0)
2429 				ncec_delete(ncec);
2430 
2431 		} else if (!NCE_PUBLISH(ncec)) {
2432 			/*
2433 			 * Probe count is 0 for a dynamic entry (one that we
2434 			 * ourselves are not publishing). We should never get
2435 			 * here if NONUD was requested, hence the ASSERT below.
2436 			 */
2437 			ASSERT((ncec->ncec_flags & NCE_F_NONUD) == 0);
2438 			ip2dbg(("nce_timer: pcount=%x dst %s\n",
2439 			    ncec->ncec_pcnt, inet_ntop(AF_INET6,
2440 			    &ncec->ncec_addr, addrbuf, sizeof (addrbuf))));
2441 			ncec->ncec_pcnt--;
2442 			mutex_exit(&ncec->ncec_lock);
2443 			/* Wait one interval before killing */
2444 			nce_restart_timer(ncec,
2445 			    ill->ill_reachable_retrans_time);
2446 		} else if (ill->ill_phyint->phyint_flags & PHYI_RUNNING) {
2447 			ipif_t *ipif;
2448 			ipaddr_t ncec_addr;
2449 
2450 			/*
2451 			 * We're done probing, and we can now declare this
2452 			 * address to be usable.  Let IP know that it's ok to
2453 			 * use.
2454 			 */
2455 			ncec->ncec_state = ND_REACHABLE;
2456 			ncec->ncec_flags &= ~NCE_F_UNVERIFIED;
2457 			mutex_exit(&ncec->ncec_lock);
2458 			if (isv6) {
2459 				ipif = ipif_lookup_addr_exact_v6(
2460 				    &ncec->ncec_addr, ill, ipst);
2461 			} else {
2462 				IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr,
2463 				    ncec_addr);
2464 				ipif = ipif_lookup_addr_exact(ncec_addr, ill,
2465 				    ipst);
2466 			}
2467 			if (ipif != NULL) {
2468 				if (ipif->ipif_was_dup) {
2469 					char ibuf[LIFNAMSIZ + 10];
2470 					char sbuf[INET6_ADDRSTRLEN];
2471 
2472 					ipif->ipif_was_dup = B_FALSE;
2473 					(void) inet_ntop(AF_INET6,
2474 					    &ipif->ipif_v6lcl_addr,
2475 					    sbuf, sizeof (sbuf));
2476 					ipif_get_name(ipif, ibuf,
2477 					    sizeof (ibuf));
2478 					cmn_err(CE_NOTE, "recovered address "
2479 					    "%s on %s", sbuf, ibuf);
2480 				}
2481 				if ((ipif->ipif_flags & IPIF_UP) &&
2482 				    !ipif->ipif_addr_ready)
2483 					ipif_up_notify(ipif);
2484 				ipif->ipif_addr_ready = 1;
2485 				ipif_refrele(ipif);
2486 			}
2487 			if (!isv6 && arp_no_defense)
2488 				break;
2489 			/* Begin defending our new address */
2490 			if (ncec->ncec_unsolicit_count > 0) {
2491 				ncec->ncec_unsolicit_count--;
2492 				if (isv6) {
2493 					dropped = ndp_announce(ncec);
2494 				} else {
2495 					dropped = arp_announce(ncec);
2496 				}
2497 
2498 				if (dropped)
2499 					ncec->ncec_unsolicit_count++;
2500 				else
2501 					ncec->ncec_last_time_defended =
2502 					    ddi_get_lbolt();
2503 			}
2504 			if (ncec->ncec_unsolicit_count > 0) {
2505 				nce_restart_timer(ncec,
2506 				    ANNOUNCE_INTERVAL(isv6));
2507 			} else if (DEFENSE_INTERVAL(isv6) != 0) {
2508 				nce_restart_timer(ncec, DEFENSE_INTERVAL(isv6));
2509 			}
2510 		} else {
2511 			/*
2512 			 * This is an address we're probing to be our own, but
2513 			 * the ill is down.  Wait until it comes back before
2514 			 * doing anything, but switch to reachable state so
2515 			 * that the restart will work.
2516 			 */
2517 			ncec->ncec_state = ND_REACHABLE;
2518 			mutex_exit(&ncec->ncec_lock);
2519 		}
2520 		break;
2521 	case ND_INCOMPLETE: {
2522 		mblk_t	*mp, *nextmp;
2523 		mblk_t	**prevmpp;
2524 
2525 		/*
2526 		 * Per case (2) in the nce_queue_mp() comments, scan ncec_qd_mp
2527 		 * for any IPMP probe packets, and toss them.  IPMP probe
2528 		 * packets will always be at the head of ncec_qd_mp, so that
2529 		 * we can stop at the first queued ND packet that is
2530 		 * not a probe packet.
2531 		 */
2532 		prevmpp = &ncec->ncec_qd_mp;
2533 		for (mp = ncec->ncec_qd_mp; mp != NULL; mp = nextmp) {
2534 			nextmp = mp->b_next;
2535 
2536 			if (IS_UNDER_IPMP(ill) && ncec->ncec_nprobes > 0) {
2537 				inet_freemsg(mp);
2538 				ncec->ncec_nprobes--;
2539 				*prevmpp = nextmp;
2540 			} else {
2541 				prevmpp = &mp->b_next;
2542 			}
2543 		}
2544 
2545 		/*
2546 		 * Must be resolver's retransmit timer.
2547 		 */
2548 		mutex_exit(&ncec->ncec_lock);
2549 		ip_ndp_resolve(ncec);
2550 		break;
2551 	}
2552 	case ND_REACHABLE:
2553 		if (((ncec->ncec_flags & NCE_F_UNSOL_ADV) &&
2554 		    ncec->ncec_unsolicit_count != 0) ||
2555 		    (NCE_PUBLISH(ncec) && DEFENSE_INTERVAL(isv6) != 0)) {
2556 			if (ncec->ncec_unsolicit_count > 0) {
2557 				ncec->ncec_unsolicit_count--;
2558 				mutex_exit(&ncec->ncec_lock);
2559 				/*
2560 				 * When we get to zero announcements left,
2561 				 * switch to address defense
2562 				 */
2563 			} else {
2564 				boolean_t rate_limit;
2565 
2566 				mutex_exit(&ncec->ncec_lock);
2567 				rate_limit = ill_defend_rate_limit(ill, ncec);
2568 				if (rate_limit) {
2569 					nce_restart_timer(ncec,
2570 					    DEFENSE_INTERVAL(isv6));
2571 					break;
2572 				}
2573 			}
2574 			if (isv6) {
2575 				dropped = ndp_announce(ncec);
2576 			} else {
2577 				dropped = arp_announce(ncec);
2578 			}
2579 			mutex_enter(&ncec->ncec_lock);
2580 			if (dropped) {
2581 				ncec->ncec_unsolicit_count++;
2582 			} else {
2583 				ncec->ncec_last_time_defended =
2584 				    ddi_get_lbolt();
2585 			}
2586 			mutex_exit(&ncec->ncec_lock);
2587 			if (ncec->ncec_unsolicit_count != 0) {
2588 				nce_restart_timer(ncec,
2589 				    ANNOUNCE_INTERVAL(isv6));
2590 			} else {
2591 				nce_restart_timer(ncec, DEFENSE_INTERVAL(isv6));
2592 			}
2593 		} else {
2594 			mutex_exit(&ncec->ncec_lock);
2595 		}
2596 		break;
2597 	default:
2598 		mutex_exit(&ncec->ncec_lock);
2599 		break;
2600 	}
2601 done:
2602 	ncec_refrele(ncec);
2603 	ill_refrele(src_ill);
2604 }
2605 
2606 /*
2607  * Set a link layer address from the ll_addr passed in.
2608  * Copy SAP from ill.
2609  */
2610 static void
2611 nce_set_ll(ncec_t *ncec, uchar_t *ll_addr)
2612 {
2613 	ill_t	*ill = ncec->ncec_ill;
2614 
2615 	ASSERT(ll_addr != NULL);
2616 	if (ill->ill_phys_addr_length > 0) {
2617 		/*
2618 		 * The bcopy() below used to be called for the physical address
2619 		 * length rather than the link layer address length. For
2620 		 * ethernet and many other media, the phys_addr and lla are
2621 		 * identical.
2622 		 *
2623 		 * The phys_addr and lla may not be the same for devices that
2624 		 * support DL_IPV6_LINK_LAYER_ADDR, though there are currently
2625 		 * no known instances of these.
2626 		 *
2627 		 * For PPP or other interfaces with a zero length
2628 		 * physical address, don't do anything here.
2629 		 * The bcopy() with a zero phys_addr length was previously
2630 		 * a no-op for interfaces with a zero-length physical address.
2631 		 * Using the lla for them would change the way they operate.
2632 		 * Doing nothing in such cases preserves expected behavior.
2633 		 */
2634 		bcopy(ll_addr, ncec->ncec_lladdr, ill->ill_nd_lla_len);
2635 	}
2636 }
2637 
2638 boolean_t
2639 nce_cmp_ll_addr(const ncec_t *ncec, const uchar_t *ll_addr,
2640     uint32_t ll_addr_len)
2641 {
2642 	ASSERT(ncec->ncec_lladdr != NULL);
2643 	if (ll_addr == NULL)
2644 		return (B_FALSE);
2645 	if (bcmp(ll_addr, ncec->ncec_lladdr, ll_addr_len) != 0)
2646 		return (B_TRUE);
2647 	return (B_FALSE);
2648 }
2649 
2650 /*
2651  * Updates the link layer address or the reachability state of
2652  * a cache entry.  Reset probe counter if needed.
2653  */
2654 void
2655 nce_update(ncec_t *ncec, uint16_t new_state, uchar_t *new_ll_addr)
2656 {
2657 	ill_t	*ill = ncec->ncec_ill;
2658 	boolean_t need_stop_timer = B_FALSE;
2659 	boolean_t need_fastpath_update = B_FALSE;
2660 	nce_t	*nce = NULL;
2661 	timeout_id_t tid;
2662 
2663 	ASSERT(MUTEX_HELD(&ncec->ncec_lock));
2664 	/*
2665 	 * If this interface does not do NUD, there is no point
2666 	 * in allowing an update to the cache entry.  Although
2667 	 * we will respond to NS.
2668 	 * The only time we accept an update for a resolver when
2669 	 * NUD is turned off is when it has just been created.
2670 	 * Non-Resolvers will always be created as REACHABLE.
2671 	 */
2672 	if (new_state != ND_UNCHANGED) {
2673 		if ((ncec->ncec_flags & NCE_F_NONUD) &&
2674 		    (ncec->ncec_state != ND_INCOMPLETE))
2675 			return;
2676 		ASSERT((int16_t)new_state >= ND_STATE_VALID_MIN);
2677 		ASSERT((int16_t)new_state <= ND_STATE_VALID_MAX);
2678 		need_stop_timer = B_TRUE;
2679 		if (new_state == ND_REACHABLE)
2680 			ncec->ncec_last = TICK_TO_MSEC(lbolt64);
2681 		else {
2682 			/* We force NUD in this case */
2683 			ncec->ncec_last = 0;
2684 		}
2685 		ncec->ncec_state = new_state;
2686 		ncec->ncec_pcnt = ND_MAX_UNICAST_SOLICIT;
2687 		ASSERT(ncec->ncec_lladdr != NULL || new_state == ND_INITIAL ||
2688 		    new_state == ND_INCOMPLETE);
2689 	}
2690 	if (need_stop_timer || (ncec->ncec_flags & NCE_F_STATIC)) {
2691 		tid = ncec->ncec_timeout_id;
2692 		ncec->ncec_timeout_id = 0;
2693 	}
2694 	/*
2695 	 * Re-trigger fastpath probe and
2696 	 * overwrite the DL_UNITDATA_REQ data, noting we'll lose
2697 	 * whatever packets that happens to be transmitting at the time.
2698 	 */
2699 	if (new_ll_addr != NULL) {
2700 		bcopy(new_ll_addr, ncec->ncec_lladdr,
2701 		    ill->ill_phys_addr_length);
2702 		need_fastpath_update = B_TRUE;
2703 	}
2704 	mutex_exit(&ncec->ncec_lock);
2705 	if (need_stop_timer || (ncec->ncec_flags & NCE_F_STATIC)) {
2706 		if (tid != 0)
2707 			(void) untimeout(tid);
2708 	}
2709 	if (need_fastpath_update) {
2710 		/*
2711 		 * Delete any existing existing dlur_mp and fp_mp information.
2712 		 * For IPMP interfaces, all underlying ill's must be checked
2713 		 * and purged.
2714 		 */
2715 		nce_fastpath_list_delete(ncec->ncec_ill, ncec, NULL);
2716 		/*
2717 		 * add the new dlur_mp and fp_mp
2718 		 */
2719 		nce = nce_fastpath(ncec, B_TRUE, NULL);
2720 		if (nce != NULL)
2721 			nce_refrele(nce);
2722 	}
2723 	mutex_enter(&ncec->ncec_lock);
2724 }
2725 
2726 static void
2727 nce_queue_mp_common(ncec_t *ncec, mblk_t *mp, boolean_t head_insert)
2728 {
2729 	uint_t	count = 0;
2730 	mblk_t  **mpp, *tmp;
2731 
2732 	ASSERT(MUTEX_HELD(&ncec->ncec_lock));
2733 
2734 	for (mpp = &ncec->ncec_qd_mp; *mpp != NULL; mpp = &(*mpp)->b_next) {
2735 		if (++count > ncec->ncec_ill->ill_max_buf) {
2736 			tmp = ncec->ncec_qd_mp->b_next;
2737 			ncec->ncec_qd_mp->b_next = NULL;
2738 			/*
2739 			 * if we never create data addrs on the under_ill
2740 			 * does this matter?
2741 			 */
2742 			BUMP_MIB(ncec->ncec_ill->ill_ip_mib,
2743 			    ipIfStatsOutDiscards);
2744 			ip_drop_output("ipIfStatsOutDiscards", ncec->ncec_qd_mp,
2745 			    ncec->ncec_ill);
2746 			freemsg(ncec->ncec_qd_mp);
2747 			ncec->ncec_qd_mp = tmp;
2748 		}
2749 	}
2750 
2751 	if (head_insert) {
2752 		ncec->ncec_nprobes++;
2753 		mp->b_next = ncec->ncec_qd_mp;
2754 		ncec->ncec_qd_mp = mp;
2755 	} else {
2756 		*mpp = mp;
2757 	}
2758 }
2759 
2760 /*
2761  * nce_queue_mp will queue the packet into the ncec_qd_mp. The packet will be
2762  * queued at the head or tail of the queue based on the input argument
2763  * 'head_insert'. The caller should specify this argument as B_TRUE if this
2764  * packet is an IPMP probe packet, in which case the following happens:
2765  *
2766  *   1. Insert it at the head of the ncec_qd_mp list.  Consider the normal
2767  *	(non-ipmp_probe) load-speading case where the source address of the ND
2768  *	packet is not tied to ncec_ill. If the ill bound to the source address
2769  *	cannot receive, the response to the ND packet will not be received.
2770  *	However, if ND packets for ncec_ill's probes are queued	behind that ND
2771  *	packet, those probes will also fail to be sent, and thus in.mpathd will
2772  *	 erroneously conclude that ncec_ill has also failed.
2773  *
2774  *   2. Drop the ipmp_probe packet in ndp_timer() if the ND did	not succeed on
2775  *	the first attempt.  This ensures that ND problems do not manifest as
2776  *	probe RTT spikes.
2777  *
2778  * We achieve this by inserting ipmp_probe() packets at the head of the
2779  * nce_queue.
2780  *
2781  * The ncec for the probe target is created with ncec_ill set to the ipmp_ill,
2782  * but the caller needs to set head_insert to B_TRUE if this is a probe packet.
2783  */
2784 void
2785 nce_queue_mp(ncec_t *ncec, mblk_t *mp, boolean_t head_insert)
2786 {
2787 	ASSERT(MUTEX_HELD(&ncec->ncec_lock));
2788 	nce_queue_mp_common(ncec, mp, head_insert);
2789 }
2790 
2791 /*
2792  * Called when address resolution failed due to a timeout.
2793  * Send an ICMP unreachable in response to all queued packets.
2794  */
2795 void
2796 ndp_resolv_failed(ncec_t *ncec)
2797 {
2798 	mblk_t	*mp, *nxt_mp;
2799 	char	buf[INET6_ADDRSTRLEN];
2800 	ill_t *ill = ncec->ncec_ill;
2801 	ip_recv_attr_t	iras;
2802 
2803 	bzero(&iras, sizeof (iras));
2804 	iras.ira_flags = 0;
2805 	/*
2806 	 * we are setting the ira_rill to the ipmp_ill (instead of
2807 	 * the actual ill on which the packet was received), but this
2808 	 * is ok because we don't actually need the real ira_rill.
2809 	 * to send the icmp unreachable to the sender.
2810 	 */
2811 	iras.ira_ill = iras.ira_rill = ill;
2812 	iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex;
2813 	iras.ira_rifindex = iras.ira_ruifindex;
2814 
2815 	ip1dbg(("ndp_resolv_failed: dst %s\n",
2816 	    inet_ntop(AF_INET6, (char *)&ncec->ncec_addr, buf, sizeof (buf))));
2817 	mutex_enter(&ncec->ncec_lock);
2818 	mp = ncec->ncec_qd_mp;
2819 	ncec->ncec_qd_mp = NULL;
2820 	ncec->ncec_nprobes = 0;
2821 	mutex_exit(&ncec->ncec_lock);
2822 	while (mp != NULL) {
2823 		nxt_mp = mp->b_next;
2824 		mp->b_next = NULL;
2825 
2826 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
2827 		ip_drop_output("ipIfStatsOutDiscards - address unreachable",
2828 		    mp, ill);
2829 		icmp_unreachable_v6(mp,
2830 		    ICMP6_DST_UNREACH_ADDR, B_FALSE, &iras);
2831 		ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE));
2832 		mp = nxt_mp;
2833 	}
2834 	ncec_cb_dispatch(ncec); /* finish off waiting callbacks */
2835 }
2836 
2837 /*
2838  * Handle the completion of NDP and ARP resolution.
2839  */
2840 void
2841 nce_resolv_ok(ncec_t *ncec)
2842 {
2843 	mblk_t *mp;
2844 	uint_t pkt_len;
2845 	iaflags_t ixaflags = IXAF_NO_TRACE;
2846 	nce_t *nce;
2847 	ill_t	*ill = ncec->ncec_ill;
2848 	boolean_t isv6 = (ncec->ncec_ipversion == IPV6_VERSION);
2849 	ip_stack_t *ipst = ill->ill_ipst;
2850 
2851 	if (IS_IPMP(ncec->ncec_ill)) {
2852 		nce_resolv_ipmp_ok(ncec);
2853 		return;
2854 	}
2855 	/* non IPMP case */
2856 
2857 	mutex_enter(&ncec->ncec_lock);
2858 	ASSERT(ncec->ncec_nprobes == 0);
2859 	mp = ncec->ncec_qd_mp;
2860 	ncec->ncec_qd_mp = NULL;
2861 	mutex_exit(&ncec->ncec_lock);
2862 
2863 	while (mp != NULL) {
2864 		mblk_t *nxt_mp;
2865 
2866 		if (ill->ill_isv6) {
2867 			ip6_t *ip6h = (ip6_t *)mp->b_rptr;
2868 
2869 			pkt_len = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN;
2870 		} else {
2871 			ipha_t *ipha = (ipha_t *)mp->b_rptr;
2872 
2873 			ixaflags |= IXAF_IS_IPV4;
2874 			pkt_len = ntohs(ipha->ipha_length);
2875 		}
2876 		nxt_mp = mp->b_next;
2877 		mp->b_next = NULL;
2878 		/*
2879 		 * IXAF_NO_DEV_FLOW_CTL information for TCP packets is no
2880 		 * longer available, but it's ok to drop this flag because TCP
2881 		 * has its own flow-control in effect, so TCP packets
2882 		 * are not likely to get here when flow-control is in effect.
2883 		 */
2884 		mutex_enter(&ill->ill_lock);
2885 		nce = nce_lookup(ill, &ncec->ncec_addr);
2886 		mutex_exit(&ill->ill_lock);
2887 
2888 		if (nce == NULL) {
2889 			if (isv6) {
2890 				BUMP_MIB(&ipst->ips_ip6_mib,
2891 				    ipIfStatsOutDiscards);
2892 			} else {
2893 				BUMP_MIB(&ipst->ips_ip_mib,
2894 				    ipIfStatsOutDiscards);
2895 			}
2896 			ip_drop_output("ipIfStatsOutDiscards - no nce",
2897 			    mp, NULL);
2898 			freemsg(mp);
2899 		} else {
2900 			/*
2901 			 * We don't know the zoneid, but
2902 			 * ip_xmit does not care since IXAF_NO_TRACE
2903 			 * is set. (We traced the packet the first
2904 			 * time through ip_xmit.)
2905 			 */
2906 			(void) ip_xmit(mp, nce, ixaflags, pkt_len, 0,
2907 			    ALL_ZONES, 0, NULL);
2908 			nce_refrele(nce);
2909 		}
2910 		mp = nxt_mp;
2911 	}
2912 
2913 	ncec_cb_dispatch(ncec); /* complete callbacks */
2914 }
2915 
2916 /*
2917  * Called by SIOCSNDP* ioctl to add/change an ncec entry
2918  * and the corresponding attributes.
2919  * Disallow states other than ND_REACHABLE or ND_STALE.
2920  */
2921 int
2922 ndp_sioc_update(ill_t *ill, lif_nd_req_t *lnr)
2923 {
2924 	sin6_t		*sin6;
2925 	in6_addr_t	*addr;
2926 	ncec_t		*ncec;
2927 	nce_t		*nce;
2928 	int		err = 0;
2929 	uint16_t	new_flags = 0;
2930 	uint16_t	old_flags = 0;
2931 	int		inflags = lnr->lnr_flags;
2932 	ip_stack_t	*ipst = ill->ill_ipst;
2933 	boolean_t	do_postprocess = B_FALSE;
2934 
2935 	ASSERT(ill->ill_isv6);
2936 	if ((lnr->lnr_state_create != ND_REACHABLE) &&
2937 	    (lnr->lnr_state_create != ND_STALE))
2938 		return (EINVAL);
2939 
2940 	sin6 = (sin6_t *)&lnr->lnr_addr;
2941 	addr = &sin6->sin6_addr;
2942 
2943 	mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
2944 	ASSERT(!IS_UNDER_IPMP(ill));
2945 	nce = nce_lookup_addr(ill, addr);
2946 	if (nce != NULL)
2947 		new_flags = nce->nce_common->ncec_flags;
2948 
2949 	switch (inflags & (NDF_ISROUTER_ON|NDF_ISROUTER_OFF)) {
2950 	case NDF_ISROUTER_ON:
2951 		new_flags |= NCE_F_ISROUTER;
2952 		break;
2953 	case NDF_ISROUTER_OFF:
2954 		new_flags &= ~NCE_F_ISROUTER;
2955 		break;
2956 	case (NDF_ISROUTER_OFF|NDF_ISROUTER_ON):
2957 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
2958 		if (nce != NULL)
2959 			nce_refrele(nce);
2960 		return (EINVAL);
2961 	}
2962 
2963 	switch (inflags & (NDF_ANYCAST_ON|NDF_ANYCAST_OFF)) {
2964 	case NDF_ANYCAST_ON:
2965 		new_flags |= NCE_F_ANYCAST;
2966 		break;
2967 	case NDF_ANYCAST_OFF:
2968 		new_flags &= ~NCE_F_ANYCAST;
2969 		break;
2970 	case (NDF_ANYCAST_OFF|NDF_ANYCAST_ON):
2971 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
2972 		if (nce != NULL)
2973 			nce_refrele(nce);
2974 		return (EINVAL);
2975 	}
2976 
2977 	if (nce == NULL) {
2978 		err = nce_add_v6(ill,
2979 		    (uchar_t *)lnr->lnr_hdw_addr,
2980 		    ill->ill_phys_addr_length,
2981 		    addr,
2982 		    new_flags,
2983 		    lnr->lnr_state_create,
2984 		    &nce);
2985 		if (err != 0) {
2986 			mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
2987 			ip1dbg(("ndp_sioc_update: Can't create NCE %d\n", err));
2988 			return (err);
2989 		} else {
2990 			do_postprocess = B_TRUE;
2991 		}
2992 	}
2993 	ncec = nce->nce_common;
2994 	old_flags = ncec->ncec_flags;
2995 	if (old_flags & NCE_F_ISROUTER && !(new_flags & NCE_F_ISROUTER)) {
2996 		ncec_router_to_host(ncec);
2997 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
2998 		if (do_postprocess)
2999 			err = nce_add_v6_postprocess(nce);
3000 		nce_refrele(nce);
3001 		return (0);
3002 	}
3003 	mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
3004 
3005 	if (do_postprocess)
3006 		err = nce_add_v6_postprocess(nce);
3007 	/*
3008 	 * err cannot be anything other than 0 because we don't support
3009 	 * proxy arp of static addresses.
3010 	 */
3011 	ASSERT(err == 0);
3012 
3013 	mutex_enter(&ncec->ncec_lock);
3014 	ncec->ncec_flags = new_flags;
3015 	mutex_exit(&ncec->ncec_lock);
3016 	/*
3017 	 * Note that we ignore the state at this point, which
3018 	 * should be either STALE or REACHABLE.  Instead we let
3019 	 * the link layer address passed in to determine the state
3020 	 * much like incoming packets.
3021 	 */
3022 	nce_process(ncec, (uchar_t *)lnr->lnr_hdw_addr, 0, B_FALSE);
3023 	nce_refrele(nce);
3024 	return (0);
3025 }
3026 
3027 /*
3028  * Create an nce_t structure for ill using the ncec->ncec_lladdr to set up
3029  * the nce_dlur_mp. If ill != ncec->ncec_ill, then the ips_ill_g_lock must
3030  * be held to ensure that they are in the same group.
3031  */
3032 static nce_t *
3033 nce_fastpath_create(ill_t *ill, ncec_t *ncec)
3034 {
3035 
3036 	nce_t *nce;
3037 
3038 	nce = nce_ill_lookup_then_add(ill, ncec);
3039 
3040 	if (nce == NULL || IS_LOOPBACK(nce->nce_ill) || IS_VNI(nce->nce_ill))
3041 		return (nce);
3042 
3043 	/*
3044 	 * hold the ncec_lock to synchronize with nce_update() so that,
3045 	 * at the end of this function, the contents of nce_dlur_mp are
3046 	 * consistent with ncec->ncec_lladdr, even though some intermediate
3047 	 * packet may have been sent out with a mangled address, which would
3048 	 * only be a transient condition.
3049 	 */
3050 	mutex_enter(&ncec->ncec_lock);
3051 	if (ncec->ncec_lladdr != NULL) {
3052 		bcopy(ncec->ncec_lladdr, nce->nce_dlur_mp->b_rptr +
3053 		    NCE_LL_ADDR_OFFSET(ill), ill->ill_phys_addr_length);
3054 	} else {
3055 		nce->nce_dlur_mp = ill_dlur_gen(NULL, 0, ill->ill_sap,
3056 		    ill->ill_sap_length);
3057 	}
3058 	mutex_exit(&ncec->ncec_lock);
3059 	return (nce);
3060 }
3061 
3062 /*
3063  * we make nce_fp_mp to have an M_DATA prepend.
3064  * The caller ensures there is hold on ncec for this function.
3065  * Note that since ill_fastpath_probe() copies the mblk there is
3066  * no need to hold the nce or ncec beyond this function.
3067  *
3068  * If the caller has passed in a non-null ncec_nce to nce_faspath() that
3069  * ncec_nce must correspond to the nce for ncec with nce_ill == ncec->ncec_ill
3070  * and will be returned back by this function, so that no extra nce_refrele
3071  * is required for the caller. The calls from nce_add_common() use this
3072  * method. All other callers (that pass in NULL ncec_nce) will have to do a
3073  * nce_refrele of the returned nce (when it is non-null).
3074  */
3075 nce_t *
3076 nce_fastpath(ncec_t *ncec, boolean_t trigger_fp_req, nce_t *ncec_nce)
3077 {
3078 	nce_t *nce;
3079 	ill_t *ill = ncec->ncec_ill;
3080 
3081 	ASSERT(ill != NULL);
3082 
3083 	if (IS_IPMP(ill) && trigger_fp_req) {
3084 		trigger_fp_req = B_FALSE;
3085 		ipmp_ncec_fastpath(ncec, ill);
3086 
3087 	}
3088 	/*
3089 	 * If the caller already has the nce corresponding to the ill, use
3090 	 * that one. Otherwise we have to lookup/add the nce. Calls from
3091 	 * nce_add_common() fall in the former category, and have just done
3092 	 * the nce lookup/add that can be reused.
3093 	 */
3094 	if (ncec_nce == NULL)
3095 		nce = nce_fastpath_create(ill, ncec);
3096 	else
3097 		nce = ncec_nce;
3098 
3099 	if (nce == NULL || IS_LOOPBACK(nce->nce_ill) || IS_VNI(nce->nce_ill))
3100 		return (nce);
3101 
3102 	if (trigger_fp_req)
3103 		nce_fastpath_trigger(nce);
3104 	return (nce);
3105 }
3106 
3107 /*
3108  * Trigger fastpath on nce. No locks may be held.
3109  */
3110 static void
3111 nce_fastpath_trigger(nce_t *nce)
3112 {
3113 	int res;
3114 	ill_t *ill = nce->nce_ill;
3115 	ncec_t *ncec = nce->nce_common;
3116 
3117 	res = ill_fastpath_probe(ill, nce->nce_dlur_mp);
3118 	/*
3119 	 * EAGAIN is an indication of a transient error
3120 	 * i.e. allocation failure etc. leave the ncec in the list it
3121 	 * will be updated when another probe happens for another ire
3122 	 * if not it will be taken out of the list when the ire is
3123 	 * deleted.
3124 	 */
3125 	if (res != 0 && res != EAGAIN && res != ENOTSUP)
3126 		nce_fastpath_list_delete(ill, ncec, NULL);
3127 }
3128 
3129 /*
3130  * Add ncec to the nce fastpath list on ill.
3131  */
3132 static nce_t *
3133 nce_ill_lookup_then_add_locked(ill_t *ill, ncec_t *ncec)
3134 {
3135 	nce_t *nce = NULL;
3136 
3137 	ASSERT(MUTEX_HELD(&ill->ill_lock));
3138 	/*
3139 	 * Atomically ensure that the ill is not CONDEMNED and is not going
3140 	 * down, before adding the NCE.
3141 	 */
3142 	if (ill->ill_state_flags & ILL_CONDEMNED)
3143 		return (NULL);
3144 	mutex_enter(&ncec->ncec_lock);
3145 	/*
3146 	 * if ncec has not been deleted and
3147 	 * is not already in the list add it.
3148 	 */
3149 	if (!NCE_ISCONDEMNED(ncec)) {
3150 		nce = nce_lookup(ill, &ncec->ncec_addr);
3151 		if (nce != NULL)
3152 			goto done;
3153 		nce = nce_add(ill, ncec);
3154 	}
3155 done:
3156 	mutex_exit(&ncec->ncec_lock);
3157 	return (nce);
3158 }
3159 
3160 nce_t *
3161 nce_ill_lookup_then_add(ill_t *ill, ncec_t *ncec)
3162 {
3163 	nce_t *nce;
3164 
3165 	mutex_enter(&ill->ill_lock);
3166 	nce = nce_ill_lookup_then_add_locked(ill, ncec);
3167 	mutex_exit(&ill->ill_lock);
3168 	return (nce);
3169 }
3170 
3171 
3172 /*
3173  * remove ncec from the ill_nce list. If 'dead' is non-null, the deleted
3174  * nce is added to the 'dead' list, and the caller must nce_refrele() the
3175  * entry after all locks have been dropped.
3176  */
3177 void
3178 nce_fastpath_list_delete(ill_t *ill, ncec_t *ncec, list_t *dead)
3179 {
3180 	nce_t *nce;
3181 
3182 	ASSERT(ill != NULL);
3183 
3184 	/* first clean out any nce pointers in the under_ills */
3185 	if (IS_IPMP(ill))
3186 		ipmp_ncec_flush_nce(ncec);
3187 
3188 	/* now the ill itself */
3189 	mutex_enter(&ill->ill_lock);
3190 	for (nce = list_head(&ill->ill_nce); nce != NULL;
3191 	    nce = list_next(&ill->ill_nce, nce)) {
3192 		if (nce->nce_common == ncec) {
3193 			nce_refhold(nce);
3194 			nce_delete(nce);
3195 			break;
3196 		}
3197 	}
3198 	mutex_exit(&ill->ill_lock);
3199 	if (nce != NULL) {
3200 		if (dead == NULL)
3201 			nce_refrele(nce);
3202 		else
3203 			list_insert_tail(dead, nce);
3204 	}
3205 }
3206 
3207 /*
3208  * when the fastpath response does not fit in the datab
3209  * associated with the existing nce_fp_mp, we delete and
3210  * add the nce to retrigger fastpath based on the information
3211  * in the ncec_t.
3212  */
3213 static nce_t *
3214 nce_delete_then_add(nce_t *nce)
3215 {
3216 	ill_t		*ill = nce->nce_ill;
3217 	nce_t		*newnce = NULL;
3218 
3219 	ip0dbg(("nce_delete_then_add nce %p ill %s\n",
3220 	    (void *)nce, ill->ill_name));
3221 	mutex_enter(&ill->ill_lock);
3222 	mutex_enter(&nce->nce_common->ncec_lock);
3223 	nce_delete(nce);
3224 	/*
3225 	 * Make sure that ncec is not condemned before adding. We hold the
3226 	 * ill_lock and ncec_lock to synchronize with ncec_delete() and
3227 	 * ipmp_ncec_flush_nce()
3228 	 */
3229 	if (!NCE_ISCONDEMNED(nce->nce_common))
3230 		newnce = nce_add(ill, nce->nce_common);
3231 	mutex_exit(&nce->nce_common->ncec_lock);
3232 	mutex_exit(&ill->ill_lock);
3233 	nce_refrele(nce);
3234 	return (newnce); /* could be null if nomem */
3235 }
3236 
3237 typedef struct nce_fp_match_s {
3238 	nce_t	*nce_fp_match_res;
3239 	mblk_t	*nce_fp_match_ack_mp;
3240 } nce_fp_match_t;
3241 
3242 /* ARGSUSED */
3243 static int
3244 nce_fastpath_match_dlur(ill_t *ill, nce_t *nce, void *arg)
3245 {
3246 	nce_fp_match_t	*nce_fp_marg = arg;
3247 	ncec_t		*ncec = nce->nce_common;
3248 	mblk_t		*mp = nce_fp_marg->nce_fp_match_ack_mp;
3249 	uchar_t	*mp_rptr, *ud_mp_rptr;
3250 	mblk_t		*ud_mp = nce->nce_dlur_mp;
3251 	ptrdiff_t	cmplen;
3252 
3253 	/*
3254 	 * mp is the mp associated with the fastpath ack.
3255 	 * ud_mp is the outstanding DL_UNITDATA_REQ on the nce_t
3256 	 * under consideration. If the contents match, then the
3257 	 * fastpath ack is used to update the nce.
3258 	 */
3259 	if (ud_mp == NULL)
3260 		return (0); /* MH_WALK_CONTINUE */
3261 	mp_rptr = mp->b_rptr;
3262 	cmplen = mp->b_wptr - mp_rptr;
3263 	ASSERT(cmplen >= 0);
3264 
3265 	ud_mp_rptr = ud_mp->b_rptr;
3266 	/*
3267 	 * The ncec is locked here to prevent any other threads from accessing
3268 	 * and changing nce_dlur_mp when the address becomes resolved to an
3269 	 * lla while we're in the middle of looking at and comparing the
3270 	 * hardware address (lla). It is also locked to prevent multiple
3271 	 * threads in nce_fastpath() from examining nce_dlur_mp at the same
3272 	 * time.
3273 	 */
3274 	mutex_enter(&ncec->ncec_lock);
3275 	if (ud_mp->b_wptr - ud_mp_rptr != cmplen ||
3276 	    bcmp((char *)mp_rptr, (char *)ud_mp_rptr, cmplen) == 0) {
3277 		nce_fp_marg->nce_fp_match_res = nce;
3278 		mutex_exit(&ncec->ncec_lock);
3279 		nce_refhold(nce);
3280 		return (1); /* MH_WALK_TERMINATE */
3281 	}
3282 	mutex_exit(&ncec->ncec_lock);
3283 	return (0); /* MH_WALK_CONTINUE */
3284 }
3285 
3286 /*
3287  * Update all NCE's that are not in fastpath mode and
3288  * have an nce_fp_mp that matches mp. mp->b_cont contains
3289  * the fastpath header.
3290  *
3291  * Returns TRUE if entry should be dequeued, or FALSE otherwise.
3292  */
3293 void
3294 nce_fastpath_update(ill_t *ill,  mblk_t *mp)
3295 {
3296 	nce_fp_match_t nce_fp_marg;
3297 	nce_t *nce;
3298 	mblk_t *nce_fp_mp, *fp_mp;
3299 
3300 	nce_fp_marg.nce_fp_match_res = NULL;
3301 	nce_fp_marg.nce_fp_match_ack_mp = mp;
3302 
3303 	nce_walk(ill, nce_fastpath_match_dlur, &nce_fp_marg);
3304 
3305 	if ((nce = nce_fp_marg.nce_fp_match_res) == NULL)
3306 		return;
3307 
3308 	mutex_enter(&nce->nce_lock);
3309 	nce_fp_mp = nce->nce_fp_mp;
3310 
3311 	if (nce_fp_mp != NULL) {
3312 		fp_mp = mp->b_cont;
3313 		if (nce_fp_mp->b_rptr + MBLKL(fp_mp) >
3314 		    nce_fp_mp->b_datap->db_lim) {
3315 			mutex_exit(&nce->nce_lock);
3316 			nce = nce_delete_then_add(nce);
3317 			if (nce == NULL) {
3318 				return;
3319 			}
3320 			mutex_enter(&nce->nce_lock);
3321 			nce_fp_mp = nce->nce_fp_mp;
3322 		}
3323 	}
3324 
3325 	/* Matched - install mp as the fastpath mp */
3326 	if (nce_fp_mp == NULL) {
3327 		fp_mp = dupb(mp->b_cont);
3328 		nce->nce_fp_mp = fp_mp;
3329 	} else {
3330 		fp_mp = mp->b_cont;
3331 		bcopy(fp_mp->b_rptr, nce_fp_mp->b_rptr, MBLKL(fp_mp));
3332 		nce->nce_fp_mp->b_wptr = nce->nce_fp_mp->b_rptr
3333 		    + MBLKL(fp_mp);
3334 	}
3335 	mutex_exit(&nce->nce_lock);
3336 	nce_refrele(nce);
3337 }
3338 
3339 /*
3340  * Return a pointer to a given option in the packet.
3341  * Assumes that option part of the packet have already been validated.
3342  */
3343 nd_opt_hdr_t *
3344 ndp_get_option(nd_opt_hdr_t *opt, int optlen, int opt_type)
3345 {
3346 	while (optlen > 0) {
3347 		if (opt->nd_opt_type == opt_type)
3348 			return (opt);
3349 		optlen -= 8 * opt->nd_opt_len;
3350 		opt = (struct nd_opt_hdr *)((char *)opt + 8 * opt->nd_opt_len);
3351 	}
3352 	return (NULL);
3353 }
3354 
3355 /*
3356  * Verify all option lengths present are > 0, also check to see
3357  * if the option lengths and packet length are consistent.
3358  */
3359 boolean_t
3360 ndp_verify_optlen(nd_opt_hdr_t *opt, int optlen)
3361 {
3362 	ASSERT(opt != NULL);
3363 	while (optlen > 0) {
3364 		if (opt->nd_opt_len == 0)
3365 			return (B_FALSE);
3366 		optlen -= 8 * opt->nd_opt_len;
3367 		if (optlen < 0)
3368 			return (B_FALSE);
3369 		opt = (struct nd_opt_hdr *)((char *)opt + 8 * opt->nd_opt_len);
3370 	}
3371 	return (B_TRUE);
3372 }
3373 
3374 /*
3375  * ncec_walk function.
3376  * Free a fraction of the NCE cache entries.
3377  *
3378  * A possible optimization here would be to use ncec_last where possible, and
3379  * delete the least-frequently used entry, which would require more complex
3380  * computation as we walk through the ncec's (e.g., track ncec entries by
3381  * order of ncec_last and/or maintain state)
3382  */
3383 static void
3384 ncec_cache_reclaim(ncec_t *ncec, char *arg)
3385 {
3386 	ip_stack_t	*ipst = ncec->ncec_ipst;
3387 	uint_t		fraction = *(uint_t *)arg;
3388 	uint_t		rand;
3389 
3390 	if ((ncec->ncec_flags &
3391 	    (NCE_F_MYADDR | NCE_F_STATIC | NCE_F_BCAST)) != 0) {
3392 		return;
3393 	}
3394 
3395 	rand = (uint_t)lbolt +
3396 	    NCE_ADDR_HASH_V6(ncec->ncec_addr, NCE_TABLE_SIZE);
3397 	if ((rand/fraction)*fraction == rand) {
3398 		IP_STAT(ipst, ip_nce_reclaim_deleted);
3399 		ncec_delete(ncec);
3400 	}
3401 }
3402 
3403 /*
3404  * kmem_cache callback to free up memory.
3405  *
3406  * For now we just delete a fixed fraction.
3407  */
3408 static void
3409 ip_nce_reclaim_stack(ip_stack_t *ipst)
3410 {
3411 	uint_t		fraction = ipst->ips_ip_nce_reclaim_fraction;
3412 
3413 	IP_STAT(ipst, ip_nce_reclaim_calls);
3414 
3415 	ncec_walk(NULL, (pfi_t)ncec_cache_reclaim, (uchar_t *)&fraction, ipst);
3416 
3417 	/*
3418 	 * Walk all CONNs that can have a reference on an ire, ncec or dce.
3419 	 * Get them to update any stale references to drop any refholds they
3420 	 * have.
3421 	 */
3422 	ipcl_walk(conn_ixa_cleanup, (void *)B_FALSE, ipst);
3423 }
3424 
3425 /*
3426  * Called by the memory allocator subsystem directly, when the system
3427  * is running low on memory.
3428  */
3429 /* ARGSUSED */
3430 void
3431 ip_nce_reclaim(void *args)
3432 {
3433 	netstack_handle_t nh;
3434 	netstack_t *ns;
3435 
3436 	netstack_next_init(&nh);
3437 	while ((ns = netstack_next(&nh)) != NULL) {
3438 		ip_nce_reclaim_stack(ns->netstack_ip);
3439 		netstack_rele(ns);
3440 	}
3441 	netstack_next_fini(&nh);
3442 }
3443 
3444 #ifdef DEBUG
3445 void
3446 ncec_trace_ref(ncec_t *ncec)
3447 {
3448 	ASSERT(MUTEX_HELD(&ncec->ncec_lock));
3449 
3450 	if (ncec->ncec_trace_disable)
3451 		return;
3452 
3453 	if (!th_trace_ref(ncec, ncec->ncec_ipst)) {
3454 		ncec->ncec_trace_disable = B_TRUE;
3455 		ncec_trace_cleanup(ncec);
3456 	}
3457 }
3458 
3459 void
3460 ncec_untrace_ref(ncec_t *ncec)
3461 {
3462 	ASSERT(MUTEX_HELD(&ncec->ncec_lock));
3463 
3464 	if (!ncec->ncec_trace_disable)
3465 		th_trace_unref(ncec);
3466 }
3467 
3468 static void
3469 ncec_trace_cleanup(const ncec_t *ncec)
3470 {
3471 	th_trace_cleanup(ncec, ncec->ncec_trace_disable);
3472 }
3473 #endif
3474 
3475 /*
3476  * Called when address resolution fails due to a timeout.
3477  * Send an ICMP unreachable in response to all queued packets.
3478  */
3479 void
3480 arp_resolv_failed(ncec_t *ncec)
3481 {
3482 	mblk_t	*mp, *nxt_mp;
3483 	char	buf[INET6_ADDRSTRLEN];
3484 	struct in_addr ipv4addr;
3485 	ill_t *ill = ncec->ncec_ill;
3486 	ip_stack_t *ipst = ncec->ncec_ipst;
3487 	ip_recv_attr_t	iras;
3488 
3489 	bzero(&iras, sizeof (iras));
3490 	iras.ira_flags = IRAF_IS_IPV4;
3491 	/*
3492 	 * we are setting the ira_rill to the ipmp_ill (instead of
3493 	 * the actual ill on which the packet was received), but this
3494 	 * is ok because we don't actually need the real ira_rill.
3495 	 * to send the icmp unreachable to the sender.
3496 	 */
3497 	iras.ira_ill = iras.ira_rill = ill;
3498 	iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex;
3499 	iras.ira_rifindex = iras.ira_ruifindex;
3500 
3501 	IN6_V4MAPPED_TO_INADDR(&ncec->ncec_addr, &ipv4addr);
3502 	ip3dbg(("arp_resolv_failed: dst %s\n",
3503 	    inet_ntop(AF_INET, &ipv4addr, buf, sizeof (buf))));
3504 	mutex_enter(&ncec->ncec_lock);
3505 	mp = ncec->ncec_qd_mp;
3506 	ncec->ncec_qd_mp = NULL;
3507 	ncec->ncec_nprobes = 0;
3508 	mutex_exit(&ncec->ncec_lock);
3509 	while (mp != NULL) {
3510 		nxt_mp = mp->b_next;
3511 		mp->b_next = NULL;
3512 
3513 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
3514 		ip_drop_output("ipIfStatsOutDiscards - address unreachable",
3515 		    mp, ill);
3516 		if (ipst->ips_ip_arp_icmp_error) {
3517 			ip3dbg(("arp_resolv_failed: "
3518 			    "Calling icmp_unreachable\n"));
3519 			icmp_unreachable(mp, ICMP_HOST_UNREACHABLE, &iras);
3520 		} else {
3521 			freemsg(mp);
3522 		}
3523 		ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE));
3524 		mp = nxt_mp;
3525 	}
3526 	ncec_cb_dispatch(ncec); /* finish off waiting callbacks */
3527 }
3528 
3529 /*
3530  * if ill is an under_ill, translate it to the ipmp_ill and add the
3531  * nce on the ipmp_ill. Two nce_t entries (one on the ipmp_ill, and
3532  * one on the underlying in_ill) will be created for the
3533  * ncec_t in this case. The ncec_t itself will be created on the ipmp_ill.
3534  */
3535 int
3536 nce_lookup_then_add_v4(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len,
3537     const in_addr_t *addr, uint16_t flags, uint16_t state, nce_t **newnce)
3538 {
3539 	int	err;
3540 	in6_addr_t addr6;
3541 	ip_stack_t *ipst = ill->ill_ipst;
3542 	nce_t	*nce, *upper_nce = NULL;
3543 	ill_t	*in_ill = ill, *under = NULL;
3544 	boolean_t need_ill_refrele = B_FALSE;
3545 
3546 	if (flags & NCE_F_MCAST) {
3547 		/*
3548 		 * hw_addr will be figured out in nce_set_multicast_v4;
3549 		 * caller needs to pass in the cast_ill for ipmp
3550 		 */
3551 		ASSERT(hw_addr == NULL);
3552 		ASSERT(!IS_IPMP(ill));
3553 		err = nce_set_multicast_v4(ill, addr, flags, newnce);
3554 		return (err);
3555 	}
3556 
3557 	if (IS_UNDER_IPMP(ill) && !(flags & NCE_F_MYADDR)) {
3558 		ill = ipmp_ill_hold_ipmp_ill(ill);
3559 		if (ill == NULL)
3560 			return (ENXIO);
3561 		need_ill_refrele = B_TRUE;
3562 	}
3563 	if ((flags & NCE_F_BCAST) != 0) {
3564 		/*
3565 		 * IPv4 broadcast ncec: compute the hwaddr.
3566 		 */
3567 		if (IS_IPMP(ill)) {
3568 			under = ipmp_ill_get_xmit_ill(ill, B_FALSE);
3569 			if (under == NULL)  {
3570 				if (need_ill_refrele)
3571 					ill_refrele(ill);
3572 				return (ENETDOWN);
3573 			}
3574 			hw_addr = under->ill_bcast_mp->b_rptr +
3575 			    NCE_LL_ADDR_OFFSET(under);
3576 			hw_addr_len = under->ill_phys_addr_length;
3577 		} else {
3578 			hw_addr = ill->ill_bcast_mp->b_rptr +
3579 			    NCE_LL_ADDR_OFFSET(ill),
3580 			    hw_addr_len = ill->ill_phys_addr_length;
3581 		}
3582 	}
3583 
3584 	mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
3585 	IN6_IPADDR_TO_V4MAPPED(*addr, &addr6);
3586 	nce = nce_lookup_addr(ill, &addr6);
3587 	if (nce == NULL) {
3588 		err = nce_add_v4(ill, hw_addr, hw_addr_len, addr, flags,
3589 		    state, &nce);
3590 	} else {
3591 		err = EEXIST;
3592 	}
3593 	mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
3594 	if (err == 0)
3595 		err = nce_add_v4_postprocess(nce);
3596 
3597 	if (in_ill != ill && nce != NULL) {
3598 		nce_t *under_nce;
3599 
3600 		/*
3601 		 * in_ill was the under_ill. Try to create the under_nce.
3602 		 * Hold the ill_g_lock to prevent changes to group membership
3603 		 * until we are done.
3604 		 */
3605 		rw_enter(&ipst->ips_ill_g_lock, RW_READER);
3606 		if (IS_IN_SAME_ILLGRP(in_ill, ill)) {
3607 			under_nce = nce_fastpath_create(in_ill,
3608 			    nce->nce_common);
3609 			upper_nce = nce;
3610 			if ((nce = under_nce) == NULL)
3611 				err = EINVAL;
3612 		}
3613 		rw_exit(&ipst->ips_ill_g_lock);
3614 		if (under_nce != NULL && NCE_ISREACHABLE(nce->nce_common))
3615 			nce_fastpath_trigger(under_nce);
3616 	}
3617 	if (nce != NULL) {
3618 		if (newnce != NULL)
3619 			*newnce = nce;
3620 		else
3621 			nce_refrele(nce);
3622 	}
3623 
3624 	if (under != NULL)
3625 		ill_refrele(under);
3626 
3627 	if (upper_nce != NULL)
3628 		nce_refrele(upper_nce);
3629 
3630 	if (need_ill_refrele)
3631 		ill_refrele(ill);
3632 
3633 	return (err);
3634 }
3635 
3636 /*
3637  * NDP Cache Entry creation routine for IPv4.
3638  * Mapped entries are handled in arp.
3639  * This routine must always be called with ndp4->ndp_g_lock held.
3640  * Prior to return, ncec_refcnt is incremented.
3641  *
3642  * IPMP notes: the ncec for non-local (i.e., !NCE_MYADDR(ncec) addresses
3643  * are always added pointing at the ipmp_ill. Thus, when the ill passed
3644  * to nce_add_v4 is an under_ill (i.e., IS_UNDER_IPMP(ill)) two nce_t
3645  * entries will be created, both pointing at the same ncec_t. The nce_t
3646  * entries will have their nce_ill set to the ipmp_ill and the under_ill
3647  * respectively, with the ncec_t having its ncec_ill pointing at the ipmp_ill.
3648  * Local addresses are always created on the ill passed to nce_add_v4.
3649  */
3650 int
3651 nce_add_v4(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len,
3652     const in_addr_t *addr, uint16_t flags, uint16_t state, nce_t **newnce)
3653 {
3654 	int		err;
3655 	boolean_t	is_multicast = (flags & NCE_F_MCAST);
3656 	struct in6_addr	addr6;
3657 	nce_t		*nce;
3658 
3659 	ASSERT(MUTEX_HELD(&ill->ill_ipst->ips_ndp4->ndp_g_lock));
3660 	ASSERT(!ill->ill_isv6);
3661 	ASSERT(!IN_MULTICAST(htonl(*addr)) || is_multicast);
3662 
3663 	IN6_IPADDR_TO_V4MAPPED(*addr, &addr6);
3664 	err = nce_add_common(ill, hw_addr, hw_addr_len, &addr6, flags, state,
3665 	    &nce);
3666 	ASSERT(newnce != NULL);
3667 	*newnce = nce;
3668 	return (err);
3669 }
3670 
3671 /*
3672  * Post-processing routine to be executed after nce_add_v4(). This function
3673  * triggers fastpath (if appropriate) and DAD on the newly added nce entry
3674  * and must be called without any locks held.
3675  *
3676  * Always returns 0, but we return an int to keep this symmetric with the
3677  * IPv6 counter-part.
3678  */
3679 int
3680 nce_add_v4_postprocess(nce_t *nce)
3681 {
3682 	ncec_t		*ncec = nce->nce_common;
3683 	uint16_t	flags = ncec->ncec_flags;
3684 	boolean_t	ndp_need_dad = B_FALSE;
3685 	boolean_t	dropped;
3686 	clock_t		delay;
3687 	ip_stack_t	*ipst = ncec->ncec_ill->ill_ipst;
3688 	uchar_t		*hw_addr = ncec->ncec_lladdr;
3689 	boolean_t	trigger_fastpath = B_TRUE;
3690 
3691 	/*
3692 	 * If the hw_addr is NULL, typically for ND_INCOMPLETE nces, then
3693 	 * we call nce_fastpath as soon as the ncec is resolved in nce_process.
3694 	 * We call nce_fastpath from nce_update if the link layer address of
3695 	 * the peer changes from nce_update
3696 	 */
3697 	if (NCE_PUBLISH(ncec) || !NCE_ISREACHABLE(ncec) || (hw_addr == NULL &&
3698 	    ncec->ncec_ill->ill_net_type != IRE_IF_NORESOLVER))
3699 		trigger_fastpath = B_FALSE;
3700 
3701 	if (trigger_fastpath)
3702 		nce_fastpath_trigger(nce);
3703 
3704 	if (NCE_PUBLISH(ncec) && ncec->ncec_state == ND_PROBE) {
3705 		/*
3706 		 * Either the caller (by passing in ND_PROBE)
3707 		 * or nce_add_common() (by the internally computed state
3708 		 * based on ncec_addr and ill_net_type) has determined
3709 		 * that this unicast entry needs DAD. Trigger DAD.
3710 		 */
3711 		ndp_need_dad = B_TRUE;
3712 	} else if (flags & NCE_F_UNSOL_ADV) {
3713 		/*
3714 		 * We account for the transmit below by assigning one
3715 		 * less than the ndd variable. Subsequent decrements
3716 		 * are done in nce_timer.
3717 		 */
3718 		mutex_enter(&ncec->ncec_lock);
3719 		ncec->ncec_unsolicit_count =
3720 		    ipst->ips_ip_arp_publish_count - 1;
3721 		mutex_exit(&ncec->ncec_lock);
3722 		dropped = arp_announce(ncec);
3723 		mutex_enter(&ncec->ncec_lock);
3724 		if (dropped)
3725 			ncec->ncec_unsolicit_count++;
3726 		else
3727 			ncec->ncec_last_time_defended = ddi_get_lbolt();
3728 		if (ncec->ncec_unsolicit_count != 0) {
3729 			nce_start_timer(ncec,
3730 			    ipst->ips_ip_arp_publish_interval);
3731 		}
3732 		mutex_exit(&ncec->ncec_lock);
3733 	}
3734 
3735 	/*
3736 	 * If ncec_xmit_interval is 0, user has configured us to send the first
3737 	 * probe right away.  Do so, and set up for the subsequent probes.
3738 	 */
3739 	if (ndp_need_dad) {
3740 		mutex_enter(&ncec->ncec_lock);
3741 		if (ncec->ncec_pcnt == 0) {
3742 			/*
3743 			 * DAD probes and announce can be
3744 			 * administratively disabled by setting the
3745 			 * probe_count to zero. Restart the timer in
3746 			 * this case to mark the ipif as ready.
3747 			 */
3748 			ncec->ncec_unsolicit_count = 0;
3749 			mutex_exit(&ncec->ncec_lock);
3750 			nce_restart_timer(ncec, 0);
3751 		} else {
3752 			mutex_exit(&ncec->ncec_lock);
3753 			delay = ((ncec->ncec_flags & NCE_F_FAST) ?
3754 			    ipst->ips_arp_probe_delay :
3755 			    ipst->ips_arp_fastprobe_delay);
3756 			nce_dad(ncec, NULL, (delay == 0 ? B_TRUE : B_FALSE));
3757 		}
3758 	}
3759 	return (0);
3760 }
3761 
3762 /*
3763  * ncec_walk routine to update all entries that have a given destination or
3764  * gateway address and cached link layer (MAC) address.  This is used when ARP
3765  * informs us that a network-to-link-layer mapping may have changed.
3766  */
3767 void
3768 nce_update_hw_changed(ncec_t *ncec, void *arg)
3769 {
3770 	nce_hw_map_t *hwm = arg;
3771 	ipaddr_t ncec_addr;
3772 
3773 	if (ncec->ncec_state != ND_REACHABLE)
3774 		return;
3775 
3776 	IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, ncec_addr);
3777 	if (ncec_addr != hwm->hwm_addr)
3778 		return;
3779 
3780 	mutex_enter(&ncec->ncec_lock);
3781 	if (hwm->hwm_flags != 0)
3782 		ncec->ncec_flags = hwm->hwm_flags;
3783 	nce_update(ncec, ND_STALE, hwm->hwm_hwaddr);
3784 	mutex_exit(&ncec->ncec_lock);
3785 }
3786 
3787 void
3788 ncec_refhold(ncec_t *ncec)
3789 {
3790 	mutex_enter(&(ncec)->ncec_lock);
3791 	(ncec)->ncec_refcnt++;
3792 	ASSERT((ncec)->ncec_refcnt != 0);
3793 #ifdef DEBUG
3794 	ncec_trace_ref(ncec);
3795 #endif
3796 	mutex_exit(&(ncec)->ncec_lock);
3797 }
3798 
3799 void
3800 ncec_refhold_notr(ncec_t *ncec)
3801 {
3802 	mutex_enter(&(ncec)->ncec_lock);
3803 	(ncec)->ncec_refcnt++;
3804 	ASSERT((ncec)->ncec_refcnt != 0);
3805 	mutex_exit(&(ncec)->ncec_lock);
3806 }
3807 
3808 static void
3809 ncec_refhold_locked(ncec_t *ncec)
3810 {
3811 	ASSERT(MUTEX_HELD(&(ncec)->ncec_lock));
3812 	(ncec)->ncec_refcnt++;
3813 #ifdef DEBUG
3814 	ncec_trace_ref(ncec);
3815 #endif
3816 }
3817 
3818 /* ncec_inactive destroys the mutex thus no mutex_exit is needed */
3819 void
3820 ncec_refrele(ncec_t *ncec)
3821 {
3822 	mutex_enter(&(ncec)->ncec_lock);
3823 #ifdef DEBUG
3824 	ncec_untrace_ref(ncec);
3825 #endif
3826 	ASSERT((ncec)->ncec_refcnt != 0);
3827 	if (--(ncec)->ncec_refcnt == 0) {
3828 		ncec_inactive(ncec);
3829 	} else {
3830 		mutex_exit(&(ncec)->ncec_lock);
3831 	}
3832 }
3833 
3834 void
3835 ncec_refrele_notr(ncec_t *ncec)
3836 {
3837 	mutex_enter(&(ncec)->ncec_lock);
3838 	ASSERT((ncec)->ncec_refcnt != 0);
3839 	if (--(ncec)->ncec_refcnt == 0) {
3840 		ncec_inactive(ncec);
3841 	} else {
3842 		mutex_exit(&(ncec)->ncec_lock);
3843 	}
3844 }
3845 
3846 /*
3847  * Common to IPv4 and IPv6.
3848  */
3849 void
3850 nce_restart_timer(ncec_t *ncec, uint_t ms)
3851 {
3852 	timeout_id_t tid;
3853 
3854 	ASSERT(!MUTEX_HELD(&(ncec)->ncec_lock));
3855 
3856 	/* First cancel any running timer */
3857 	mutex_enter(&ncec->ncec_lock);
3858 	tid = ncec->ncec_timeout_id;
3859 	ncec->ncec_timeout_id = 0;
3860 	if (tid != 0) {
3861 		mutex_exit(&ncec->ncec_lock);
3862 		(void) untimeout(tid);
3863 		mutex_enter(&ncec->ncec_lock);
3864 	}
3865 
3866 	/* Restart timer */
3867 	nce_start_timer(ncec, ms);
3868 	mutex_exit(&ncec->ncec_lock);
3869 }
3870 
3871 static void
3872 nce_start_timer(ncec_t *ncec, uint_t ms)
3873 {
3874 	ASSERT(MUTEX_HELD(&ncec->ncec_lock));
3875 	/*
3876 	 * Don't start the timer if the ncec has been deleted, or if the timer
3877 	 * is already running
3878 	 */
3879 	if (!NCE_ISCONDEMNED(ncec) && ncec->ncec_timeout_id == 0) {
3880 		ncec->ncec_timeout_id = timeout(nce_timer, ncec,
3881 		    MSEC_TO_TICK(ms) == 0 ? 1 : MSEC_TO_TICK(ms));
3882 	}
3883 }
3884 
3885 int
3886 nce_set_multicast_v4(ill_t *ill, const in_addr_t *dst,
3887     uint16_t flags, nce_t **newnce)
3888 {
3889 	uchar_t		*hw_addr;
3890 	int		err = 0;
3891 	ip_stack_t	*ipst = ill->ill_ipst;
3892 	in6_addr_t	dst6;
3893 	nce_t		*nce;
3894 
3895 	ASSERT(!ill->ill_isv6);
3896 
3897 	IN6_IPADDR_TO_V4MAPPED(*dst, &dst6);
3898 	mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
3899 	if ((nce = nce_lookup_addr(ill, &dst6)) != NULL) {
3900 		mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
3901 		goto done;
3902 	}
3903 	if (ill->ill_net_type == IRE_IF_RESOLVER) {
3904 		/*
3905 		 * For IRE_IF_RESOLVER a hardware mapping can be
3906 		 * generated, for IRE_IF_NORESOLVER, resolution cookie
3907 		 * in the ill is copied in nce_add_v4().
3908 		 */
3909 		hw_addr = kmem_alloc(ill->ill_phys_addr_length, KM_NOSLEEP);
3910 		if (hw_addr == NULL) {
3911 			mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
3912 			return (ENOMEM);
3913 		}
3914 		ip_mcast_mapping(ill, (uchar_t *)dst, hw_addr);
3915 	} else {
3916 		/*
3917 		 * IRE_IF_NORESOLVER type simply copies the resolution
3918 		 * cookie passed in.  So no hw_addr is needed.
3919 		 */
3920 		hw_addr = NULL;
3921 	}
3922 	ASSERT(flags & NCE_F_MCAST);
3923 	ASSERT(flags & NCE_F_NONUD);
3924 	/* nce_state will be computed by nce_add_common() */
3925 	err = nce_add_v4(ill, hw_addr, ill->ill_phys_addr_length, dst, flags,
3926 	    ND_UNCHANGED, &nce);
3927 	mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
3928 	if (err == 0)
3929 		err = nce_add_v4_postprocess(nce);
3930 	if (hw_addr != NULL)
3931 		kmem_free(hw_addr, ill->ill_phys_addr_length);
3932 	if (err != 0) {
3933 		ip1dbg(("nce_set_multicast_v4: create failed" "%d\n", err));
3934 		return (err);
3935 	}
3936 done:
3937 	if (newnce != NULL)
3938 		*newnce = nce;
3939 	else
3940 		nce_refrele(nce);
3941 	return (0);
3942 }
3943 
3944 /*
3945  * This is used when scanning for "old" (least recently broadcast) NCEs.  We
3946  * don't want to have to walk the list for every single one, so we gather up
3947  * batches at a time.
3948  */
3949 #define	NCE_RESCHED_LIST_LEN	8
3950 
3951 typedef struct {
3952 	ill_t	*ncert_ill;
3953 	uint_t	ncert_num;
3954 	ncec_t	*ncert_nces[NCE_RESCHED_LIST_LEN];
3955 } nce_resched_t;
3956 
3957 /*
3958  * Pick the longest waiting NCEs for defense.
3959  */
3960 /* ARGSUSED */
3961 static int
3962 ncec_reschedule(ill_t *ill, nce_t *nce, void *arg)
3963 {
3964 	nce_resched_t *ncert = arg;
3965 	ncec_t **ncecs;
3966 	ncec_t **ncec_max;
3967 	ncec_t *ncec_temp;
3968 	ncec_t *ncec = nce->nce_common;
3969 
3970 	ASSERT(ncec->ncec_ill == ncert->ncert_ill);
3971 	/*
3972 	 * Only reachable entries that are ready for announcement are eligible.
3973 	 */
3974 	if (!NCE_MYADDR(ncec) || ncec->ncec_state != ND_REACHABLE)
3975 		return (0);
3976 	if (ncert->ncert_num < NCE_RESCHED_LIST_LEN) {
3977 		ncec_refhold(ncec);
3978 		ncert->ncert_nces[ncert->ncert_num++] = ncec;
3979 	} else {
3980 		ncecs = ncert->ncert_nces;
3981 		ncec_max = ncecs + NCE_RESCHED_LIST_LEN;
3982 		ncec_refhold(ncec);
3983 		for (; ncecs < ncec_max; ncecs++) {
3984 			ASSERT(ncec != NULL);
3985 			if ((*ncecs)->ncec_last_time_defended >
3986 			    ncec->ncec_last_time_defended) {
3987 				ncec_temp = *ncecs;
3988 				*ncecs = ncec;
3989 				ncec = ncec_temp;
3990 			}
3991 		}
3992 		ncec_refrele(ncec);
3993 	}
3994 	return (0);
3995 }
3996 
3997 /*
3998  * Reschedule the ARP defense of any long-waiting NCEs.  It's assumed that this
3999  * doesn't happen very often (if at all), and thus it needn't be highly
4000  * optimized.  (Note, though, that it's actually O(N) complexity, because the
4001  * outer loop is bounded by a constant rather than by the length of the list.)
4002  */
4003 static void
4004 nce_ill_reschedule(ill_t *ill, nce_resched_t *ncert)
4005 {
4006 	ncec_t		*ncec;
4007 	ip_stack_t	*ipst = ill->ill_ipst;
4008 	uint_t		i, defend_rate;
4009 
4010 	i = ill->ill_defend_count;
4011 	ill->ill_defend_count = 0;
4012 	if (ill->ill_isv6)
4013 		defend_rate = ipst->ips_ndp_defend_rate;
4014 	else
4015 		defend_rate = ipst->ips_arp_defend_rate;
4016 	/* If none could be sitting around, then don't reschedule */
4017 	if (i < defend_rate) {
4018 		DTRACE_PROBE1(reschedule_none, ill_t *, ill);
4019 		return;
4020 	}
4021 	ncert->ncert_ill = ill;
4022 	while (ill->ill_defend_count < defend_rate) {
4023 		nce_walk_common(ill, ncec_reschedule, ncert);
4024 		for (i = 0; i < ncert->ncert_num; i++) {
4025 
4026 			ncec = ncert->ncert_nces[i];
4027 			mutex_enter(&ncec->ncec_lock);
4028 			ncec->ncec_flags |= NCE_F_DELAYED;
4029 			mutex_exit(&ncec->ncec_lock);
4030 			/*
4031 			 * we plan to schedule this ncec, so incr the
4032 			 * defend_count in anticipation.
4033 			 */
4034 			if (++ill->ill_defend_count >= defend_rate)
4035 				break;
4036 		}
4037 		if (ncert->ncert_num < NCE_RESCHED_LIST_LEN)
4038 			break;
4039 	}
4040 }
4041 
4042 /*
4043  * Check if the current rate-limiting parameters permit the sending
4044  * of another address defense announcement for both IPv4 and IPv6.
4045  * Returns B_TRUE if rate-limiting is in effect (i.e., send is not
4046  * permitted), and B_FALSE otherwise. The `defend_rate' parameter
4047  * determines how many address defense announcements are permitted
4048  * in any `defense_perio' interval.
4049  */
4050 static boolean_t
4051 ill_defend_rate_limit(ill_t *ill, ncec_t *ncec)
4052 {
4053 	clock_t		now = ddi_get_lbolt();
4054 	ip_stack_t	*ipst = ill->ill_ipst;
4055 	clock_t		start = ill->ill_defend_start;
4056 	uint32_t	elapsed, defend_period, defend_rate;
4057 	nce_resched_t	ncert;
4058 	boolean_t	ret;
4059 	int		i;
4060 
4061 	if (ill->ill_isv6) {
4062 		defend_period = ipst->ips_ndp_defend_period;
4063 		defend_rate = ipst->ips_ndp_defend_rate;
4064 	} else {
4065 		defend_period = ipst->ips_arp_defend_period;
4066 		defend_rate = ipst->ips_arp_defend_rate;
4067 	}
4068 	if (defend_rate == 0)
4069 		return (B_TRUE);
4070 	bzero(&ncert, sizeof (ncert));
4071 	mutex_enter(&ill->ill_lock);
4072 	if (start > 0) {
4073 		elapsed = now - start;
4074 		if (elapsed > SEC_TO_TICK(defend_period)) {
4075 			ill->ill_defend_start = now;
4076 			/*
4077 			 * nce_ill_reschedule will attempt to
4078 			 * prevent starvation by reschduling the
4079 			 * oldest entries, which are marked with
4080 			 * the NCE_F_DELAYED flag.
4081 			 */
4082 			nce_ill_reschedule(ill, &ncert);
4083 		}
4084 	} else {
4085 		ill->ill_defend_start = now;
4086 	}
4087 	ASSERT(ill->ill_defend_count <= defend_rate);
4088 	mutex_enter(&ncec->ncec_lock);
4089 	if (ncec->ncec_flags & NCE_F_DELAYED) {
4090 		/*
4091 		 * This ncec was rescheduled as one of the really old
4092 		 * entries needing on-going defense. The
4093 		 * ill_defend_count was already incremented in
4094 		 * nce_ill_reschedule. Go ahead and send the announce.
4095 		 */
4096 		ncec->ncec_flags &= ~NCE_F_DELAYED;
4097 		mutex_exit(&ncec->ncec_lock);
4098 		ret = B_FALSE;
4099 		goto done;
4100 	}
4101 	mutex_exit(&ncec->ncec_lock);
4102 	if (ill->ill_defend_count < defend_rate)
4103 		ill->ill_defend_count++;
4104 	if (ill->ill_defend_count == defend_rate) {
4105 		/*
4106 		 * we are no longer allowed to send unbidden defense
4107 		 * messages. Wait for rescheduling.
4108 		 */
4109 		ret = B_TRUE;
4110 	} else {
4111 		ret = B_FALSE;
4112 	}
4113 done:
4114 	mutex_exit(&ill->ill_lock);
4115 	/*
4116 	 * After all the locks have been dropped we can restart nce timer,
4117 	 * and refrele the delayed ncecs
4118 	 */
4119 	for (i = 0; i < ncert.ncert_num; i++) {
4120 		clock_t	xmit_interval;
4121 		ncec_t	*tmp;
4122 
4123 		tmp = ncert.ncert_nces[i];
4124 		xmit_interval = nce_fuzz_interval(tmp->ncec_xmit_interval,
4125 		    B_FALSE);
4126 		nce_restart_timer(tmp, xmit_interval);
4127 		ncec_refrele(tmp);
4128 	}
4129 	return (ret);
4130 }
4131 
4132 boolean_t
4133 ndp_announce(ncec_t *ncec)
4134 {
4135 	return (ndp_xmit(ncec->ncec_ill, ND_NEIGHBOR_ADVERT, ncec->ncec_lladdr,
4136 	    ncec->ncec_lladdr_length, &ncec->ncec_addr, &ipv6_all_hosts_mcast,
4137 	    nce_advert_flags(ncec)));
4138 }
4139 
4140 ill_t *
4141 nce_resolve_src(ncec_t *ncec, in6_addr_t *src)
4142 {
4143 	mblk_t		*mp;
4144 	in6_addr_t	src6;
4145 	ipaddr_t	src4;
4146 	ill_t		*ill = ncec->ncec_ill;
4147 	ill_t		*src_ill = NULL;
4148 	ipif_t		*ipif = NULL;
4149 	boolean_t	is_myaddr = NCE_MYADDR(ncec);
4150 	boolean_t	isv6 = (ncec->ncec_ipversion == IPV6_VERSION);
4151 
4152 	ASSERT(src != NULL);
4153 	ASSERT(IN6_IS_ADDR_UNSPECIFIED(src));
4154 	src6 = *src;
4155 	if (is_myaddr) {
4156 		src6 = ncec->ncec_addr;
4157 		if (!isv6)
4158 			IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, src4);
4159 	} else {
4160 		/*
4161 		 * try to find one from the outgoing packet.
4162 		 */
4163 		mutex_enter(&ncec->ncec_lock);
4164 		mp = ncec->ncec_qd_mp;
4165 		if (mp != NULL) {
4166 			if (isv6) {
4167 				ip6_t	*ip6h = (ip6_t *)mp->b_rptr;
4168 
4169 				src6 = ip6h->ip6_src;
4170 			} else {
4171 				ipha_t  *ipha = (ipha_t *)mp->b_rptr;
4172 
4173 				src4 = ipha->ipha_src;
4174 				IN6_IPADDR_TO_V4MAPPED(src4, &src6);
4175 			}
4176 		}
4177 		mutex_exit(&ncec->ncec_lock);
4178 	}
4179 
4180 	/*
4181 	 * For outgoing packets, if the src of outgoing packet is one
4182 	 * of the assigned interface addresses use it, otherwise we
4183 	 * will pick the source address below.
4184 	 * For local addresses (is_myaddr) doing DAD, NDP announce
4185 	 * messages are mcast. So we use the (IPMP) cast_ill or the
4186 	 * (non-IPMP) ncec_ill for these message types. The only case
4187 	 * of unicast DAD messages are for IPv6 ND probes, for which
4188 	 * we find the ipif_bound_ill corresponding to the ncec_addr.
4189 	 */
4190 	if (!IN6_IS_ADDR_UNSPECIFIED(&src6) || is_myaddr) {
4191 		if (isv6) {
4192 			ipif = ipif_lookup_addr_nondup_v6(&src6, ill, ALL_ZONES,
4193 			    ill->ill_ipst);
4194 		} else {
4195 			ipif = ipif_lookup_addr_nondup(src4, ill, ALL_ZONES,
4196 			    ill->ill_ipst);
4197 		}
4198 
4199 		/*
4200 		 * If no relevant ipif can be found, then it's not one of our
4201 		 * addresses.  Reset to :: and try to find a src for the NS or
4202 		 * ARP request using ipif_select_source_v[4,6]  below.
4203 		 * If an ipif can be found, but it's not yet done with
4204 		 * DAD verification, and we are not being invoked for
4205 		 * DAD (i.e., !is_myaddr), then just postpone this
4206 		 * transmission until later.
4207 		 */
4208 		if (ipif == NULL) {
4209 			src6 = ipv6_all_zeros;
4210 			src4 = INADDR_ANY;
4211 		} else if (!ipif->ipif_addr_ready && !is_myaddr) {
4212 			DTRACE_PROBE2(nce__resolve__ipif__not__ready,
4213 			    ncec_t *, ncec, ipif_t *, ipif);
4214 			ipif_refrele(ipif);
4215 			return (NULL);
4216 		}
4217 	}
4218 
4219 	if (IN6_IS_ADDR_UNSPECIFIED(&src6) && !is_myaddr) {
4220 		/*
4221 		 * Pick a source address for this solicitation, but
4222 		 * restrict the selection to addresses assigned to the
4223 		 * output interface.  We do this because the destination will
4224 		 * create a neighbor cache entry for the source address of
4225 		 * this packet, so the source address had better be a valid
4226 		 * neighbor.
4227 		 */
4228 		if (isv6) {
4229 			ipif = ipif_select_source_v6(ill, &ncec->ncec_addr,
4230 			    B_TRUE, IPV6_PREFER_SRC_DEFAULT, ALL_ZONES,
4231 			    B_FALSE, NULL);
4232 		} else {
4233 			ipaddr_t nce_addr;
4234 
4235 			IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, nce_addr);
4236 			ipif = ipif_select_source_v4(ill, nce_addr, ALL_ZONES,
4237 			    B_FALSE, NULL);
4238 		}
4239 		if (ipif == NULL && IS_IPMP(ill)) {
4240 			ill_t *send_ill = ipmp_ill_get_xmit_ill(ill, B_TRUE);
4241 
4242 			if (send_ill != NULL) {
4243 				if (isv6) {
4244 					ipif = ipif_select_source_v6(send_ill,
4245 					    &ncec->ncec_addr, B_TRUE,
4246 					    IPV6_PREFER_SRC_DEFAULT, ALL_ZONES,
4247 					    B_FALSE, NULL);
4248 				} else {
4249 					IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr,
4250 					    src4);
4251 					ipif = ipif_select_source_v4(send_ill,
4252 					    src4, ALL_ZONES, B_TRUE, NULL);
4253 				}
4254 				ill_refrele(send_ill);
4255 			}
4256 		}
4257 
4258 		if (ipif == NULL) {
4259 			char buf[INET6_ADDRSTRLEN];
4260 
4261 			ip1dbg(("nce_resolve_src: No source ipif for dst %s\n",
4262 			    inet_ntop((isv6 ? AF_INET6 : AF_INET),
4263 			    (char *)&ncec->ncec_addr, buf, sizeof (buf))));
4264 			DTRACE_PROBE1(nce__resolve__no__ipif, ncec_t *, ncec);
4265 			return (NULL);
4266 		}
4267 		src6 = ipif->ipif_v6lcl_addr;
4268 	}
4269 	*src = src6;
4270 	if (ipif != NULL) {
4271 		src_ill = ipif->ipif_ill;
4272 		if (IS_IPMP(src_ill))
4273 			src_ill = ipmp_ipif_hold_bound_ill(ipif);
4274 		else
4275 			ill_refhold(src_ill);
4276 		ipif_refrele(ipif);
4277 		DTRACE_PROBE2(nce__resolve__src__ill, ncec_t *, ncec,
4278 		    ill_t *, src_ill);
4279 	}
4280 	return (src_ill);
4281 }
4282 
4283 void
4284 ip_nce_lookup_and_update(ipaddr_t *addr, ipif_t *ipif, ip_stack_t *ipst,
4285     uchar_t *hwaddr, int hwaddr_len, int flags)
4286 {
4287 	ill_t	*ill;
4288 	ncec_t	*ncec;
4289 	nce_t	*nce;
4290 	uint16_t new_state;
4291 
4292 	ill = (ipif ? ipif->ipif_ill : NULL);
4293 	if (ill != NULL) {
4294 		/*
4295 		 * only one ncec is possible
4296 		 */
4297 		nce = nce_lookup_v4(ill, addr);
4298 		if (nce != NULL) {
4299 			ncec = nce->nce_common;
4300 			mutex_enter(&ncec->ncec_lock);
4301 			if (NCE_ISREACHABLE(ncec))
4302 				new_state = ND_UNCHANGED;
4303 			else
4304 				new_state = ND_STALE;
4305 			ncec->ncec_flags = flags;
4306 			nce_update(ncec, new_state, hwaddr);
4307 			mutex_exit(&ncec->ncec_lock);
4308 			nce_refrele(nce);
4309 			return;
4310 		}
4311 	} else {
4312 		/*
4313 		 * ill is wildcard; clean up all ncec's and ire's
4314 		 * that match on addr.
4315 		 */
4316 		nce_hw_map_t hwm;
4317 
4318 		hwm.hwm_addr = *addr;
4319 		hwm.hwm_hwlen = hwaddr_len;
4320 		hwm.hwm_hwaddr = hwaddr;
4321 		hwm.hwm_flags = flags;
4322 
4323 		ncec_walk_common(ipst->ips_ndp4, NULL,
4324 		    (pfi_t)nce_update_hw_changed, (uchar_t *)&hwm, B_TRUE);
4325 	}
4326 }
4327 
4328 /*
4329  * Common function to add ncec entries.
4330  * we always add the ncec with ncec_ill == ill, and always create
4331  * nce_t on ncec_ill. A dlpi fastpath message may be triggered if the
4332  * ncec is !reachable.
4333  *
4334  * When the caller passes in an nce_state of ND_UNCHANGED,
4335  * nce_add_common() will determine the state of the created nce based
4336  * on the ill_net_type and nce_flags used. Otherwise, the nce will
4337  * be created with state set to the passed in nce_state.
4338  */
4339 static int
4340 nce_add_common(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len,
4341     const in6_addr_t *addr, uint16_t flags, uint16_t nce_state, nce_t **retnce)
4342 {
4343 	static	ncec_t		nce_nil;
4344 	uchar_t			*template = NULL;
4345 	int			err;
4346 	ncec_t			*ncec;
4347 	ncec_t			**ncep;
4348 	ip_stack_t		*ipst = ill->ill_ipst;
4349 	uint16_t		state;
4350 	boolean_t		fastprobe = B_FALSE;
4351 	struct ndp_g_s		*ndp;
4352 	nce_t			*nce = NULL;
4353 	mblk_t			*dlur_mp = NULL;
4354 
4355 	if (ill->ill_isv6)
4356 		ndp = ill->ill_ipst->ips_ndp6;
4357 	else
4358 		ndp = ill->ill_ipst->ips_ndp4;
4359 
4360 	*retnce = NULL;
4361 
4362 	ASSERT(MUTEX_HELD(&ndp->ndp_g_lock));
4363 
4364 	if (IN6_IS_ADDR_UNSPECIFIED(addr)) {
4365 		ip0dbg(("nce_add_common: no addr\n"));
4366 		return (EINVAL);
4367 	}
4368 	if ((flags & ~NCE_EXTERNAL_FLAGS_MASK)) {
4369 		ip0dbg(("nce_add_common: flags = %x\n", (int)flags));
4370 		return (EINVAL);
4371 	}
4372 
4373 	if (ill->ill_isv6) {
4374 		ncep = ((ncec_t **)NCE_HASH_PTR_V6(ipst, *addr));
4375 	} else {
4376 		ipaddr_t v4addr;
4377 
4378 		IN6_V4MAPPED_TO_IPADDR(addr, v4addr);
4379 		ncep = ((ncec_t **)NCE_HASH_PTR_V4(ipst, v4addr));
4380 	}
4381 
4382 	/*
4383 	 * The caller has ensured that there is no nce on ill, but there could
4384 	 * still be an nce_common_t for the address, so that we find exisiting
4385 	 * ncec_t strucutures first, and atomically add a new nce_t if
4386 	 * one is found. The ndp_g_lock ensures that we don't cross threads
4387 	 * with an ncec_delete(). Unlike ncec_lookup_illgrp() we do not
4388 	 * compare for matches across the illgrp because this function is
4389 	 * called via nce_lookup_then_add_v* -> nce_add_v* -> nce_add_common,
4390 	 * with the nce_lookup_then_add_v* passing in the ipmp_ill where
4391 	 * appropriate.
4392 	 */
4393 	ncec = *ncep;
4394 	for (; ncec != NULL; ncec = ncec->ncec_next) {
4395 		if (ncec->ncec_ill == ill) {
4396 			if (IN6_ARE_ADDR_EQUAL(&ncec->ncec_addr, addr)) {
4397 				*retnce = nce_ill_lookup_then_add(ill, ncec);
4398 				if (*retnce != NULL)
4399 					break;
4400 			}
4401 		}
4402 	}
4403 	if (*retnce != NULL) {
4404 		/*
4405 		 * We should never find *retnce to be MYADDR, since the caller
4406 		 * may then incorrectly restart a DAD timer that's already
4407 		 * running.
4408 		 */
4409 		ASSERT(!NCE_MYADDR(ncec));
4410 		/* caller must trigger fastpath on nce */
4411 		return (0);
4412 	}
4413 	ncec = kmem_cache_alloc(ncec_cache, KM_NOSLEEP);
4414 	if (ncec == NULL)
4415 		return (ENOMEM);
4416 	*ncec = nce_nil;
4417 	ncec->ncec_ill = ill;
4418 	ncec->ncec_ipversion = (ill->ill_isv6 ? IPV6_VERSION : IPV4_VERSION);
4419 	ncec->ncec_flags = flags;
4420 	ncec->ncec_ipst = ipst;	/* No netstack_hold */
4421 
4422 	if (!ill->ill_isv6) {
4423 		ipaddr_t addr4;
4424 
4425 		/*
4426 		 * DAD probe interval and probe count are set based on
4427 		 * fast/slow probe settings. If the underlying link doesn't
4428 		 * have reliably up/down notifications or if we're working
4429 		 * with IPv4 169.254.0.0/16 Link Local Address space, then
4430 		 * don't use the fast timers.  Otherwise, use them.
4431 		 */
4432 		ASSERT(IN6_IS_ADDR_V4MAPPED(addr));
4433 		IN6_V4MAPPED_TO_IPADDR(addr, addr4);
4434 		if (ill->ill_note_link && !IS_IPV4_LL_SPACE(&addr4))
4435 			fastprobe = B_TRUE;
4436 		if (fastprobe) {
4437 			ncec->ncec_xmit_interval =
4438 			    ipst->ips_arp_fastprobe_interval;
4439 			ncec->ncec_pcnt =
4440 			    ipst->ips_arp_fastprobe_count;
4441 			ncec->ncec_flags |= NCE_F_FAST;
4442 		} else {
4443 			ncec->ncec_xmit_interval =
4444 			    ipst->ips_arp_probe_interval;
4445 			ncec->ncec_pcnt =
4446 			    ipst->ips_arp_probe_count;
4447 		}
4448 		if (NCE_PUBLISH(ncec)) {
4449 			ncec->ncec_unsolicit_count =
4450 			    ipst->ips_ip_arp_publish_count;
4451 		}
4452 	} else {
4453 		/*
4454 		 * probe interval is constant: ILL_PROBE_INTERVAL
4455 		 * probe count is constant: ND_MAX_UNICAST_SOLICIT
4456 		 */
4457 		ncec->ncec_pcnt = ND_MAX_UNICAST_SOLICIT;
4458 		if (NCE_PUBLISH(ncec)) {
4459 			ncec->ncec_unsolicit_count =
4460 			    ipst->ips_ip_ndp_unsolicit_count;
4461 		}
4462 	}
4463 	ncec->ncec_rcnt = ill->ill_xmit_count;
4464 	ncec->ncec_addr = *addr;
4465 	ncec->ncec_qd_mp = NULL;
4466 	ncec->ncec_refcnt = 1; /* for ncec getting created */
4467 	mutex_init(&ncec->ncec_lock, NULL, MUTEX_DEFAULT, NULL);
4468 	ncec->ncec_trace_disable = B_FALSE;
4469 
4470 	/*
4471 	 * ncec_lladdr holds link layer address
4472 	 */
4473 	if (hw_addr_len > 0) {
4474 		template = kmem_alloc(hw_addr_len, KM_NOSLEEP);
4475 		if (template == NULL) {
4476 			err = ENOMEM;
4477 			goto err_ret;
4478 		}
4479 		ncec->ncec_lladdr = template;
4480 		ncec->ncec_lladdr_length = hw_addr_len;
4481 		bzero(ncec->ncec_lladdr, hw_addr_len);
4482 	}
4483 	if ((flags & NCE_F_BCAST) != 0) {
4484 		state = ND_REACHABLE;
4485 		ASSERT(hw_addr_len > 0);
4486 	} else if (ill->ill_net_type == IRE_IF_RESOLVER) {
4487 		state = ND_INITIAL;
4488 	} else if (ill->ill_net_type == IRE_IF_NORESOLVER) {
4489 		/*
4490 		 * NORESOLVER entries are always created in the REACHABLE
4491 		 * state.
4492 		 */
4493 		state = ND_REACHABLE;
4494 		if (ill->ill_phys_addr_length == IP_ADDR_LEN &&
4495 		    ill->ill_mactype != DL_IPV4 &&
4496 		    ill->ill_mactype != DL_6TO4) {
4497 			/*
4498 			 * We create a nce_res_mp with the IP nexthop address
4499 			 * as the destination address if the physical length
4500 			 * is exactly 4 bytes for point-to-multipoint links
4501 			 * that do their own resolution from IP to link-layer
4502 			 * address (e.g. IP over X.25).
4503 			 */
4504 			bcopy((uchar_t *)addr,
4505 			    ncec->ncec_lladdr, ill->ill_phys_addr_length);
4506 		}
4507 		if (ill->ill_phys_addr_length == IPV6_ADDR_LEN &&
4508 		    ill->ill_mactype != DL_IPV6) {
4509 			/*
4510 			 * We create a nce_res_mp with the IP nexthop address
4511 			 * as the destination address if the physical legnth
4512 			 * is exactly 16 bytes for point-to-multipoint links
4513 			 * that do their own resolution from IP to link-layer
4514 			 * address.
4515 			 */
4516 			bcopy((uchar_t *)addr,
4517 			    ncec->ncec_lladdr, ill->ill_phys_addr_length);
4518 		}
4519 		/*
4520 		 * Since NUD is not part of the base IPv4 protocol definition,
4521 		 * IPv4 neighbor entries on NORESOLVER interfaces will never
4522 		 * age, and are marked NCE_F_NONUD.
4523 		 */
4524 		if (!ill->ill_isv6)
4525 			ncec->ncec_flags |= NCE_F_NONUD;
4526 	} else if (ill->ill_net_type == IRE_LOOPBACK) {
4527 		state = ND_REACHABLE;
4528 	}
4529 
4530 	if (hw_addr != NULL || ill->ill_net_type == IRE_IF_NORESOLVER) {
4531 		/*
4532 		 * We are adding an ncec with a deterministic hw_addr,
4533 		 * so the state can only be one of {REACHABLE, STALE, PROBE}.
4534 		 *
4535 		 * if we are adding a unicast ncec for the local address
4536 		 * it would be REACHABLE; we would be adding a ND_STALE entry
4537 		 * for the requestor of an ARP_REQUEST/ND_SOLICIT. Our own
4538 		 * addresses are added in PROBE to trigger DAD.
4539 		 */
4540 		if ((flags & (NCE_F_MCAST|NCE_F_BCAST)) ||
4541 		    ill->ill_net_type == IRE_IF_NORESOLVER)
4542 			state = ND_REACHABLE;
4543 		else if (!NCE_PUBLISH(ncec))
4544 			state = ND_STALE;
4545 		else
4546 			state = ND_PROBE;
4547 		if (hw_addr != NULL)
4548 			nce_set_ll(ncec, hw_addr);
4549 	}
4550 	/* caller overrides internally computed state */
4551 	if (nce_state != ND_UNCHANGED)
4552 		state = nce_state;
4553 
4554 	if (state == ND_PROBE)
4555 		ncec->ncec_flags |= NCE_F_UNVERIFIED;
4556 
4557 	ncec->ncec_state = state;
4558 
4559 	if (state == ND_REACHABLE) {
4560 		ncec->ncec_last = TICK_TO_MSEC(lbolt64);
4561 		ncec->ncec_init_time = TICK_TO_MSEC(lbolt64);
4562 	} else {
4563 		ncec->ncec_last = 0;
4564 		if (state == ND_INITIAL)
4565 			ncec->ncec_init_time = TICK_TO_MSEC(lbolt64);
4566 	}
4567 	list_create(&ncec->ncec_cb, sizeof (ncec_cb_t),
4568 	    offsetof(ncec_cb_t, ncec_cb_node));
4569 	/*
4570 	 * have all the memory allocations out of the way before taking locks
4571 	 * and adding the nce.
4572 	 */
4573 	nce = kmem_cache_alloc(nce_cache, KM_NOSLEEP);
4574 	if (nce == NULL) {
4575 		err = ENOMEM;
4576 		goto err_ret;
4577 	}
4578 	if (ncec->ncec_lladdr != NULL ||
4579 	    ill->ill_net_type == IRE_IF_NORESOLVER) {
4580 		dlur_mp = ill_dlur_gen(ncec->ncec_lladdr,
4581 		    ill->ill_phys_addr_length, ill->ill_sap,
4582 		    ill->ill_sap_length);
4583 		if (dlur_mp == NULL) {
4584 			err = ENOMEM;
4585 			goto err_ret;
4586 		}
4587 	}
4588 
4589 	/*
4590 	 * Atomically ensure that the ill is not CONDEMNED, before
4591 	 * adding the NCE.
4592 	 */
4593 	mutex_enter(&ill->ill_lock);
4594 	if (ill->ill_state_flags & ILL_CONDEMNED) {
4595 		mutex_exit(&ill->ill_lock);
4596 		err = EINVAL;
4597 		goto err_ret;
4598 	}
4599 	if (!NCE_MYADDR(ncec) &&
4600 	    (ill->ill_state_flags & ILL_DOWN_IN_PROGRESS)) {
4601 		mutex_exit(&ill->ill_lock);
4602 		DTRACE_PROBE1(nce__add__on__down__ill, ncec_t *, ncec);
4603 		err = EINVAL;
4604 		goto err_ret;
4605 	}
4606 	/*
4607 	 * Acquire the ncec_lock even before adding the ncec to the list
4608 	 * so that it cannot get deleted after the ncec is added, but
4609 	 * before we add the nce.
4610 	 */
4611 	mutex_enter(&ncec->ncec_lock);
4612 	if ((ncec->ncec_next = *ncep) != NULL)
4613 		ncec->ncec_next->ncec_ptpn = &ncec->ncec_next;
4614 	*ncep = ncec;
4615 	ncec->ncec_ptpn = ncep;
4616 
4617 	/* Bump up the number of ncec's referencing this ill */
4618 	DTRACE_PROBE3(ill__incr__cnt, (ill_t *), ill,
4619 	    (char *), "ncec", (void *), ncec);
4620 	ill->ill_ncec_cnt++;
4621 	/*
4622 	 * Since we hold the ncec_lock at this time, the ncec cannot be
4623 	 * condemned, and we can safely add the nce.
4624 	 */
4625 	*retnce = nce_add_impl(ill, ncec, nce, dlur_mp);
4626 	mutex_exit(&ncec->ncec_lock);
4627 	mutex_exit(&ill->ill_lock);
4628 
4629 	/* caller must trigger fastpath on *retnce */
4630 	return (0);
4631 
4632 err_ret:
4633 	if (ncec != NULL)
4634 		kmem_cache_free(ncec_cache, ncec);
4635 	if (nce != NULL)
4636 		kmem_cache_free(nce_cache, nce);
4637 	freemsg(dlur_mp);
4638 	if (template != NULL)
4639 		kmem_free(template, ill->ill_phys_addr_length);
4640 	return (err);
4641 }
4642 
4643 /*
4644  * take a ref on the nce
4645  */
4646 void
4647 nce_refhold(nce_t *nce)
4648 {
4649 	mutex_enter(&nce->nce_lock);
4650 	nce->nce_refcnt++;
4651 	ASSERT((nce)->nce_refcnt != 0);
4652 	mutex_exit(&nce->nce_lock);
4653 }
4654 
4655 /*
4656  * release a ref on the nce; In general, this
4657  * cannot be called with locks held because nce_inactive
4658  * may result in nce_inactive which will take the ill_lock,
4659  * do ipif_ill_refrele_tail etc. Thus the one exception
4660  * where this can be called with locks held is when the caller
4661  * is certain that the nce_refcnt is sufficient to prevent
4662  * the invocation of nce_inactive.
4663  */
4664 void
4665 nce_refrele(nce_t *nce)
4666 {
4667 	ASSERT((nce)->nce_refcnt != 0);
4668 	mutex_enter(&nce->nce_lock);
4669 	if (--nce->nce_refcnt == 0)
4670 		nce_inactive(nce); /* destroys the mutex */
4671 	else
4672 		mutex_exit(&nce->nce_lock);
4673 }
4674 
4675 /*
4676  * free the nce after all refs have gone away.
4677  */
4678 static void
4679 nce_inactive(nce_t *nce)
4680 {
4681 	ill_t *ill = nce->nce_ill;
4682 
4683 	ASSERT(nce->nce_refcnt == 0);
4684 
4685 	ncec_refrele_notr(nce->nce_common);
4686 	nce->nce_common = NULL;
4687 	freemsg(nce->nce_fp_mp);
4688 	freemsg(nce->nce_dlur_mp);
4689 
4690 	mutex_enter(&ill->ill_lock);
4691 	DTRACE_PROBE3(ill__decr__cnt, (ill_t *), ill,
4692 	    (char *), "nce", (void *), nce);
4693 	ill->ill_nce_cnt--;
4694 	nce->nce_ill = NULL;
4695 	/*
4696 	 * If the number of ncec's associated with this ill have dropped
4697 	 * to zero, check whether we need to restart any operation that
4698 	 * is waiting for this to happen.
4699 	 */
4700 	if (ILL_DOWN_OK(ill)) {
4701 		/* ipif_ill_refrele_tail drops the ill_lock */
4702 		ipif_ill_refrele_tail(ill);
4703 	} else {
4704 		mutex_exit(&ill->ill_lock);
4705 	}
4706 
4707 	mutex_destroy(&nce->nce_lock);
4708 	kmem_cache_free(nce_cache, nce);
4709 }
4710 
4711 /*
4712  * Add an nce to the ill_nce list.
4713  */
4714 static nce_t *
4715 nce_add_impl(ill_t *ill, ncec_t *ncec, nce_t *nce, mblk_t *dlur_mp)
4716 {
4717 	bzero(nce, sizeof (*nce));
4718 	mutex_init(&nce->nce_lock, NULL, MUTEX_DEFAULT, NULL);
4719 	nce->nce_common = ncec;
4720 	nce->nce_addr = ncec->ncec_addr;
4721 	nce->nce_ill = ill;
4722 	DTRACE_PROBE3(ill__incr__cnt, (ill_t *), ill,
4723 	    (char *), "nce", (void *), nce);
4724 	ill->ill_nce_cnt++;
4725 
4726 	nce->nce_refcnt = 1; /* for the thread */
4727 	ncec->ncec_refcnt++; /* want ncec_refhold_locked_notr(ncec) */
4728 	nce->nce_dlur_mp = dlur_mp;
4729 
4730 	/* add nce to the ill's fastpath list.  */
4731 	nce->nce_refcnt++; /* for the list */
4732 	list_insert_head(&ill->ill_nce, nce);
4733 	return (nce);
4734 }
4735 
4736 static nce_t *
4737 nce_add(ill_t *ill, ncec_t *ncec)
4738 {
4739 	nce_t	*nce;
4740 	mblk_t	*dlur_mp = NULL;
4741 
4742 	ASSERT(MUTEX_HELD(&ill->ill_lock));
4743 	ASSERT(MUTEX_HELD(&ncec->ncec_lock));
4744 
4745 	nce = kmem_cache_alloc(nce_cache, KM_NOSLEEP);
4746 	if (nce == NULL)
4747 		return (NULL);
4748 	if (ncec->ncec_lladdr != NULL ||
4749 	    ill->ill_net_type == IRE_IF_NORESOLVER) {
4750 		dlur_mp = ill_dlur_gen(ncec->ncec_lladdr,
4751 		    ill->ill_phys_addr_length, ill->ill_sap,
4752 		    ill->ill_sap_length);
4753 		if (dlur_mp == NULL) {
4754 			kmem_cache_free(nce_cache, nce);
4755 			return (NULL);
4756 		}
4757 	}
4758 	return (nce_add_impl(ill, ncec, nce, dlur_mp));
4759 }
4760 
4761 /*
4762  * remove the nce from the ill_faspath list
4763  */
4764 void
4765 nce_delete(nce_t *nce)
4766 {
4767 	ill_t	*ill = nce->nce_ill;
4768 
4769 	ASSERT(MUTEX_HELD(&ill->ill_lock));
4770 
4771 	mutex_enter(&nce->nce_lock);
4772 	if (nce->nce_is_condemned) {
4773 		/*
4774 		 * some other thread has removed this nce from the ill_nce list
4775 		 */
4776 		mutex_exit(&nce->nce_lock);
4777 		return;
4778 	}
4779 	nce->nce_is_condemned = B_TRUE;
4780 	mutex_exit(&nce->nce_lock);
4781 
4782 	list_remove(&ill->ill_nce, nce);
4783 	/*
4784 	 * even though we are holding the ill_lock, it is ok to
4785 	 * call nce_refrele here because we know that we should have
4786 	 * at least 2 refs on the nce: one for the thread, and one
4787 	 * for the list. The refrele below will release the one for
4788 	 * the list.
4789 	 */
4790 	nce_refrele(nce);
4791 }
4792 
4793 nce_t *
4794 nce_lookup(ill_t *ill, const in6_addr_t *addr)
4795 {
4796 	nce_t *nce = NULL;
4797 
4798 	ASSERT(ill != NULL);
4799 	ASSERT(MUTEX_HELD(&ill->ill_lock));
4800 
4801 	for (nce = list_head(&ill->ill_nce); nce != NULL;
4802 	    nce = list_next(&ill->ill_nce, nce)) {
4803 		if (IN6_ARE_ADDR_EQUAL(&nce->nce_addr, addr))
4804 			break;
4805 	}
4806 
4807 	/*
4808 	 * if we found the nce on the ill_nce list while holding
4809 	 * the ill_lock, then it cannot be condemned yet.
4810 	 */
4811 	if (nce != NULL) {
4812 		ASSERT(!nce->nce_is_condemned);
4813 		nce_refhold(nce);
4814 	}
4815 	return (nce);
4816 }
4817 
4818 /*
4819  * Walk the ill_nce list on ill. The callback function func() cannot perform
4820  * any destructive actions.
4821  */
4822 static void
4823 nce_walk_common(ill_t *ill, pfi_t func, void *arg)
4824 {
4825 	nce_t *nce = NULL, *nce_next;
4826 
4827 	ASSERT(MUTEX_HELD(&ill->ill_lock));
4828 	for (nce = list_head(&ill->ill_nce); nce != NULL; ) {
4829 		nce_next = list_next(&ill->ill_nce, nce);
4830 		if (func(ill, nce, arg) != 0)
4831 			break;
4832 		nce = nce_next;
4833 	}
4834 }
4835 
4836 void
4837 nce_walk(ill_t *ill, pfi_t func, void *arg)
4838 {
4839 	mutex_enter(&ill->ill_lock);
4840 	nce_walk_common(ill, func, arg);
4841 	mutex_exit(&ill->ill_lock);
4842 }
4843 
4844 void
4845 nce_flush(ill_t *ill, boolean_t flushall)
4846 {
4847 	nce_t *nce, *nce_next;
4848 	list_t dead;
4849 
4850 	list_create(&dead, sizeof (nce_t), offsetof(nce_t, nce_node));
4851 	mutex_enter(&ill->ill_lock);
4852 	for (nce = list_head(&ill->ill_nce); nce != NULL; ) {
4853 		nce_next = list_next(&ill->ill_nce, nce);
4854 		if (!flushall && NCE_PUBLISH(nce->nce_common)) {
4855 			nce = nce_next;
4856 			continue;
4857 		}
4858 		/*
4859 		 * nce_delete requires that the caller should either not
4860 		 * be holding locks, or should hold a ref to ensure that
4861 		 * we wont hit ncec_inactive. So take a ref and clean up
4862 		 * after the list is flushed.
4863 		 */
4864 		nce_refhold(nce);
4865 		nce_delete(nce);
4866 		list_insert_tail(&dead, nce);
4867 		nce = nce_next;
4868 	}
4869 	mutex_exit(&ill->ill_lock);
4870 	while ((nce = list_head(&dead)) != NULL) {
4871 		list_remove(&dead, nce);
4872 		nce_refrele(nce);
4873 	}
4874 	ASSERT(list_is_empty(&dead));
4875 	list_destroy(&dead);
4876 }
4877 
4878 /* Return an interval that is anywhere in the [1 .. intv] range */
4879 static clock_t
4880 nce_fuzz_interval(clock_t intv, boolean_t initial_time)
4881 {
4882 	clock_t rnd, frac;
4883 
4884 	(void) random_get_pseudo_bytes((uint8_t *)&rnd, sizeof (rnd));
4885 	/* Note that clock_t is signed; must chop off bits */
4886 	rnd &= (1ul << (NBBY * sizeof (rnd) - 1)) - 1;
4887 	if (initial_time) {
4888 		if (intv <= 0)
4889 			intv = 1;
4890 		else
4891 			intv = (rnd % intv) + 1;
4892 	} else {
4893 		/* Compute 'frac' as 20% of the configured interval */
4894 		if ((frac = intv / 5) <= 1)
4895 			frac = 2;
4896 		/* Set intv randomly in the range [intv-frac .. intv+frac] */
4897 		if ((intv = intv - frac + rnd % (2 * frac + 1)) <= 0)
4898 		intv = 1;
4899 	}
4900 	return (intv);
4901 }
4902 
4903 void
4904 nce_resolv_ipmp_ok(ncec_t *ncec)
4905 {
4906 	mblk_t *mp;
4907 	uint_t pkt_len;
4908 	iaflags_t ixaflags = IXAF_NO_TRACE;
4909 	nce_t *under_nce;
4910 	ill_t	*ill = ncec->ncec_ill;
4911 	boolean_t isv6 = (ncec->ncec_ipversion == IPV6_VERSION);
4912 	ipif_t *src_ipif = NULL;
4913 	ip_stack_t *ipst = ill->ill_ipst;
4914 	ill_t *send_ill;
4915 	uint_t nprobes;
4916 
4917 	ASSERT(IS_IPMP(ill));
4918 
4919 	mutex_enter(&ncec->ncec_lock);
4920 	nprobes = ncec->ncec_nprobes;
4921 	mp = ncec->ncec_qd_mp;
4922 	ncec->ncec_qd_mp = NULL;
4923 	ncec->ncec_nprobes = 0;
4924 	mutex_exit(&ncec->ncec_lock);
4925 
4926 	while (mp != NULL) {
4927 		mblk_t *nxt_mp;
4928 
4929 		nxt_mp = mp->b_next;
4930 		mp->b_next = NULL;
4931 		if (isv6) {
4932 			ip6_t *ip6h = (ip6_t *)mp->b_rptr;
4933 
4934 			pkt_len = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN;
4935 			src_ipif = ipif_lookup_addr_nondup_v6(&ip6h->ip6_src,
4936 			    ill, ALL_ZONES, ipst);
4937 		} else {
4938 			ipha_t *ipha = (ipha_t *)mp->b_rptr;
4939 
4940 			ixaflags |= IXAF_IS_IPV4;
4941 			pkt_len = ntohs(ipha->ipha_length);
4942 			src_ipif = ipif_lookup_addr_nondup(ipha->ipha_src,
4943 			    ill, ALL_ZONES, ipst);
4944 		}
4945 
4946 		/*
4947 		 * find a new nce based on an under_ill. The first IPMP probe
4948 		 * packet gets queued, so we could still find a src_ipif that
4949 		 * matches an IPMP test address.
4950 		 */
4951 		if (src_ipif == NULL || IS_IPMP(src_ipif->ipif_ill)) {
4952 			/*
4953 			 * if src_ipif is null, this could be either a
4954 			 * forwarded packet or a probe whose src got deleted.
4955 			 * We identify the former case by looking for the
4956 			 * ncec_nprobes: the first ncec_nprobes packets are
4957 			 * probes;
4958 			 */
4959 			if (src_ipif == NULL && nprobes > 0)
4960 				goto drop_pkt;
4961 
4962 			/*
4963 			 * For forwarded packets, we use the ipmp rotor
4964 			 * to find send_ill.
4965 			 */
4966 			send_ill = ipmp_ill_get_xmit_ill(ncec->ncec_ill,
4967 			    B_TRUE);
4968 		} else {
4969 			send_ill = src_ipif->ipif_ill;
4970 			ill_refhold(send_ill);
4971 		}
4972 
4973 		DTRACE_PROBE4(nce__resolve__ipmp, (mblk_t *), mp,
4974 		    (ncec_t *), ncec, (ipif_t *),
4975 		    src_ipif, (ill_t *), send_ill);
4976 
4977 		if (send_ill == NULL) {
4978 			if (src_ipif != NULL)
4979 				ipif_refrele(src_ipif);
4980 			goto drop_pkt;
4981 		}
4982 		/* create an under_nce on send_ill */
4983 		rw_enter(&ipst->ips_ill_g_lock, RW_READER);
4984 		if (IS_IN_SAME_ILLGRP(send_ill, ncec->ncec_ill))
4985 			under_nce = nce_fastpath_create(send_ill, ncec);
4986 		else
4987 			under_nce = NULL;
4988 		rw_exit(&ipst->ips_ill_g_lock);
4989 		if (under_nce != NULL && NCE_ISREACHABLE(ncec))
4990 			nce_fastpath_trigger(under_nce);
4991 
4992 		ill_refrele(send_ill);
4993 		if (src_ipif != NULL)
4994 			ipif_refrele(src_ipif);
4995 
4996 		if (under_nce != NULL) {
4997 			(void) ip_xmit(mp, under_nce, ixaflags, pkt_len, 0,
4998 			    ALL_ZONES, 0, NULL);
4999 			nce_refrele(under_nce);
5000 			if (nprobes > 0)
5001 				nprobes--;
5002 			mp = nxt_mp;
5003 			continue;
5004 		}
5005 drop_pkt:
5006 		if (isv6) {
5007 			BUMP_MIB(&ipst->ips_ip6_mib, ipIfStatsOutDiscards);
5008 		} else {
5009 			BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
5010 		}
5011 		ip_drop_output("ipIfStatsOutDiscards - no under_ill", mp, NULL);
5012 		freemsg(mp);
5013 		if (nprobes > 0)
5014 			nprobes--;
5015 		mp = nxt_mp;
5016 	}
5017 	ncec_cb_dispatch(ncec); /* complete callbacks */
5018 }
5019