xref: /illumos-gate/usr/src/uts/common/inet/ip/ip_ndp.c (revision e9af4bc0b1cc30cea75d6ad4aa2fde97d985e9be)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #include <sys/types.h>
27 #include <sys/stream.h>
28 #include <sys/stropts.h>
29 #include <sys/strsun.h>
30 #include <sys/sysmacros.h>
31 #include <sys/errno.h>
32 #include <sys/dlpi.h>
33 #include <sys/socket.h>
34 #include <sys/ddi.h>
35 #include <sys/sunddi.h>
36 #include <sys/cmn_err.h>
37 #include <sys/debug.h>
38 #include <sys/vtrace.h>
39 #include <sys/kmem.h>
40 #include <sys/zone.h>
41 #include <sys/ethernet.h>
42 #include <sys/sdt.h>
43 #include <sys/mac.h>
44 
45 #include <net/if.h>
46 #include <net/if_types.h>
47 #include <net/if_dl.h>
48 #include <net/route.h>
49 #include <netinet/in.h>
50 #include <netinet/ip6.h>
51 #include <netinet/icmp6.h>
52 
53 #include <inet/common.h>
54 #include <inet/mi.h>
55 #include <inet/mib2.h>
56 #include <inet/nd.h>
57 #include <inet/ip.h>
58 #include <inet/ip_impl.h>
59 #include <inet/ipclassifier.h>
60 #include <inet/ip_if.h>
61 #include <inet/ip_ire.h>
62 #include <inet/ip_rts.h>
63 #include <inet/ip6.h>
64 #include <inet/ip_ndp.h>
65 #include <inet/sctp_ip.h>
66 #include <inet/ip_arp.h>
67 #include <inet/ip2mac_impl.h>
68 
69 #define	ANNOUNCE_INTERVAL(isv6) \
70 	(isv6 ? ipst->ips_ip_ndp_unsolicit_interval : \
71 	ipst->ips_ip_arp_publish_interval)
72 
73 #define	DEFENSE_INTERVAL(isv6) \
74 	(isv6 ? ipst->ips_ndp_defend_interval : \
75 	ipst->ips_arp_defend_interval)
76 
77 /* Non-tunable probe interval, based on link capabilities */
78 #define	ILL_PROBE_INTERVAL(ill)	((ill)->ill_note_link ? 150 : 1500)
79 
80 /*
81  * The IPv4 Link Local address space is special; we do extra duplicate checking
82  * there, as the entire assignment mechanism rests on random numbers.
83  */
84 #define	IS_IPV4_LL_SPACE(ptr)	(((uchar_t *)ptr)[0] == 169 && \
85 				((uchar_t *)ptr)[1] == 254)
86 
87 /*
88  * NCE_EXTERNAL_FLAGS_MASK defines the set of ncec_flags that may be passed
89  * in to the ncec*add* functions.
90  *
91  * NCE_F_AUTHORITY means that we ignore any incoming adverts for that
92  * mapping (though DAD is performed for the mapping). NCE_F_PUBLISH means
93  * that we will respond to requests for the protocol address.
94  */
95 #define	NCE_EXTERNAL_FLAGS_MASK \
96 	(NCE_F_MYADDR | NCE_F_ISROUTER | NCE_F_NONUD | \
97 	NCE_F_ANYCAST | NCE_F_UNSOL_ADV | NCE_F_BCAST | NCE_F_MCAST | \
98 	NCE_F_AUTHORITY | NCE_F_PUBLISH | NCE_F_STATIC)
99 
100 /*
101  * Lock ordering:
102  *
103  *	ndp_g_lock -> ill_lock -> ncec_lock
104  *
105  * The ndp_g_lock protects the NCE hash (nce_hash_tbl, NCE_HASH_PTR) and
106  * ncec_next.  ncec_lock protects the contents of the NCE (particularly
107  * ncec_refcnt).
108  */
109 
110 static	void	nce_cleanup_list(ncec_t *ncec);
111 static	void 	nce_set_ll(ncec_t *ncec, uchar_t *ll_addr);
112 static	ncec_t	*ncec_lookup_illgrp(ill_t *, const in6_addr_t *,
113     ncec_t *);
114 static	nce_t	*nce_lookup_addr(ill_t *, const in6_addr_t *);
115 static	int	nce_set_multicast_v6(ill_t *ill, const in6_addr_t *addr,
116     uint16_t ncec_flags, nce_t **newnce);
117 static	int	nce_set_multicast_v4(ill_t *ill, const in_addr_t *dst,
118     uint16_t ncec_flags, nce_t **newnce);
119 static	boolean_t	ndp_xmit(ill_t *ill, uint32_t operation,
120     uint8_t *hwaddr, uint_t hwaddr_len, const in6_addr_t *sender,
121     const in6_addr_t *target, int flag);
122 static void	ncec_refhold_locked(ncec_t *);
123 static boolean_t ill_defend_rate_limit(ill_t *, ncec_t *);
124 static	void	nce_queue_mp_common(ncec_t *, mblk_t *, boolean_t);
125 static	int	nce_add_common(ill_t *, uchar_t *, uint_t, const in6_addr_t *,
126     uint16_t, uint16_t, nce_t **);
127 static nce_t *nce_add_impl(ill_t *, ncec_t *, nce_t *, mblk_t *);
128 static nce_t *nce_add(ill_t *, ncec_t *);
129 static void nce_inactive(nce_t *);
130 extern nce_t 	*nce_lookup(ill_t *, const in6_addr_t *);
131 static nce_t *nce_ill_lookup_then_add(ill_t *, ncec_t *);
132 static int	nce_add_v6(ill_t *, uchar_t *, uint_t, const in6_addr_t *,
133     uint16_t, uint16_t, nce_t **);
134 static int	nce_add_v4(ill_t *, uchar_t *, uint_t, const in_addr_t *,
135     uint16_t, uint16_t, nce_t **);
136 static int  nce_add_v6_postprocess(nce_t *);
137 static int  nce_add_v4_postprocess(nce_t *);
138 static ill_t *nce_resolve_src(ncec_t *, in6_addr_t *);
139 static clock_t nce_fuzz_interval(clock_t, boolean_t);
140 static void nce_resolv_ipmp_ok(ncec_t *);
141 static void nce_walk_common(ill_t *, pfi_t, void *);
142 static void nce_start_timer(ncec_t *, uint_t);
143 static nce_t *nce_fastpath_create(ill_t *, ncec_t *);
144 static void nce_fastpath_trigger(nce_t *);
145 static nce_t *nce_fastpath(ncec_t *, boolean_t, nce_t *);
146 
147 #ifdef DEBUG
148 static void	ncec_trace_cleanup(const ncec_t *);
149 #endif
150 
151 #define	NCE_HASH_PTR_V4(ipst, addr)					\
152 	(&((ipst)->ips_ndp4->nce_hash_tbl[IRE_ADDR_HASH(addr, NCE_TABLE_SIZE)]))
153 
154 #define	NCE_HASH_PTR_V6(ipst, addr)				 \
155 	(&((ipst)->ips_ndp6->nce_hash_tbl[NCE_ADDR_HASH_V6(addr, \
156 		NCE_TABLE_SIZE)]))
157 
158 extern kmem_cache_t *ncec_cache;
159 extern kmem_cache_t *nce_cache;
160 
161 /*
162  * Send out a IPv6 (unicast) or IPv4 (broadcast) DAD probe
163  * If src_ill is not null, the ncec_addr is bound to src_ill. The
164  * src_ill is ignored by nce_dad for IPv4 Neighbor Cache entries where
165  * the probe is sent on the ncec_ill (in the non-IPMP case) or the
166  * IPMP cast_ill (in the IPMP case).
167  *
168  * Note that the probe interval is based on ncec->ncec_ill which
169  * may be the ipmp_ill.
170  */
171 static void
172 nce_dad(ncec_t *ncec, ill_t *src_ill, boolean_t send_probe)
173 {
174 	boolean_t dropped;
175 	uint32_t probe_interval;
176 
177 	ASSERT(!(ncec->ncec_flags & NCE_F_MCAST));
178 	ASSERT(!(ncec->ncec_flags & NCE_F_BCAST));
179 	if (ncec->ncec_ipversion == IPV6_VERSION) {
180 		dropped = ndp_xmit(src_ill, ND_NEIGHBOR_SOLICIT,
181 		    ncec->ncec_lladdr, ncec->ncec_lladdr_length,
182 		    &ipv6_all_zeros, &ncec->ncec_addr, NDP_PROBE);
183 		probe_interval = ILL_PROBE_INTERVAL(ncec->ncec_ill);
184 	} else {
185 		/* IPv4 DAD delay the initial probe. */
186 		if (send_probe)
187 			dropped = arp_probe(ncec);
188 		else
189 			dropped = B_TRUE;
190 		probe_interval = nce_fuzz_interval(ncec->ncec_xmit_interval,
191 		    !send_probe);
192 	}
193 	if (!dropped) {
194 		mutex_enter(&ncec->ncec_lock);
195 		ncec->ncec_pcnt--;
196 		mutex_exit(&ncec->ncec_lock);
197 	}
198 	nce_restart_timer(ncec, probe_interval);
199 }
200 
201 /*
202  * Compute default flags to use for an advertisement of this ncec's address.
203  */
204 static int
205 nce_advert_flags(const ncec_t *ncec)
206 {
207 	int flag = 0;
208 
209 	if (ncec->ncec_flags & NCE_F_ISROUTER)
210 		flag |= NDP_ISROUTER;
211 	if (!(ncec->ncec_flags & NCE_F_ANYCAST))
212 		flag |= NDP_ORIDE;
213 
214 	return (flag);
215 }
216 
217 /*
218  * NDP Cache Entry creation routine.
219  * This routine must always be called with ndp6->ndp_g_lock held.
220  */
221 int
222 nce_add_v6(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len,
223     const in6_addr_t *addr, uint16_t flags, uint16_t state, nce_t **newnce)
224 {
225 	int		err;
226 	nce_t		*nce;
227 
228 	ASSERT(MUTEX_HELD(&ill->ill_ipst->ips_ndp6->ndp_g_lock));
229 	ASSERT(ill != NULL && ill->ill_isv6);
230 
231 	err = nce_add_common(ill, hw_addr, hw_addr_len, addr, flags, state,
232 	    &nce);
233 	if (err != 0)
234 		return (err);
235 	ASSERT(newnce != NULL);
236 	*newnce = nce;
237 	return (err);
238 }
239 
240 /*
241  * Post-processing routine to be executed after nce_add_v6(). This function
242  * triggers fastpath (if appropriate) and DAD on the newly added nce entry
243  * and must be called without any locks held.
244  */
245 int
246 nce_add_v6_postprocess(nce_t *nce)
247 {
248 	ncec_t		*ncec = nce->nce_common;
249 	boolean_t	dropped = B_FALSE;
250 	uchar_t		*hw_addr = ncec->ncec_lladdr;
251 	uint_t		hw_addr_len = ncec->ncec_lladdr_length;
252 	ill_t		*ill = ncec->ncec_ill;
253 	int		err = 0;
254 	uint16_t	flags = ncec->ncec_flags;
255 	ip_stack_t	*ipst = ill->ill_ipst;
256 	boolean_t	trigger_fastpath = B_TRUE;
257 
258 	/*
259 	 * If the hw_addr is NULL, typically for ND_INCOMPLETE nces, then
260 	 * we call nce_fastpath as soon as the ncec is resolved in nce_process.
261 	 * We call nce_fastpath from nce_update if the link layer address of
262 	 * the peer changes from nce_update
263 	 */
264 	if (NCE_PUBLISH(ncec) || !NCE_ISREACHABLE(ncec) ||
265 	    (hw_addr == NULL && ill->ill_net_type != IRE_IF_NORESOLVER))
266 		trigger_fastpath = B_FALSE;
267 
268 	if (trigger_fastpath)
269 		nce_fastpath_trigger(nce);
270 	if (NCE_PUBLISH(ncec) && ncec->ncec_state == ND_PROBE) {
271 		ill_t *hwaddr_ill;
272 		/*
273 		 * Unicast entry that needs DAD.
274 		 */
275 		if (IS_IPMP(ill)) {
276 			hwaddr_ill = ipmp_illgrp_find_ill(ill->ill_grp,
277 			    hw_addr, hw_addr_len);
278 		} else {
279 			hwaddr_ill = ill;
280 		}
281 		nce_dad(ncec, hwaddr_ill, B_TRUE);
282 		err = EINPROGRESS;
283 	} else if (flags & NCE_F_UNSOL_ADV) {
284 		/*
285 		 * We account for the transmit below by assigning one
286 		 * less than the ndd variable. Subsequent decrements
287 		 * are done in nce_timer.
288 		 */
289 		mutex_enter(&ncec->ncec_lock);
290 		ncec->ncec_unsolicit_count =
291 		    ipst->ips_ip_ndp_unsolicit_count - 1;
292 		mutex_exit(&ncec->ncec_lock);
293 		dropped = ndp_xmit(ill,
294 		    ND_NEIGHBOR_ADVERT,
295 		    hw_addr,
296 		    hw_addr_len,
297 		    &ncec->ncec_addr,	/* Source and target of the adv */
298 		    &ipv6_all_hosts_mcast, /* Destination of the packet */
299 		    nce_advert_flags(ncec));
300 		mutex_enter(&ncec->ncec_lock);
301 		if (dropped)
302 			ncec->ncec_unsolicit_count++;
303 		else
304 			ncec->ncec_last_time_defended = ddi_get_lbolt();
305 		if (ncec->ncec_unsolicit_count != 0) {
306 			nce_start_timer(ncec,
307 			    ipst->ips_ip_ndp_unsolicit_interval);
308 		}
309 		mutex_exit(&ncec->ncec_lock);
310 	}
311 	return (err);
312 }
313 
314 /*
315  * Atomically lookup and add (if needed) Neighbor Cache information for
316  * an address.
317  *
318  * IPMP notes: the ncec for non-local (i.e., !NCE_MYADDR(ncec) addresses
319  * are always added pointing at the ipmp_ill. Thus, when the ill passed
320  * to nce_add_v6 is an under_ill (i.e., IS_UNDER_IPMP(ill)) two nce_t
321  * entries will be created, both pointing at the same ncec_t. The nce_t
322  * entries will have their nce_ill set to the ipmp_ill and the under_ill
323  * respectively, with the ncec_t having its ncec_ill pointing at the ipmp_ill.
324  * Local addresses are always created on the ill passed to nce_add_v6.
325  */
326 int
327 nce_lookup_then_add_v6(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len,
328     const in6_addr_t *addr, uint16_t flags, uint16_t state, nce_t **newnce)
329 {
330 	int		err = 0;
331 	ip_stack_t	*ipst = ill->ill_ipst;
332 	nce_t		*nce, *upper_nce = NULL;
333 	ill_t		*in_ill = ill;
334 	boolean_t	need_ill_refrele = B_FALSE;
335 
336 	if (flags & NCE_F_MCAST) {
337 		/*
338 		 * hw_addr will be figured out in nce_set_multicast_v6;
339 		 * caller has to select the cast_ill
340 		 */
341 		ASSERT(hw_addr == NULL);
342 		ASSERT(!IS_IPMP(ill));
343 		err = nce_set_multicast_v6(ill, addr, flags, newnce);
344 		return (err);
345 	}
346 	ASSERT(ill->ill_isv6);
347 	if (IS_UNDER_IPMP(ill) && !(flags & NCE_F_MYADDR)) {
348 		ill = ipmp_ill_hold_ipmp_ill(ill);
349 		if (ill == NULL)
350 			return (ENXIO);
351 		need_ill_refrele = B_TRUE;
352 	}
353 
354 	mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
355 	nce = nce_lookup_addr(ill, addr);
356 	if (nce == NULL) {
357 		err = nce_add_v6(ill, hw_addr, hw_addr_len, addr, flags, state,
358 		    &nce);
359 	} else {
360 		err = EEXIST;
361 	}
362 	mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
363 	if (err == 0)
364 		err = nce_add_v6_postprocess(nce);
365 	if (in_ill != ill && nce != NULL) {
366 		nce_t *under_nce = NULL;
367 
368 		/*
369 		 * in_ill was the under_ill. Try to create the under_nce.
370 		 * Hold the ill_g_lock to prevent changes to group membership
371 		 * until we are done.
372 		 */
373 		rw_enter(&ipst->ips_ill_g_lock, RW_READER);
374 		if (IS_IN_SAME_ILLGRP(in_ill, ill)) {
375 			under_nce = nce_fastpath_create(in_ill,
376 			    nce->nce_common);
377 			upper_nce = nce;
378 			if ((nce = under_nce) == NULL)
379 				err = EINVAL;
380 		}
381 		rw_exit(&ipst->ips_ill_g_lock);
382 		if (under_nce != NULL && NCE_ISREACHABLE(nce->nce_common))
383 			nce_fastpath_trigger(under_nce);
384 	}
385 	if (nce != NULL) {
386 		if (newnce != NULL)
387 			*newnce = nce;
388 		else
389 			nce_refrele(nce);
390 	}
391 	/* nce_refrele is deferred until the lock is dropped  */
392 	if (upper_nce != NULL)
393 		nce_refrele(upper_nce);
394 	if (need_ill_refrele)
395 		ill_refrele(ill);
396 	return (err);
397 }
398 
399 /*
400  * Remove all the CONDEMNED nces from the appropriate hash table.
401  * We create a private list of NCEs, these may have ires pointing
402  * to them, so the list will be passed through to clean up dependent
403  * ires and only then we can do ncec_refrele() which can make NCE inactive.
404  */
405 static void
406 nce_remove(ndp_g_t *ndp, ncec_t *ncec, ncec_t **free_nce_list)
407 {
408 	ncec_t *ncec1;
409 	ncec_t **ptpn;
410 
411 	ASSERT(MUTEX_HELD(&ndp->ndp_g_lock));
412 	ASSERT(ndp->ndp_g_walker == 0);
413 	for (; ncec; ncec = ncec1) {
414 		ncec1 = ncec->ncec_next;
415 		mutex_enter(&ncec->ncec_lock);
416 		if (NCE_ISCONDEMNED(ncec)) {
417 			ptpn = ncec->ncec_ptpn;
418 			ncec1 = ncec->ncec_next;
419 			if (ncec1 != NULL)
420 				ncec1->ncec_ptpn = ptpn;
421 			*ptpn = ncec1;
422 			ncec->ncec_ptpn = NULL;
423 			ncec->ncec_next = NULL;
424 			ncec->ncec_next = *free_nce_list;
425 			*free_nce_list = ncec;
426 		}
427 		mutex_exit(&ncec->ncec_lock);
428 	}
429 }
430 
431 /*
432  * 1. Mark the entry CONDEMNED. This ensures that no new nce_lookup()
433  *    will return this NCE. Also no new timeouts will
434  *    be started (See nce_restart_timer).
435  * 2. Cancel any currently running timeouts.
436  * 3. If there is an ndp walker, return. The walker will do the cleanup.
437  *    This ensures that walkers see a consistent list of NCEs while walking.
438  * 4. Otherwise remove the NCE from the list of NCEs
439  */
440 void
441 ncec_delete(ncec_t *ncec)
442 {
443 	ncec_t	**ptpn;
444 	ncec_t	*ncec1;
445 	int	ipversion = ncec->ncec_ipversion;
446 	ndp_g_t *ndp;
447 	ip_stack_t	*ipst = ncec->ncec_ipst;
448 
449 	if (ipversion == IPV4_VERSION)
450 		ndp = ipst->ips_ndp4;
451 	else
452 		ndp = ipst->ips_ndp6;
453 
454 	/* Serialize deletes */
455 	mutex_enter(&ncec->ncec_lock);
456 	if (NCE_ISCONDEMNED(ncec)) {
457 		/* Some other thread is doing the delete */
458 		mutex_exit(&ncec->ncec_lock);
459 		return;
460 	}
461 	/*
462 	 * Caller has a refhold. Also 1 ref for being in the list. Thus
463 	 * refcnt has to be >= 2
464 	 */
465 	ASSERT(ncec->ncec_refcnt >= 2);
466 	ncec->ncec_flags |= NCE_F_CONDEMNED;
467 	mutex_exit(&ncec->ncec_lock);
468 
469 	/* Count how many condemned ires for kmem_cache callback */
470 	atomic_add_32(&ipst->ips_num_nce_condemned, 1);
471 	nce_fastpath_list_delete(ncec->ncec_ill, ncec, NULL);
472 
473 	/* Complete any waiting callbacks */
474 	ncec_cb_dispatch(ncec);
475 
476 	/*
477 	 * Cancel any running timer. Timeout can't be restarted
478 	 * since CONDEMNED is set. Can't hold ncec_lock across untimeout.
479 	 * Passing invalid timeout id is fine.
480 	 */
481 	if (ncec->ncec_timeout_id != 0) {
482 		(void) untimeout(ncec->ncec_timeout_id);
483 		ncec->ncec_timeout_id = 0;
484 	}
485 
486 	mutex_enter(&ndp->ndp_g_lock);
487 	if (ncec->ncec_ptpn == NULL) {
488 		/*
489 		 * The last ndp walker has already removed this ncec from
490 		 * the list after we marked the ncec CONDEMNED and before
491 		 * we grabbed the global lock.
492 		 */
493 		mutex_exit(&ndp->ndp_g_lock);
494 		return;
495 	}
496 	if (ndp->ndp_g_walker > 0) {
497 		/*
498 		 * Can't unlink. The walker will clean up
499 		 */
500 		ndp->ndp_g_walker_cleanup = B_TRUE;
501 		mutex_exit(&ndp->ndp_g_lock);
502 		return;
503 	}
504 
505 	/*
506 	 * Now remove the ncec from the list. nce_restart_timer won't restart
507 	 * the timer since it is marked CONDEMNED.
508 	 */
509 	ptpn = ncec->ncec_ptpn;
510 	ncec1 = ncec->ncec_next;
511 	if (ncec1 != NULL)
512 		ncec1->ncec_ptpn = ptpn;
513 	*ptpn = ncec1;
514 	ncec->ncec_ptpn = NULL;
515 	ncec->ncec_next = NULL;
516 	mutex_exit(&ndp->ndp_g_lock);
517 
518 	/* Removed from ncec_ptpn/ncec_next list */
519 	ncec_refrele_notr(ncec);
520 }
521 
522 void
523 ncec_inactive(ncec_t *ncec)
524 {
525 	mblk_t		**mpp;
526 	ill_t		*ill = ncec->ncec_ill;
527 	ip_stack_t	*ipst = ncec->ncec_ipst;
528 
529 	ASSERT(ncec->ncec_refcnt == 0);
530 	ASSERT(MUTEX_HELD(&ncec->ncec_lock));
531 
532 	/* Count how many condemned nces for kmem_cache callback */
533 	if (NCE_ISCONDEMNED(ncec))
534 		atomic_add_32(&ipst->ips_num_nce_condemned, -1);
535 
536 	/* Free all allocated messages */
537 	mpp = &ncec->ncec_qd_mp;
538 	while (*mpp != NULL) {
539 		mblk_t  *mp;
540 
541 		mp = *mpp;
542 		*mpp = mp->b_next;
543 
544 		inet_freemsg(mp);
545 	}
546 	/*
547 	 * must have been cleaned up in ncec_delete
548 	 */
549 	ASSERT(list_is_empty(&ncec->ncec_cb));
550 	list_destroy(&ncec->ncec_cb);
551 	/*
552 	 * free the ncec_lladdr if one was allocated in nce_add_common()
553 	 */
554 	if (ncec->ncec_lladdr_length > 0)
555 		kmem_free(ncec->ncec_lladdr, ncec->ncec_lladdr_length);
556 
557 #ifdef DEBUG
558 	ncec_trace_cleanup(ncec);
559 #endif
560 
561 	mutex_enter(&ill->ill_lock);
562 	DTRACE_PROBE3(ill__decr__cnt, (ill_t *), ill,
563 	    (char *), "ncec", (void *), ncec);
564 	ill->ill_ncec_cnt--;
565 	ncec->ncec_ill = NULL;
566 	/*
567 	 * If the number of ncec's associated with this ill have dropped
568 	 * to zero, check whether we need to restart any operation that
569 	 * is waiting for this to happen.
570 	 */
571 	if (ILL_DOWN_OK(ill)) {
572 		/* ipif_ill_refrele_tail drops the ill_lock */
573 		ipif_ill_refrele_tail(ill);
574 	} else {
575 		mutex_exit(&ill->ill_lock);
576 	}
577 
578 	mutex_destroy(&ncec->ncec_lock);
579 	kmem_cache_free(ncec_cache, ncec);
580 }
581 
582 /*
583  * ncec_walk routine.  Delete the ncec if it is associated with the ill
584  * that is going away.  Always called as a writer.
585  */
586 void
587 ncec_delete_per_ill(ncec_t *ncec, uchar_t *arg)
588 {
589 	if ((ncec != NULL) && ncec->ncec_ill == (ill_t *)arg) {
590 		ncec_delete(ncec);
591 	}
592 }
593 
594 /*
595  * Neighbor Cache cleanup logic for a list of ncec_t entries.
596  */
597 static void
598 nce_cleanup_list(ncec_t *ncec)
599 {
600 	ncec_t *ncec_next;
601 
602 	ASSERT(ncec != NULL);
603 	while (ncec != NULL) {
604 		ncec_next = ncec->ncec_next;
605 		ncec->ncec_next = NULL;
606 
607 		/*
608 		 * It is possible for the last ndp walker (this thread)
609 		 * to come here after ncec_delete has marked the ncec CONDEMNED
610 		 * and before it has removed the ncec from the fastpath list
611 		 * or called untimeout. So we need to do it here. It is safe
612 		 * for both ncec_delete and this thread to do it twice or
613 		 * even simultaneously since each of the threads has a
614 		 * reference on the ncec.
615 		 */
616 		nce_fastpath_list_delete(ncec->ncec_ill, ncec, NULL);
617 		/*
618 		 * Cancel any running timer. Timeout can't be restarted
619 		 * since CONDEMNED is set. The ncec_lock can't be
620 		 * held across untimeout though passing invalid timeout
621 		 * id is fine.
622 		 */
623 		if (ncec->ncec_timeout_id != 0) {
624 			(void) untimeout(ncec->ncec_timeout_id);
625 			ncec->ncec_timeout_id = 0;
626 		}
627 		/* Removed from ncec_ptpn/ncec_next list */
628 		ncec_refrele_notr(ncec);
629 		ncec = ncec_next;
630 	}
631 }
632 
633 /*
634  * Restart DAD on given NCE.  Returns B_TRUE if DAD has been restarted.
635  */
636 boolean_t
637 nce_restart_dad(ncec_t *ncec)
638 {
639 	boolean_t started;
640 	ill_t *ill, *hwaddr_ill;
641 
642 	if (ncec == NULL)
643 		return (B_FALSE);
644 	ill = ncec->ncec_ill;
645 	mutex_enter(&ncec->ncec_lock);
646 	if (ncec->ncec_state == ND_PROBE) {
647 		mutex_exit(&ncec->ncec_lock);
648 		started = B_TRUE;
649 	} else if (ncec->ncec_state == ND_REACHABLE) {
650 		ASSERT(ncec->ncec_lladdr != NULL);
651 		ncec->ncec_state = ND_PROBE;
652 		ncec->ncec_pcnt = ND_MAX_UNICAST_SOLICIT;
653 		/*
654 		 * Slight cheat here: we don't use the initial probe delay
655 		 * for IPv4 in this obscure case.
656 		 */
657 		mutex_exit(&ncec->ncec_lock);
658 		if (IS_IPMP(ill)) {
659 			hwaddr_ill = ipmp_illgrp_find_ill(ill->ill_grp,
660 			    ncec->ncec_lladdr, ncec->ncec_lladdr_length);
661 		} else {
662 			hwaddr_ill = ill;
663 		}
664 		nce_dad(ncec, hwaddr_ill, B_TRUE);
665 		started = B_TRUE;
666 	} else {
667 		mutex_exit(&ncec->ncec_lock);
668 		started = B_FALSE;
669 	}
670 	return (started);
671 }
672 
673 /*
674  * IPv6 Cache entry lookup.  Try to find an ncec matching the parameters passed.
675  * If one is found, the refcnt on the ncec will be incremented.
676  */
677 ncec_t *
678 ncec_lookup_illgrp_v6(ill_t *ill, const in6_addr_t *addr)
679 {
680 	ncec_t		*ncec;
681 	ip_stack_t	*ipst = ill->ill_ipst;
682 
683 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
684 	mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
685 
686 	/* Get head of v6 hash table */
687 	ncec = *((ncec_t **)NCE_HASH_PTR_V6(ipst, *addr));
688 	ncec = ncec_lookup_illgrp(ill, addr, ncec);
689 	mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
690 	rw_exit(&ipst->ips_ill_g_lock);
691 	return (ncec);
692 }
693 /*
694  * IPv4 Cache entry lookup.  Try to find an ncec matching the parameters passed.
695  * If one is found, the refcnt on the ncec will be incremented.
696  */
697 ncec_t *
698 ncec_lookup_illgrp_v4(ill_t *ill, const in_addr_t *addr)
699 {
700 	ncec_t	*ncec = NULL;
701 	in6_addr_t addr6;
702 	ip_stack_t *ipst = ill->ill_ipst;
703 
704 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
705 	mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
706 
707 	/* Get head of v4 hash table */
708 	ncec = *((ncec_t **)NCE_HASH_PTR_V4(ipst, *addr));
709 	IN6_IPADDR_TO_V4MAPPED(*addr, &addr6);
710 	ncec = ncec_lookup_illgrp(ill, &addr6, ncec);
711 	mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
712 	rw_exit(&ipst->ips_ill_g_lock);
713 	return (ncec);
714 }
715 
716 /*
717  * Cache entry lookup.  Try to find an ncec matching the parameters passed.
718  * If an ncec is found, increment the hold count on that ncec.
719  * The caller passes in the start of the appropriate hash table, and must
720  * be holding the appropriate global lock (ndp_g_lock). In addition, since
721  * this function matches ncec_t entries across the illgrp, the ips_ill_g_lock
722  * must be held as reader.
723  *
724  * This function always matches across the ipmp group.
725  */
726 ncec_t *
727 ncec_lookup_illgrp(ill_t *ill, const in6_addr_t *addr, ncec_t *ncec)
728 {
729 	ndp_g_t		*ndp;
730 	ip_stack_t	*ipst = ill->ill_ipst;
731 
732 	if (ill->ill_isv6)
733 		ndp = ipst->ips_ndp6;
734 	else
735 		ndp = ipst->ips_ndp4;
736 
737 	ASSERT(ill != NULL);
738 	ASSERT(MUTEX_HELD(&ndp->ndp_g_lock));
739 	if (IN6_IS_ADDR_UNSPECIFIED(addr))
740 		return (NULL);
741 	for (; ncec != NULL; ncec = ncec->ncec_next) {
742 		if (ncec->ncec_ill == ill ||
743 		    IS_IN_SAME_ILLGRP(ill, ncec->ncec_ill)) {
744 			if (IN6_ARE_ADDR_EQUAL(&ncec->ncec_addr, addr)) {
745 				mutex_enter(&ncec->ncec_lock);
746 				if (!NCE_ISCONDEMNED(ncec)) {
747 					ncec_refhold_locked(ncec);
748 					mutex_exit(&ncec->ncec_lock);
749 					break;
750 				}
751 				mutex_exit(&ncec->ncec_lock);
752 			}
753 		}
754 	}
755 	return (ncec);
756 }
757 
758 /*
759  * Find an nce_t on ill with nce_addr == addr. Lookup the nce_t
760  * entries for ill only, i.e., when ill is part of an ipmp group,
761  * nce_lookup_v4 will never try to match across the group.
762  */
763 nce_t *
764 nce_lookup_v4(ill_t *ill, const in_addr_t *addr)
765 {
766 	nce_t *nce;
767 	in6_addr_t addr6;
768 	ip_stack_t *ipst = ill->ill_ipst;
769 
770 	mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
771 	IN6_IPADDR_TO_V4MAPPED(*addr, &addr6);
772 	nce = nce_lookup_addr(ill, &addr6);
773 	mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
774 	return (nce);
775 }
776 
777 /*
778  * Find an nce_t on ill with nce_addr == addr. Lookup the nce_t
779  * entries for ill only, i.e., when ill is part of an ipmp group,
780  * nce_lookup_v6 will never try to match across the group.
781  */
782 nce_t *
783 nce_lookup_v6(ill_t *ill, const in6_addr_t *addr6)
784 {
785 	nce_t *nce;
786 	ip_stack_t *ipst = ill->ill_ipst;
787 
788 	mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
789 	nce = nce_lookup_addr(ill, addr6);
790 	mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
791 	return (nce);
792 }
793 
794 static nce_t *
795 nce_lookup_addr(ill_t *ill, const in6_addr_t *addr)
796 {
797 	nce_t *nce;
798 
799 	ASSERT(ill != NULL);
800 #ifdef DEBUG
801 	if (ill->ill_isv6)
802 		ASSERT(MUTEX_HELD(&ill->ill_ipst->ips_ndp6->ndp_g_lock));
803 	else
804 		ASSERT(MUTEX_HELD(&ill->ill_ipst->ips_ndp4->ndp_g_lock));
805 #endif
806 	mutex_enter(&ill->ill_lock);
807 	nce = nce_lookup(ill, addr);
808 	mutex_exit(&ill->ill_lock);
809 	return (nce);
810 }
811 
812 
813 /*
814  * Router turned to host.  We need to make sure that cached copies of the ncec
815  * are not used for forwarding packets if they were derived from the default
816  * route, and that the default route itself is removed, as  required by
817  * section 7.2.5 of RFC 2461.
818  *
819  * Note that the ncec itself probably has valid link-layer information for the
820  * nexthop, so that there is no reason to delete the ncec, as long as the
821  * ISROUTER flag is turned off.
822  */
823 static void
824 ncec_router_to_host(ncec_t *ncec)
825 {
826 	ire_t		*ire;
827 	ip_stack_t	*ipst = ncec->ncec_ipst;
828 
829 	mutex_enter(&ncec->ncec_lock);
830 	ncec->ncec_flags &= ~NCE_F_ISROUTER;
831 	mutex_exit(&ncec->ncec_lock);
832 
833 	ire = ire_ftable_lookup_v6(&ipv6_all_zeros, &ipv6_all_zeros,
834 	    &ncec->ncec_addr, IRE_DEFAULT, ncec->ncec_ill, ALL_ZONES, NULL,
835 	    MATCH_IRE_ILL | MATCH_IRE_TYPE | MATCH_IRE_GW, 0, ipst, NULL);
836 	if (ire != NULL) {
837 		ip_rts_rtmsg(RTM_DELETE, ire, 0, ipst);
838 		ire_delete(ire);
839 		ire_refrele(ire);
840 	}
841 }
842 
843 /*
844  * Process passed in parameters either from an incoming packet or via
845  * user ioctl.
846  */
847 void
848 nce_process(ncec_t *ncec, uchar_t *hw_addr, uint32_t flag, boolean_t is_adv)
849 {
850 	ill_t	*ill = ncec->ncec_ill;
851 	uint32_t hw_addr_len = ill->ill_phys_addr_length;
852 	boolean_t ll_updated = B_FALSE;
853 	boolean_t ll_changed;
854 	nce_t	*nce;
855 
856 	ASSERT(ncec->ncec_ipversion == IPV6_VERSION);
857 	/*
858 	 * No updates of link layer address or the neighbor state is
859 	 * allowed, when the cache is in NONUD state.  This still
860 	 * allows for responding to reachability solicitation.
861 	 */
862 	mutex_enter(&ncec->ncec_lock);
863 	if (ncec->ncec_state == ND_INCOMPLETE) {
864 		if (hw_addr == NULL) {
865 			mutex_exit(&ncec->ncec_lock);
866 			return;
867 		}
868 		nce_set_ll(ncec, hw_addr);
869 		/*
870 		 * Update ncec state and send the queued packets
871 		 * back to ip this time ire will be added.
872 		 */
873 		if (flag & ND_NA_FLAG_SOLICITED) {
874 			nce_update(ncec, ND_REACHABLE, NULL);
875 		} else {
876 			nce_update(ncec, ND_STALE, NULL);
877 		}
878 		mutex_exit(&ncec->ncec_lock);
879 		nce = nce_fastpath(ncec, B_TRUE, NULL);
880 		nce_resolv_ok(ncec);
881 		if (nce != NULL)
882 			nce_refrele(nce);
883 		return;
884 	}
885 	ll_changed = nce_cmp_ll_addr(ncec, hw_addr, hw_addr_len);
886 	if (!is_adv) {
887 		/* If this is a SOLICITATION request only */
888 		if (ll_changed)
889 			nce_update(ncec, ND_STALE, hw_addr);
890 		mutex_exit(&ncec->ncec_lock);
891 		ncec_cb_dispatch(ncec);
892 		return;
893 	}
894 	if (!(flag & ND_NA_FLAG_OVERRIDE) && ll_changed) {
895 		/* If in any other state than REACHABLE, ignore */
896 		if (ncec->ncec_state == ND_REACHABLE) {
897 			nce_update(ncec, ND_STALE, NULL);
898 		}
899 		mutex_exit(&ncec->ncec_lock);
900 		ncec_cb_dispatch(ncec);
901 		return;
902 	} else {
903 		if (ll_changed) {
904 			nce_update(ncec, ND_UNCHANGED, hw_addr);
905 			ll_updated = B_TRUE;
906 		}
907 		if (flag & ND_NA_FLAG_SOLICITED) {
908 			nce_update(ncec, ND_REACHABLE, NULL);
909 		} else {
910 			if (ll_updated) {
911 				nce_update(ncec, ND_STALE, NULL);
912 			}
913 		}
914 		mutex_exit(&ncec->ncec_lock);
915 		if (!(flag & ND_NA_FLAG_ROUTER) && (ncec->ncec_flags &
916 		    NCE_F_ISROUTER)) {
917 			ncec_router_to_host(ncec);
918 		} else {
919 			ncec_cb_dispatch(ncec);
920 		}
921 	}
922 }
923 
924 /*
925  * Pass arg1 to the pfi supplied, along with each ncec in existence.
926  * ncec_walk() places a REFHOLD on the ncec and drops the lock when
927  * walking the hash list.
928  */
929 void
930 ncec_walk_common(ndp_g_t *ndp, ill_t *ill, pfi_t pfi, void *arg1,
931     boolean_t trace)
932 {
933 	ncec_t	*ncec;
934 	ncec_t	*ncec1;
935 	ncec_t	**ncep;
936 	ncec_t	*free_nce_list = NULL;
937 
938 	mutex_enter(&ndp->ndp_g_lock);
939 	/* Prevent ncec_delete from unlink and free of NCE */
940 	ndp->ndp_g_walker++;
941 	mutex_exit(&ndp->ndp_g_lock);
942 	for (ncep = ndp->nce_hash_tbl;
943 	    ncep < A_END(ndp->nce_hash_tbl); ncep++) {
944 		for (ncec = *ncep; ncec != NULL; ncec = ncec1) {
945 			ncec1 = ncec->ncec_next;
946 			if (ill == NULL || ncec->ncec_ill == ill) {
947 				if (trace) {
948 					ncec_refhold(ncec);
949 					(*pfi)(ncec, arg1);
950 					ncec_refrele(ncec);
951 				} else {
952 					ncec_refhold_notr(ncec);
953 					(*pfi)(ncec, arg1);
954 					ncec_refrele_notr(ncec);
955 				}
956 			}
957 		}
958 	}
959 	mutex_enter(&ndp->ndp_g_lock);
960 	ndp->ndp_g_walker--;
961 	if (ndp->ndp_g_walker_cleanup && ndp->ndp_g_walker == 0) {
962 		/* Time to delete condemned entries */
963 		for (ncep = ndp->nce_hash_tbl;
964 		    ncep < A_END(ndp->nce_hash_tbl); ncep++) {
965 			ncec = *ncep;
966 			if (ncec != NULL) {
967 				nce_remove(ndp, ncec, &free_nce_list);
968 			}
969 		}
970 		ndp->ndp_g_walker_cleanup = B_FALSE;
971 	}
972 
973 	mutex_exit(&ndp->ndp_g_lock);
974 
975 	if (free_nce_list != NULL) {
976 		nce_cleanup_list(free_nce_list);
977 	}
978 }
979 
980 /*
981  * Walk everything.
982  * Note that ill can be NULL hence can't derive the ipst from it.
983  */
984 void
985 ncec_walk(ill_t *ill, pfi_t pfi, void *arg1, ip_stack_t *ipst)
986 {
987 	ncec_walk_common(ipst->ips_ndp4, ill, pfi, arg1, B_TRUE);
988 	ncec_walk_common(ipst->ips_ndp6, ill, pfi, arg1, B_TRUE);
989 }
990 
991 /*
992  * For each interface an entry is added for the unspecified multicast group.
993  * Here that mapping is used to form the multicast cache entry for a particular
994  * multicast destination.
995  */
996 static int
997 nce_set_multicast_v6(ill_t *ill, const in6_addr_t *dst,
998     uint16_t flags, nce_t **newnce)
999 {
1000 	uchar_t		*hw_addr;
1001 	int		err = 0;
1002 	ip_stack_t	*ipst = ill->ill_ipst;
1003 	nce_t		*nce;
1004 
1005 	ASSERT(ill != NULL);
1006 	ASSERT(ill->ill_isv6);
1007 	ASSERT(!(IN6_IS_ADDR_UNSPECIFIED(dst)));
1008 
1009 	mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
1010 	nce = nce_lookup_addr(ill, dst);
1011 	if (nce != NULL) {
1012 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
1013 		goto done;
1014 	}
1015 	if (ill->ill_net_type == IRE_IF_RESOLVER) {
1016 		/*
1017 		 * For IRE_IF_RESOLVER a hardware mapping can be
1018 		 * generated.
1019 		 */
1020 		hw_addr = kmem_alloc(ill->ill_nd_lla_len, KM_NOSLEEP);
1021 		if (hw_addr == NULL) {
1022 			mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
1023 			return (ENOMEM);
1024 		}
1025 		ip_mcast_mapping(ill, (uchar_t *)dst, hw_addr);
1026 	} else {
1027 		/* No hw_addr is needed for IRE_IF_NORESOLVER. */
1028 		hw_addr = NULL;
1029 	}
1030 	ASSERT((flags & NCE_F_MCAST) != 0);
1031 	ASSERT((flags & NCE_F_NONUD) != 0);
1032 	/* nce_state will be computed by nce_add_common() */
1033 	err = nce_add_v6(ill, hw_addr, ill->ill_phys_addr_length, dst, flags,
1034 	    ND_UNCHANGED, &nce);
1035 	mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
1036 	if (err == 0)
1037 		err = nce_add_v6_postprocess(nce);
1038 	if (hw_addr != NULL)
1039 		kmem_free(hw_addr, ill->ill_nd_lla_len);
1040 	if (err != 0) {
1041 		ip1dbg(("nce_set_multicast_v6: create failed" "%d\n", err));
1042 		return (err);
1043 	}
1044 done:
1045 	ASSERT(nce->nce_common->ncec_state == ND_REACHABLE);
1046 	if (newnce != NULL)
1047 		*newnce = nce;
1048 	else
1049 		nce_refrele(nce);
1050 	return (0);
1051 }
1052 
1053 /*
1054  * Return the link layer address, and any flags of a ncec.
1055  */
1056 int
1057 ndp_query(ill_t *ill, struct lif_nd_req *lnr)
1058 {
1059 	ncec_t		*ncec;
1060 	in6_addr_t	*addr;
1061 	sin6_t		*sin6;
1062 
1063 	ASSERT(ill != NULL && ill->ill_isv6);
1064 	sin6 = (sin6_t *)&lnr->lnr_addr;
1065 	addr =  &sin6->sin6_addr;
1066 
1067 	/*
1068 	 * NOTE: if the ill is an IPMP interface, then match against the whole
1069 	 * illgrp.  This e.g. allows in.ndpd to retrieve the link layer
1070 	 * addresses for the data addresses on an IPMP interface even though
1071 	 * ipif_ndp_up() created them with an ncec_ill of ipif_bound_ill.
1072 	 */
1073 	ncec = ncec_lookup_illgrp_v6(ill, addr);
1074 	if (ncec == NULL)
1075 		return (ESRCH);
1076 	/* If no link layer address is available yet, return ESRCH */
1077 	if (!NCE_ISREACHABLE(ncec)) {
1078 		ncec_refrele(ncec);
1079 		return (ESRCH);
1080 	}
1081 	lnr->lnr_hdw_len = ill->ill_phys_addr_length;
1082 	bcopy(ncec->ncec_lladdr, (uchar_t *)&lnr->lnr_hdw_addr,
1083 	    lnr->lnr_hdw_len);
1084 	if (ncec->ncec_flags & NCE_F_ISROUTER)
1085 		lnr->lnr_flags = NDF_ISROUTER_ON;
1086 	if (ncec->ncec_flags & NCE_F_ANYCAST)
1087 		lnr->lnr_flags |= NDF_ANYCAST_ON;
1088 	ncec_refrele(ncec);
1089 	return (0);
1090 }
1091 
1092 /*
1093  * Finish setting up the Enable/Disable multicast for the driver.
1094  */
1095 mblk_t *
1096 ndp_mcastreq(ill_t *ill, const in6_addr_t *v6group, uint32_t hw_addr_len,
1097     uint32_t hw_addr_offset, mblk_t *mp)
1098 {
1099 	uchar_t		*hw_addr;
1100 	ipaddr_t	v4group;
1101 	uchar_t		*addr;
1102 
1103 	ASSERT(ill->ill_net_type == IRE_IF_RESOLVER);
1104 	if (IN6_IS_ADDR_V4MAPPED(v6group)) {
1105 		IN6_V4MAPPED_TO_IPADDR(v6group, v4group);
1106 
1107 		ASSERT(CLASSD(v4group));
1108 		ASSERT(!(ill->ill_isv6));
1109 
1110 		addr = (uchar_t *)&v4group;
1111 	} else {
1112 		ASSERT(IN6_IS_ADDR_MULTICAST(v6group));
1113 		ASSERT(ill->ill_isv6);
1114 
1115 		addr = (uchar_t *)v6group;
1116 	}
1117 	hw_addr = mi_offset_paramc(mp, hw_addr_offset, hw_addr_len);
1118 	if (hw_addr == NULL) {
1119 		ip0dbg(("ndp_mcastreq NULL hw_addr\n"));
1120 		freemsg(mp);
1121 		return (NULL);
1122 	}
1123 
1124 	ip_mcast_mapping(ill, addr, hw_addr);
1125 	return (mp);
1126 }
1127 
1128 void
1129 ip_ndp_resolve(ncec_t *ncec)
1130 {
1131 	in_addr_t	sender4 = INADDR_ANY;
1132 	in6_addr_t	sender6 = ipv6_all_zeros;
1133 	ill_t		*src_ill;
1134 	uint32_t	ms;
1135 
1136 	src_ill = nce_resolve_src(ncec, &sender6);
1137 	if (src_ill == NULL) {
1138 		/* Make sure we try again later */
1139 		ms = ncec->ncec_ill->ill_reachable_retrans_time;
1140 		nce_restart_timer(ncec, (clock_t)ms);
1141 		return;
1142 	}
1143 	if (ncec->ncec_ipversion == IPV4_VERSION)
1144 		IN6_V4MAPPED_TO_IPADDR(&sender6, sender4);
1145 	mutex_enter(&ncec->ncec_lock);
1146 	if (ncec->ncec_ipversion == IPV6_VERSION)
1147 		ms = ndp_solicit(ncec, sender6, src_ill);
1148 	else
1149 		ms = arp_request(ncec, sender4, src_ill);
1150 	mutex_exit(&ncec->ncec_lock);
1151 	if (ms == 0) {
1152 		if (ncec->ncec_state != ND_REACHABLE) {
1153 			if (ncec->ncec_ipversion == IPV6_VERSION)
1154 				ndp_resolv_failed(ncec);
1155 			else
1156 				arp_resolv_failed(ncec);
1157 			ASSERT((ncec->ncec_flags & NCE_F_STATIC) == 0);
1158 			nce_make_unreachable(ncec);
1159 			ncec_delete(ncec);
1160 		}
1161 	} else {
1162 		nce_restart_timer(ncec, (clock_t)ms);
1163 	}
1164 done:
1165 	ill_refrele(src_ill);
1166 }
1167 
1168 /*
1169  * Send an IPv6 neighbor solicitation.
1170  * Returns number of milliseconds after which we should either rexmit or abort.
1171  * Return of zero means we should abort.
1172  * The caller holds the ncec_lock to protect ncec_qd_mp and ncec_rcnt.
1173  * The optional source address is used as a hint to ndp_solicit for
1174  * which source to use in the packet.
1175  *
1176  * NOTE: This routine drops ncec_lock (and later reacquires it) when sending
1177  * the packet.
1178  */
1179 uint32_t
1180 ndp_solicit(ncec_t *ncec, in6_addr_t src, ill_t *ill)
1181 {
1182 	in6_addr_t	dst;
1183 	boolean_t	dropped = B_FALSE;
1184 
1185 	ASSERT(ncec->ncec_ipversion == IPV6_VERSION);
1186 	ASSERT(MUTEX_HELD(&ncec->ncec_lock));
1187 
1188 	if (ncec->ncec_rcnt == 0)
1189 		return (0);
1190 
1191 	dst = ncec->ncec_addr;
1192 	ncec->ncec_rcnt--;
1193 	mutex_exit(&ncec->ncec_lock);
1194 	dropped = ndp_xmit(ill, ND_NEIGHBOR_SOLICIT, ill->ill_phys_addr,
1195 	    ill->ill_phys_addr_length, &src, &dst, 0);
1196 	mutex_enter(&ncec->ncec_lock);
1197 	if (dropped)
1198 		ncec->ncec_rcnt++;
1199 	return (ncec->ncec_ill->ill_reachable_retrans_time);
1200 }
1201 
1202 /*
1203  * Attempt to recover an address on an interface that's been marked as a
1204  * duplicate.  Because NCEs are destroyed when the interface goes down, there's
1205  * no easy way to just probe the address and have the right thing happen if
1206  * it's no longer in use.  Instead, we just bring it up normally and allow the
1207  * regular interface start-up logic to probe for a remaining duplicate and take
1208  * us back down if necessary.
1209  * Neither DHCP nor temporary addresses arrive here; they're excluded by
1210  * ip_ndp_excl.
1211  */
1212 /* ARGSUSED */
1213 void
1214 ip_addr_recover(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg)
1215 {
1216 	ill_t	*ill = rq->q_ptr;
1217 	ipif_t	*ipif;
1218 	in6_addr_t *addr6 = (in6_addr_t *)mp->b_rptr;
1219 	in_addr_t *addr4 = (in_addr_t *)mp->b_rptr;
1220 	boolean_t addr_equal;
1221 
1222 	for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
1223 		/*
1224 		 * We do not support recovery of proxy ARP'd interfaces,
1225 		 * because the system lacks a complete proxy ARP mechanism.
1226 		 */
1227 		if (ill->ill_isv6) {
1228 			addr_equal = IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6lcl_addr,
1229 			    addr6);
1230 		} else {
1231 			addr_equal = (ipif->ipif_lcl_addr == *addr4);
1232 		}
1233 
1234 		if ((ipif->ipif_flags & IPIF_POINTOPOINT) || !addr_equal)
1235 			continue;
1236 
1237 		/*
1238 		 * If we have already recovered or if the interface is going
1239 		 * away, then ignore.
1240 		 */
1241 		mutex_enter(&ill->ill_lock);
1242 		if (!(ipif->ipif_flags & IPIF_DUPLICATE) ||
1243 		    (ipif->ipif_state_flags & IPIF_CONDEMNED)) {
1244 			mutex_exit(&ill->ill_lock);
1245 			continue;
1246 		}
1247 
1248 		ipif->ipif_flags &= ~IPIF_DUPLICATE;
1249 		ill->ill_ipif_dup_count--;
1250 		mutex_exit(&ill->ill_lock);
1251 		ipif->ipif_was_dup = B_TRUE;
1252 
1253 		if (ill->ill_isv6) {
1254 			VERIFY(ipif_ndp_up(ipif, B_TRUE) != EINPROGRESS);
1255 			(void) ipif_up_done_v6(ipif);
1256 		} else {
1257 			VERIFY(ipif_arp_up(ipif, Res_act_initial, B_TRUE) !=
1258 			    EINPROGRESS);
1259 			(void) ipif_up_done(ipif);
1260 		}
1261 	}
1262 	freeb(mp);
1263 }
1264 
1265 /*
1266  * Attempt to recover an IPv6 interface that's been shut down as a duplicate.
1267  * As long as someone else holds the address, the interface will stay down.
1268  * When that conflict goes away, the interface is brought back up.  This is
1269  * done so that accidental shutdowns of addresses aren't made permanent.  Your
1270  * server will recover from a failure.
1271  *
1272  * For DHCP and temporary addresses, recovery is not done in the kernel.
1273  * Instead, it's handled by user space processes (dhcpagent and in.ndpd).
1274  *
1275  * This function is entered on a timer expiry; the ID is in ipif_recovery_id.
1276  */
1277 void
1278 ipif_dup_recovery(void *arg)
1279 {
1280 	ipif_t *ipif = arg;
1281 
1282 	ipif->ipif_recovery_id = 0;
1283 	if (!(ipif->ipif_flags & IPIF_DUPLICATE))
1284 		return;
1285 
1286 	/*
1287 	 * No lock, because this is just an optimization.
1288 	 */
1289 	if (ipif->ipif_state_flags & IPIF_CONDEMNED)
1290 		return;
1291 
1292 	/* If the link is down, we'll retry this later */
1293 	if (!(ipif->ipif_ill->ill_phyint->phyint_flags & PHYI_RUNNING))
1294 		return;
1295 
1296 	ipif_do_recovery(ipif);
1297 }
1298 
1299 /*
1300  * Perform interface recovery by forcing the duplicate interfaces up and
1301  * allowing the system to determine which ones should stay up.
1302  *
1303  * Called both by recovery timer expiry and link-up notification.
1304  */
1305 void
1306 ipif_do_recovery(ipif_t *ipif)
1307 {
1308 	ill_t *ill = ipif->ipif_ill;
1309 	mblk_t *mp;
1310 	ip_stack_t *ipst = ill->ill_ipst;
1311 	size_t mp_size;
1312 
1313 	if (ipif->ipif_isv6)
1314 		mp_size = sizeof (ipif->ipif_v6lcl_addr);
1315 	else
1316 		mp_size = sizeof (ipif->ipif_lcl_addr);
1317 	mp = allocb(mp_size, BPRI_MED);
1318 	if (mp == NULL) {
1319 		mutex_enter(&ill->ill_lock);
1320 		if (ipst->ips_ip_dup_recovery > 0 &&
1321 		    ipif->ipif_recovery_id == 0 &&
1322 		    !(ipif->ipif_state_flags & IPIF_CONDEMNED)) {
1323 			ipif->ipif_recovery_id = timeout(ipif_dup_recovery,
1324 			    ipif, MSEC_TO_TICK(ipst->ips_ip_dup_recovery));
1325 		}
1326 		mutex_exit(&ill->ill_lock);
1327 	} else {
1328 		/*
1329 		 * A recovery timer may still be running if we got here from
1330 		 * ill_restart_dad(); cancel that timer.
1331 		 */
1332 		if (ipif->ipif_recovery_id != 0)
1333 			(void) untimeout(ipif->ipif_recovery_id);
1334 		ipif->ipif_recovery_id = 0;
1335 
1336 		if (ipif->ipif_isv6) {
1337 			bcopy(&ipif->ipif_v6lcl_addr, mp->b_rptr,
1338 			    sizeof (ipif->ipif_v6lcl_addr));
1339 		} else  {
1340 			bcopy(&ipif->ipif_lcl_addr, mp->b_rptr,
1341 			    sizeof (ipif->ipif_lcl_addr));
1342 		}
1343 		ill_refhold(ill);
1344 		qwriter_ip(ill, ill->ill_rq, mp, ip_addr_recover, NEW_OP,
1345 		    B_FALSE);
1346 	}
1347 }
1348 
1349 /*
1350  * Find the MAC and IP addresses in an NA/NS message.
1351  */
1352 static void
1353 ip_ndp_find_addresses(mblk_t *mp, ip_recv_attr_t *ira, ill_t *ill,
1354     in6_addr_t *targp, uchar_t **haddr, uint_t *haddrlenp)
1355 {
1356 	icmp6_t *icmp6 = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
1357 	nd_neighbor_solicit_t *ns = (nd_neighbor_solicit_t *)icmp6;
1358 	uchar_t *addr;
1359 	int alen;
1360 
1361 	/* icmp_inbound_v6 ensures this */
1362 	ASSERT(ira->ira_flags & IRAF_L2SRC_SET);
1363 
1364 	addr = ira->ira_l2src;
1365 	alen = ill->ill_phys_addr_length;
1366 	if (alen > 0) {
1367 		*haddr = addr;
1368 		*haddrlenp = alen;
1369 	} else {
1370 		*haddr = NULL;
1371 		*haddrlenp = 0;
1372 	}
1373 
1374 	/* nd_ns_target and nd_na_target are at the same offset, so we cheat */
1375 	*targp = ns->nd_ns_target;
1376 }
1377 
1378 /*
1379  * This is for exclusive changes due to NDP duplicate address detection
1380  * failure.
1381  */
1382 /* ARGSUSED */
1383 static void
1384 ip_ndp_excl(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg)
1385 {
1386 	ill_t	*ill = rq->q_ptr;
1387 	ipif_t	*ipif;
1388 	uchar_t	*haddr;
1389 	uint_t	haddrlen;
1390 	ip_stack_t *ipst = ill->ill_ipst;
1391 	in6_addr_t targ;
1392 	ip_recv_attr_t iras;
1393 	mblk_t	*attrmp;
1394 
1395 	attrmp = mp;
1396 	mp = mp->b_cont;
1397 	attrmp->b_cont = NULL;
1398 	if (!ip_recv_attr_from_mblk(attrmp, &iras)) {
1399 		/* The ill or ip_stack_t disappeared on us */
1400 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
1401 		ip_drop_input("ip_recv_attr_from_mblk", mp, ill);
1402 		freemsg(mp);
1403 		ira_cleanup(&iras, B_TRUE);
1404 		return;
1405 	}
1406 
1407 	ASSERT(ill == iras.ira_rill);
1408 
1409 	ip_ndp_find_addresses(mp, &iras, ill, &targ, &haddr, &haddrlen);
1410 	if (haddr != NULL && haddrlen == ill->ill_phys_addr_length) {
1411 		/*
1412 		 * Ignore conflicts generated by misbehaving switches that
1413 		 * just reflect our own messages back to us.  For IPMP, we may
1414 		 * see reflections across any ill in the illgrp.
1415 		 *
1416 		 * RFC2462 and revisions tried to detect both the case
1417 		 * when a statically configured IPv6 address is a duplicate,
1418 		 * and the case when the L2 address itself is a duplicate. The
1419 		 * later is important because, with stateles address autoconf,
1420 		 * if the L2 address is a duplicate, the resulting IPv6
1421 		 * address(es) would also be duplicates. We rely on DAD of the
1422 		 * IPv6 address itself to detect the latter case.
1423 		 */
1424 		/* For an under ill_grp can change under lock */
1425 		rw_enter(&ipst->ips_ill_g_lock, RW_READER);
1426 		if (bcmp(haddr, ill->ill_phys_addr, haddrlen) == 0 ||
1427 		    IS_UNDER_IPMP(ill) &&
1428 		    ipmp_illgrp_find_ill(ill->ill_grp, haddr,
1429 		    haddrlen) != NULL) {
1430 			rw_exit(&ipst->ips_ill_g_lock);
1431 			goto ignore_conflict;
1432 		}
1433 		rw_exit(&ipst->ips_ill_g_lock);
1434 	}
1435 
1436 	/*
1437 	 * Look up the appropriate ipif.
1438 	 */
1439 	ipif = ipif_lookup_addr_v6(&targ, ill, ALL_ZONES, ipst);
1440 	if (ipif == NULL)
1441 		goto ignore_conflict;
1442 
1443 	/* Reload the ill to match the ipif */
1444 	ill = ipif->ipif_ill;
1445 
1446 	/* If it's already duplicate or ineligible, then don't do anything. */
1447 	if (ipif->ipif_flags & (IPIF_POINTOPOINT|IPIF_DUPLICATE)) {
1448 		ipif_refrele(ipif);
1449 		goto ignore_conflict;
1450 	}
1451 
1452 	/*
1453 	 * If this is a failure during duplicate recovery, then don't
1454 	 * complain.  It may take a long time to recover.
1455 	 */
1456 	if (!ipif->ipif_was_dup) {
1457 		char ibuf[LIFNAMSIZ];
1458 		char hbuf[MAC_STR_LEN];
1459 		char sbuf[INET6_ADDRSTRLEN];
1460 
1461 		ipif_get_name(ipif, ibuf, sizeof (ibuf));
1462 		cmn_err(CE_WARN, "%s has duplicate address %s (in use by %s);"
1463 		    " disabled", ibuf,
1464 		    inet_ntop(AF_INET6, &targ, sbuf, sizeof (sbuf)),
1465 		    mac_colon_addr(haddr, haddrlen, hbuf, sizeof (hbuf)));
1466 	}
1467 	mutex_enter(&ill->ill_lock);
1468 	ASSERT(!(ipif->ipif_flags & IPIF_DUPLICATE));
1469 	ipif->ipif_flags |= IPIF_DUPLICATE;
1470 	ill->ill_ipif_dup_count++;
1471 	mutex_exit(&ill->ill_lock);
1472 	(void) ipif_down(ipif, NULL, NULL);
1473 	(void) ipif_down_tail(ipif);
1474 	mutex_enter(&ill->ill_lock);
1475 	if (!(ipif->ipif_flags & (IPIF_DHCPRUNNING|IPIF_TEMPORARY)) &&
1476 	    ill->ill_net_type == IRE_IF_RESOLVER &&
1477 	    !(ipif->ipif_state_flags & IPIF_CONDEMNED) &&
1478 	    ipst->ips_ip_dup_recovery > 0) {
1479 		ASSERT(ipif->ipif_recovery_id == 0);
1480 		ipif->ipif_recovery_id = timeout(ipif_dup_recovery,
1481 		    ipif, MSEC_TO_TICK(ipst->ips_ip_dup_recovery));
1482 	}
1483 	mutex_exit(&ill->ill_lock);
1484 	ipif_refrele(ipif);
1485 
1486 ignore_conflict:
1487 	freemsg(mp);
1488 	ira_cleanup(&iras, B_TRUE);
1489 }
1490 
1491 /*
1492  * Handle failure by tearing down the ipifs with the specified address.  Note
1493  * that tearing down the ipif also means deleting the ncec through ipif_down, so
1494  * it's not possible to do recovery by just restarting the ncec timer.  Instead,
1495  * we start a timer on the ipif.
1496  * Caller has to free mp;
1497  */
1498 static void
1499 ndp_failure(mblk_t *mp, ip_recv_attr_t *ira)
1500 {
1501 	const uchar_t	*haddr;
1502 	ill_t		*ill = ira->ira_rill;
1503 
1504 	/*
1505 	 * Ignore conflicts generated by misbehaving switches that just
1506 	 * reflect our own messages back to us.
1507 	 */
1508 
1509 	/* icmp_inbound_v6 ensures this */
1510 	ASSERT(ira->ira_flags & IRAF_L2SRC_SET);
1511 	haddr = ira->ira_l2src;
1512 	if (haddr != NULL &&
1513 	    bcmp(haddr, ill->ill_phys_addr, ill->ill_phys_addr_length) == 0) {
1514 		return;
1515 	}
1516 
1517 	if ((mp = copymsg(mp)) != NULL) {
1518 		mblk_t	*attrmp;
1519 
1520 		attrmp = ip_recv_attr_to_mblk(ira);
1521 		if (attrmp == NULL) {
1522 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
1523 			ip_drop_input("ipIfStatsInDiscards", mp, ill);
1524 			freemsg(mp);
1525 		} else {
1526 			ASSERT(attrmp->b_cont == NULL);
1527 			attrmp->b_cont = mp;
1528 			mp = attrmp;
1529 			ill_refhold(ill);
1530 			qwriter_ip(ill, ill->ill_rq, mp, ip_ndp_excl, NEW_OP,
1531 			    B_FALSE);
1532 		}
1533 	}
1534 }
1535 
1536 /*
1537  * Handle a discovered conflict: some other system is advertising that it owns
1538  * one of our IP addresses.  We need to defend ourselves, or just shut down the
1539  * interface.
1540  *
1541  * Handles both IPv4 and IPv6
1542  */
1543 boolean_t
1544 ip_nce_conflict(mblk_t *mp, ip_recv_attr_t *ira, ncec_t *ncec)
1545 {
1546 	ipif_t		*ipif;
1547 	clock_t		now;
1548 	uint_t		maxdefense;
1549 	uint_t		defs;
1550 	ill_t		*ill = ira->ira_ill;
1551 	ip_stack_t	*ipst = ill->ill_ipst;
1552 	uint32_t	elapsed;
1553 	boolean_t	isv6 = ill->ill_isv6;
1554 	ipaddr_t	ncec_addr;
1555 
1556 	if (isv6) {
1557 		ipif = ipif_lookup_addr_v6(&ncec->ncec_addr, ill, ALL_ZONES,
1558 		    ipst);
1559 	} else {
1560 		if (arp_no_defense) {
1561 			/*
1562 			 * Yes, there is a conflict, but no, we do not
1563 			 * defend ourself.
1564 			 */
1565 			return (B_TRUE);
1566 		}
1567 		IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, ncec_addr);
1568 		ipif = ipif_lookup_addr(ncec_addr, ill, ALL_ZONES,
1569 		    ipst);
1570 	}
1571 	if (ipif == NULL)
1572 		return (B_FALSE);
1573 
1574 	/*
1575 	 * First, figure out if this address is disposable.
1576 	 */
1577 	if (ipif->ipif_flags & (IPIF_DHCPRUNNING | IPIF_TEMPORARY))
1578 		maxdefense = ipst->ips_ip_max_temp_defend;
1579 	else
1580 		maxdefense = ipst->ips_ip_max_defend;
1581 
1582 	/*
1583 	 * Now figure out how many times we've defended ourselves.  Ignore
1584 	 * defenses that happened long in the past.
1585 	 */
1586 	now = ddi_get_lbolt();
1587 	elapsed = (drv_hztousec(now - ncec->ncec_last_time_defended))/1000000;
1588 	mutex_enter(&ncec->ncec_lock);
1589 	if ((defs = ncec->ncec_defense_count) > 0 &&
1590 	    elapsed > ipst->ips_ip_defend_interval) {
1591 		/*
1592 		 * ip_defend_interval has elapsed.
1593 		 * reset the defense count.
1594 		 */
1595 		ncec->ncec_defense_count = defs = 0;
1596 	}
1597 	ncec->ncec_defense_count++;
1598 	ncec->ncec_last_time_defended = now;
1599 	mutex_exit(&ncec->ncec_lock);
1600 	ipif_refrele(ipif);
1601 
1602 	/*
1603 	 * If we've defended ourselves too many times already, then give up and
1604 	 * tear down the interface(s) using this address.
1605 	 * Otherwise, caller has to defend by sending out an announce.
1606 	 */
1607 	if (defs >= maxdefense) {
1608 		if (isv6)
1609 			ndp_failure(mp, ira);
1610 		else
1611 			arp_failure(mp, ira);
1612 	} else {
1613 		return (B_TRUE); /* caller must defend this address */
1614 	}
1615 	return (B_FALSE);
1616 }
1617 
1618 /*
1619  * Handle reception of Neighbor Solicitation messages.
1620  */
1621 static void
1622 ndp_input_solicit(mblk_t *mp, ip_recv_attr_t *ira)
1623 {
1624 	ill_t		*ill = ira->ira_ill, *under_ill;
1625 	nd_neighbor_solicit_t *ns;
1626 	uint32_t	hlen = ill->ill_phys_addr_length;
1627 	uchar_t		*haddr = NULL;
1628 	icmp6_t		*icmp_nd;
1629 	ip6_t		*ip6h;
1630 	ncec_t		*our_ncec = NULL;
1631 	in6_addr_t	target;
1632 	in6_addr_t	src;
1633 	int		len;
1634 	int		flag = 0;
1635 	nd_opt_hdr_t	*opt = NULL;
1636 	boolean_t	bad_solicit = B_FALSE;
1637 	mib2_ipv6IfIcmpEntry_t	*mib = ill->ill_icmp6_mib;
1638 	boolean_t	need_ill_refrele = B_FALSE;
1639 
1640 	ip6h = (ip6_t *)mp->b_rptr;
1641 	icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
1642 	len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN;
1643 	src = ip6h->ip6_src;
1644 	ns = (nd_neighbor_solicit_t *)icmp_nd;
1645 	target = ns->nd_ns_target;
1646 	if (IN6_IS_ADDR_MULTICAST(&target)) {
1647 		if (ip_debug > 2) {
1648 			/* ip1dbg */
1649 			pr_addr_dbg("ndp_input_solicit: Target is"
1650 			    " multicast! %s\n", AF_INET6, &target);
1651 		}
1652 		bad_solicit = B_TRUE;
1653 		goto done;
1654 	}
1655 	if (len > sizeof (nd_neighbor_solicit_t)) {
1656 		/* Options present */
1657 		opt = (nd_opt_hdr_t *)&ns[1];
1658 		len -= sizeof (nd_neighbor_solicit_t);
1659 		if (!ndp_verify_optlen(opt, len)) {
1660 			ip1dbg(("ndp_input_solicit: Bad opt len\n"));
1661 			bad_solicit = B_TRUE;
1662 			goto done;
1663 		}
1664 	}
1665 	if (IN6_IS_ADDR_UNSPECIFIED(&src)) {
1666 		/* Check to see if this is a valid DAD solicitation */
1667 		if (!IN6_IS_ADDR_MC_SOLICITEDNODE(&ip6h->ip6_dst)) {
1668 			if (ip_debug > 2) {
1669 				/* ip1dbg */
1670 				pr_addr_dbg("ndp_input_solicit: IPv6 "
1671 				    "Destination is not solicited node "
1672 				    "multicast %s\n", AF_INET6,
1673 				    &ip6h->ip6_dst);
1674 			}
1675 			bad_solicit = B_TRUE;
1676 			goto done;
1677 		}
1678 	}
1679 
1680 	/*
1681 	 * NOTE: with IPMP, it's possible the nominated multicast ill (which
1682 	 * received this packet if it's multicast) is not the ill tied to
1683 	 * e.g. the IPMP ill's data link-local.  So we match across the illgrp
1684 	 * to ensure we find the associated NCE.
1685 	 */
1686 	our_ncec = ncec_lookup_illgrp_v6(ill, &target);
1687 	/*
1688 	 * If this is a valid Solicitation for an address we are publishing,
1689 	 * then a PUBLISH entry should exist in the cache
1690 	 */
1691 	if (our_ncec == NULL || !NCE_PUBLISH(our_ncec)) {
1692 		ip1dbg(("ndp_input_solicit: Wrong target in NS?!"
1693 		    "ifname=%s ", ill->ill_name));
1694 		if (ip_debug > 2) {
1695 			/* ip1dbg */
1696 			pr_addr_dbg(" dst %s\n", AF_INET6, &target);
1697 		}
1698 		if (our_ncec == NULL)
1699 			bad_solicit = B_TRUE;
1700 		goto done;
1701 	}
1702 
1703 	/* At this point we should have a verified NS per spec */
1704 	if (opt != NULL) {
1705 		opt = ndp_get_option(opt, len, ND_OPT_SOURCE_LINKADDR);
1706 		if (opt != NULL) {
1707 			haddr = (uchar_t *)&opt[1];
1708 			if (hlen > opt->nd_opt_len * 8 - sizeof (*opt) ||
1709 			    hlen == 0) {
1710 				ip1dbg(("ndp_input_advert: bad SLLA\n"));
1711 				bad_solicit = B_TRUE;
1712 				goto done;
1713 			}
1714 		}
1715 	}
1716 
1717 	/* If sending directly to peer, set the unicast flag */
1718 	if (!IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst))
1719 		flag |= NDP_UNICAST;
1720 
1721 	/*
1722 	 * Create/update the entry for the soliciting node on the ipmp_ill.
1723 	 * or respond to outstanding queries, don't if
1724 	 * the source is unspecified address.
1725 	 */
1726 	if (!IN6_IS_ADDR_UNSPECIFIED(&src)) {
1727 		int	err;
1728 		nce_t	*nnce;
1729 
1730 		ASSERT(ill->ill_isv6);
1731 		/*
1732 		 * Regular solicitations *must* include the Source Link-Layer
1733 		 * Address option.  Ignore messages that do not.
1734 		 */
1735 		if (haddr == NULL && IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst)) {
1736 			ip1dbg(("ndp_input_solicit: source link-layer address "
1737 			    "option missing with a specified source.\n"));
1738 			bad_solicit = B_TRUE;
1739 			goto done;
1740 		}
1741 
1742 		/*
1743 		 * This is a regular solicitation.  If we're still in the
1744 		 * process of verifying the address, then don't respond at all
1745 		 * and don't keep track of the sender.
1746 		 */
1747 		if (our_ncec->ncec_state == ND_PROBE)
1748 			goto done;
1749 
1750 		/*
1751 		 * If the solicitation doesn't have sender hardware address
1752 		 * (legal for unicast solicitation), then process without
1753 		 * installing the return NCE.  Either we already know it, or
1754 		 * we'll be forced to look it up when (and if) we reply to the
1755 		 * packet.
1756 		 */
1757 		if (haddr == NULL)
1758 			goto no_source;
1759 
1760 		under_ill = ill;
1761 		if (IS_UNDER_IPMP(under_ill)) {
1762 			ill = ipmp_ill_hold_ipmp_ill(under_ill);
1763 			if (ill == NULL)
1764 				ill = under_ill;
1765 			else
1766 				need_ill_refrele = B_TRUE;
1767 		}
1768 		err = nce_lookup_then_add_v6(ill,
1769 		    haddr, hlen,
1770 		    &src,	/* Soliciting nodes address */
1771 		    0,
1772 		    ND_STALE,
1773 		    &nnce);
1774 
1775 		if (need_ill_refrele) {
1776 			ill_refrele(ill);
1777 			ill = under_ill;
1778 			need_ill_refrele =  B_FALSE;
1779 		}
1780 		switch (err) {
1781 		case 0:
1782 			/* done with this entry */
1783 			nce_refrele(nnce);
1784 			break;
1785 		case EEXIST:
1786 			/*
1787 			 * B_FALSE indicates this is not an an advertisement.
1788 			 */
1789 			nce_process(nnce->nce_common, haddr, 0, B_FALSE);
1790 			nce_refrele(nnce);
1791 			break;
1792 		default:
1793 			ip1dbg(("ndp_input_solicit: Can't create NCE %d\n",
1794 			    err));
1795 			goto done;
1796 		}
1797 no_source:
1798 		flag |= NDP_SOLICITED;
1799 	} else {
1800 		/*
1801 		 * No source link layer address option should be present in a
1802 		 * valid DAD request.
1803 		 */
1804 		if (haddr != NULL) {
1805 			ip1dbg(("ndp_input_solicit: source link-layer address "
1806 			    "option present with an unspecified source.\n"));
1807 			bad_solicit = B_TRUE;
1808 			goto done;
1809 		}
1810 		if (our_ncec->ncec_state == ND_PROBE) {
1811 			/*
1812 			 * Internally looped-back probes will have
1813 			 * IRAF_L2SRC_LOOPBACK set so we can ignore our own
1814 			 * transmissions.
1815 			 */
1816 			if (!(ira->ira_flags & IRAF_L2SRC_LOOPBACK)) {
1817 				/*
1818 				 * If someone else is probing our address, then
1819 				 * we've crossed wires.  Declare failure.
1820 				 */
1821 				ndp_failure(mp, ira);
1822 			}
1823 			goto done;
1824 		}
1825 		/*
1826 		 * This is a DAD probe.  Multicast the advertisement to the
1827 		 * all-nodes address.
1828 		 */
1829 		src = ipv6_all_hosts_mcast;
1830 	}
1831 	flag |= nce_advert_flags(our_ncec);
1832 	(void) ndp_xmit(ill,
1833 	    ND_NEIGHBOR_ADVERT,
1834 	    our_ncec->ncec_lladdr,
1835 	    our_ncec->ncec_lladdr_length,
1836 	    &target,	/* Source and target of the advertisement pkt */
1837 	    &src,	/* IP Destination (source of original pkt) */
1838 	    flag);
1839 done:
1840 	if (bad_solicit)
1841 		BUMP_MIB(mib, ipv6IfIcmpInBadNeighborSolicitations);
1842 	if (our_ncec != NULL)
1843 		ncec_refrele(our_ncec);
1844 }
1845 
1846 /*
1847  * Handle reception of Neighbor Solicitation messages
1848  */
1849 void
1850 ndp_input_advert(mblk_t *mp, ip_recv_attr_t *ira)
1851 {
1852 	ill_t		*ill = ira->ira_ill;
1853 	nd_neighbor_advert_t *na;
1854 	uint32_t	hlen = ill->ill_phys_addr_length;
1855 	uchar_t		*haddr = NULL;
1856 	icmp6_t		*icmp_nd;
1857 	ip6_t		*ip6h;
1858 	ncec_t		*dst_ncec = NULL;
1859 	in6_addr_t	target;
1860 	nd_opt_hdr_t	*opt = NULL;
1861 	int		len;
1862 	ip_stack_t	*ipst = ill->ill_ipst;
1863 	mib2_ipv6IfIcmpEntry_t	*mib = ill->ill_icmp6_mib;
1864 
1865 	ip6h = (ip6_t *)mp->b_rptr;
1866 	icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
1867 	len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN;
1868 	na = (nd_neighbor_advert_t *)icmp_nd;
1869 
1870 	if (IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst) &&
1871 	    (na->nd_na_flags_reserved & ND_NA_FLAG_SOLICITED)) {
1872 		ip1dbg(("ndp_input_advert: Target is multicast but the "
1873 		    "solicited flag is not zero\n"));
1874 		BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements);
1875 		return;
1876 	}
1877 	target = na->nd_na_target;
1878 	if (IN6_IS_ADDR_MULTICAST(&target)) {
1879 		ip1dbg(("ndp_input_advert: Target is multicast!\n"));
1880 		BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements);
1881 		return;
1882 	}
1883 	if (len > sizeof (nd_neighbor_advert_t)) {
1884 		opt = (nd_opt_hdr_t *)&na[1];
1885 		if (!ndp_verify_optlen(opt,
1886 		    len - sizeof (nd_neighbor_advert_t))) {
1887 			ip1dbg(("ndp_input_advert: cannot verify SLLA\n"));
1888 			BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements);
1889 			return;
1890 		}
1891 		/* At this point we have a verified NA per spec */
1892 		len -= sizeof (nd_neighbor_advert_t);
1893 		opt = ndp_get_option(opt, len, ND_OPT_TARGET_LINKADDR);
1894 		if (opt != NULL) {
1895 			haddr = (uchar_t *)&opt[1];
1896 			if (hlen > opt->nd_opt_len * 8 - sizeof (*opt) ||
1897 			    hlen == 0) {
1898 				ip1dbg(("ndp_input_advert: bad SLLA\n"));
1899 				BUMP_MIB(mib,
1900 				    ipv6IfIcmpInBadNeighborAdvertisements);
1901 				return;
1902 			}
1903 		}
1904 	}
1905 
1906 	/*
1907 	 * NOTE: we match across the illgrp since we need to do DAD for all of
1908 	 * our local addresses, and those are spread across all the active
1909 	 * ills in the group.
1910 	 */
1911 	if ((dst_ncec = ncec_lookup_illgrp_v6(ill, &target)) == NULL)
1912 		return;
1913 
1914 	if (NCE_PUBLISH(dst_ncec)) {
1915 		/*
1916 		 * Someone just advertised an addresses that we publish. First,
1917 		 * check it it was us -- if so, we can safely ignore it.
1918 		 * We don't get the haddr from the ira_l2src because, in the
1919 		 * case that the packet originated from us, on an IPMP group,
1920 		 * the ira_l2src may would be the link-layer address of the
1921 		 * cast_ill used to send the packet, which may not be the same
1922 		 * as the dst_ncec->ncec_lladdr of the address.
1923 		 */
1924 		if (haddr != NULL) {
1925 			if (ira->ira_flags & IRAF_L2SRC_LOOPBACK)
1926 				goto out;
1927 
1928 			if (!nce_cmp_ll_addr(dst_ncec, haddr, hlen))
1929 				goto out;   /* from us -- no conflict */
1930 
1931 			/*
1932 			 * If we're in an IPMP group, check if this is an echo
1933 			 * from another ill in the group.  Use the double-
1934 			 * checked locking pattern to avoid grabbing
1935 			 * ill_g_lock in the non-IPMP case.
1936 			 */
1937 			if (IS_UNDER_IPMP(ill)) {
1938 				rw_enter(&ipst->ips_ill_g_lock, RW_READER);
1939 				if (IS_UNDER_IPMP(ill) && ipmp_illgrp_find_ill(
1940 				    ill->ill_grp, haddr, hlen) != NULL) {
1941 					rw_exit(&ipst->ips_ill_g_lock);
1942 					goto out;
1943 				}
1944 				rw_exit(&ipst->ips_ill_g_lock);
1945 			}
1946 		}
1947 
1948 		/*
1949 		 * This appears to be a real conflict.  If we're trying to
1950 		 * configure this NCE (ND_PROBE), then shut it down.
1951 		 * Otherwise, handle the discovered conflict.
1952 		 */
1953 		if (dst_ncec->ncec_state == ND_PROBE) {
1954 			ndp_failure(mp, ira);
1955 		} else {
1956 			if (ip_nce_conflict(mp, ira, dst_ncec)) {
1957 				char hbuf[MAC_STR_LEN];
1958 				char sbuf[INET6_ADDRSTRLEN];
1959 
1960 				cmn_err(CE_WARN,
1961 				    "node '%s' is using %s on %s",
1962 				    inet_ntop(AF_INET6, &target, sbuf,
1963 				    sizeof (sbuf)),
1964 				    haddr == NULL ? "<none>" :
1965 				    mac_colon_addr(haddr, hlen, hbuf,
1966 				    sizeof (hbuf)), ill->ill_name);
1967 				/*
1968 				 * RFC 4862, Section 5.4.4 does not mandate
1969 				 * any specific behavior when an NA matches
1970 				 * a non-tentative address assigned to the
1971 				 * receiver. We make the choice of defending
1972 				 * our address, based on the assumption that
1973 				 * the sender has not detected the Duplicate.
1974 				 *
1975 				 * ncec_last_time_defended has been adjusted
1976 				 * in ip_nce_conflict()
1977 				 */
1978 				(void) ndp_announce(dst_ncec);
1979 			}
1980 		}
1981 	} else {
1982 		if (na->nd_na_flags_reserved & ND_NA_FLAG_ROUTER)
1983 			dst_ncec->ncec_flags |= NCE_F_ISROUTER;
1984 
1985 		/* B_TRUE indicates this an advertisement */
1986 		nce_process(dst_ncec, haddr, na->nd_na_flags_reserved, B_TRUE);
1987 	}
1988 out:
1989 	ncec_refrele(dst_ncec);
1990 }
1991 
1992 /*
1993  * Process NDP neighbor solicitation/advertisement messages.
1994  * The checksum has already checked o.k before reaching here.
1995  * Information about the datalink header is contained in ira_l2src, but
1996  * that should be ignored for loopback packets.
1997  */
1998 void
1999 ndp_input(mblk_t *mp, ip_recv_attr_t *ira)
2000 {
2001 	ill_t		*ill = ira->ira_rill;
2002 	icmp6_t		*icmp_nd;
2003 	ip6_t		*ip6h;
2004 	int		len;
2005 	mib2_ipv6IfIcmpEntry_t	*mib = ill->ill_icmp6_mib;
2006 	ill_t		*orig_ill = NULL;
2007 
2008 	/*
2009 	 * Since ira_ill is where the IRE_LOCAL was hosted we use ira_rill
2010 	 * and make it be the IPMP upper so avoid being confused by a packet
2011 	 * addressed to a unicast address on a different ill.
2012 	 */
2013 	if (IS_UNDER_IPMP(ill)) {
2014 		orig_ill = ill;
2015 		ill = ipmp_ill_hold_ipmp_ill(orig_ill);
2016 		if (ill == NULL) {
2017 			ill = orig_ill;
2018 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2019 			ip_drop_input("ipIfStatsInDiscards - IPMP ill",
2020 			    mp, ill);
2021 			freemsg(mp);
2022 			return;
2023 		}
2024 		ASSERT(ill != orig_ill);
2025 		orig_ill = ira->ira_ill;
2026 		ira->ira_ill = ill;
2027 		mib = ill->ill_icmp6_mib;
2028 	}
2029 	if (!pullupmsg(mp, -1)) {
2030 		ip1dbg(("ndp_input: pullupmsg failed\n"));
2031 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2032 		ip_drop_input("ipIfStatsInDiscards - pullupmsg", mp, ill);
2033 		goto done;
2034 	}
2035 	ip6h = (ip6_t *)mp->b_rptr;
2036 	if (ip6h->ip6_hops != IPV6_MAX_HOPS) {
2037 		ip1dbg(("ndp_input: hoplimit != IPV6_MAX_HOPS\n"));
2038 		ip_drop_input("ipv6IfIcmpBadHoplimit", mp, ill);
2039 		BUMP_MIB(mib, ipv6IfIcmpBadHoplimit);
2040 		goto done;
2041 	}
2042 	/*
2043 	 * NDP does not accept any extension headers between the
2044 	 * IP header and the ICMP header since e.g. a routing
2045 	 * header could be dangerous.
2046 	 * This assumes that any AH or ESP headers are removed
2047 	 * by ip prior to passing the packet to ndp_input.
2048 	 */
2049 	if (ip6h->ip6_nxt != IPPROTO_ICMPV6) {
2050 		ip1dbg(("ndp_input: Wrong next header 0x%x\n",
2051 		    ip6h->ip6_nxt));
2052 		ip_drop_input("Wrong next header", mp, ill);
2053 		BUMP_MIB(mib, ipv6IfIcmpInErrors);
2054 		goto done;
2055 	}
2056 	icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
2057 	ASSERT(icmp_nd->icmp6_type == ND_NEIGHBOR_SOLICIT ||
2058 	    icmp_nd->icmp6_type == ND_NEIGHBOR_ADVERT);
2059 	if (icmp_nd->icmp6_code != 0) {
2060 		ip1dbg(("ndp_input: icmp6 code != 0 \n"));
2061 		ip_drop_input("code non-zero", mp, ill);
2062 		BUMP_MIB(mib, ipv6IfIcmpInErrors);
2063 		goto done;
2064 	}
2065 	len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN;
2066 	/*
2067 	 * Make sure packet length is large enough for either
2068 	 * a NS or a NA icmp packet.
2069 	 */
2070 	if (len <  sizeof (struct icmp6_hdr) + sizeof (struct in6_addr)) {
2071 		ip1dbg(("ndp_input: packet too short\n"));
2072 		ip_drop_input("packet too short", mp, ill);
2073 		BUMP_MIB(mib, ipv6IfIcmpInErrors);
2074 		goto done;
2075 	}
2076 	if (icmp_nd->icmp6_type == ND_NEIGHBOR_SOLICIT) {
2077 		ndp_input_solicit(mp, ira);
2078 	} else {
2079 		ndp_input_advert(mp, ira);
2080 	}
2081 done:
2082 	freemsg(mp);
2083 	if (orig_ill != NULL) {
2084 		ill_refrele(ill);
2085 		ira->ira_ill = orig_ill;
2086 	}
2087 }
2088 
2089 /*
2090  * ndp_xmit is called to form and transmit a ND solicitation or
2091  * advertisement ICMP packet.
2092  *
2093  * If the source address is unspecified and this isn't a probe (used for
2094  * duplicate address detection), an appropriate source address and link layer
2095  * address will be chosen here.  The link layer address option is included if
2096  * the source is specified (i.e., all non-probe packets), and omitted (per the
2097  * specification) otherwise.
2098  *
2099  * It returns B_FALSE only if it does a successful put() to the
2100  * corresponding ill's ill_wq otherwise returns B_TRUE.
2101  */
2102 static boolean_t
2103 ndp_xmit(ill_t *ill, uint32_t operation, uint8_t *hw_addr, uint_t hw_addr_len,
2104     const in6_addr_t *sender, const in6_addr_t *target, int flag)
2105 {
2106 	uint32_t	len;
2107 	icmp6_t 	*icmp6;
2108 	mblk_t		*mp;
2109 	ip6_t		*ip6h;
2110 	nd_opt_hdr_t	*opt;
2111 	uint_t		plen;
2112 	zoneid_t	zoneid = GLOBAL_ZONEID;
2113 	ill_t		*hwaddr_ill = ill;
2114 	ip_xmit_attr_t	ixas;
2115 	ip_stack_t	*ipst = ill->ill_ipst;
2116 	boolean_t	need_refrele = B_FALSE;
2117 	boolean_t	probe = B_FALSE;
2118 
2119 	if (IS_UNDER_IPMP(ill)) {
2120 		probe = ipif_lookup_testaddr_v6(ill, sender, NULL);
2121 		/*
2122 		 * We send non-probe packets on the upper IPMP interface.
2123 		 * ip_output_simple() will use cast_ill for sending any
2124 		 * multicast packets. Note that we can't follow the same
2125 		 * logic for probe packets because all interfaces in the ipmp
2126 		 * group may have failed, so that we really want to only try
2127 		 * to send the ND packet on the ill corresponding to the src
2128 		 * address.
2129 		 */
2130 		if (!probe) {
2131 			ill = ipmp_ill_hold_ipmp_ill(ill);
2132 			if (ill != NULL)
2133 				need_refrele = B_TRUE;
2134 			else
2135 				ill = hwaddr_ill;
2136 		}
2137 	}
2138 
2139 	/*
2140 	 * If we have a unspecified source(sender) address, select a
2141 	 * proper source address for the solicitation here itself so
2142 	 * that we can initialize the h/w address correctly.
2143 	 *
2144 	 * If the sender is specified then we use this address in order
2145 	 * to lookup the zoneid before calling ip_output_v6(). This is to
2146 	 * enable unicast ND_NEIGHBOR_ADVERT packets to be routed correctly
2147 	 * by IP (we cannot guarantee that the global zone has an interface
2148 	 * route to the destination).
2149 	 *
2150 	 * Note that the NA never comes here with the unspecified source
2151 	 * address.
2152 	 */
2153 
2154 	/*
2155 	 * Probes will have unspec src at this point.
2156 	 */
2157 	if (!(IN6_IS_ADDR_UNSPECIFIED(sender))) {
2158 		zoneid = ipif_lookup_addr_zoneid_v6(sender, ill, ipst);
2159 		/*
2160 		 * It's possible for ipif_lookup_addr_zoneid_v6() to return
2161 		 * ALL_ZONES if it cannot find a matching ipif for the address
2162 		 * we are trying to use. In this case we err on the side of
2163 		 * trying to send the packet by defaulting to the GLOBAL_ZONEID.
2164 		 */
2165 		if (zoneid == ALL_ZONES)
2166 			zoneid = GLOBAL_ZONEID;
2167 	}
2168 
2169 	plen = (sizeof (nd_opt_hdr_t) + hw_addr_len + 7) / 8;
2170 	len = IPV6_HDR_LEN + sizeof (nd_neighbor_advert_t) + plen * 8;
2171 	mp = allocb(len,  BPRI_LO);
2172 	if (mp == NULL) {
2173 		if (need_refrele)
2174 			ill_refrele(ill);
2175 		return (B_TRUE);
2176 	}
2177 
2178 	bzero((char *)mp->b_rptr, len);
2179 	mp->b_wptr = mp->b_rptr + len;
2180 
2181 	bzero(&ixas, sizeof (ixas));
2182 	ixas.ixa_flags = IXAF_BASIC_SIMPLE_V6 | IXAF_NO_HW_CKSUM;
2183 
2184 	ixas.ixa_ifindex = ill->ill_phyint->phyint_ifindex;
2185 	ixas.ixa_ipst = ipst;
2186 	ixas.ixa_cred = kcred;
2187 	ixas.ixa_cpid = NOPID;
2188 	ixas.ixa_tsl = NULL;
2189 	ixas.ixa_zoneid = zoneid;
2190 
2191 	ip6h = (ip6_t *)mp->b_rptr;
2192 	ip6h->ip6_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
2193 	ip6h->ip6_plen = htons(len - IPV6_HDR_LEN);
2194 	ip6h->ip6_nxt = IPPROTO_ICMPV6;
2195 	ip6h->ip6_hops = IPV6_MAX_HOPS;
2196 	ixas.ixa_multicast_ttl = ip6h->ip6_hops;
2197 	ip6h->ip6_dst = *target;
2198 	icmp6 = (icmp6_t *)&ip6h[1];
2199 
2200 	if (hw_addr_len != 0) {
2201 		opt = (nd_opt_hdr_t *)((uint8_t *)ip6h + IPV6_HDR_LEN +
2202 		    sizeof (nd_neighbor_advert_t));
2203 	} else {
2204 		opt = NULL;
2205 	}
2206 	if (operation == ND_NEIGHBOR_SOLICIT) {
2207 		nd_neighbor_solicit_t *ns = (nd_neighbor_solicit_t *)icmp6;
2208 
2209 		if (opt != NULL && !(flag & NDP_PROBE)) {
2210 			/*
2211 			 * Note that we don't send out SLLA for ND probes
2212 			 * per RFC 4862, even though we do send out the src
2213 			 * haddr for IPv4 DAD probes, even though both IPv4
2214 			 * and IPv6 go out with the unspecified/INADDR_ANY
2215 			 * src IP addr.
2216 			 */
2217 			opt->nd_opt_type = ND_OPT_SOURCE_LINKADDR;
2218 		}
2219 		ip6h->ip6_src = *sender;
2220 		ns->nd_ns_target = *target;
2221 		if (!(flag & NDP_UNICAST)) {
2222 			/* Form multicast address of the target */
2223 			ip6h->ip6_dst = ipv6_solicited_node_mcast;
2224 			ip6h->ip6_dst.s6_addr32[3] |=
2225 			    ns->nd_ns_target.s6_addr32[3];
2226 		}
2227 	} else {
2228 		nd_neighbor_advert_t *na = (nd_neighbor_advert_t *)icmp6;
2229 
2230 		ASSERT(!(flag & NDP_PROBE));
2231 		if (opt != NULL)
2232 			opt->nd_opt_type = ND_OPT_TARGET_LINKADDR;
2233 		ip6h->ip6_src = *sender;
2234 		na->nd_na_target = *sender;
2235 		if (flag & NDP_ISROUTER)
2236 			na->nd_na_flags_reserved |= ND_NA_FLAG_ROUTER;
2237 		if (flag & NDP_SOLICITED)
2238 			na->nd_na_flags_reserved |= ND_NA_FLAG_SOLICITED;
2239 		if (flag & NDP_ORIDE)
2240 			na->nd_na_flags_reserved |= ND_NA_FLAG_OVERRIDE;
2241 	}
2242 
2243 	if (!(flag & NDP_PROBE)) {
2244 		if (hw_addr != NULL && opt != NULL) {
2245 			/* Fill in link layer address and option len */
2246 			opt->nd_opt_len = (uint8_t)plen;
2247 			bcopy(hw_addr, &opt[1], hw_addr_len);
2248 		}
2249 	}
2250 	if (opt != NULL && opt->nd_opt_type == 0) {
2251 		/* If there's no link layer address option, then strip it. */
2252 		len -= plen * 8;
2253 		mp->b_wptr = mp->b_rptr + len;
2254 		ip6h->ip6_plen = htons(len - IPV6_HDR_LEN);
2255 	}
2256 
2257 	icmp6->icmp6_type = (uint8_t)operation;
2258 	icmp6->icmp6_code = 0;
2259 	/*
2260 	 * Prepare for checksum by putting icmp length in the icmp
2261 	 * checksum field. The checksum is calculated in ip_output.c.
2262 	 */
2263 	icmp6->icmp6_cksum = ip6h->ip6_plen;
2264 
2265 	(void) ip_output_simple(mp, &ixas);
2266 	ixa_cleanup(&ixas);
2267 	if (need_refrele)
2268 		ill_refrele(ill);
2269 	return (B_FALSE);
2270 }
2271 
2272 /*
2273  * Used to set ND_UNREACHBLE before ncec_delete sets it NCE_F_CONDEMNED.
2274  * The datapath uses this as an indication that there
2275  * is a problem (as opposed to a NCE that was just
2276  * reclaimed due to lack of memory.
2277  * Note that static ARP entries never become unreachable.
2278  */
2279 void
2280 nce_make_unreachable(ncec_t *ncec)
2281 {
2282 	mutex_enter(&ncec->ncec_lock);
2283 	ncec->ncec_state = ND_UNREACHABLE;
2284 	mutex_exit(&ncec->ncec_lock);
2285 }
2286 
2287 /*
2288  * NCE retransmit timer. Common to IPv4 and IPv6.
2289  * This timer goes off when:
2290  * a. It is time to retransmit a resolution for resolver.
2291  * b. It is time to send reachability probes.
2292  */
2293 void
2294 nce_timer(void *arg)
2295 {
2296 	ncec_t		*ncec = arg;
2297 	ill_t		*ill = ncec->ncec_ill, *src_ill;
2298 	char		addrbuf[INET6_ADDRSTRLEN];
2299 	boolean_t	dropped = B_FALSE;
2300 	ip_stack_t	*ipst = ncec->ncec_ipst;
2301 	boolean_t	isv6 = (ncec->ncec_ipversion == IPV6_VERSION);
2302 	in_addr_t	sender4 = INADDR_ANY;
2303 	in6_addr_t	sender6 = ipv6_all_zeros;
2304 
2305 	/*
2306 	 * The timer has to be cancelled by ncec_delete before doing the final
2307 	 * refrele. So the NCE is guaranteed to exist when the timer runs
2308 	 * until it clears the timeout_id. Before clearing the timeout_id
2309 	 * bump up the refcnt so that we can continue to use the ncec
2310 	 */
2311 	ASSERT(ncec != NULL);
2312 	mutex_enter(&ncec->ncec_lock);
2313 	ncec_refhold_locked(ncec);
2314 	ncec->ncec_timeout_id = 0;
2315 	mutex_exit(&ncec->ncec_lock);
2316 
2317 	src_ill = nce_resolve_src(ncec, &sender6);
2318 	/* if we could not find a sender address, return */
2319 	if (src_ill == NULL) {
2320 		if (!isv6) {
2321 			IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, sender4);
2322 			ip1dbg(("no src ill for %s\n", inet_ntop(AF_INET,
2323 			    &sender4, addrbuf, sizeof (addrbuf))));
2324 		} else {
2325 			ip1dbg(("no src ill for %s\n", inet_ntop(AF_INET6,
2326 			    &ncec->ncec_addr, addrbuf, sizeof (addrbuf))));
2327 		}
2328 		nce_restart_timer(ncec, ill->ill_reachable_retrans_time);
2329 		ncec_refrele(ncec);
2330 		return;
2331 	}
2332 	if (!isv6)
2333 		IN6_V4MAPPED_TO_IPADDR(&sender6, sender4);
2334 
2335 	mutex_enter(&ncec->ncec_lock);
2336 	/*
2337 	 * Check the reachability state.
2338 	 */
2339 	switch (ncec->ncec_state) {
2340 	case ND_DELAY:
2341 		ASSERT(ncec->ncec_lladdr != NULL);
2342 		ncec->ncec_state = ND_PROBE;
2343 		ncec->ncec_pcnt = ND_MAX_UNICAST_SOLICIT;
2344 		if (isv6) {
2345 			mutex_exit(&ncec->ncec_lock);
2346 			dropped = ndp_xmit(src_ill, ND_NEIGHBOR_SOLICIT,
2347 			    src_ill->ill_phys_addr,
2348 			    src_ill->ill_phys_addr_length,
2349 			    &sender6, &ncec->ncec_addr,
2350 			    NDP_UNICAST);
2351 		} else {
2352 			dropped = arp_request(ncec, sender4, src_ill);
2353 			mutex_exit(&ncec->ncec_lock);
2354 		}
2355 		if (!dropped) {
2356 			mutex_enter(&ncec->ncec_lock);
2357 			ncec->ncec_pcnt--;
2358 			mutex_exit(&ncec->ncec_lock);
2359 		}
2360 		if (ip_debug > 3) {
2361 			/* ip2dbg */
2362 			pr_addr_dbg("nce_timer: state for %s changed "
2363 			    "to PROBE\n", AF_INET6, &ncec->ncec_addr);
2364 		}
2365 		nce_restart_timer(ncec, ill->ill_reachable_retrans_time);
2366 		break;
2367 	case ND_PROBE:
2368 		/* must be retransmit timer */
2369 		ASSERT(ncec->ncec_pcnt >= -1);
2370 		if (ncec->ncec_pcnt > 0) {
2371 			/*
2372 			 * As per RFC2461, the ncec gets deleted after
2373 			 * MAX_UNICAST_SOLICIT unsuccessful re-transmissions.
2374 			 * Note that the first unicast solicitation is sent
2375 			 * during the DELAY state.
2376 			 */
2377 			ip2dbg(("nce_timer: pcount=%x dst %s\n",
2378 			    ncec->ncec_pcnt,
2379 			    inet_ntop((isv6? AF_INET6 : AF_INET),
2380 			    &ncec->ncec_addr, addrbuf, sizeof (addrbuf))));
2381 			if (NCE_PUBLISH(ncec)) {
2382 				mutex_exit(&ncec->ncec_lock);
2383 				/*
2384 				 * send out a probe; note that src_ill
2385 				 * is ignored by nce_dad() for all
2386 				 * DAD message types other than IPv6
2387 				 * unicast probes
2388 				 */
2389 				nce_dad(ncec, src_ill, B_TRUE);
2390 			} else {
2391 				ASSERT(src_ill != NULL);
2392 				if (isv6) {
2393 					mutex_exit(&ncec->ncec_lock);
2394 					dropped = ndp_xmit(src_ill,
2395 					    ND_NEIGHBOR_SOLICIT,
2396 					    src_ill->ill_phys_addr,
2397 					    src_ill->ill_phys_addr_length,
2398 					    &sender6, &ncec->ncec_addr,
2399 					    NDP_UNICAST);
2400 				} else {
2401 					/*
2402 					 * since the nce is REACHABLE,
2403 					 * the ARP request will be sent out
2404 					 * as a link-layer unicast.
2405 					 */
2406 					dropped = arp_request(ncec, sender4,
2407 					    src_ill);
2408 					mutex_exit(&ncec->ncec_lock);
2409 				}
2410 				if (!dropped) {
2411 					mutex_enter(&ncec->ncec_lock);
2412 					ncec->ncec_pcnt--;
2413 					mutex_exit(&ncec->ncec_lock);
2414 				}
2415 				nce_restart_timer(ncec,
2416 				    ill->ill_reachable_retrans_time);
2417 			}
2418 		} else if (ncec->ncec_pcnt < 0) {
2419 			/* No hope, delete the ncec */
2420 			/* Tell datapath it went bad */
2421 			ncec->ncec_state = ND_UNREACHABLE;
2422 			mutex_exit(&ncec->ncec_lock);
2423 			if (ip_debug > 2) {
2424 				/* ip1dbg */
2425 				pr_addr_dbg("nce_timer: Delete NCE for"
2426 				    " dst %s\n", (isv6? AF_INET6: AF_INET),
2427 				    &ncec->ncec_addr);
2428 			}
2429 			/* if static ARP can't delete. */
2430 			if ((ncec->ncec_flags & NCE_F_STATIC) == 0)
2431 				ncec_delete(ncec);
2432 
2433 		} else if (!NCE_PUBLISH(ncec)) {
2434 			/*
2435 			 * Probe count is 0 for a dynamic entry (one that we
2436 			 * ourselves are not publishing). We should never get
2437 			 * here if NONUD was requested, hence the ASSERT below.
2438 			 */
2439 			ASSERT((ncec->ncec_flags & NCE_F_NONUD) == 0);
2440 			ip2dbg(("nce_timer: pcount=%x dst %s\n",
2441 			    ncec->ncec_pcnt, inet_ntop(AF_INET6,
2442 			    &ncec->ncec_addr, addrbuf, sizeof (addrbuf))));
2443 			ncec->ncec_pcnt--;
2444 			mutex_exit(&ncec->ncec_lock);
2445 			/* Wait one interval before killing */
2446 			nce_restart_timer(ncec,
2447 			    ill->ill_reachable_retrans_time);
2448 		} else if (ill->ill_phyint->phyint_flags & PHYI_RUNNING) {
2449 			ipif_t *ipif;
2450 			ipaddr_t ncec_addr;
2451 
2452 			/*
2453 			 * We're done probing, and we can now declare this
2454 			 * address to be usable.  Let IP know that it's ok to
2455 			 * use.
2456 			 */
2457 			ncec->ncec_state = ND_REACHABLE;
2458 			ncec->ncec_flags &= ~NCE_F_UNVERIFIED;
2459 			mutex_exit(&ncec->ncec_lock);
2460 			if (isv6) {
2461 				ipif = ipif_lookup_addr_exact_v6(
2462 				    &ncec->ncec_addr, ill, ipst);
2463 			} else {
2464 				IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr,
2465 				    ncec_addr);
2466 				ipif = ipif_lookup_addr_exact(ncec_addr, ill,
2467 				    ipst);
2468 			}
2469 			if (ipif != NULL) {
2470 				if (ipif->ipif_was_dup) {
2471 					char ibuf[LIFNAMSIZ];
2472 					char sbuf[INET6_ADDRSTRLEN];
2473 
2474 					ipif->ipif_was_dup = B_FALSE;
2475 					(void) inet_ntop(AF_INET6,
2476 					    &ipif->ipif_v6lcl_addr,
2477 					    sbuf, sizeof (sbuf));
2478 					ipif_get_name(ipif, ibuf,
2479 					    sizeof (ibuf));
2480 					cmn_err(CE_NOTE, "recovered address "
2481 					    "%s on %s", sbuf, ibuf);
2482 				}
2483 				if ((ipif->ipif_flags & IPIF_UP) &&
2484 				    !ipif->ipif_addr_ready)
2485 					ipif_up_notify(ipif);
2486 				ipif->ipif_addr_ready = 1;
2487 				ipif_refrele(ipif);
2488 			}
2489 			if (!isv6 && arp_no_defense)
2490 				break;
2491 			/* Begin defending our new address */
2492 			if (ncec->ncec_unsolicit_count > 0) {
2493 				ncec->ncec_unsolicit_count--;
2494 				if (isv6) {
2495 					dropped = ndp_announce(ncec);
2496 				} else {
2497 					dropped = arp_announce(ncec);
2498 				}
2499 
2500 				if (dropped)
2501 					ncec->ncec_unsolicit_count++;
2502 				else
2503 					ncec->ncec_last_time_defended =
2504 					    ddi_get_lbolt();
2505 			}
2506 			if (ncec->ncec_unsolicit_count > 0) {
2507 				nce_restart_timer(ncec,
2508 				    ANNOUNCE_INTERVAL(isv6));
2509 			} else if (DEFENSE_INTERVAL(isv6) != 0) {
2510 				nce_restart_timer(ncec, DEFENSE_INTERVAL(isv6));
2511 			}
2512 		} else {
2513 			/*
2514 			 * This is an address we're probing to be our own, but
2515 			 * the ill is down.  Wait until it comes back before
2516 			 * doing anything, but switch to reachable state so
2517 			 * that the restart will work.
2518 			 */
2519 			ncec->ncec_state = ND_REACHABLE;
2520 			mutex_exit(&ncec->ncec_lock);
2521 		}
2522 		break;
2523 	case ND_INCOMPLETE: {
2524 		mblk_t	*mp, *nextmp;
2525 		mblk_t	**prevmpp;
2526 
2527 		/*
2528 		 * Per case (2) in the nce_queue_mp() comments, scan ncec_qd_mp
2529 		 * for any IPMP probe packets, and toss them.  IPMP probe
2530 		 * packets will always be at the head of ncec_qd_mp, so that
2531 		 * we can stop at the first queued ND packet that is
2532 		 * not a probe packet.
2533 		 */
2534 		prevmpp = &ncec->ncec_qd_mp;
2535 		for (mp = ncec->ncec_qd_mp; mp != NULL; mp = nextmp) {
2536 			nextmp = mp->b_next;
2537 
2538 			if (IS_UNDER_IPMP(ill) && ncec->ncec_nprobes > 0) {
2539 				inet_freemsg(mp);
2540 				ncec->ncec_nprobes--;
2541 				*prevmpp = nextmp;
2542 			} else {
2543 				prevmpp = &mp->b_next;
2544 			}
2545 		}
2546 
2547 		/*
2548 		 * Must be resolver's retransmit timer.
2549 		 */
2550 		mutex_exit(&ncec->ncec_lock);
2551 		ip_ndp_resolve(ncec);
2552 		break;
2553 	}
2554 	case ND_REACHABLE:
2555 		if (((ncec->ncec_flags & NCE_F_UNSOL_ADV) &&
2556 		    ncec->ncec_unsolicit_count != 0) ||
2557 		    (NCE_PUBLISH(ncec) && DEFENSE_INTERVAL(isv6) != 0)) {
2558 			if (ncec->ncec_unsolicit_count > 0) {
2559 				ncec->ncec_unsolicit_count--;
2560 				mutex_exit(&ncec->ncec_lock);
2561 				/*
2562 				 * When we get to zero announcements left,
2563 				 * switch to address defense
2564 				 */
2565 			} else {
2566 				boolean_t rate_limit;
2567 
2568 				mutex_exit(&ncec->ncec_lock);
2569 				rate_limit = ill_defend_rate_limit(ill, ncec);
2570 				if (rate_limit) {
2571 					nce_restart_timer(ncec,
2572 					    DEFENSE_INTERVAL(isv6));
2573 					break;
2574 				}
2575 			}
2576 			if (isv6) {
2577 				dropped = ndp_announce(ncec);
2578 			} else {
2579 				dropped = arp_announce(ncec);
2580 			}
2581 			mutex_enter(&ncec->ncec_lock);
2582 			if (dropped) {
2583 				ncec->ncec_unsolicit_count++;
2584 			} else {
2585 				ncec->ncec_last_time_defended =
2586 				    ddi_get_lbolt();
2587 			}
2588 			mutex_exit(&ncec->ncec_lock);
2589 			if (ncec->ncec_unsolicit_count != 0) {
2590 				nce_restart_timer(ncec,
2591 				    ANNOUNCE_INTERVAL(isv6));
2592 			} else {
2593 				nce_restart_timer(ncec, DEFENSE_INTERVAL(isv6));
2594 			}
2595 		} else {
2596 			mutex_exit(&ncec->ncec_lock);
2597 		}
2598 		break;
2599 	default:
2600 		mutex_exit(&ncec->ncec_lock);
2601 		break;
2602 	}
2603 done:
2604 	ncec_refrele(ncec);
2605 	ill_refrele(src_ill);
2606 }
2607 
2608 /*
2609  * Set a link layer address from the ll_addr passed in.
2610  * Copy SAP from ill.
2611  */
2612 static void
2613 nce_set_ll(ncec_t *ncec, uchar_t *ll_addr)
2614 {
2615 	ill_t	*ill = ncec->ncec_ill;
2616 
2617 	ASSERT(ll_addr != NULL);
2618 	if (ill->ill_phys_addr_length > 0) {
2619 		/*
2620 		 * The bcopy() below used to be called for the physical address
2621 		 * length rather than the link layer address length. For
2622 		 * ethernet and many other media, the phys_addr and lla are
2623 		 * identical.
2624 		 *
2625 		 * The phys_addr and lla may not be the same for devices that
2626 		 * support DL_IPV6_LINK_LAYER_ADDR, though there are currently
2627 		 * no known instances of these.
2628 		 *
2629 		 * For PPP or other interfaces with a zero length
2630 		 * physical address, don't do anything here.
2631 		 * The bcopy() with a zero phys_addr length was previously
2632 		 * a no-op for interfaces with a zero-length physical address.
2633 		 * Using the lla for them would change the way they operate.
2634 		 * Doing nothing in such cases preserves expected behavior.
2635 		 */
2636 		bcopy(ll_addr, ncec->ncec_lladdr, ill->ill_nd_lla_len);
2637 	}
2638 }
2639 
2640 boolean_t
2641 nce_cmp_ll_addr(const ncec_t *ncec, const uchar_t *ll_addr,
2642     uint32_t ll_addr_len)
2643 {
2644 	ASSERT(ncec->ncec_lladdr != NULL);
2645 	if (ll_addr == NULL)
2646 		return (B_FALSE);
2647 	if (bcmp(ll_addr, ncec->ncec_lladdr, ll_addr_len) != 0)
2648 		return (B_TRUE);
2649 	return (B_FALSE);
2650 }
2651 
2652 /*
2653  * Updates the link layer address or the reachability state of
2654  * a cache entry.  Reset probe counter if needed.
2655  */
2656 void
2657 nce_update(ncec_t *ncec, uint16_t new_state, uchar_t *new_ll_addr)
2658 {
2659 	ill_t	*ill = ncec->ncec_ill;
2660 	boolean_t need_stop_timer = B_FALSE;
2661 	boolean_t need_fastpath_update = B_FALSE;
2662 	nce_t	*nce = NULL;
2663 	timeout_id_t tid;
2664 
2665 	ASSERT(MUTEX_HELD(&ncec->ncec_lock));
2666 	/*
2667 	 * If this interface does not do NUD, there is no point
2668 	 * in allowing an update to the cache entry.  Although
2669 	 * we will respond to NS.
2670 	 * The only time we accept an update for a resolver when
2671 	 * NUD is turned off is when it has just been created.
2672 	 * Non-Resolvers will always be created as REACHABLE.
2673 	 */
2674 	if (new_state != ND_UNCHANGED) {
2675 		if ((ncec->ncec_flags & NCE_F_NONUD) &&
2676 		    (ncec->ncec_state != ND_INCOMPLETE))
2677 			return;
2678 		ASSERT((int16_t)new_state >= ND_STATE_VALID_MIN);
2679 		ASSERT((int16_t)new_state <= ND_STATE_VALID_MAX);
2680 		need_stop_timer = B_TRUE;
2681 		if (new_state == ND_REACHABLE)
2682 			ncec->ncec_last = TICK_TO_MSEC(ddi_get_lbolt64());
2683 		else {
2684 			/* We force NUD in this case */
2685 			ncec->ncec_last = 0;
2686 		}
2687 		ncec->ncec_state = new_state;
2688 		ncec->ncec_pcnt = ND_MAX_UNICAST_SOLICIT;
2689 		ASSERT(ncec->ncec_lladdr != NULL || new_state == ND_INITIAL ||
2690 		    new_state == ND_INCOMPLETE);
2691 	}
2692 	if (need_stop_timer || (ncec->ncec_flags & NCE_F_STATIC)) {
2693 		tid = ncec->ncec_timeout_id;
2694 		ncec->ncec_timeout_id = 0;
2695 	}
2696 	/*
2697 	 * Re-trigger fastpath probe and
2698 	 * overwrite the DL_UNITDATA_REQ data, noting we'll lose
2699 	 * whatever packets that happens to be transmitting at the time.
2700 	 */
2701 	if (new_ll_addr != NULL) {
2702 		bcopy(new_ll_addr, ncec->ncec_lladdr,
2703 		    ill->ill_phys_addr_length);
2704 		need_fastpath_update = B_TRUE;
2705 	}
2706 	mutex_exit(&ncec->ncec_lock);
2707 	if (need_stop_timer || (ncec->ncec_flags & NCE_F_STATIC)) {
2708 		if (tid != 0)
2709 			(void) untimeout(tid);
2710 	}
2711 	if (need_fastpath_update) {
2712 		/*
2713 		 * Delete any existing existing dlur_mp and fp_mp information.
2714 		 * For IPMP interfaces, all underlying ill's must be checked
2715 		 * and purged.
2716 		 */
2717 		nce_fastpath_list_delete(ncec->ncec_ill, ncec, NULL);
2718 		/*
2719 		 * add the new dlur_mp and fp_mp
2720 		 */
2721 		nce = nce_fastpath(ncec, B_TRUE, NULL);
2722 		if (nce != NULL)
2723 			nce_refrele(nce);
2724 	}
2725 	mutex_enter(&ncec->ncec_lock);
2726 }
2727 
2728 static void
2729 nce_queue_mp_common(ncec_t *ncec, mblk_t *mp, boolean_t head_insert)
2730 {
2731 	uint_t	count = 0;
2732 	mblk_t  **mpp, *tmp;
2733 
2734 	ASSERT(MUTEX_HELD(&ncec->ncec_lock));
2735 
2736 	for (mpp = &ncec->ncec_qd_mp; *mpp != NULL; mpp = &(*mpp)->b_next) {
2737 		if (++count > ncec->ncec_ill->ill_max_buf) {
2738 			tmp = ncec->ncec_qd_mp->b_next;
2739 			ncec->ncec_qd_mp->b_next = NULL;
2740 			/*
2741 			 * if we never create data addrs on the under_ill
2742 			 * does this matter?
2743 			 */
2744 			BUMP_MIB(ncec->ncec_ill->ill_ip_mib,
2745 			    ipIfStatsOutDiscards);
2746 			ip_drop_output("ipIfStatsOutDiscards", ncec->ncec_qd_mp,
2747 			    ncec->ncec_ill);
2748 			freemsg(ncec->ncec_qd_mp);
2749 			ncec->ncec_qd_mp = tmp;
2750 		}
2751 	}
2752 
2753 	if (head_insert) {
2754 		ncec->ncec_nprobes++;
2755 		mp->b_next = ncec->ncec_qd_mp;
2756 		ncec->ncec_qd_mp = mp;
2757 	} else {
2758 		*mpp = mp;
2759 	}
2760 }
2761 
2762 /*
2763  * nce_queue_mp will queue the packet into the ncec_qd_mp. The packet will be
2764  * queued at the head or tail of the queue based on the input argument
2765  * 'head_insert'. The caller should specify this argument as B_TRUE if this
2766  * packet is an IPMP probe packet, in which case the following happens:
2767  *
2768  *   1. Insert it at the head of the ncec_qd_mp list.  Consider the normal
2769  *	(non-ipmp_probe) load-speading case where the source address of the ND
2770  *	packet is not tied to ncec_ill. If the ill bound to the source address
2771  *	cannot receive, the response to the ND packet will not be received.
2772  *	However, if ND packets for ncec_ill's probes are queued	behind that ND
2773  *	packet, those probes will also fail to be sent, and thus in.mpathd will
2774  *	 erroneously conclude that ncec_ill has also failed.
2775  *
2776  *   2. Drop the ipmp_probe packet in ndp_timer() if the ND did	not succeed on
2777  *	the first attempt.  This ensures that ND problems do not manifest as
2778  *	probe RTT spikes.
2779  *
2780  * We achieve this by inserting ipmp_probe() packets at the head of the
2781  * nce_queue.
2782  *
2783  * The ncec for the probe target is created with ncec_ill set to the ipmp_ill,
2784  * but the caller needs to set head_insert to B_TRUE if this is a probe packet.
2785  */
2786 void
2787 nce_queue_mp(ncec_t *ncec, mblk_t *mp, boolean_t head_insert)
2788 {
2789 	ASSERT(MUTEX_HELD(&ncec->ncec_lock));
2790 	nce_queue_mp_common(ncec, mp, head_insert);
2791 }
2792 
2793 /*
2794  * Called when address resolution failed due to a timeout.
2795  * Send an ICMP unreachable in response to all queued packets.
2796  */
2797 void
2798 ndp_resolv_failed(ncec_t *ncec)
2799 {
2800 	mblk_t	*mp, *nxt_mp;
2801 	char	buf[INET6_ADDRSTRLEN];
2802 	ill_t *ill = ncec->ncec_ill;
2803 	ip_recv_attr_t	iras;
2804 
2805 	bzero(&iras, sizeof (iras));
2806 	iras.ira_flags = 0;
2807 	/*
2808 	 * we are setting the ira_rill to the ipmp_ill (instead of
2809 	 * the actual ill on which the packet was received), but this
2810 	 * is ok because we don't actually need the real ira_rill.
2811 	 * to send the icmp unreachable to the sender.
2812 	 */
2813 	iras.ira_ill = iras.ira_rill = ill;
2814 	iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex;
2815 	iras.ira_rifindex = iras.ira_ruifindex;
2816 
2817 	ip1dbg(("ndp_resolv_failed: dst %s\n",
2818 	    inet_ntop(AF_INET6, (char *)&ncec->ncec_addr, buf, sizeof (buf))));
2819 	mutex_enter(&ncec->ncec_lock);
2820 	mp = ncec->ncec_qd_mp;
2821 	ncec->ncec_qd_mp = NULL;
2822 	ncec->ncec_nprobes = 0;
2823 	mutex_exit(&ncec->ncec_lock);
2824 	while (mp != NULL) {
2825 		nxt_mp = mp->b_next;
2826 		mp->b_next = NULL;
2827 
2828 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
2829 		ip_drop_output("ipIfStatsOutDiscards - address unreachable",
2830 		    mp, ill);
2831 		icmp_unreachable_v6(mp,
2832 		    ICMP6_DST_UNREACH_ADDR, B_FALSE, &iras);
2833 		ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE));
2834 		mp = nxt_mp;
2835 	}
2836 	ncec_cb_dispatch(ncec); /* finish off waiting callbacks */
2837 }
2838 
2839 /*
2840  * Handle the completion of NDP and ARP resolution.
2841  */
2842 void
2843 nce_resolv_ok(ncec_t *ncec)
2844 {
2845 	mblk_t *mp;
2846 	uint_t pkt_len;
2847 	iaflags_t ixaflags = IXAF_NO_TRACE;
2848 	nce_t *nce;
2849 	ill_t	*ill = ncec->ncec_ill;
2850 	boolean_t isv6 = (ncec->ncec_ipversion == IPV6_VERSION);
2851 	ip_stack_t *ipst = ill->ill_ipst;
2852 
2853 	if (IS_IPMP(ncec->ncec_ill)) {
2854 		nce_resolv_ipmp_ok(ncec);
2855 		return;
2856 	}
2857 	/* non IPMP case */
2858 
2859 	mutex_enter(&ncec->ncec_lock);
2860 	ASSERT(ncec->ncec_nprobes == 0);
2861 	mp = ncec->ncec_qd_mp;
2862 	ncec->ncec_qd_mp = NULL;
2863 	mutex_exit(&ncec->ncec_lock);
2864 
2865 	while (mp != NULL) {
2866 		mblk_t *nxt_mp;
2867 
2868 		if (ill->ill_isv6) {
2869 			ip6_t *ip6h = (ip6_t *)mp->b_rptr;
2870 
2871 			pkt_len = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN;
2872 		} else {
2873 			ipha_t *ipha = (ipha_t *)mp->b_rptr;
2874 
2875 			ixaflags |= IXAF_IS_IPV4;
2876 			pkt_len = ntohs(ipha->ipha_length);
2877 		}
2878 		nxt_mp = mp->b_next;
2879 		mp->b_next = NULL;
2880 		/*
2881 		 * IXAF_NO_DEV_FLOW_CTL information for TCP packets is no
2882 		 * longer available, but it's ok to drop this flag because TCP
2883 		 * has its own flow-control in effect, so TCP packets
2884 		 * are not likely to get here when flow-control is in effect.
2885 		 */
2886 		mutex_enter(&ill->ill_lock);
2887 		nce = nce_lookup(ill, &ncec->ncec_addr);
2888 		mutex_exit(&ill->ill_lock);
2889 
2890 		if (nce == NULL) {
2891 			if (isv6) {
2892 				BUMP_MIB(&ipst->ips_ip6_mib,
2893 				    ipIfStatsOutDiscards);
2894 			} else {
2895 				BUMP_MIB(&ipst->ips_ip_mib,
2896 				    ipIfStatsOutDiscards);
2897 			}
2898 			ip_drop_output("ipIfStatsOutDiscards - no nce",
2899 			    mp, NULL);
2900 			freemsg(mp);
2901 		} else {
2902 			/*
2903 			 * We don't know the zoneid, but
2904 			 * ip_xmit does not care since IXAF_NO_TRACE
2905 			 * is set. (We traced the packet the first
2906 			 * time through ip_xmit.)
2907 			 */
2908 			(void) ip_xmit(mp, nce, ixaflags, pkt_len, 0,
2909 			    ALL_ZONES, 0, NULL);
2910 			nce_refrele(nce);
2911 		}
2912 		mp = nxt_mp;
2913 	}
2914 
2915 	ncec_cb_dispatch(ncec); /* complete callbacks */
2916 }
2917 
2918 /*
2919  * Called by SIOCSNDP* ioctl to add/change an ncec entry
2920  * and the corresponding attributes.
2921  * Disallow states other than ND_REACHABLE or ND_STALE.
2922  */
2923 int
2924 ndp_sioc_update(ill_t *ill, lif_nd_req_t *lnr)
2925 {
2926 	sin6_t		*sin6;
2927 	in6_addr_t	*addr;
2928 	ncec_t		*ncec;
2929 	nce_t		*nce;
2930 	int		err = 0;
2931 	uint16_t	new_flags = 0;
2932 	uint16_t	old_flags = 0;
2933 	int		inflags = lnr->lnr_flags;
2934 	ip_stack_t	*ipst = ill->ill_ipst;
2935 	boolean_t	do_postprocess = B_FALSE;
2936 
2937 	ASSERT(ill->ill_isv6);
2938 	if ((lnr->lnr_state_create != ND_REACHABLE) &&
2939 	    (lnr->lnr_state_create != ND_STALE))
2940 		return (EINVAL);
2941 
2942 	sin6 = (sin6_t *)&lnr->lnr_addr;
2943 	addr = &sin6->sin6_addr;
2944 
2945 	mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
2946 	ASSERT(!IS_UNDER_IPMP(ill));
2947 	nce = nce_lookup_addr(ill, addr);
2948 	if (nce != NULL)
2949 		new_flags = nce->nce_common->ncec_flags;
2950 
2951 	switch (inflags & (NDF_ISROUTER_ON|NDF_ISROUTER_OFF)) {
2952 	case NDF_ISROUTER_ON:
2953 		new_flags |= NCE_F_ISROUTER;
2954 		break;
2955 	case NDF_ISROUTER_OFF:
2956 		new_flags &= ~NCE_F_ISROUTER;
2957 		break;
2958 	case (NDF_ISROUTER_OFF|NDF_ISROUTER_ON):
2959 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
2960 		if (nce != NULL)
2961 			nce_refrele(nce);
2962 		return (EINVAL);
2963 	}
2964 
2965 	switch (inflags & (NDF_ANYCAST_ON|NDF_ANYCAST_OFF)) {
2966 	case NDF_ANYCAST_ON:
2967 		new_flags |= NCE_F_ANYCAST;
2968 		break;
2969 	case NDF_ANYCAST_OFF:
2970 		new_flags &= ~NCE_F_ANYCAST;
2971 		break;
2972 	case (NDF_ANYCAST_OFF|NDF_ANYCAST_ON):
2973 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
2974 		if (nce != NULL)
2975 			nce_refrele(nce);
2976 		return (EINVAL);
2977 	}
2978 
2979 	if (nce == NULL) {
2980 		err = nce_add_v6(ill,
2981 		    (uchar_t *)lnr->lnr_hdw_addr,
2982 		    ill->ill_phys_addr_length,
2983 		    addr,
2984 		    new_flags,
2985 		    lnr->lnr_state_create,
2986 		    &nce);
2987 		if (err != 0) {
2988 			mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
2989 			ip1dbg(("ndp_sioc_update: Can't create NCE %d\n", err));
2990 			return (err);
2991 		} else {
2992 			do_postprocess = B_TRUE;
2993 		}
2994 	}
2995 	ncec = nce->nce_common;
2996 	old_flags = ncec->ncec_flags;
2997 	if (old_flags & NCE_F_ISROUTER && !(new_flags & NCE_F_ISROUTER)) {
2998 		ncec_router_to_host(ncec);
2999 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
3000 		if (do_postprocess)
3001 			err = nce_add_v6_postprocess(nce);
3002 		nce_refrele(nce);
3003 		return (0);
3004 	}
3005 	mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
3006 
3007 	if (do_postprocess)
3008 		err = nce_add_v6_postprocess(nce);
3009 	/*
3010 	 * err cannot be anything other than 0 because we don't support
3011 	 * proxy arp of static addresses.
3012 	 */
3013 	ASSERT(err == 0);
3014 
3015 	mutex_enter(&ncec->ncec_lock);
3016 	ncec->ncec_flags = new_flags;
3017 	mutex_exit(&ncec->ncec_lock);
3018 	/*
3019 	 * Note that we ignore the state at this point, which
3020 	 * should be either STALE or REACHABLE.  Instead we let
3021 	 * the link layer address passed in to determine the state
3022 	 * much like incoming packets.
3023 	 */
3024 	nce_process(ncec, (uchar_t *)lnr->lnr_hdw_addr, 0, B_FALSE);
3025 	nce_refrele(nce);
3026 	return (0);
3027 }
3028 
3029 /*
3030  * Create an nce_t structure for ill using the ncec->ncec_lladdr to set up
3031  * the nce_dlur_mp. If ill != ncec->ncec_ill, then the ips_ill_g_lock must
3032  * be held to ensure that they are in the same group.
3033  */
3034 static nce_t *
3035 nce_fastpath_create(ill_t *ill, ncec_t *ncec)
3036 {
3037 
3038 	nce_t *nce;
3039 
3040 	nce = nce_ill_lookup_then_add(ill, ncec);
3041 
3042 	if (nce == NULL || IS_LOOPBACK(nce->nce_ill) || IS_VNI(nce->nce_ill))
3043 		return (nce);
3044 
3045 	/*
3046 	 * hold the ncec_lock to synchronize with nce_update() so that,
3047 	 * at the end of this function, the contents of nce_dlur_mp are
3048 	 * consistent with ncec->ncec_lladdr, even though some intermediate
3049 	 * packet may have been sent out with a mangled address, which would
3050 	 * only be a transient condition.
3051 	 */
3052 	mutex_enter(&ncec->ncec_lock);
3053 	if (ncec->ncec_lladdr != NULL) {
3054 		bcopy(ncec->ncec_lladdr, nce->nce_dlur_mp->b_rptr +
3055 		    NCE_LL_ADDR_OFFSET(ill), ill->ill_phys_addr_length);
3056 	} else {
3057 		nce->nce_dlur_mp = ill_dlur_gen(NULL, 0, ill->ill_sap,
3058 		    ill->ill_sap_length);
3059 	}
3060 	mutex_exit(&ncec->ncec_lock);
3061 	return (nce);
3062 }
3063 
3064 /*
3065  * we make nce_fp_mp to have an M_DATA prepend.
3066  * The caller ensures there is hold on ncec for this function.
3067  * Note that since ill_fastpath_probe() copies the mblk there is
3068  * no need to hold the nce or ncec beyond this function.
3069  *
3070  * If the caller has passed in a non-null ncec_nce to nce_faspath() that
3071  * ncec_nce must correspond to the nce for ncec with nce_ill == ncec->ncec_ill
3072  * and will be returned back by this function, so that no extra nce_refrele
3073  * is required for the caller. The calls from nce_add_common() use this
3074  * method. All other callers (that pass in NULL ncec_nce) will have to do a
3075  * nce_refrele of the returned nce (when it is non-null).
3076  */
3077 nce_t *
3078 nce_fastpath(ncec_t *ncec, boolean_t trigger_fp_req, nce_t *ncec_nce)
3079 {
3080 	nce_t *nce;
3081 	ill_t *ill = ncec->ncec_ill;
3082 
3083 	ASSERT(ill != NULL);
3084 
3085 	if (IS_IPMP(ill) && trigger_fp_req) {
3086 		trigger_fp_req = B_FALSE;
3087 		ipmp_ncec_fastpath(ncec, ill);
3088 
3089 	}
3090 	/*
3091 	 * If the caller already has the nce corresponding to the ill, use
3092 	 * that one. Otherwise we have to lookup/add the nce. Calls from
3093 	 * nce_add_common() fall in the former category, and have just done
3094 	 * the nce lookup/add that can be reused.
3095 	 */
3096 	if (ncec_nce == NULL)
3097 		nce = nce_fastpath_create(ill, ncec);
3098 	else
3099 		nce = ncec_nce;
3100 
3101 	if (nce == NULL || IS_LOOPBACK(nce->nce_ill) || IS_VNI(nce->nce_ill))
3102 		return (nce);
3103 
3104 	if (trigger_fp_req)
3105 		nce_fastpath_trigger(nce);
3106 	return (nce);
3107 }
3108 
3109 /*
3110  * Trigger fastpath on nce. No locks may be held.
3111  */
3112 static void
3113 nce_fastpath_trigger(nce_t *nce)
3114 {
3115 	int res;
3116 	ill_t *ill = nce->nce_ill;
3117 	ncec_t *ncec = nce->nce_common;
3118 
3119 	res = ill_fastpath_probe(ill, nce->nce_dlur_mp);
3120 	/*
3121 	 * EAGAIN is an indication of a transient error
3122 	 * i.e. allocation failure etc. leave the ncec in the list it
3123 	 * will be updated when another probe happens for another ire
3124 	 * if not it will be taken out of the list when the ire is
3125 	 * deleted.
3126 	 */
3127 	if (res != 0 && res != EAGAIN && res != ENOTSUP)
3128 		nce_fastpath_list_delete(ill, ncec, NULL);
3129 }
3130 
3131 /*
3132  * Add ncec to the nce fastpath list on ill.
3133  */
3134 static nce_t *
3135 nce_ill_lookup_then_add_locked(ill_t *ill, ncec_t *ncec)
3136 {
3137 	nce_t *nce = NULL;
3138 
3139 	ASSERT(MUTEX_HELD(&ill->ill_lock));
3140 	/*
3141 	 * Atomically ensure that the ill is not CONDEMNED and is not going
3142 	 * down, before adding the NCE.
3143 	 */
3144 	if (ill->ill_state_flags & ILL_CONDEMNED)
3145 		return (NULL);
3146 	mutex_enter(&ncec->ncec_lock);
3147 	/*
3148 	 * if ncec has not been deleted and
3149 	 * is not already in the list add it.
3150 	 */
3151 	if (!NCE_ISCONDEMNED(ncec)) {
3152 		nce = nce_lookup(ill, &ncec->ncec_addr);
3153 		if (nce != NULL)
3154 			goto done;
3155 		nce = nce_add(ill, ncec);
3156 	}
3157 done:
3158 	mutex_exit(&ncec->ncec_lock);
3159 	return (nce);
3160 }
3161 
3162 nce_t *
3163 nce_ill_lookup_then_add(ill_t *ill, ncec_t *ncec)
3164 {
3165 	nce_t *nce;
3166 
3167 	mutex_enter(&ill->ill_lock);
3168 	nce = nce_ill_lookup_then_add_locked(ill, ncec);
3169 	mutex_exit(&ill->ill_lock);
3170 	return (nce);
3171 }
3172 
3173 
3174 /*
3175  * remove ncec from the ill_nce list. If 'dead' is non-null, the deleted
3176  * nce is added to the 'dead' list, and the caller must nce_refrele() the
3177  * entry after all locks have been dropped.
3178  */
3179 void
3180 nce_fastpath_list_delete(ill_t *ill, ncec_t *ncec, list_t *dead)
3181 {
3182 	nce_t *nce;
3183 
3184 	ASSERT(ill != NULL);
3185 
3186 	/* first clean out any nce pointers in the under_ills */
3187 	if (IS_IPMP(ill))
3188 		ipmp_ncec_flush_nce(ncec);
3189 
3190 	/* now the ill itself */
3191 	mutex_enter(&ill->ill_lock);
3192 	for (nce = list_head(&ill->ill_nce); nce != NULL;
3193 	    nce = list_next(&ill->ill_nce, nce)) {
3194 		if (nce->nce_common == ncec) {
3195 			nce_refhold(nce);
3196 			nce_delete(nce);
3197 			break;
3198 		}
3199 	}
3200 	mutex_exit(&ill->ill_lock);
3201 	if (nce != NULL) {
3202 		if (dead == NULL)
3203 			nce_refrele(nce);
3204 		else
3205 			list_insert_tail(dead, nce);
3206 	}
3207 }
3208 
3209 /*
3210  * when the fastpath response does not fit in the datab
3211  * associated with the existing nce_fp_mp, we delete and
3212  * add the nce to retrigger fastpath based on the information
3213  * in the ncec_t.
3214  */
3215 static nce_t *
3216 nce_delete_then_add(nce_t *nce)
3217 {
3218 	ill_t		*ill = nce->nce_ill;
3219 	nce_t		*newnce = NULL;
3220 
3221 	ip0dbg(("nce_delete_then_add nce %p ill %s\n",
3222 	    (void *)nce, ill->ill_name));
3223 	mutex_enter(&ill->ill_lock);
3224 	mutex_enter(&nce->nce_common->ncec_lock);
3225 	nce_delete(nce);
3226 	/*
3227 	 * Make sure that ncec is not condemned before adding. We hold the
3228 	 * ill_lock and ncec_lock to synchronize with ncec_delete() and
3229 	 * ipmp_ncec_flush_nce()
3230 	 */
3231 	if (!NCE_ISCONDEMNED(nce->nce_common))
3232 		newnce = nce_add(ill, nce->nce_common);
3233 	mutex_exit(&nce->nce_common->ncec_lock);
3234 	mutex_exit(&ill->ill_lock);
3235 	nce_refrele(nce);
3236 	return (newnce); /* could be null if nomem */
3237 }
3238 
3239 typedef struct nce_fp_match_s {
3240 	nce_t	*nce_fp_match_res;
3241 	mblk_t	*nce_fp_match_ack_mp;
3242 } nce_fp_match_t;
3243 
3244 /* ARGSUSED */
3245 static int
3246 nce_fastpath_match_dlur(ill_t *ill, nce_t *nce, void *arg)
3247 {
3248 	nce_fp_match_t	*nce_fp_marg = arg;
3249 	ncec_t		*ncec = nce->nce_common;
3250 	mblk_t		*mp = nce_fp_marg->nce_fp_match_ack_mp;
3251 	uchar_t	*mp_rptr, *ud_mp_rptr;
3252 	mblk_t		*ud_mp = nce->nce_dlur_mp;
3253 	ptrdiff_t	cmplen;
3254 
3255 	/*
3256 	 * mp is the mp associated with the fastpath ack.
3257 	 * ud_mp is the outstanding DL_UNITDATA_REQ on the nce_t
3258 	 * under consideration. If the contents match, then the
3259 	 * fastpath ack is used to update the nce.
3260 	 */
3261 	if (ud_mp == NULL)
3262 		return (0);
3263 	mp_rptr = mp->b_rptr;
3264 	cmplen = mp->b_wptr - mp_rptr;
3265 	ASSERT(cmplen >= 0);
3266 
3267 	ud_mp_rptr = ud_mp->b_rptr;
3268 	/*
3269 	 * The ncec is locked here to prevent any other threads from accessing
3270 	 * and changing nce_dlur_mp when the address becomes resolved to an
3271 	 * lla while we're in the middle of looking at and comparing the
3272 	 * hardware address (lla). It is also locked to prevent multiple
3273 	 * threads in nce_fastpath() from examining nce_dlur_mp at the same
3274 	 * time.
3275 	 */
3276 	mutex_enter(&ncec->ncec_lock);
3277 	if (ud_mp->b_wptr - ud_mp_rptr != cmplen ||
3278 	    bcmp((char *)mp_rptr, (char *)ud_mp_rptr, cmplen) == 0) {
3279 		nce_fp_marg->nce_fp_match_res = nce;
3280 		mutex_exit(&ncec->ncec_lock);
3281 		nce_refhold(nce);
3282 		return (1);
3283 	}
3284 	mutex_exit(&ncec->ncec_lock);
3285 	return (0);
3286 }
3287 
3288 /*
3289  * Update all NCE's that are not in fastpath mode and
3290  * have an nce_fp_mp that matches mp. mp->b_cont contains
3291  * the fastpath header.
3292  *
3293  * Returns TRUE if entry should be dequeued, or FALSE otherwise.
3294  */
3295 void
3296 nce_fastpath_update(ill_t *ill,  mblk_t *mp)
3297 {
3298 	nce_fp_match_t nce_fp_marg;
3299 	nce_t *nce;
3300 	mblk_t *nce_fp_mp, *fp_mp;
3301 
3302 	nce_fp_marg.nce_fp_match_res = NULL;
3303 	nce_fp_marg.nce_fp_match_ack_mp = mp;
3304 
3305 	nce_walk(ill, nce_fastpath_match_dlur, &nce_fp_marg);
3306 
3307 	if ((nce = nce_fp_marg.nce_fp_match_res) == NULL)
3308 		return;
3309 
3310 	mutex_enter(&nce->nce_lock);
3311 	nce_fp_mp = nce->nce_fp_mp;
3312 
3313 	if (nce_fp_mp != NULL) {
3314 		fp_mp = mp->b_cont;
3315 		if (nce_fp_mp->b_rptr + MBLKL(fp_mp) >
3316 		    nce_fp_mp->b_datap->db_lim) {
3317 			mutex_exit(&nce->nce_lock);
3318 			nce = nce_delete_then_add(nce);
3319 			if (nce == NULL) {
3320 				return;
3321 			}
3322 			mutex_enter(&nce->nce_lock);
3323 			nce_fp_mp = nce->nce_fp_mp;
3324 		}
3325 	}
3326 
3327 	/* Matched - install mp as the fastpath mp */
3328 	if (nce_fp_mp == NULL) {
3329 		fp_mp = dupb(mp->b_cont);
3330 		nce->nce_fp_mp = fp_mp;
3331 	} else {
3332 		fp_mp = mp->b_cont;
3333 		bcopy(fp_mp->b_rptr, nce_fp_mp->b_rptr, MBLKL(fp_mp));
3334 		nce->nce_fp_mp->b_wptr = nce->nce_fp_mp->b_rptr
3335 		    + MBLKL(fp_mp);
3336 	}
3337 	mutex_exit(&nce->nce_lock);
3338 	nce_refrele(nce);
3339 }
3340 
3341 /*
3342  * Return a pointer to a given option in the packet.
3343  * Assumes that option part of the packet have already been validated.
3344  */
3345 nd_opt_hdr_t *
3346 ndp_get_option(nd_opt_hdr_t *opt, int optlen, int opt_type)
3347 {
3348 	while (optlen > 0) {
3349 		if (opt->nd_opt_type == opt_type)
3350 			return (opt);
3351 		optlen -= 8 * opt->nd_opt_len;
3352 		opt = (struct nd_opt_hdr *)((char *)opt + 8 * opt->nd_opt_len);
3353 	}
3354 	return (NULL);
3355 }
3356 
3357 /*
3358  * Verify all option lengths present are > 0, also check to see
3359  * if the option lengths and packet length are consistent.
3360  */
3361 boolean_t
3362 ndp_verify_optlen(nd_opt_hdr_t *opt, int optlen)
3363 {
3364 	ASSERT(opt != NULL);
3365 	while (optlen > 0) {
3366 		if (opt->nd_opt_len == 0)
3367 			return (B_FALSE);
3368 		optlen -= 8 * opt->nd_opt_len;
3369 		if (optlen < 0)
3370 			return (B_FALSE);
3371 		opt = (struct nd_opt_hdr *)((char *)opt + 8 * opt->nd_opt_len);
3372 	}
3373 	return (B_TRUE);
3374 }
3375 
3376 /*
3377  * ncec_walk function.
3378  * Free a fraction of the NCE cache entries.
3379  *
3380  * A possible optimization here would be to use ncec_last where possible, and
3381  * delete the least-frequently used entry, which would require more complex
3382  * computation as we walk through the ncec's (e.g., track ncec entries by
3383  * order of ncec_last and/or maintain state)
3384  */
3385 static void
3386 ncec_cache_reclaim(ncec_t *ncec, char *arg)
3387 {
3388 	ip_stack_t	*ipst = ncec->ncec_ipst;
3389 	uint_t		fraction = *(uint_t *)arg;
3390 	uint_t		rand;
3391 
3392 	if ((ncec->ncec_flags &
3393 	    (NCE_F_MYADDR | NCE_F_STATIC | NCE_F_BCAST)) != 0) {
3394 		return;
3395 	}
3396 
3397 	rand = (uint_t)ddi_get_lbolt() +
3398 	    NCE_ADDR_HASH_V6(ncec->ncec_addr, NCE_TABLE_SIZE);
3399 	if ((rand/fraction)*fraction == rand) {
3400 		IP_STAT(ipst, ip_nce_reclaim_deleted);
3401 		ncec_delete(ncec);
3402 	}
3403 }
3404 
3405 /*
3406  * kmem_cache callback to free up memory.
3407  *
3408  * For now we just delete a fixed fraction.
3409  */
3410 static void
3411 ip_nce_reclaim_stack(ip_stack_t *ipst)
3412 {
3413 	uint_t		fraction = ipst->ips_ip_nce_reclaim_fraction;
3414 
3415 	IP_STAT(ipst, ip_nce_reclaim_calls);
3416 
3417 	ncec_walk(NULL, (pfi_t)ncec_cache_reclaim, (uchar_t *)&fraction, ipst);
3418 
3419 	/*
3420 	 * Walk all CONNs that can have a reference on an ire, ncec or dce.
3421 	 * Get them to update any stale references to drop any refholds they
3422 	 * have.
3423 	 */
3424 	ipcl_walk(conn_ixa_cleanup, (void *)B_FALSE, ipst);
3425 }
3426 
3427 /*
3428  * Called by the memory allocator subsystem directly, when the system
3429  * is running low on memory.
3430  */
3431 /* ARGSUSED */
3432 void
3433 ip_nce_reclaim(void *args)
3434 {
3435 	netstack_handle_t nh;
3436 	netstack_t *ns;
3437 
3438 	netstack_next_init(&nh);
3439 	while ((ns = netstack_next(&nh)) != NULL) {
3440 		ip_nce_reclaim_stack(ns->netstack_ip);
3441 		netstack_rele(ns);
3442 	}
3443 	netstack_next_fini(&nh);
3444 }
3445 
3446 #ifdef DEBUG
3447 void
3448 ncec_trace_ref(ncec_t *ncec)
3449 {
3450 	ASSERT(MUTEX_HELD(&ncec->ncec_lock));
3451 
3452 	if (ncec->ncec_trace_disable)
3453 		return;
3454 
3455 	if (!th_trace_ref(ncec, ncec->ncec_ipst)) {
3456 		ncec->ncec_trace_disable = B_TRUE;
3457 		ncec_trace_cleanup(ncec);
3458 	}
3459 }
3460 
3461 void
3462 ncec_untrace_ref(ncec_t *ncec)
3463 {
3464 	ASSERT(MUTEX_HELD(&ncec->ncec_lock));
3465 
3466 	if (!ncec->ncec_trace_disable)
3467 		th_trace_unref(ncec);
3468 }
3469 
3470 static void
3471 ncec_trace_cleanup(const ncec_t *ncec)
3472 {
3473 	th_trace_cleanup(ncec, ncec->ncec_trace_disable);
3474 }
3475 #endif
3476 
3477 /*
3478  * Called when address resolution fails due to a timeout.
3479  * Send an ICMP unreachable in response to all queued packets.
3480  */
3481 void
3482 arp_resolv_failed(ncec_t *ncec)
3483 {
3484 	mblk_t	*mp, *nxt_mp;
3485 	char	buf[INET6_ADDRSTRLEN];
3486 	struct in_addr ipv4addr;
3487 	ill_t *ill = ncec->ncec_ill;
3488 	ip_stack_t *ipst = ncec->ncec_ipst;
3489 	ip_recv_attr_t	iras;
3490 
3491 	bzero(&iras, sizeof (iras));
3492 	iras.ira_flags = IRAF_IS_IPV4;
3493 	/*
3494 	 * we are setting the ira_rill to the ipmp_ill (instead of
3495 	 * the actual ill on which the packet was received), but this
3496 	 * is ok because we don't actually need the real ira_rill.
3497 	 * to send the icmp unreachable to the sender.
3498 	 */
3499 	iras.ira_ill = iras.ira_rill = ill;
3500 	iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex;
3501 	iras.ira_rifindex = iras.ira_ruifindex;
3502 
3503 	IN6_V4MAPPED_TO_INADDR(&ncec->ncec_addr, &ipv4addr);
3504 	ip3dbg(("arp_resolv_failed: dst %s\n",
3505 	    inet_ntop(AF_INET, &ipv4addr, buf, sizeof (buf))));
3506 	mutex_enter(&ncec->ncec_lock);
3507 	mp = ncec->ncec_qd_mp;
3508 	ncec->ncec_qd_mp = NULL;
3509 	ncec->ncec_nprobes = 0;
3510 	mutex_exit(&ncec->ncec_lock);
3511 	while (mp != NULL) {
3512 		nxt_mp = mp->b_next;
3513 		mp->b_next = NULL;
3514 
3515 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
3516 		ip_drop_output("ipIfStatsOutDiscards - address unreachable",
3517 		    mp, ill);
3518 		if (ipst->ips_ip_arp_icmp_error) {
3519 			ip3dbg(("arp_resolv_failed: "
3520 			    "Calling icmp_unreachable\n"));
3521 			icmp_unreachable(mp, ICMP_HOST_UNREACHABLE, &iras);
3522 		} else {
3523 			freemsg(mp);
3524 		}
3525 		ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE));
3526 		mp = nxt_mp;
3527 	}
3528 	ncec_cb_dispatch(ncec); /* finish off waiting callbacks */
3529 }
3530 
3531 /*
3532  * if ill is an under_ill, translate it to the ipmp_ill and add the
3533  * nce on the ipmp_ill. Two nce_t entries (one on the ipmp_ill, and
3534  * one on the underlying in_ill) will be created for the
3535  * ncec_t in this case. The ncec_t itself will be created on the ipmp_ill.
3536  */
3537 int
3538 nce_lookup_then_add_v4(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len,
3539     const in_addr_t *addr, uint16_t flags, uint16_t state, nce_t **newnce)
3540 {
3541 	int	err;
3542 	in6_addr_t addr6;
3543 	ip_stack_t *ipst = ill->ill_ipst;
3544 	nce_t	*nce, *upper_nce = NULL;
3545 	ill_t	*in_ill = ill, *under = NULL;
3546 	boolean_t need_ill_refrele = B_FALSE;
3547 
3548 	if (flags & NCE_F_MCAST) {
3549 		/*
3550 		 * hw_addr will be figured out in nce_set_multicast_v4;
3551 		 * caller needs to pass in the cast_ill for ipmp
3552 		 */
3553 		ASSERT(hw_addr == NULL);
3554 		ASSERT(!IS_IPMP(ill));
3555 		err = nce_set_multicast_v4(ill, addr, flags, newnce);
3556 		return (err);
3557 	}
3558 
3559 	if (IS_UNDER_IPMP(ill) && !(flags & NCE_F_MYADDR)) {
3560 		ill = ipmp_ill_hold_ipmp_ill(ill);
3561 		if (ill == NULL)
3562 			return (ENXIO);
3563 		need_ill_refrele = B_TRUE;
3564 	}
3565 	if ((flags & NCE_F_BCAST) != 0) {
3566 		/*
3567 		 * IPv4 broadcast ncec: compute the hwaddr.
3568 		 */
3569 		if (IS_IPMP(ill)) {
3570 			under = ipmp_ill_get_xmit_ill(ill, B_FALSE);
3571 			if (under == NULL)  {
3572 				if (need_ill_refrele)
3573 					ill_refrele(ill);
3574 				return (ENETDOWN);
3575 			}
3576 			hw_addr = under->ill_bcast_mp->b_rptr +
3577 			    NCE_LL_ADDR_OFFSET(under);
3578 			hw_addr_len = under->ill_phys_addr_length;
3579 		} else {
3580 			hw_addr = ill->ill_bcast_mp->b_rptr +
3581 			    NCE_LL_ADDR_OFFSET(ill),
3582 			    hw_addr_len = ill->ill_phys_addr_length;
3583 		}
3584 	}
3585 
3586 	mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
3587 	IN6_IPADDR_TO_V4MAPPED(*addr, &addr6);
3588 	nce = nce_lookup_addr(ill, &addr6);
3589 	if (nce == NULL) {
3590 		err = nce_add_v4(ill, hw_addr, hw_addr_len, addr, flags,
3591 		    state, &nce);
3592 	} else {
3593 		err = EEXIST;
3594 	}
3595 	mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
3596 	if (err == 0)
3597 		err = nce_add_v4_postprocess(nce);
3598 
3599 	if (in_ill != ill && nce != NULL) {
3600 		nce_t *under_nce = NULL;
3601 
3602 		/*
3603 		 * in_ill was the under_ill. Try to create the under_nce.
3604 		 * Hold the ill_g_lock to prevent changes to group membership
3605 		 * until we are done.
3606 		 */
3607 		rw_enter(&ipst->ips_ill_g_lock, RW_READER);
3608 		if (IS_IN_SAME_ILLGRP(in_ill, ill)) {
3609 			under_nce = nce_fastpath_create(in_ill,
3610 			    nce->nce_common);
3611 			upper_nce = nce;
3612 			if ((nce = under_nce) == NULL)
3613 				err = EINVAL;
3614 		}
3615 		rw_exit(&ipst->ips_ill_g_lock);
3616 		if (under_nce != NULL && NCE_ISREACHABLE(nce->nce_common))
3617 			nce_fastpath_trigger(under_nce);
3618 	}
3619 	if (nce != NULL) {
3620 		if (newnce != NULL)
3621 			*newnce = nce;
3622 		else
3623 			nce_refrele(nce);
3624 	}
3625 
3626 	if (under != NULL)
3627 		ill_refrele(under);
3628 
3629 	if (upper_nce != NULL)
3630 		nce_refrele(upper_nce);
3631 
3632 	if (need_ill_refrele)
3633 		ill_refrele(ill);
3634 
3635 	return (err);
3636 }
3637 
3638 /*
3639  * NDP Cache Entry creation routine for IPv4.
3640  * This routine must always be called with ndp4->ndp_g_lock held.
3641  * Prior to return, ncec_refcnt is incremented.
3642  *
3643  * IPMP notes: the ncec for non-local (i.e., !NCE_MYADDR(ncec) addresses
3644  * are always added pointing at the ipmp_ill. Thus, when the ill passed
3645  * to nce_add_v4 is an under_ill (i.e., IS_UNDER_IPMP(ill)) two nce_t
3646  * entries will be created, both pointing at the same ncec_t. The nce_t
3647  * entries will have their nce_ill set to the ipmp_ill and the under_ill
3648  * respectively, with the ncec_t having its ncec_ill pointing at the ipmp_ill.
3649  * Local addresses are always created on the ill passed to nce_add_v4.
3650  */
3651 int
3652 nce_add_v4(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len,
3653     const in_addr_t *addr, uint16_t flags, uint16_t state, nce_t **newnce)
3654 {
3655 	int		err;
3656 	boolean_t	is_multicast = (flags & NCE_F_MCAST);
3657 	struct in6_addr	addr6;
3658 	nce_t		*nce;
3659 
3660 	ASSERT(MUTEX_HELD(&ill->ill_ipst->ips_ndp4->ndp_g_lock));
3661 	ASSERT(!ill->ill_isv6);
3662 	ASSERT(!IN_MULTICAST(htonl(*addr)) || is_multicast);
3663 
3664 	IN6_IPADDR_TO_V4MAPPED(*addr, &addr6);
3665 	err = nce_add_common(ill, hw_addr, hw_addr_len, &addr6, flags, state,
3666 	    &nce);
3667 	ASSERT(newnce != NULL);
3668 	*newnce = nce;
3669 	return (err);
3670 }
3671 
3672 /*
3673  * Post-processing routine to be executed after nce_add_v4(). This function
3674  * triggers fastpath (if appropriate) and DAD on the newly added nce entry
3675  * and must be called without any locks held.
3676  *
3677  * Always returns 0, but we return an int to keep this symmetric with the
3678  * IPv6 counter-part.
3679  */
3680 int
3681 nce_add_v4_postprocess(nce_t *nce)
3682 {
3683 	ncec_t		*ncec = nce->nce_common;
3684 	uint16_t	flags = ncec->ncec_flags;
3685 	boolean_t	ndp_need_dad = B_FALSE;
3686 	boolean_t	dropped;
3687 	clock_t		delay;
3688 	ip_stack_t	*ipst = ncec->ncec_ill->ill_ipst;
3689 	uchar_t		*hw_addr = ncec->ncec_lladdr;
3690 	boolean_t	trigger_fastpath = B_TRUE;
3691 
3692 	/*
3693 	 * If the hw_addr is NULL, typically for ND_INCOMPLETE nces, then
3694 	 * we call nce_fastpath as soon as the ncec is resolved in nce_process.
3695 	 * We call nce_fastpath from nce_update if the link layer address of
3696 	 * the peer changes from nce_update
3697 	 */
3698 	if (NCE_PUBLISH(ncec) || !NCE_ISREACHABLE(ncec) || (hw_addr == NULL &&
3699 	    ncec->ncec_ill->ill_net_type != IRE_IF_NORESOLVER))
3700 		trigger_fastpath = B_FALSE;
3701 
3702 	if (trigger_fastpath)
3703 		nce_fastpath_trigger(nce);
3704 
3705 	if (NCE_PUBLISH(ncec) && ncec->ncec_state == ND_PROBE) {
3706 		/*
3707 		 * Either the caller (by passing in ND_PROBE)
3708 		 * or nce_add_common() (by the internally computed state
3709 		 * based on ncec_addr and ill_net_type) has determined
3710 		 * that this unicast entry needs DAD. Trigger DAD.
3711 		 */
3712 		ndp_need_dad = B_TRUE;
3713 	} else if (flags & NCE_F_UNSOL_ADV) {
3714 		/*
3715 		 * We account for the transmit below by assigning one
3716 		 * less than the ndd variable. Subsequent decrements
3717 		 * are done in nce_timer.
3718 		 */
3719 		mutex_enter(&ncec->ncec_lock);
3720 		ncec->ncec_unsolicit_count =
3721 		    ipst->ips_ip_arp_publish_count - 1;
3722 		mutex_exit(&ncec->ncec_lock);
3723 		dropped = arp_announce(ncec);
3724 		mutex_enter(&ncec->ncec_lock);
3725 		if (dropped)
3726 			ncec->ncec_unsolicit_count++;
3727 		else
3728 			ncec->ncec_last_time_defended = ddi_get_lbolt();
3729 		if (ncec->ncec_unsolicit_count != 0) {
3730 			nce_start_timer(ncec,
3731 			    ipst->ips_ip_arp_publish_interval);
3732 		}
3733 		mutex_exit(&ncec->ncec_lock);
3734 	}
3735 
3736 	/*
3737 	 * If ncec_xmit_interval is 0, user has configured us to send the first
3738 	 * probe right away.  Do so, and set up for the subsequent probes.
3739 	 */
3740 	if (ndp_need_dad) {
3741 		mutex_enter(&ncec->ncec_lock);
3742 		if (ncec->ncec_pcnt == 0) {
3743 			/*
3744 			 * DAD probes and announce can be
3745 			 * administratively disabled by setting the
3746 			 * probe_count to zero. Restart the timer in
3747 			 * this case to mark the ipif as ready.
3748 			 */
3749 			ncec->ncec_unsolicit_count = 0;
3750 			mutex_exit(&ncec->ncec_lock);
3751 			nce_restart_timer(ncec, 0);
3752 		} else {
3753 			mutex_exit(&ncec->ncec_lock);
3754 			delay = ((ncec->ncec_flags & NCE_F_FAST) ?
3755 			    ipst->ips_arp_probe_delay :
3756 			    ipst->ips_arp_fastprobe_delay);
3757 			nce_dad(ncec, NULL, (delay == 0 ? B_TRUE : B_FALSE));
3758 		}
3759 	}
3760 	return (0);
3761 }
3762 
3763 /*
3764  * ncec_walk routine to update all entries that have a given destination or
3765  * gateway address and cached link layer (MAC) address.  This is used when ARP
3766  * informs us that a network-to-link-layer mapping may have changed.
3767  */
3768 void
3769 nce_update_hw_changed(ncec_t *ncec, void *arg)
3770 {
3771 	nce_hw_map_t *hwm = arg;
3772 	ipaddr_t ncec_addr;
3773 
3774 	if (ncec->ncec_state != ND_REACHABLE)
3775 		return;
3776 
3777 	IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, ncec_addr);
3778 	if (ncec_addr != hwm->hwm_addr)
3779 		return;
3780 
3781 	mutex_enter(&ncec->ncec_lock);
3782 	if (hwm->hwm_flags != 0)
3783 		ncec->ncec_flags = hwm->hwm_flags;
3784 	nce_update(ncec, ND_STALE, hwm->hwm_hwaddr);
3785 	mutex_exit(&ncec->ncec_lock);
3786 }
3787 
3788 void
3789 ncec_refhold(ncec_t *ncec)
3790 {
3791 	mutex_enter(&(ncec)->ncec_lock);
3792 	(ncec)->ncec_refcnt++;
3793 	ASSERT((ncec)->ncec_refcnt != 0);
3794 #ifdef DEBUG
3795 	ncec_trace_ref(ncec);
3796 #endif
3797 	mutex_exit(&(ncec)->ncec_lock);
3798 }
3799 
3800 void
3801 ncec_refhold_notr(ncec_t *ncec)
3802 {
3803 	mutex_enter(&(ncec)->ncec_lock);
3804 	(ncec)->ncec_refcnt++;
3805 	ASSERT((ncec)->ncec_refcnt != 0);
3806 	mutex_exit(&(ncec)->ncec_lock);
3807 }
3808 
3809 static void
3810 ncec_refhold_locked(ncec_t *ncec)
3811 {
3812 	ASSERT(MUTEX_HELD(&(ncec)->ncec_lock));
3813 	(ncec)->ncec_refcnt++;
3814 #ifdef DEBUG
3815 	ncec_trace_ref(ncec);
3816 #endif
3817 }
3818 
3819 /* ncec_inactive destroys the mutex thus no mutex_exit is needed */
3820 void
3821 ncec_refrele(ncec_t *ncec)
3822 {
3823 	mutex_enter(&(ncec)->ncec_lock);
3824 #ifdef DEBUG
3825 	ncec_untrace_ref(ncec);
3826 #endif
3827 	ASSERT((ncec)->ncec_refcnt != 0);
3828 	if (--(ncec)->ncec_refcnt == 0) {
3829 		ncec_inactive(ncec);
3830 	} else {
3831 		mutex_exit(&(ncec)->ncec_lock);
3832 	}
3833 }
3834 
3835 void
3836 ncec_refrele_notr(ncec_t *ncec)
3837 {
3838 	mutex_enter(&(ncec)->ncec_lock);
3839 	ASSERT((ncec)->ncec_refcnt != 0);
3840 	if (--(ncec)->ncec_refcnt == 0) {
3841 		ncec_inactive(ncec);
3842 	} else {
3843 		mutex_exit(&(ncec)->ncec_lock);
3844 	}
3845 }
3846 
3847 /*
3848  * Common to IPv4 and IPv6.
3849  */
3850 void
3851 nce_restart_timer(ncec_t *ncec, uint_t ms)
3852 {
3853 	timeout_id_t tid;
3854 
3855 	ASSERT(!MUTEX_HELD(&(ncec)->ncec_lock));
3856 
3857 	/* First cancel any running timer */
3858 	mutex_enter(&ncec->ncec_lock);
3859 	tid = ncec->ncec_timeout_id;
3860 	ncec->ncec_timeout_id = 0;
3861 	if (tid != 0) {
3862 		mutex_exit(&ncec->ncec_lock);
3863 		(void) untimeout(tid);
3864 		mutex_enter(&ncec->ncec_lock);
3865 	}
3866 
3867 	/* Restart timer */
3868 	nce_start_timer(ncec, ms);
3869 	mutex_exit(&ncec->ncec_lock);
3870 }
3871 
3872 static void
3873 nce_start_timer(ncec_t *ncec, uint_t ms)
3874 {
3875 	ASSERT(MUTEX_HELD(&ncec->ncec_lock));
3876 	/*
3877 	 * Don't start the timer if the ncec has been deleted, or if the timer
3878 	 * is already running
3879 	 */
3880 	if (!NCE_ISCONDEMNED(ncec) && ncec->ncec_timeout_id == 0) {
3881 		ncec->ncec_timeout_id = timeout(nce_timer, ncec,
3882 		    MSEC_TO_TICK(ms) == 0 ? 1 : MSEC_TO_TICK(ms));
3883 	}
3884 }
3885 
3886 int
3887 nce_set_multicast_v4(ill_t *ill, const in_addr_t *dst,
3888     uint16_t flags, nce_t **newnce)
3889 {
3890 	uchar_t		*hw_addr;
3891 	int		err = 0;
3892 	ip_stack_t	*ipst = ill->ill_ipst;
3893 	in6_addr_t	dst6;
3894 	nce_t		*nce;
3895 
3896 	ASSERT(!ill->ill_isv6);
3897 
3898 	IN6_IPADDR_TO_V4MAPPED(*dst, &dst6);
3899 	mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
3900 	if ((nce = nce_lookup_addr(ill, &dst6)) != NULL) {
3901 		mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
3902 		goto done;
3903 	}
3904 	if (ill->ill_net_type == IRE_IF_RESOLVER) {
3905 		/*
3906 		 * For IRE_IF_RESOLVER a hardware mapping can be
3907 		 * generated, for IRE_IF_NORESOLVER, resolution cookie
3908 		 * in the ill is copied in nce_add_v4().
3909 		 */
3910 		hw_addr = kmem_alloc(ill->ill_phys_addr_length, KM_NOSLEEP);
3911 		if (hw_addr == NULL) {
3912 			mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
3913 			return (ENOMEM);
3914 		}
3915 		ip_mcast_mapping(ill, (uchar_t *)dst, hw_addr);
3916 	} else {
3917 		/*
3918 		 * IRE_IF_NORESOLVER type simply copies the resolution
3919 		 * cookie passed in.  So no hw_addr is needed.
3920 		 */
3921 		hw_addr = NULL;
3922 	}
3923 	ASSERT(flags & NCE_F_MCAST);
3924 	ASSERT(flags & NCE_F_NONUD);
3925 	/* nce_state will be computed by nce_add_common() */
3926 	err = nce_add_v4(ill, hw_addr, ill->ill_phys_addr_length, dst, flags,
3927 	    ND_UNCHANGED, &nce);
3928 	mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
3929 	if (err == 0)
3930 		err = nce_add_v4_postprocess(nce);
3931 	if (hw_addr != NULL)
3932 		kmem_free(hw_addr, ill->ill_phys_addr_length);
3933 	if (err != 0) {
3934 		ip1dbg(("nce_set_multicast_v4: create failed" "%d\n", err));
3935 		return (err);
3936 	}
3937 done:
3938 	if (newnce != NULL)
3939 		*newnce = nce;
3940 	else
3941 		nce_refrele(nce);
3942 	return (0);
3943 }
3944 
3945 /*
3946  * This is used when scanning for "old" (least recently broadcast) NCEs.  We
3947  * don't want to have to walk the list for every single one, so we gather up
3948  * batches at a time.
3949  */
3950 #define	NCE_RESCHED_LIST_LEN	8
3951 
3952 typedef struct {
3953 	ill_t	*ncert_ill;
3954 	uint_t	ncert_num;
3955 	ncec_t	*ncert_nces[NCE_RESCHED_LIST_LEN];
3956 } nce_resched_t;
3957 
3958 /*
3959  * Pick the longest waiting NCEs for defense.
3960  */
3961 /* ARGSUSED */
3962 static int
3963 ncec_reschedule(ill_t *ill, nce_t *nce, void *arg)
3964 {
3965 	nce_resched_t *ncert = arg;
3966 	ncec_t **ncecs;
3967 	ncec_t **ncec_max;
3968 	ncec_t *ncec_temp;
3969 	ncec_t *ncec = nce->nce_common;
3970 
3971 	ASSERT(ncec->ncec_ill == ncert->ncert_ill);
3972 	/*
3973 	 * Only reachable entries that are ready for announcement are eligible.
3974 	 */
3975 	if (!NCE_MYADDR(ncec) || ncec->ncec_state != ND_REACHABLE)
3976 		return (0);
3977 	if (ncert->ncert_num < NCE_RESCHED_LIST_LEN) {
3978 		ncec_refhold(ncec);
3979 		ncert->ncert_nces[ncert->ncert_num++] = ncec;
3980 	} else {
3981 		ncecs = ncert->ncert_nces;
3982 		ncec_max = ncecs + NCE_RESCHED_LIST_LEN;
3983 		ncec_refhold(ncec);
3984 		for (; ncecs < ncec_max; ncecs++) {
3985 			ASSERT(ncec != NULL);
3986 			if ((*ncecs)->ncec_last_time_defended >
3987 			    ncec->ncec_last_time_defended) {
3988 				ncec_temp = *ncecs;
3989 				*ncecs = ncec;
3990 				ncec = ncec_temp;
3991 			}
3992 		}
3993 		ncec_refrele(ncec);
3994 	}
3995 	return (0);
3996 }
3997 
3998 /*
3999  * Reschedule the ARP defense of any long-waiting NCEs.  It's assumed that this
4000  * doesn't happen very often (if at all), and thus it needn't be highly
4001  * optimized.  (Note, though, that it's actually O(N) complexity, because the
4002  * outer loop is bounded by a constant rather than by the length of the list.)
4003  */
4004 static void
4005 nce_ill_reschedule(ill_t *ill, nce_resched_t *ncert)
4006 {
4007 	ncec_t		*ncec;
4008 	ip_stack_t	*ipst = ill->ill_ipst;
4009 	uint_t		i, defend_rate;
4010 
4011 	i = ill->ill_defend_count;
4012 	ill->ill_defend_count = 0;
4013 	if (ill->ill_isv6)
4014 		defend_rate = ipst->ips_ndp_defend_rate;
4015 	else
4016 		defend_rate = ipst->ips_arp_defend_rate;
4017 	/* If none could be sitting around, then don't reschedule */
4018 	if (i < defend_rate) {
4019 		DTRACE_PROBE1(reschedule_none, ill_t *, ill);
4020 		return;
4021 	}
4022 	ncert->ncert_ill = ill;
4023 	while (ill->ill_defend_count < defend_rate) {
4024 		nce_walk_common(ill, ncec_reschedule, ncert);
4025 		for (i = 0; i < ncert->ncert_num; i++) {
4026 
4027 			ncec = ncert->ncert_nces[i];
4028 			mutex_enter(&ncec->ncec_lock);
4029 			ncec->ncec_flags |= NCE_F_DELAYED;
4030 			mutex_exit(&ncec->ncec_lock);
4031 			/*
4032 			 * we plan to schedule this ncec, so incr the
4033 			 * defend_count in anticipation.
4034 			 */
4035 			if (++ill->ill_defend_count >= defend_rate)
4036 				break;
4037 		}
4038 		if (ncert->ncert_num < NCE_RESCHED_LIST_LEN)
4039 			break;
4040 	}
4041 }
4042 
4043 /*
4044  * Check if the current rate-limiting parameters permit the sending
4045  * of another address defense announcement for both IPv4 and IPv6.
4046  * Returns B_TRUE if rate-limiting is in effect (i.e., send is not
4047  * permitted), and B_FALSE otherwise. The `defend_rate' parameter
4048  * determines how many address defense announcements are permitted
4049  * in any `defense_perio' interval.
4050  */
4051 static boolean_t
4052 ill_defend_rate_limit(ill_t *ill, ncec_t *ncec)
4053 {
4054 	clock_t		now = ddi_get_lbolt();
4055 	ip_stack_t	*ipst = ill->ill_ipst;
4056 	clock_t		start = ill->ill_defend_start;
4057 	uint32_t	elapsed, defend_period, defend_rate;
4058 	nce_resched_t	ncert;
4059 	boolean_t	ret;
4060 	int		i;
4061 
4062 	if (ill->ill_isv6) {
4063 		defend_period = ipst->ips_ndp_defend_period;
4064 		defend_rate = ipst->ips_ndp_defend_rate;
4065 	} else {
4066 		defend_period = ipst->ips_arp_defend_period;
4067 		defend_rate = ipst->ips_arp_defend_rate;
4068 	}
4069 	if (defend_rate == 0)
4070 		return (B_TRUE);
4071 	bzero(&ncert, sizeof (ncert));
4072 	mutex_enter(&ill->ill_lock);
4073 	if (start > 0) {
4074 		elapsed = now - start;
4075 		if (elapsed > SEC_TO_TICK(defend_period)) {
4076 			ill->ill_defend_start = now;
4077 			/*
4078 			 * nce_ill_reschedule will attempt to
4079 			 * prevent starvation by reschduling the
4080 			 * oldest entries, which are marked with
4081 			 * the NCE_F_DELAYED flag.
4082 			 */
4083 			nce_ill_reschedule(ill, &ncert);
4084 		}
4085 	} else {
4086 		ill->ill_defend_start = now;
4087 	}
4088 	ASSERT(ill->ill_defend_count <= defend_rate);
4089 	mutex_enter(&ncec->ncec_lock);
4090 	if (ncec->ncec_flags & NCE_F_DELAYED) {
4091 		/*
4092 		 * This ncec was rescheduled as one of the really old
4093 		 * entries needing on-going defense. The
4094 		 * ill_defend_count was already incremented in
4095 		 * nce_ill_reschedule. Go ahead and send the announce.
4096 		 */
4097 		ncec->ncec_flags &= ~NCE_F_DELAYED;
4098 		mutex_exit(&ncec->ncec_lock);
4099 		ret = B_FALSE;
4100 		goto done;
4101 	}
4102 	mutex_exit(&ncec->ncec_lock);
4103 	if (ill->ill_defend_count < defend_rate)
4104 		ill->ill_defend_count++;
4105 	if (ill->ill_defend_count == defend_rate) {
4106 		/*
4107 		 * we are no longer allowed to send unbidden defense
4108 		 * messages. Wait for rescheduling.
4109 		 */
4110 		ret = B_TRUE;
4111 	} else {
4112 		ret = B_FALSE;
4113 	}
4114 done:
4115 	mutex_exit(&ill->ill_lock);
4116 	/*
4117 	 * After all the locks have been dropped we can restart nce timer,
4118 	 * and refrele the delayed ncecs
4119 	 */
4120 	for (i = 0; i < ncert.ncert_num; i++) {
4121 		clock_t	xmit_interval;
4122 		ncec_t	*tmp;
4123 
4124 		tmp = ncert.ncert_nces[i];
4125 		xmit_interval = nce_fuzz_interval(tmp->ncec_xmit_interval,
4126 		    B_FALSE);
4127 		nce_restart_timer(tmp, xmit_interval);
4128 		ncec_refrele(tmp);
4129 	}
4130 	return (ret);
4131 }
4132 
4133 boolean_t
4134 ndp_announce(ncec_t *ncec)
4135 {
4136 	return (ndp_xmit(ncec->ncec_ill, ND_NEIGHBOR_ADVERT, ncec->ncec_lladdr,
4137 	    ncec->ncec_lladdr_length, &ncec->ncec_addr, &ipv6_all_hosts_mcast,
4138 	    nce_advert_flags(ncec)));
4139 }
4140 
4141 ill_t *
4142 nce_resolve_src(ncec_t *ncec, in6_addr_t *src)
4143 {
4144 	mblk_t		*mp;
4145 	in6_addr_t	src6;
4146 	ipaddr_t	src4;
4147 	ill_t		*ill = ncec->ncec_ill;
4148 	ill_t		*src_ill = NULL;
4149 	ipif_t		*ipif = NULL;
4150 	boolean_t	is_myaddr = NCE_MYADDR(ncec);
4151 	boolean_t	isv6 = (ncec->ncec_ipversion == IPV6_VERSION);
4152 
4153 	ASSERT(src != NULL);
4154 	ASSERT(IN6_IS_ADDR_UNSPECIFIED(src));
4155 	src6 = *src;
4156 	if (is_myaddr) {
4157 		src6 = ncec->ncec_addr;
4158 		if (!isv6)
4159 			IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, src4);
4160 	} else {
4161 		/*
4162 		 * try to find one from the outgoing packet.
4163 		 */
4164 		mutex_enter(&ncec->ncec_lock);
4165 		mp = ncec->ncec_qd_mp;
4166 		if (mp != NULL) {
4167 			if (isv6) {
4168 				ip6_t	*ip6h = (ip6_t *)mp->b_rptr;
4169 
4170 				src6 = ip6h->ip6_src;
4171 			} else {
4172 				ipha_t  *ipha = (ipha_t *)mp->b_rptr;
4173 
4174 				src4 = ipha->ipha_src;
4175 				IN6_IPADDR_TO_V4MAPPED(src4, &src6);
4176 			}
4177 		}
4178 		mutex_exit(&ncec->ncec_lock);
4179 	}
4180 
4181 	/*
4182 	 * For outgoing packets, if the src of outgoing packet is one
4183 	 * of the assigned interface addresses use it, otherwise we
4184 	 * will pick the source address below.
4185 	 * For local addresses (is_myaddr) doing DAD, NDP announce
4186 	 * messages are mcast. So we use the (IPMP) cast_ill or the
4187 	 * (non-IPMP) ncec_ill for these message types. The only case
4188 	 * of unicast DAD messages are for IPv6 ND probes, for which
4189 	 * we find the ipif_bound_ill corresponding to the ncec_addr.
4190 	 */
4191 	if (!IN6_IS_ADDR_UNSPECIFIED(&src6) || is_myaddr) {
4192 		if (isv6) {
4193 			ipif = ipif_lookup_addr_nondup_v6(&src6, ill, ALL_ZONES,
4194 			    ill->ill_ipst);
4195 		} else {
4196 			ipif = ipif_lookup_addr_nondup(src4, ill, ALL_ZONES,
4197 			    ill->ill_ipst);
4198 		}
4199 
4200 		/*
4201 		 * If no relevant ipif can be found, then it's not one of our
4202 		 * addresses.  Reset to :: and try to find a src for the NS or
4203 		 * ARP request using ipif_select_source_v[4,6]  below.
4204 		 * If an ipif can be found, but it's not yet done with
4205 		 * DAD verification, and we are not being invoked for
4206 		 * DAD (i.e., !is_myaddr), then just postpone this
4207 		 * transmission until later.
4208 		 */
4209 		if (ipif == NULL) {
4210 			src6 = ipv6_all_zeros;
4211 			src4 = INADDR_ANY;
4212 		} else if (!ipif->ipif_addr_ready && !is_myaddr) {
4213 			DTRACE_PROBE2(nce__resolve__ipif__not__ready,
4214 			    ncec_t *, ncec, ipif_t *, ipif);
4215 			ipif_refrele(ipif);
4216 			return (NULL);
4217 		}
4218 	}
4219 
4220 	if (IN6_IS_ADDR_UNSPECIFIED(&src6) && !is_myaddr) {
4221 		/*
4222 		 * Pick a source address for this solicitation, but
4223 		 * restrict the selection to addresses assigned to the
4224 		 * output interface.  We do this because the destination will
4225 		 * create a neighbor cache entry for the source address of
4226 		 * this packet, so the source address had better be a valid
4227 		 * neighbor.
4228 		 */
4229 		if (isv6) {
4230 			ipif = ipif_select_source_v6(ill, &ncec->ncec_addr,
4231 			    B_TRUE, IPV6_PREFER_SRC_DEFAULT, ALL_ZONES,
4232 			    B_FALSE, NULL);
4233 		} else {
4234 			ipaddr_t nce_addr;
4235 
4236 			IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, nce_addr);
4237 			ipif = ipif_select_source_v4(ill, nce_addr, ALL_ZONES,
4238 			    B_FALSE, NULL);
4239 		}
4240 		if (ipif == NULL && IS_IPMP(ill)) {
4241 			ill_t *send_ill = ipmp_ill_get_xmit_ill(ill, B_TRUE);
4242 
4243 			if (send_ill != NULL) {
4244 				if (isv6) {
4245 					ipif = ipif_select_source_v6(send_ill,
4246 					    &ncec->ncec_addr, B_TRUE,
4247 					    IPV6_PREFER_SRC_DEFAULT, ALL_ZONES,
4248 					    B_FALSE, NULL);
4249 				} else {
4250 					IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr,
4251 					    src4);
4252 					ipif = ipif_select_source_v4(send_ill,
4253 					    src4, ALL_ZONES, B_TRUE, NULL);
4254 				}
4255 				ill_refrele(send_ill);
4256 			}
4257 		}
4258 
4259 		if (ipif == NULL) {
4260 			char buf[INET6_ADDRSTRLEN];
4261 
4262 			ip1dbg(("nce_resolve_src: No source ipif for dst %s\n",
4263 			    inet_ntop((isv6 ? AF_INET6 : AF_INET),
4264 			    (char *)&ncec->ncec_addr, buf, sizeof (buf))));
4265 			DTRACE_PROBE1(nce__resolve__no__ipif, ncec_t *, ncec);
4266 			return (NULL);
4267 		}
4268 		src6 = ipif->ipif_v6lcl_addr;
4269 	}
4270 	*src = src6;
4271 	if (ipif != NULL) {
4272 		src_ill = ipif->ipif_ill;
4273 		if (IS_IPMP(src_ill))
4274 			src_ill = ipmp_ipif_hold_bound_ill(ipif);
4275 		else
4276 			ill_refhold(src_ill);
4277 		ipif_refrele(ipif);
4278 		DTRACE_PROBE2(nce__resolve__src__ill, ncec_t *, ncec,
4279 		    ill_t *, src_ill);
4280 	}
4281 	return (src_ill);
4282 }
4283 
4284 void
4285 ip_nce_lookup_and_update(ipaddr_t *addr, ipif_t *ipif, ip_stack_t *ipst,
4286     uchar_t *hwaddr, int hwaddr_len, int flags)
4287 {
4288 	ill_t	*ill;
4289 	ncec_t	*ncec;
4290 	nce_t	*nce;
4291 	uint16_t new_state;
4292 
4293 	ill = (ipif ? ipif->ipif_ill : NULL);
4294 	if (ill != NULL) {
4295 		/*
4296 		 * only one ncec is possible
4297 		 */
4298 		nce = nce_lookup_v4(ill, addr);
4299 		if (nce != NULL) {
4300 			ncec = nce->nce_common;
4301 			mutex_enter(&ncec->ncec_lock);
4302 			if (NCE_ISREACHABLE(ncec))
4303 				new_state = ND_UNCHANGED;
4304 			else
4305 				new_state = ND_STALE;
4306 			ncec->ncec_flags = flags;
4307 			nce_update(ncec, new_state, hwaddr);
4308 			mutex_exit(&ncec->ncec_lock);
4309 			nce_refrele(nce);
4310 			return;
4311 		}
4312 	} else {
4313 		/*
4314 		 * ill is wildcard; clean up all ncec's and ire's
4315 		 * that match on addr.
4316 		 */
4317 		nce_hw_map_t hwm;
4318 
4319 		hwm.hwm_addr = *addr;
4320 		hwm.hwm_hwlen = hwaddr_len;
4321 		hwm.hwm_hwaddr = hwaddr;
4322 		hwm.hwm_flags = flags;
4323 
4324 		ncec_walk_common(ipst->ips_ndp4, NULL,
4325 		    (pfi_t)nce_update_hw_changed, (uchar_t *)&hwm, B_TRUE);
4326 	}
4327 }
4328 
4329 /*
4330  * Common function to add ncec entries.
4331  * we always add the ncec with ncec_ill == ill, and always create
4332  * nce_t on ncec_ill. A dlpi fastpath message may be triggered if the
4333  * ncec is !reachable.
4334  *
4335  * When the caller passes in an nce_state of ND_UNCHANGED,
4336  * nce_add_common() will determine the state of the created nce based
4337  * on the ill_net_type and nce_flags used. Otherwise, the nce will
4338  * be created with state set to the passed in nce_state.
4339  */
4340 static int
4341 nce_add_common(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len,
4342     const in6_addr_t *addr, uint16_t flags, uint16_t nce_state, nce_t **retnce)
4343 {
4344 	static	ncec_t		nce_nil;
4345 	uchar_t			*template = NULL;
4346 	int			err;
4347 	ncec_t			*ncec;
4348 	ncec_t			**ncep;
4349 	ip_stack_t		*ipst = ill->ill_ipst;
4350 	uint16_t		state;
4351 	boolean_t		fastprobe = B_FALSE;
4352 	struct ndp_g_s		*ndp;
4353 	nce_t			*nce = NULL;
4354 	mblk_t			*dlur_mp = NULL;
4355 
4356 	if (ill->ill_isv6)
4357 		ndp = ill->ill_ipst->ips_ndp6;
4358 	else
4359 		ndp = ill->ill_ipst->ips_ndp4;
4360 
4361 	*retnce = NULL;
4362 
4363 	ASSERT(MUTEX_HELD(&ndp->ndp_g_lock));
4364 
4365 	if (IN6_IS_ADDR_UNSPECIFIED(addr)) {
4366 		ip0dbg(("nce_add_common: no addr\n"));
4367 		return (EINVAL);
4368 	}
4369 	if ((flags & ~NCE_EXTERNAL_FLAGS_MASK)) {
4370 		ip0dbg(("nce_add_common: flags = %x\n", (int)flags));
4371 		return (EINVAL);
4372 	}
4373 
4374 	if (ill->ill_isv6) {
4375 		ncep = ((ncec_t **)NCE_HASH_PTR_V6(ipst, *addr));
4376 	} else {
4377 		ipaddr_t v4addr;
4378 
4379 		IN6_V4MAPPED_TO_IPADDR(addr, v4addr);
4380 		ncep = ((ncec_t **)NCE_HASH_PTR_V4(ipst, v4addr));
4381 	}
4382 
4383 	/*
4384 	 * The caller has ensured that there is no nce on ill, but there could
4385 	 * still be an nce_common_t for the address, so that we find exisiting
4386 	 * ncec_t strucutures first, and atomically add a new nce_t if
4387 	 * one is found. The ndp_g_lock ensures that we don't cross threads
4388 	 * with an ncec_delete(). Unlike ncec_lookup_illgrp() we do not
4389 	 * compare for matches across the illgrp because this function is
4390 	 * called via nce_lookup_then_add_v* -> nce_add_v* -> nce_add_common,
4391 	 * with the nce_lookup_then_add_v* passing in the ipmp_ill where
4392 	 * appropriate.
4393 	 */
4394 	ncec = *ncep;
4395 	for (; ncec != NULL; ncec = ncec->ncec_next) {
4396 		if (ncec->ncec_ill == ill) {
4397 			if (IN6_ARE_ADDR_EQUAL(&ncec->ncec_addr, addr)) {
4398 				*retnce = nce_ill_lookup_then_add(ill, ncec);
4399 				if (*retnce != NULL)
4400 					break;
4401 			}
4402 		}
4403 	}
4404 	if (*retnce != NULL) {
4405 		/*
4406 		 * We should never find *retnce to be MYADDR, since the caller
4407 		 * may then incorrectly restart a DAD timer that's already
4408 		 * running.
4409 		 */
4410 		ASSERT(!NCE_MYADDR(ncec));
4411 		/* caller must trigger fastpath on nce */
4412 		return (0);
4413 	}
4414 	ncec = kmem_cache_alloc(ncec_cache, KM_NOSLEEP);
4415 	if (ncec == NULL)
4416 		return (ENOMEM);
4417 	*ncec = nce_nil;
4418 	ncec->ncec_ill = ill;
4419 	ncec->ncec_ipversion = (ill->ill_isv6 ? IPV6_VERSION : IPV4_VERSION);
4420 	ncec->ncec_flags = flags;
4421 	ncec->ncec_ipst = ipst;	/* No netstack_hold */
4422 
4423 	if (!ill->ill_isv6) {
4424 		ipaddr_t addr4;
4425 
4426 		/*
4427 		 * DAD probe interval and probe count are set based on
4428 		 * fast/slow probe settings. If the underlying link doesn't
4429 		 * have reliably up/down notifications or if we're working
4430 		 * with IPv4 169.254.0.0/16 Link Local Address space, then
4431 		 * don't use the fast timers.  Otherwise, use them.
4432 		 */
4433 		ASSERT(IN6_IS_ADDR_V4MAPPED(addr));
4434 		IN6_V4MAPPED_TO_IPADDR(addr, addr4);
4435 		if (ill->ill_note_link && !IS_IPV4_LL_SPACE(&addr4))
4436 			fastprobe = B_TRUE;
4437 		if (fastprobe) {
4438 			ncec->ncec_xmit_interval =
4439 			    ipst->ips_arp_fastprobe_interval;
4440 			ncec->ncec_pcnt =
4441 			    ipst->ips_arp_fastprobe_count;
4442 			ncec->ncec_flags |= NCE_F_FAST;
4443 		} else {
4444 			ncec->ncec_xmit_interval =
4445 			    ipst->ips_arp_probe_interval;
4446 			ncec->ncec_pcnt =
4447 			    ipst->ips_arp_probe_count;
4448 		}
4449 		if (NCE_PUBLISH(ncec)) {
4450 			ncec->ncec_unsolicit_count =
4451 			    ipst->ips_ip_arp_publish_count;
4452 		}
4453 	} else {
4454 		/*
4455 		 * probe interval is constant: ILL_PROBE_INTERVAL
4456 		 * probe count is constant: ND_MAX_UNICAST_SOLICIT
4457 		 */
4458 		ncec->ncec_pcnt = ND_MAX_UNICAST_SOLICIT;
4459 		if (NCE_PUBLISH(ncec)) {
4460 			ncec->ncec_unsolicit_count =
4461 			    ipst->ips_ip_ndp_unsolicit_count;
4462 		}
4463 	}
4464 	ncec->ncec_rcnt = ill->ill_xmit_count;
4465 	ncec->ncec_addr = *addr;
4466 	ncec->ncec_qd_mp = NULL;
4467 	ncec->ncec_refcnt = 1; /* for ncec getting created */
4468 	mutex_init(&ncec->ncec_lock, NULL, MUTEX_DEFAULT, NULL);
4469 	ncec->ncec_trace_disable = B_FALSE;
4470 
4471 	/*
4472 	 * ncec_lladdr holds link layer address
4473 	 */
4474 	if (hw_addr_len > 0) {
4475 		template = kmem_alloc(hw_addr_len, KM_NOSLEEP);
4476 		if (template == NULL) {
4477 			err = ENOMEM;
4478 			goto err_ret;
4479 		}
4480 		ncec->ncec_lladdr = template;
4481 		ncec->ncec_lladdr_length = hw_addr_len;
4482 		bzero(ncec->ncec_lladdr, hw_addr_len);
4483 	}
4484 	if ((flags & NCE_F_BCAST) != 0) {
4485 		state = ND_REACHABLE;
4486 		ASSERT(hw_addr_len > 0);
4487 	} else if (ill->ill_net_type == IRE_IF_RESOLVER) {
4488 		state = ND_INITIAL;
4489 	} else if (ill->ill_net_type == IRE_IF_NORESOLVER) {
4490 		/*
4491 		 * NORESOLVER entries are always created in the REACHABLE
4492 		 * state.
4493 		 */
4494 		state = ND_REACHABLE;
4495 		if (ill->ill_phys_addr_length == IP_ADDR_LEN &&
4496 		    ill->ill_mactype != DL_IPV4 &&
4497 		    ill->ill_mactype != DL_6TO4) {
4498 			/*
4499 			 * We create a nce_res_mp with the IP nexthop address
4500 			 * as the destination address if the physical length
4501 			 * is exactly 4 bytes for point-to-multipoint links
4502 			 * that do their own resolution from IP to link-layer
4503 			 * address (e.g. IP over X.25).
4504 			 */
4505 			bcopy((uchar_t *)addr,
4506 			    ncec->ncec_lladdr, ill->ill_phys_addr_length);
4507 		}
4508 		if (ill->ill_phys_addr_length == IPV6_ADDR_LEN &&
4509 		    ill->ill_mactype != DL_IPV6) {
4510 			/*
4511 			 * We create a nce_res_mp with the IP nexthop address
4512 			 * as the destination address if the physical legnth
4513 			 * is exactly 16 bytes for point-to-multipoint links
4514 			 * that do their own resolution from IP to link-layer
4515 			 * address.
4516 			 */
4517 			bcopy((uchar_t *)addr,
4518 			    ncec->ncec_lladdr, ill->ill_phys_addr_length);
4519 		}
4520 		/*
4521 		 * Since NUD is not part of the base IPv4 protocol definition,
4522 		 * IPv4 neighbor entries on NORESOLVER interfaces will never
4523 		 * age, and are marked NCE_F_NONUD.
4524 		 */
4525 		if (!ill->ill_isv6)
4526 			ncec->ncec_flags |= NCE_F_NONUD;
4527 	} else if (ill->ill_net_type == IRE_LOOPBACK) {
4528 		state = ND_REACHABLE;
4529 	}
4530 
4531 	if (hw_addr != NULL || ill->ill_net_type == IRE_IF_NORESOLVER) {
4532 		/*
4533 		 * We are adding an ncec with a deterministic hw_addr,
4534 		 * so the state can only be one of {REACHABLE, STALE, PROBE}.
4535 		 *
4536 		 * if we are adding a unicast ncec for the local address
4537 		 * it would be REACHABLE; we would be adding a ND_STALE entry
4538 		 * for the requestor of an ARP_REQUEST/ND_SOLICIT. Our own
4539 		 * addresses are added in PROBE to trigger DAD.
4540 		 */
4541 		if ((flags & (NCE_F_MCAST|NCE_F_BCAST)) ||
4542 		    ill->ill_net_type == IRE_IF_NORESOLVER)
4543 			state = ND_REACHABLE;
4544 		else if (!NCE_PUBLISH(ncec))
4545 			state = ND_STALE;
4546 		else
4547 			state = ND_PROBE;
4548 		if (hw_addr != NULL)
4549 			nce_set_ll(ncec, hw_addr);
4550 	}
4551 	/* caller overrides internally computed state */
4552 	if (nce_state != ND_UNCHANGED)
4553 		state = nce_state;
4554 
4555 	if (state == ND_PROBE)
4556 		ncec->ncec_flags |= NCE_F_UNVERIFIED;
4557 
4558 	ncec->ncec_state = state;
4559 
4560 	if (state == ND_REACHABLE) {
4561 		ncec->ncec_last = ncec->ncec_init_time =
4562 		    TICK_TO_MSEC(ddi_get_lbolt64());
4563 	} else {
4564 		ncec->ncec_last = 0;
4565 		if (state == ND_INITIAL)
4566 			ncec->ncec_init_time = TICK_TO_MSEC(ddi_get_lbolt64());
4567 	}
4568 	list_create(&ncec->ncec_cb, sizeof (ncec_cb_t),
4569 	    offsetof(ncec_cb_t, ncec_cb_node));
4570 	/*
4571 	 * have all the memory allocations out of the way before taking locks
4572 	 * and adding the nce.
4573 	 */
4574 	nce = kmem_cache_alloc(nce_cache, KM_NOSLEEP);
4575 	if (nce == NULL) {
4576 		err = ENOMEM;
4577 		goto err_ret;
4578 	}
4579 	if (ncec->ncec_lladdr != NULL ||
4580 	    ill->ill_net_type == IRE_IF_NORESOLVER) {
4581 		dlur_mp = ill_dlur_gen(ncec->ncec_lladdr,
4582 		    ill->ill_phys_addr_length, ill->ill_sap,
4583 		    ill->ill_sap_length);
4584 		if (dlur_mp == NULL) {
4585 			err = ENOMEM;
4586 			goto err_ret;
4587 		}
4588 	}
4589 
4590 	/*
4591 	 * Atomically ensure that the ill is not CONDEMNED, before
4592 	 * adding the NCE.
4593 	 */
4594 	mutex_enter(&ill->ill_lock);
4595 	if (ill->ill_state_flags & ILL_CONDEMNED) {
4596 		mutex_exit(&ill->ill_lock);
4597 		err = EINVAL;
4598 		goto err_ret;
4599 	}
4600 	if (!NCE_MYADDR(ncec) &&
4601 	    (ill->ill_state_flags & ILL_DOWN_IN_PROGRESS)) {
4602 		mutex_exit(&ill->ill_lock);
4603 		DTRACE_PROBE1(nce__add__on__down__ill, ncec_t *, ncec);
4604 		err = EINVAL;
4605 		goto err_ret;
4606 	}
4607 	/*
4608 	 * Acquire the ncec_lock even before adding the ncec to the list
4609 	 * so that it cannot get deleted after the ncec is added, but
4610 	 * before we add the nce.
4611 	 */
4612 	mutex_enter(&ncec->ncec_lock);
4613 	if ((ncec->ncec_next = *ncep) != NULL)
4614 		ncec->ncec_next->ncec_ptpn = &ncec->ncec_next;
4615 	*ncep = ncec;
4616 	ncec->ncec_ptpn = ncep;
4617 
4618 	/* Bump up the number of ncec's referencing this ill */
4619 	DTRACE_PROBE3(ill__incr__cnt, (ill_t *), ill,
4620 	    (char *), "ncec", (void *), ncec);
4621 	ill->ill_ncec_cnt++;
4622 	/*
4623 	 * Since we hold the ncec_lock at this time, the ncec cannot be
4624 	 * condemned, and we can safely add the nce.
4625 	 */
4626 	*retnce = nce_add_impl(ill, ncec, nce, dlur_mp);
4627 	mutex_exit(&ncec->ncec_lock);
4628 	mutex_exit(&ill->ill_lock);
4629 
4630 	/* caller must trigger fastpath on *retnce */
4631 	return (0);
4632 
4633 err_ret:
4634 	if (ncec != NULL)
4635 		kmem_cache_free(ncec_cache, ncec);
4636 	if (nce != NULL)
4637 		kmem_cache_free(nce_cache, nce);
4638 	freemsg(dlur_mp);
4639 	if (template != NULL)
4640 		kmem_free(template, ill->ill_phys_addr_length);
4641 	return (err);
4642 }
4643 
4644 /*
4645  * take a ref on the nce
4646  */
4647 void
4648 nce_refhold(nce_t *nce)
4649 {
4650 	mutex_enter(&nce->nce_lock);
4651 	nce->nce_refcnt++;
4652 	ASSERT((nce)->nce_refcnt != 0);
4653 	mutex_exit(&nce->nce_lock);
4654 }
4655 
4656 /*
4657  * release a ref on the nce; In general, this
4658  * cannot be called with locks held because nce_inactive
4659  * may result in nce_inactive which will take the ill_lock,
4660  * do ipif_ill_refrele_tail etc. Thus the one exception
4661  * where this can be called with locks held is when the caller
4662  * is certain that the nce_refcnt is sufficient to prevent
4663  * the invocation of nce_inactive.
4664  */
4665 void
4666 nce_refrele(nce_t *nce)
4667 {
4668 	ASSERT((nce)->nce_refcnt != 0);
4669 	mutex_enter(&nce->nce_lock);
4670 	if (--nce->nce_refcnt == 0)
4671 		nce_inactive(nce); /* destroys the mutex */
4672 	else
4673 		mutex_exit(&nce->nce_lock);
4674 }
4675 
4676 /*
4677  * free the nce after all refs have gone away.
4678  */
4679 static void
4680 nce_inactive(nce_t *nce)
4681 {
4682 	ill_t *ill = nce->nce_ill;
4683 
4684 	ASSERT(nce->nce_refcnt == 0);
4685 
4686 	ncec_refrele_notr(nce->nce_common);
4687 	nce->nce_common = NULL;
4688 	freemsg(nce->nce_fp_mp);
4689 	freemsg(nce->nce_dlur_mp);
4690 
4691 	mutex_enter(&ill->ill_lock);
4692 	DTRACE_PROBE3(ill__decr__cnt, (ill_t *), ill,
4693 	    (char *), "nce", (void *), nce);
4694 	ill->ill_nce_cnt--;
4695 	nce->nce_ill = NULL;
4696 	/*
4697 	 * If the number of ncec's associated with this ill have dropped
4698 	 * to zero, check whether we need to restart any operation that
4699 	 * is waiting for this to happen.
4700 	 */
4701 	if (ILL_DOWN_OK(ill)) {
4702 		/* ipif_ill_refrele_tail drops the ill_lock */
4703 		ipif_ill_refrele_tail(ill);
4704 	} else {
4705 		mutex_exit(&ill->ill_lock);
4706 	}
4707 
4708 	mutex_destroy(&nce->nce_lock);
4709 	kmem_cache_free(nce_cache, nce);
4710 }
4711 
4712 /*
4713  * Add an nce to the ill_nce list.
4714  */
4715 static nce_t *
4716 nce_add_impl(ill_t *ill, ncec_t *ncec, nce_t *nce, mblk_t *dlur_mp)
4717 {
4718 	bzero(nce, sizeof (*nce));
4719 	mutex_init(&nce->nce_lock, NULL, MUTEX_DEFAULT, NULL);
4720 	nce->nce_common = ncec;
4721 	nce->nce_addr = ncec->ncec_addr;
4722 	nce->nce_ill = ill;
4723 	DTRACE_PROBE3(ill__incr__cnt, (ill_t *), ill,
4724 	    (char *), "nce", (void *), nce);
4725 	ill->ill_nce_cnt++;
4726 
4727 	nce->nce_refcnt = 1; /* for the thread */
4728 	ncec->ncec_refcnt++; /* want ncec_refhold_locked_notr(ncec) */
4729 	nce->nce_dlur_mp = dlur_mp;
4730 
4731 	/* add nce to the ill's fastpath list.  */
4732 	nce->nce_refcnt++; /* for the list */
4733 	list_insert_head(&ill->ill_nce, nce);
4734 	return (nce);
4735 }
4736 
4737 static nce_t *
4738 nce_add(ill_t *ill, ncec_t *ncec)
4739 {
4740 	nce_t	*nce;
4741 	mblk_t	*dlur_mp = NULL;
4742 
4743 	ASSERT(MUTEX_HELD(&ill->ill_lock));
4744 	ASSERT(MUTEX_HELD(&ncec->ncec_lock));
4745 
4746 	nce = kmem_cache_alloc(nce_cache, KM_NOSLEEP);
4747 	if (nce == NULL)
4748 		return (NULL);
4749 	if (ncec->ncec_lladdr != NULL ||
4750 	    ill->ill_net_type == IRE_IF_NORESOLVER) {
4751 		dlur_mp = ill_dlur_gen(ncec->ncec_lladdr,
4752 		    ill->ill_phys_addr_length, ill->ill_sap,
4753 		    ill->ill_sap_length);
4754 		if (dlur_mp == NULL) {
4755 			kmem_cache_free(nce_cache, nce);
4756 			return (NULL);
4757 		}
4758 	}
4759 	return (nce_add_impl(ill, ncec, nce, dlur_mp));
4760 }
4761 
4762 /*
4763  * remove the nce from the ill_faspath list
4764  */
4765 void
4766 nce_delete(nce_t *nce)
4767 {
4768 	ill_t	*ill = nce->nce_ill;
4769 
4770 	ASSERT(MUTEX_HELD(&ill->ill_lock));
4771 
4772 	mutex_enter(&nce->nce_lock);
4773 	if (nce->nce_is_condemned) {
4774 		/*
4775 		 * some other thread has removed this nce from the ill_nce list
4776 		 */
4777 		mutex_exit(&nce->nce_lock);
4778 		return;
4779 	}
4780 	nce->nce_is_condemned = B_TRUE;
4781 	mutex_exit(&nce->nce_lock);
4782 
4783 	list_remove(&ill->ill_nce, nce);
4784 	/*
4785 	 * even though we are holding the ill_lock, it is ok to
4786 	 * call nce_refrele here because we know that we should have
4787 	 * at least 2 refs on the nce: one for the thread, and one
4788 	 * for the list. The refrele below will release the one for
4789 	 * the list.
4790 	 */
4791 	nce_refrele(nce);
4792 }
4793 
4794 nce_t *
4795 nce_lookup(ill_t *ill, const in6_addr_t *addr)
4796 {
4797 	nce_t *nce = NULL;
4798 
4799 	ASSERT(ill != NULL);
4800 	ASSERT(MUTEX_HELD(&ill->ill_lock));
4801 
4802 	for (nce = list_head(&ill->ill_nce); nce != NULL;
4803 	    nce = list_next(&ill->ill_nce, nce)) {
4804 		if (IN6_ARE_ADDR_EQUAL(&nce->nce_addr, addr))
4805 			break;
4806 	}
4807 
4808 	/*
4809 	 * if we found the nce on the ill_nce list while holding
4810 	 * the ill_lock, then it cannot be condemned yet.
4811 	 */
4812 	if (nce != NULL) {
4813 		ASSERT(!nce->nce_is_condemned);
4814 		nce_refhold(nce);
4815 	}
4816 	return (nce);
4817 }
4818 
4819 /*
4820  * Walk the ill_nce list on ill. The callback function func() cannot perform
4821  * any destructive actions.
4822  */
4823 static void
4824 nce_walk_common(ill_t *ill, pfi_t func, void *arg)
4825 {
4826 	nce_t *nce = NULL, *nce_next;
4827 
4828 	ASSERT(MUTEX_HELD(&ill->ill_lock));
4829 	for (nce = list_head(&ill->ill_nce); nce != NULL; ) {
4830 		nce_next = list_next(&ill->ill_nce, nce);
4831 		if (func(ill, nce, arg) != 0)
4832 			break;
4833 		nce = nce_next;
4834 	}
4835 }
4836 
4837 void
4838 nce_walk(ill_t *ill, pfi_t func, void *arg)
4839 {
4840 	mutex_enter(&ill->ill_lock);
4841 	nce_walk_common(ill, func, arg);
4842 	mutex_exit(&ill->ill_lock);
4843 }
4844 
4845 void
4846 nce_flush(ill_t *ill, boolean_t flushall)
4847 {
4848 	nce_t *nce, *nce_next;
4849 	list_t dead;
4850 
4851 	list_create(&dead, sizeof (nce_t), offsetof(nce_t, nce_node));
4852 	mutex_enter(&ill->ill_lock);
4853 	for (nce = list_head(&ill->ill_nce); nce != NULL; ) {
4854 		nce_next = list_next(&ill->ill_nce, nce);
4855 		if (!flushall && NCE_PUBLISH(nce->nce_common)) {
4856 			nce = nce_next;
4857 			continue;
4858 		}
4859 		/*
4860 		 * nce_delete requires that the caller should either not
4861 		 * be holding locks, or should hold a ref to ensure that
4862 		 * we wont hit ncec_inactive. So take a ref and clean up
4863 		 * after the list is flushed.
4864 		 */
4865 		nce_refhold(nce);
4866 		nce_delete(nce);
4867 		list_insert_tail(&dead, nce);
4868 		nce = nce_next;
4869 	}
4870 	mutex_exit(&ill->ill_lock);
4871 	while ((nce = list_head(&dead)) != NULL) {
4872 		list_remove(&dead, nce);
4873 		nce_refrele(nce);
4874 	}
4875 	ASSERT(list_is_empty(&dead));
4876 	list_destroy(&dead);
4877 }
4878 
4879 /* Return an interval that is anywhere in the [1 .. intv] range */
4880 static clock_t
4881 nce_fuzz_interval(clock_t intv, boolean_t initial_time)
4882 {
4883 	clock_t rnd, frac;
4884 
4885 	(void) random_get_pseudo_bytes((uint8_t *)&rnd, sizeof (rnd));
4886 	/* Note that clock_t is signed; must chop off bits */
4887 	rnd &= (1ul << (NBBY * sizeof (rnd) - 1)) - 1;
4888 	if (initial_time) {
4889 		if (intv <= 0)
4890 			intv = 1;
4891 		else
4892 			intv = (rnd % intv) + 1;
4893 	} else {
4894 		/* Compute 'frac' as 20% of the configured interval */
4895 		if ((frac = intv / 5) <= 1)
4896 			frac = 2;
4897 		/* Set intv randomly in the range [intv-frac .. intv+frac] */
4898 		if ((intv = intv - frac + rnd % (2 * frac + 1)) <= 0)
4899 		intv = 1;
4900 	}
4901 	return (intv);
4902 }
4903 
4904 void
4905 nce_resolv_ipmp_ok(ncec_t *ncec)
4906 {
4907 	mblk_t *mp;
4908 	uint_t pkt_len;
4909 	iaflags_t ixaflags = IXAF_NO_TRACE;
4910 	nce_t *under_nce;
4911 	ill_t	*ill = ncec->ncec_ill;
4912 	boolean_t isv6 = (ncec->ncec_ipversion == IPV6_VERSION);
4913 	ipif_t *src_ipif = NULL;
4914 	ip_stack_t *ipst = ill->ill_ipst;
4915 	ill_t *send_ill;
4916 	uint_t nprobes;
4917 
4918 	ASSERT(IS_IPMP(ill));
4919 
4920 	mutex_enter(&ncec->ncec_lock);
4921 	nprobes = ncec->ncec_nprobes;
4922 	mp = ncec->ncec_qd_mp;
4923 	ncec->ncec_qd_mp = NULL;
4924 	ncec->ncec_nprobes = 0;
4925 	mutex_exit(&ncec->ncec_lock);
4926 
4927 	while (mp != NULL) {
4928 		mblk_t *nxt_mp;
4929 
4930 		nxt_mp = mp->b_next;
4931 		mp->b_next = NULL;
4932 		if (isv6) {
4933 			ip6_t *ip6h = (ip6_t *)mp->b_rptr;
4934 
4935 			pkt_len = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN;
4936 			src_ipif = ipif_lookup_addr_nondup_v6(&ip6h->ip6_src,
4937 			    ill, ALL_ZONES, ipst);
4938 		} else {
4939 			ipha_t *ipha = (ipha_t *)mp->b_rptr;
4940 
4941 			ixaflags |= IXAF_IS_IPV4;
4942 			pkt_len = ntohs(ipha->ipha_length);
4943 			src_ipif = ipif_lookup_addr_nondup(ipha->ipha_src,
4944 			    ill, ALL_ZONES, ipst);
4945 		}
4946 
4947 		/*
4948 		 * find a new nce based on an under_ill. The first IPMP probe
4949 		 * packet gets queued, so we could still find a src_ipif that
4950 		 * matches an IPMP test address.
4951 		 */
4952 		if (src_ipif == NULL || IS_IPMP(src_ipif->ipif_ill)) {
4953 			/*
4954 			 * if src_ipif is null, this could be either a
4955 			 * forwarded packet or a probe whose src got deleted.
4956 			 * We identify the former case by looking for the
4957 			 * ncec_nprobes: the first ncec_nprobes packets are
4958 			 * probes;
4959 			 */
4960 			if (src_ipif == NULL && nprobes > 0)
4961 				goto drop_pkt;
4962 
4963 			/*
4964 			 * For forwarded packets, we use the ipmp rotor
4965 			 * to find send_ill.
4966 			 */
4967 			send_ill = ipmp_ill_get_xmit_ill(ncec->ncec_ill,
4968 			    B_TRUE);
4969 		} else {
4970 			send_ill = src_ipif->ipif_ill;
4971 			ill_refhold(send_ill);
4972 		}
4973 
4974 		DTRACE_PROBE4(nce__resolve__ipmp, (mblk_t *), mp,
4975 		    (ncec_t *), ncec, (ipif_t *),
4976 		    src_ipif, (ill_t *), send_ill);
4977 
4978 		if (send_ill == NULL) {
4979 			if (src_ipif != NULL)
4980 				ipif_refrele(src_ipif);
4981 			goto drop_pkt;
4982 		}
4983 		/* create an under_nce on send_ill */
4984 		rw_enter(&ipst->ips_ill_g_lock, RW_READER);
4985 		if (IS_IN_SAME_ILLGRP(send_ill, ncec->ncec_ill))
4986 			under_nce = nce_fastpath_create(send_ill, ncec);
4987 		else
4988 			under_nce = NULL;
4989 		rw_exit(&ipst->ips_ill_g_lock);
4990 		if (under_nce != NULL && NCE_ISREACHABLE(ncec))
4991 			nce_fastpath_trigger(under_nce);
4992 
4993 		ill_refrele(send_ill);
4994 		if (src_ipif != NULL)
4995 			ipif_refrele(src_ipif);
4996 
4997 		if (under_nce != NULL) {
4998 			(void) ip_xmit(mp, under_nce, ixaflags, pkt_len, 0,
4999 			    ALL_ZONES, 0, NULL);
5000 			nce_refrele(under_nce);
5001 			if (nprobes > 0)
5002 				nprobes--;
5003 			mp = nxt_mp;
5004 			continue;
5005 		}
5006 drop_pkt:
5007 		if (isv6) {
5008 			BUMP_MIB(&ipst->ips_ip6_mib, ipIfStatsOutDiscards);
5009 		} else {
5010 			BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
5011 		}
5012 		ip_drop_output("ipIfStatsOutDiscards - no under_ill", mp, NULL);
5013 		freemsg(mp);
5014 		if (nprobes > 0)
5015 			nprobes--;
5016 		mp = nxt_mp;
5017 	}
5018 	ncec_cb_dispatch(ncec); /* complete callbacks */
5019 }
5020