xref: /titanic_50/usr/src/uts/common/inet/ip/ip_ndp.c (revision dae2dfb7b2bc52128c80617e060a0ba54ca46aba)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 #include <sys/types.h>
29 #include <sys/stream.h>
30 #include <sys/stropts.h>
31 #include <sys/strsun.h>
32 #include <sys/sysmacros.h>
33 #include <sys/errno.h>
34 #include <sys/dlpi.h>
35 #include <sys/socket.h>
36 #include <sys/ddi.h>
37 #include <sys/sunddi.h>
38 #include <sys/cmn_err.h>
39 #include <sys/debug.h>
40 #include <sys/vtrace.h>
41 #include <sys/kmem.h>
42 #include <sys/zone.h>
43 #include <sys/ethernet.h>
44 #include <sys/sdt.h>
45 
46 #include <net/if.h>
47 #include <net/if_types.h>
48 #include <net/if_dl.h>
49 #include <net/route.h>
50 #include <netinet/in.h>
51 #include <netinet/ip6.h>
52 #include <netinet/icmp6.h>
53 
54 #include <inet/common.h>
55 #include <inet/mi.h>
56 #include <inet/mib2.h>
57 #include <inet/nd.h>
58 #include <inet/ip.h>
59 #include <inet/ip_impl.h>
60 #include <inet/ipclassifier.h>
61 #include <inet/ip_if.h>
62 #include <inet/ip_ire.h>
63 #include <inet/ip_rts.h>
64 #include <inet/ip6.h>
65 #include <inet/ip_ndp.h>
66 #include <inet/ipsec_impl.h>
67 #include <inet/ipsec_info.h>
68 #include <inet/sctp_ip.h>
69 
70 /*
71  * Function names with nce_ prefix are static while function
72  * names with ndp_ prefix are used by rest of the IP.
73  *
74  * Lock ordering:
75  *
76  *	ndp_g_lock -> ill_lock -> nce_lock
77  *
78  * The ndp_g_lock protects the NCE hash (nce_hash_tbl, NCE_HASH_PTR) and
79  * nce_next.  Nce_lock protects the contents of the NCE (particularly
80  * nce_refcnt).
81  */
82 
83 static	boolean_t nce_cmp_ll_addr(const nce_t *nce, const uchar_t *new_ll_addr,
84     uint32_t ll_addr_len);
85 static	void	nce_ire_delete(nce_t *nce);
86 static	void	nce_ire_delete1(ire_t *ire, char *nce_arg);
87 static	void 	nce_set_ll(nce_t *nce, uchar_t *ll_addr);
88 static	nce_t	*nce_lookup_addr(ill_t *, const in6_addr_t *, nce_t *);
89 static	nce_t	*nce_lookup_mapping(ill_t *ill, const in6_addr_t *addr);
90 static	void	nce_make_mapping(nce_t *nce, uchar_t *addrpos,
91     uchar_t *addr);
92 static	int	nce_set_multicast(ill_t *ill, const in6_addr_t *addr);
93 static	void	nce_queue_mp(nce_t *nce, mblk_t *mp);
94 static	mblk_t	*nce_udreq_alloc(ill_t *ill);
95 static	void	nce_update(nce_t *nce, uint16_t new_state,
96     uchar_t *new_ll_addr);
97 static	uint32_t	nce_solicit(nce_t *nce, mblk_t *mp);
98 static	boolean_t	nce_xmit(ill_t *ill, uint32_t operation,
99     ill_t *hwaddr_ill, boolean_t use_lla_addr, const in6_addr_t *sender,
100     const in6_addr_t *target, int flag);
101 static int	ndp_add_v4(ill_t *, const in_addr_t *, uint16_t,
102     nce_t **, nce_t *);
103 
104 #ifdef DEBUG
105 static void	nce_trace_cleanup(const nce_t *);
106 #endif
107 
108 #define	NCE_HASH_PTR_V4(ipst, addr)					\
109 	(&((ipst)->ips_ndp4->nce_hash_tbl[IRE_ADDR_HASH(addr, NCE_TABLE_SIZE)]))
110 
111 #define	NCE_HASH_PTR_V6(ipst, addr)				 \
112 	(&((ipst)->ips_ndp6->nce_hash_tbl[NCE_ADDR_HASH_V6(addr, \
113 		NCE_TABLE_SIZE)]))
114 
115 /*
116  * Compute default flags to use for an advertisement of this nce's address.
117  */
118 static int
119 nce_advert_flags(const nce_t *nce)
120 {
121 	int flag = 0;
122 
123 	if (nce->nce_flags & NCE_F_ISROUTER)
124 		flag |= NDP_ISROUTER;
125 	if (!(nce->nce_flags & NCE_F_ANYCAST))
126 		flag |= NDP_ORIDE;
127 
128 	return (flag);
129 }
130 
131 /* Non-tunable probe interval, based on link capabilities */
132 #define	ILL_PROBE_INTERVAL(ill)	((ill)->ill_note_link ? 150 : 1500)
133 
134 /*
135  * NDP Cache Entry creation routine.
136  * Mapped entries will never do NUD .
137  * This routine must always be called with ndp6->ndp_g_lock held.
138  * Prior to return, nce_refcnt is incremented.
139  */
140 int
141 ndp_add_v6(ill_t *ill, uchar_t *hw_addr, const in6_addr_t *addr,
142     const in6_addr_t *mask, const in6_addr_t *extract_mask,
143     uint32_t hw_extract_start, uint16_t flags, uint16_t state,
144     nce_t **newnce)
145 {
146 	static	nce_t		nce_nil;
147 	nce_t		*nce;
148 	mblk_t		*mp;
149 	mblk_t		*template;
150 	nce_t		**ncep;
151 	int		err;
152 	boolean_t	dropped = B_FALSE;
153 	ip_stack_t	*ipst = ill->ill_ipst;
154 
155 	ASSERT(MUTEX_HELD(&ipst->ips_ndp6->ndp_g_lock));
156 	ASSERT(ill != NULL && ill->ill_isv6);
157 	if (IN6_IS_ADDR_UNSPECIFIED(addr)) {
158 		ip0dbg(("ndp_add_v6: no addr\n"));
159 		return (EINVAL);
160 	}
161 	if ((flags & ~NCE_EXTERNAL_FLAGS_MASK)) {
162 		ip0dbg(("ndp_add_v6: flags = %x\n", (int)flags));
163 		return (EINVAL);
164 	}
165 	if (IN6_IS_ADDR_UNSPECIFIED(extract_mask) &&
166 	    (flags & NCE_F_MAPPING)) {
167 		ip0dbg(("ndp_add_v6: extract mask zero for mapping"));
168 		return (EINVAL);
169 	}
170 	/*
171 	 * Allocate the mblk to hold the nce.
172 	 *
173 	 * XXX This can come out of a separate cache - nce_cache.
174 	 * We don't need the mp anymore as there are no more
175 	 * "qwriter"s
176 	 */
177 	mp = allocb(sizeof (nce_t), BPRI_MED);
178 	if (mp == NULL)
179 		return (ENOMEM);
180 
181 	nce = (nce_t *)mp->b_rptr;
182 	mp->b_wptr = (uchar_t *)&nce[1];
183 	*nce = nce_nil;
184 
185 	/*
186 	 * This one holds link layer address
187 	 */
188 	if (ill->ill_net_type == IRE_IF_RESOLVER) {
189 		template = nce_udreq_alloc(ill);
190 	} else {
191 		if (ill->ill_resolver_mp == NULL) {
192 			freeb(mp);
193 			return (EINVAL);
194 		}
195 		ASSERT((ill->ill_net_type == IRE_IF_NORESOLVER));
196 		template = copyb(ill->ill_resolver_mp);
197 	}
198 	if (template == NULL) {
199 		freeb(mp);
200 		return (ENOMEM);
201 	}
202 	nce->nce_ill = ill;
203 	nce->nce_ipversion = IPV6_VERSION;
204 	nce->nce_flags = flags;
205 	nce->nce_state = state;
206 	nce->nce_pcnt = ND_MAX_UNICAST_SOLICIT;
207 	nce->nce_rcnt = ill->ill_xmit_count;
208 	nce->nce_addr = *addr;
209 	nce->nce_mask = *mask;
210 	nce->nce_extract_mask = *extract_mask;
211 	nce->nce_ll_extract_start = hw_extract_start;
212 	nce->nce_fp_mp = NULL;
213 	nce->nce_res_mp = template;
214 	if (state == ND_REACHABLE)
215 		nce->nce_last = TICK_TO_MSEC(lbolt64);
216 	else
217 		nce->nce_last = 0;
218 	nce->nce_qd_mp = NULL;
219 	nce->nce_mp = mp;
220 	if (hw_addr != NULL)
221 		nce_set_ll(nce, hw_addr);
222 	/* This one is for nce getting created */
223 	nce->nce_refcnt = 1;
224 	mutex_init(&nce->nce_lock, NULL, MUTEX_DEFAULT, NULL);
225 	if (nce->nce_flags & NCE_F_MAPPING) {
226 		ASSERT(IN6_IS_ADDR_MULTICAST(addr));
227 		ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&nce->nce_mask));
228 		ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&nce->nce_extract_mask));
229 		ncep = &ipst->ips_ndp6->nce_mask_entries;
230 	} else {
231 		ncep = ((nce_t **)NCE_HASH_PTR_V6(ipst, *addr));
232 	}
233 
234 	nce->nce_trace_disable = B_FALSE;
235 
236 	/*
237 	 * Atomically ensure that the ill is not CONDEMNED, before
238 	 * adding the NCE.
239 	 */
240 	mutex_enter(&ill->ill_lock);
241 	if (ill->ill_state_flags & ILL_CONDEMNED) {
242 		mutex_exit(&ill->ill_lock);
243 		freeb(mp);
244 		freeb(template);
245 		return (EINVAL);
246 	}
247 	if ((nce->nce_next = *ncep) != NULL)
248 		nce->nce_next->nce_ptpn = &nce->nce_next;
249 	*ncep = nce;
250 	nce->nce_ptpn = ncep;
251 	*newnce = nce;
252 	/* This one is for nce being used by an active thread */
253 	NCE_REFHOLD(*newnce);
254 
255 	/* Bump up the number of nce's referencing this ill */
256 	ill->ill_nce_cnt++;
257 	mutex_exit(&ill->ill_lock);
258 
259 	err = 0;
260 	if ((flags & NCE_F_PERMANENT) && state == ND_PROBE) {
261 		mutex_enter(&nce->nce_lock);
262 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
263 		nce->nce_pcnt = ND_MAX_UNICAST_SOLICIT;
264 		mutex_exit(&nce->nce_lock);
265 		dropped = nce_xmit(ill, ND_NEIGHBOR_SOLICIT, NULL, B_FALSE,
266 		    &ipv6_all_zeros, addr, NDP_PROBE);
267 		if (dropped) {
268 			mutex_enter(&nce->nce_lock);
269 			nce->nce_pcnt++;
270 			mutex_exit(&nce->nce_lock);
271 		}
272 		NDP_RESTART_TIMER(nce, ILL_PROBE_INTERVAL(ill));
273 		mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
274 		err = EINPROGRESS;
275 	} else if (flags & NCE_F_UNSOL_ADV) {
276 		/*
277 		 * We account for the transmit below by assigning one
278 		 * less than the ndd variable. Subsequent decrements
279 		 * are done in ndp_timer.
280 		 */
281 		mutex_enter(&nce->nce_lock);
282 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
283 		nce->nce_unsolicit_count = ipst->ips_ip_ndp_unsolicit_count - 1;
284 		mutex_exit(&nce->nce_lock);
285 		dropped = nce_xmit(ill,
286 		    ND_NEIGHBOR_ADVERT,
287 		    ill,	/* ill to be used for extracting ill_nd_lla */
288 		    B_TRUE,	/* use ill_nd_lla */
289 		    addr,	/* Source and target of the advertisement pkt */
290 		    &ipv6_all_hosts_mcast, /* Destination of the packet */
291 		    nce_advert_flags(nce));
292 		mutex_enter(&nce->nce_lock);
293 		if (dropped)
294 			nce->nce_unsolicit_count++;
295 		if (nce->nce_unsolicit_count != 0) {
296 			nce->nce_timeout_id = timeout(ndp_timer, nce,
297 			    MSEC_TO_TICK(ipst->ips_ip_ndp_unsolicit_interval));
298 		}
299 		mutex_exit(&nce->nce_lock);
300 		mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
301 	}
302 	/*
303 	 * If the hw_addr is NULL, typically for ND_INCOMPLETE nces, then
304 	 * we call nce_fastpath as soon as the nce is resolved in ndp_process.
305 	 * We call nce_fastpath from nce_update if the link layer address of
306 	 * the peer changes from nce_update
307 	 */
308 	if (hw_addr != NULL || ill->ill_net_type == IRE_IF_NORESOLVER)
309 		nce_fastpath(nce);
310 	return (err);
311 }
312 
313 int
314 ndp_lookup_then_add_v6(ill_t *ill, uchar_t *hw_addr, const in6_addr_t *addr,
315     const in6_addr_t *mask, const in6_addr_t *extract_mask,
316     uint32_t hw_extract_start, uint16_t flags, uint16_t state,
317     nce_t **newnce)
318 {
319 	int	err = 0;
320 	nce_t	*nce;
321 	ip_stack_t	*ipst = ill->ill_ipst;
322 
323 	ASSERT(ill->ill_isv6);
324 	mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
325 
326 	/* Get head of v6 hash table */
327 	nce = *((nce_t **)NCE_HASH_PTR_V6(ipst, *addr));
328 	nce = nce_lookup_addr(ill, addr, nce);
329 	if (nce == NULL) {
330 		err = ndp_add_v6(ill,
331 		    hw_addr,
332 		    addr,
333 		    mask,
334 		    extract_mask,
335 		    hw_extract_start,
336 		    flags,
337 		    state,
338 		    newnce);
339 	} else {
340 		*newnce = nce;
341 		err = EEXIST;
342 	}
343 	mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
344 	return (err);
345 }
346 
347 /*
348  * Remove all the CONDEMNED nces from the appropriate hash table.
349  * We create a private list of NCEs, these may have ires pointing
350  * to them, so the list will be passed through to clean up dependent
351  * ires and only then we can do NCE_REFRELE which can make NCE inactive.
352  */
353 static void
354 nce_remove(ndp_g_t *ndp, nce_t *nce, nce_t **free_nce_list)
355 {
356 	nce_t *nce1;
357 	nce_t **ptpn;
358 
359 	ASSERT(MUTEX_HELD(&ndp->ndp_g_lock));
360 	ASSERT(ndp->ndp_g_walker == 0);
361 	for (; nce; nce = nce1) {
362 		nce1 = nce->nce_next;
363 		mutex_enter(&nce->nce_lock);
364 		if (nce->nce_flags & NCE_F_CONDEMNED) {
365 			ptpn = nce->nce_ptpn;
366 			nce1 = nce->nce_next;
367 			if (nce1 != NULL)
368 				nce1->nce_ptpn = ptpn;
369 			*ptpn = nce1;
370 			nce->nce_ptpn = NULL;
371 			nce->nce_next = NULL;
372 			nce->nce_next = *free_nce_list;
373 			*free_nce_list = nce;
374 		}
375 		mutex_exit(&nce->nce_lock);
376 	}
377 }
378 
379 /*
380  * 1. Mark the nce CONDEMNED. This ensures that no new nce_lookup()
381  *    will return this NCE. Also no new IREs will be created that
382  *    point to this NCE (See ire_add_v6).  Also no new timeouts will
383  *    be started (See NDP_RESTART_TIMER).
384  * 2. Cancel any currently running timeouts.
385  * 3. If there is an ndp walker, return. The walker will do the cleanup.
386  *    This ensures that walkers see a consistent list of NCEs while walking.
387  * 4. Otherwise remove the NCE from the list of NCEs
388  * 5. Delete all IREs pointing to this NCE.
389  */
390 void
391 ndp_delete(nce_t *nce)
392 {
393 	nce_t	**ptpn;
394 	nce_t	*nce1;
395 	int	ipversion = nce->nce_ipversion;
396 	ndp_g_t *ndp;
397 	ip_stack_t	*ipst = nce->nce_ill->ill_ipst;
398 
399 	if (ipversion == IPV4_VERSION)
400 		ndp = ipst->ips_ndp4;
401 	else
402 		ndp = ipst->ips_ndp6;
403 
404 	/* Serialize deletes */
405 	mutex_enter(&nce->nce_lock);
406 	if (nce->nce_flags & NCE_F_CONDEMNED) {
407 		/* Some other thread is doing the delete */
408 		mutex_exit(&nce->nce_lock);
409 		return;
410 	}
411 	/*
412 	 * Caller has a refhold. Also 1 ref for being in the list. Thus
413 	 * refcnt has to be >= 2
414 	 */
415 	ASSERT(nce->nce_refcnt >= 2);
416 	nce->nce_flags |= NCE_F_CONDEMNED;
417 	mutex_exit(&nce->nce_lock);
418 
419 	nce_fastpath_list_delete(nce);
420 
421 	/*
422 	 * Cancel any running timer. Timeout can't be restarted
423 	 * since CONDEMNED is set. Can't hold nce_lock across untimeout.
424 	 * Passing invalid timeout id is fine.
425 	 */
426 	if (nce->nce_timeout_id != 0) {
427 		(void) untimeout(nce->nce_timeout_id);
428 		nce->nce_timeout_id = 0;
429 	}
430 
431 	mutex_enter(&ndp->ndp_g_lock);
432 	if (nce->nce_ptpn == NULL) {
433 		/*
434 		 * The last ndp walker has already removed this nce from
435 		 * the list after we marked the nce CONDEMNED and before
436 		 * we grabbed the global lock.
437 		 */
438 		mutex_exit(&ndp->ndp_g_lock);
439 		return;
440 	}
441 	if (ndp->ndp_g_walker > 0) {
442 		/*
443 		 * Can't unlink. The walker will clean up
444 		 */
445 		ndp->ndp_g_walker_cleanup = B_TRUE;
446 		mutex_exit(&ndp->ndp_g_lock);
447 		return;
448 	}
449 
450 	/*
451 	 * Now remove the nce from the list. NDP_RESTART_TIMER won't restart
452 	 * the timer since it is marked CONDEMNED.
453 	 */
454 	ptpn = nce->nce_ptpn;
455 	nce1 = nce->nce_next;
456 	if (nce1 != NULL)
457 		nce1->nce_ptpn = ptpn;
458 	*ptpn = nce1;
459 	nce->nce_ptpn = NULL;
460 	nce->nce_next = NULL;
461 	mutex_exit(&ndp->ndp_g_lock);
462 
463 	nce_ire_delete(nce);
464 }
465 
466 void
467 ndp_inactive(nce_t *nce)
468 {
469 	mblk_t		**mpp;
470 	ill_t		*ill;
471 
472 	ASSERT(nce->nce_refcnt == 0);
473 	ASSERT(MUTEX_HELD(&nce->nce_lock));
474 	ASSERT(nce->nce_fastpath == NULL);
475 
476 	/* Free all nce allocated messages */
477 	mpp = &nce->nce_first_mp_to_free;
478 	do {
479 		while (*mpp != NULL) {
480 			mblk_t  *mp;
481 
482 			mp = *mpp;
483 			*mpp = mp->b_next;
484 
485 			inet_freemsg(mp);
486 		}
487 	} while (mpp++ != &nce->nce_last_mp_to_free);
488 
489 #ifdef DEBUG
490 	nce_trace_cleanup(nce);
491 #endif
492 
493 	ill = nce->nce_ill;
494 	mutex_enter(&ill->ill_lock);
495 	ill->ill_nce_cnt--;
496 	/*
497 	 * If the number of nce's associated with this ill have dropped
498 	 * to zero, check whether we need to restart any operation that
499 	 * is waiting for this to happen.
500 	 */
501 	if (ill->ill_nce_cnt == 0) {
502 		/* ipif_ill_refrele_tail drops the ill_lock */
503 		ipif_ill_refrele_tail(ill);
504 	} else {
505 		mutex_exit(&ill->ill_lock);
506 	}
507 	mutex_destroy(&nce->nce_lock);
508 	if (nce->nce_mp != NULL)
509 		inet_freemsg(nce->nce_mp);
510 }
511 
512 /*
513  * ndp_walk routine.  Delete the nce if it is associated with the ill
514  * that is going away.  Always called as a writer.
515  */
516 void
517 ndp_delete_per_ill(nce_t *nce, uchar_t *arg)
518 {
519 	if ((nce != NULL) && nce->nce_ill == (ill_t *)arg) {
520 		ndp_delete(nce);
521 	}
522 }
523 
524 /*
525  * Walk a list of to be inactive NCEs and blow away all the ires.
526  */
527 static void
528 nce_ire_delete_list(nce_t *nce)
529 {
530 	nce_t *nce_next;
531 
532 	ASSERT(nce != NULL);
533 	while (nce != NULL) {
534 		nce_next = nce->nce_next;
535 		nce->nce_next = NULL;
536 
537 		/*
538 		 * It is possible for the last ndp walker (this thread)
539 		 * to come here after ndp_delete has marked the nce CONDEMNED
540 		 * and before it has removed the nce from the fastpath list
541 		 * or called untimeout. So we need to do it here. It is safe
542 		 * for both ndp_delete and this thread to do it twice or
543 		 * even simultaneously since each of the threads has a
544 		 * reference on the nce.
545 		 */
546 		nce_fastpath_list_delete(nce);
547 		/*
548 		 * Cancel any running timer. Timeout can't be restarted
549 		 * since CONDEMNED is set. Can't hold nce_lock across untimeout.
550 		 * Passing invalid timeout id is fine.
551 		 */
552 		if (nce->nce_timeout_id != 0) {
553 			(void) untimeout(nce->nce_timeout_id);
554 			nce->nce_timeout_id = 0;
555 		}
556 		/*
557 		 * We might hit this func thus in the v4 case:
558 		 * ipif_down->ipif_ndp_down->ndp_walk
559 		 */
560 
561 		if (nce->nce_ipversion == IPV4_VERSION) {
562 			ire_walk_ill_v4(MATCH_IRE_ILL | MATCH_IRE_TYPE,
563 			    IRE_CACHE, nce_ire_delete1,
564 			    (char *)nce, nce->nce_ill);
565 		} else {
566 			ASSERT(nce->nce_ipversion == IPV6_VERSION);
567 			ire_walk_ill_v6(MATCH_IRE_ILL | MATCH_IRE_TYPE,
568 			    IRE_CACHE, nce_ire_delete1,
569 			    (char *)nce, nce->nce_ill);
570 		}
571 		NCE_REFRELE_NOTR(nce);
572 		nce = nce_next;
573 	}
574 }
575 
576 /*
577  * Delete an ire when the nce goes away.
578  */
579 /* ARGSUSED */
580 static void
581 nce_ire_delete(nce_t *nce)
582 {
583 	if (nce->nce_ipversion == IPV6_VERSION) {
584 		ire_walk_ill_v6(MATCH_IRE_ILL | MATCH_IRE_TYPE, IRE_CACHE,
585 		    nce_ire_delete1, (char *)nce, nce->nce_ill);
586 		NCE_REFRELE_NOTR(nce);
587 	} else {
588 		ire_walk_ill_v4(MATCH_IRE_ILL | MATCH_IRE_TYPE, IRE_CACHE,
589 		    nce_ire_delete1, (char *)nce, nce->nce_ill);
590 		NCE_REFRELE_NOTR(nce);
591 	}
592 }
593 
594 /*
595  * ire_walk routine used to delete every IRE that shares this nce
596  */
597 static void
598 nce_ire_delete1(ire_t *ire, char *nce_arg)
599 {
600 	nce_t	*nce = (nce_t *)nce_arg;
601 
602 	ASSERT(ire->ire_type == IRE_CACHE);
603 
604 	if (ire->ire_nce == nce) {
605 		ASSERT(ire->ire_ipversion == nce->nce_ipversion);
606 		ire_delete(ire);
607 	}
608 }
609 
610 /*
611  * Restart DAD on given NCE.  Returns B_TRUE if DAD has been restarted.
612  */
613 boolean_t
614 ndp_restart_dad(nce_t *nce)
615 {
616 	boolean_t started;
617 	boolean_t dropped;
618 
619 	if (nce == NULL)
620 		return (B_FALSE);
621 	mutex_enter(&nce->nce_lock);
622 	if (nce->nce_state == ND_PROBE) {
623 		mutex_exit(&nce->nce_lock);
624 		started = B_TRUE;
625 	} else if (nce->nce_state == ND_REACHABLE) {
626 		nce->nce_state = ND_PROBE;
627 		nce->nce_pcnt = ND_MAX_UNICAST_SOLICIT - 1;
628 		mutex_exit(&nce->nce_lock);
629 		dropped = nce_xmit(nce->nce_ill, ND_NEIGHBOR_SOLICIT, NULL,
630 		    B_FALSE, &ipv6_all_zeros, &nce->nce_addr, NDP_PROBE);
631 		if (dropped) {
632 			mutex_enter(&nce->nce_lock);
633 			nce->nce_pcnt++;
634 			mutex_exit(&nce->nce_lock);
635 		}
636 		NDP_RESTART_TIMER(nce, ILL_PROBE_INTERVAL(nce->nce_ill));
637 		started = B_TRUE;
638 	} else {
639 		mutex_exit(&nce->nce_lock);
640 		started = B_FALSE;
641 	}
642 	return (started);
643 }
644 
645 /*
646  * IPv6 Cache entry lookup.  Try to find an nce matching the parameters passed.
647  * If one is found, the refcnt on the nce will be incremented.
648  */
649 nce_t *
650 ndp_lookup_v6(ill_t *ill, const in6_addr_t *addr, boolean_t caller_holds_lock)
651 {
652 	nce_t	*nce;
653 	ip_stack_t	*ipst;
654 
655 	ASSERT(ill != NULL);
656 	ipst = ill->ill_ipst;
657 
658 	ASSERT(ill != NULL && ill->ill_isv6);
659 	if (!caller_holds_lock) {
660 		mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
661 	}
662 
663 	/* Get head of v6 hash table */
664 	nce = *((nce_t **)NCE_HASH_PTR_V6(ipst, *addr));
665 	nce = nce_lookup_addr(ill, addr, nce);
666 	if (nce == NULL)
667 		nce = nce_lookup_mapping(ill, addr);
668 	if (!caller_holds_lock)
669 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
670 	return (nce);
671 }
672 /*
673  * IPv4 Cache entry lookup.  Try to find an nce matching the parameters passed.
674  * If one is found, the refcnt on the nce will be incremented.
675  * Since multicast mappings are handled in arp, there are no nce_mcast_entries
676  * so we skip the nce_lookup_mapping call.
677  * XXX TODO: if the nce is found to be ND_STALE, ndp_delete it and return NULL
678  */
679 nce_t *
680 ndp_lookup_v4(ill_t *ill, const in_addr_t *addr, boolean_t caller_holds_lock)
681 {
682 	nce_t	*nce;
683 	in6_addr_t addr6;
684 	ip_stack_t *ipst = ill->ill_ipst;
685 
686 	if (!caller_holds_lock) {
687 		mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
688 	}
689 
690 	/* Get head of v4 hash table */
691 	nce = *((nce_t **)NCE_HASH_PTR_V4(ipst, *addr));
692 	IN6_IPADDR_TO_V4MAPPED(*addr, &addr6);
693 	nce = nce_lookup_addr(ill, &addr6, nce);
694 	if (!caller_holds_lock)
695 		mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
696 	return (nce);
697 }
698 
699 /*
700  * Cache entry lookup.  Try to find an nce matching the parameters passed.
701  * Look only for exact entries (no mappings).  If an nce is found, increment
702  * the hold count on that nce. The caller passes in the start of the
703  * appropriate hash table, and must be holding the appropriate global
704  * lock (ndp_g_lock).
705  */
706 static nce_t *
707 nce_lookup_addr(ill_t *ill, const in6_addr_t *addr, nce_t *nce)
708 {
709 	ndp_g_t		*ndp;
710 	ip_stack_t	*ipst = ill->ill_ipst;
711 
712 	if (ill->ill_isv6)
713 		ndp = ipst->ips_ndp6;
714 	else
715 		ndp = ipst->ips_ndp4;
716 
717 	ASSERT(ill != NULL);
718 	ASSERT(MUTEX_HELD(&ndp->ndp_g_lock));
719 	if (IN6_IS_ADDR_UNSPECIFIED(addr))
720 		return (NULL);
721 	for (; nce != NULL; nce = nce->nce_next) {
722 		if (nce->nce_ill == ill) {
723 			if (IN6_ARE_ADDR_EQUAL(&nce->nce_addr, addr) &&
724 			    IN6_ARE_ADDR_EQUAL(&nce->nce_mask,
725 			    &ipv6_all_ones)) {
726 				mutex_enter(&nce->nce_lock);
727 				if (!(nce->nce_flags & NCE_F_CONDEMNED)) {
728 					NCE_REFHOLD_LOCKED(nce);
729 					mutex_exit(&nce->nce_lock);
730 					break;
731 				}
732 				mutex_exit(&nce->nce_lock);
733 			}
734 		}
735 	}
736 	return (nce);
737 }
738 
739 /*
740  * Cache entry lookup.  Try to find an nce matching the parameters passed.
741  * Look only for mappings.
742  */
743 static nce_t *
744 nce_lookup_mapping(ill_t *ill, const in6_addr_t *addr)
745 {
746 	nce_t	*nce;
747 	ip_stack_t	*ipst = ill->ill_ipst;
748 
749 	ASSERT(ill != NULL && ill->ill_isv6);
750 	ASSERT(MUTEX_HELD(&ipst->ips_ndp6->ndp_g_lock));
751 	if (!IN6_IS_ADDR_MULTICAST(addr))
752 		return (NULL);
753 	nce = ipst->ips_ndp6->nce_mask_entries;
754 	for (; nce != NULL; nce = nce->nce_next)
755 		if (nce->nce_ill == ill &&
756 		    (V6_MASK_EQ(*addr, nce->nce_mask, nce->nce_addr))) {
757 			mutex_enter(&nce->nce_lock);
758 			if (!(nce->nce_flags & NCE_F_CONDEMNED)) {
759 				NCE_REFHOLD_LOCKED(nce);
760 				mutex_exit(&nce->nce_lock);
761 				break;
762 			}
763 			mutex_exit(&nce->nce_lock);
764 		}
765 	return (nce);
766 }
767 
768 /*
769  * Process passed in parameters either from an incoming packet or via
770  * user ioctl.
771  */
772 void
773 ndp_process(nce_t *nce, uchar_t *hw_addr, uint32_t flag, boolean_t is_adv)
774 {
775 	ill_t	*ill = nce->nce_ill;
776 	uint32_t hw_addr_len = ill->ill_nd_lla_len;
777 	mblk_t	*mp;
778 	boolean_t ll_updated = B_FALSE;
779 	boolean_t ll_changed;
780 	ip_stack_t	*ipst = ill->ill_ipst;
781 
782 	ASSERT(nce->nce_ipversion == IPV6_VERSION);
783 	/*
784 	 * No updates of link layer address or the neighbor state is
785 	 * allowed, when the cache is in NONUD state.  This still
786 	 * allows for responding to reachability solicitation.
787 	 */
788 	mutex_enter(&nce->nce_lock);
789 	if (nce->nce_state == ND_INCOMPLETE) {
790 		if (hw_addr == NULL) {
791 			mutex_exit(&nce->nce_lock);
792 			return;
793 		}
794 		nce_set_ll(nce, hw_addr);
795 		/*
796 		 * Update nce state and send the queued packets
797 		 * back to ip this time ire will be added.
798 		 */
799 		if (flag & ND_NA_FLAG_SOLICITED) {
800 			nce_update(nce, ND_REACHABLE, NULL);
801 		} else {
802 			nce_update(nce, ND_STALE, NULL);
803 		}
804 		mutex_exit(&nce->nce_lock);
805 		nce_fastpath(nce);
806 		mutex_enter(&nce->nce_lock);
807 		mp = nce->nce_qd_mp;
808 		nce->nce_qd_mp = NULL;
809 		mutex_exit(&nce->nce_lock);
810 		while (mp != NULL) {
811 			mblk_t *nxt_mp, *data_mp;
812 
813 			nxt_mp = mp->b_next;
814 			mp->b_next = NULL;
815 
816 			if (mp->b_datap->db_type == M_CTL)
817 				data_mp = mp->b_cont;
818 			else
819 				data_mp = mp;
820 			if (data_mp->b_prev != NULL) {
821 				ill_t   *inbound_ill;
822 				queue_t *fwdq = NULL;
823 				uint_t ifindex;
824 
825 				ifindex = (uint_t)(uintptr_t)data_mp->b_prev;
826 				inbound_ill = ill_lookup_on_ifindex(ifindex,
827 				    B_TRUE, NULL, NULL, NULL, NULL, ipst);
828 				if (inbound_ill == NULL) {
829 					data_mp->b_prev = NULL;
830 					freemsg(mp);
831 					return;
832 				} else {
833 					fwdq = inbound_ill->ill_rq;
834 				}
835 				data_mp->b_prev = NULL;
836 				/*
837 				 * Send a forwarded packet back into ip_rput_v6
838 				 * just as in ire_send_v6().
839 				 * Extract the queue from b_prev (set in
840 				 * ip_rput_data_v6).
841 				 */
842 				if (fwdq != NULL) {
843 					/*
844 					 * Forwarded packets hop count will
845 					 * get decremented in ip_rput_data_v6
846 					 */
847 					if (data_mp != mp)
848 						freeb(mp);
849 					put(fwdq, data_mp);
850 				} else {
851 					/*
852 					 * Send locally originated packets back
853 					 * into * ip_wput_v6.
854 					 */
855 					put(ill->ill_wq, mp);
856 				}
857 				ill_refrele(inbound_ill);
858 			} else {
859 				put(ill->ill_wq, mp);
860 			}
861 			mp = nxt_mp;
862 		}
863 		return;
864 	}
865 	ll_changed = nce_cmp_ll_addr(nce, hw_addr, hw_addr_len);
866 	if (!is_adv) {
867 		/* If this is a SOLICITATION request only */
868 		if (ll_changed)
869 			nce_update(nce, ND_STALE, hw_addr);
870 		mutex_exit(&nce->nce_lock);
871 		return;
872 	}
873 	if (!(flag & ND_NA_FLAG_OVERRIDE) && ll_changed) {
874 		/* If in any other state than REACHABLE, ignore */
875 		if (nce->nce_state == ND_REACHABLE) {
876 			nce_update(nce, ND_STALE, NULL);
877 		}
878 		mutex_exit(&nce->nce_lock);
879 		return;
880 	} else {
881 		if (ll_changed) {
882 			nce_update(nce, ND_UNCHANGED, hw_addr);
883 			ll_updated = B_TRUE;
884 		}
885 		if (flag & ND_NA_FLAG_SOLICITED) {
886 			nce_update(nce, ND_REACHABLE, NULL);
887 		} else {
888 			if (ll_updated) {
889 				nce_update(nce, ND_STALE, NULL);
890 			}
891 		}
892 		mutex_exit(&nce->nce_lock);
893 		if (!(flag & ND_NA_FLAG_ROUTER) && (nce->nce_flags &
894 		    NCE_F_ISROUTER)) {
895 			ire_t *ire;
896 
897 			/*
898 			 * Router turned to host.  We need to remove the
899 			 * entry as well as any default route that may be
900 			 * using this as a next hop.  This is required by
901 			 * section 7.2.5 of RFC 2461.
902 			 */
903 			ire = ire_ftable_lookup_v6(&ipv6_all_zeros,
904 			    &ipv6_all_zeros, &nce->nce_addr, IRE_DEFAULT,
905 			    nce->nce_ill->ill_ipif, NULL, ALL_ZONES, 0, NULL,
906 			    MATCH_IRE_ILL | MATCH_IRE_TYPE | MATCH_IRE_GW |
907 			    MATCH_IRE_DEFAULT, ipst);
908 			if (ire != NULL) {
909 				ip_rts_rtmsg(RTM_DELETE, ire, 0, ipst);
910 				ire_delete(ire);
911 				ire_refrele(ire);
912 			}
913 			ndp_delete(nce);
914 		}
915 	}
916 }
917 
918 /*
919  * Pass arg1 to the pfi supplied, along with each nce in existence.
920  * ndp_walk() places a REFHOLD on the nce and drops the lock when
921  * walking the hash list.
922  */
923 void
924 ndp_walk_common(ndp_g_t *ndp, ill_t *ill, pfi_t pfi, void *arg1,
925     boolean_t trace)
926 {
927 
928 	nce_t	*nce;
929 	nce_t	*nce1;
930 	nce_t	**ncep;
931 	nce_t	*free_nce_list = NULL;
932 
933 	mutex_enter(&ndp->ndp_g_lock);
934 	/* Prevent ndp_delete from unlink and free of NCE */
935 	ndp->ndp_g_walker++;
936 	mutex_exit(&ndp->ndp_g_lock);
937 	for (ncep = ndp->nce_hash_tbl;
938 	    ncep < A_END(ndp->nce_hash_tbl); ncep++) {
939 		for (nce = *ncep; nce != NULL; nce = nce1) {
940 			nce1 = nce->nce_next;
941 			if (ill == NULL || nce->nce_ill == ill) {
942 				if (trace) {
943 					NCE_REFHOLD(nce);
944 					(*pfi)(nce, arg1);
945 					NCE_REFRELE(nce);
946 				} else {
947 					NCE_REFHOLD_NOTR(nce);
948 					(*pfi)(nce, arg1);
949 					NCE_REFRELE_NOTR(nce);
950 				}
951 			}
952 		}
953 	}
954 	for (nce = ndp->nce_mask_entries; nce != NULL; nce = nce1) {
955 		nce1 = nce->nce_next;
956 		if (ill == NULL || nce->nce_ill == ill) {
957 			if (trace) {
958 				NCE_REFHOLD(nce);
959 				(*pfi)(nce, arg1);
960 				NCE_REFRELE(nce);
961 			} else {
962 				NCE_REFHOLD_NOTR(nce);
963 				(*pfi)(nce, arg1);
964 				NCE_REFRELE_NOTR(nce);
965 			}
966 		}
967 	}
968 	mutex_enter(&ndp->ndp_g_lock);
969 	ndp->ndp_g_walker--;
970 	/*
971 	 * While NCE's are removed from global list they are placed
972 	 * in a private list, to be passed to nce_ire_delete_list().
973 	 * The reason is, there may be ires pointing to this nce
974 	 * which needs to cleaned up.
975 	 */
976 	if (ndp->ndp_g_walker_cleanup && ndp->ndp_g_walker == 0) {
977 		/* Time to delete condemned entries */
978 		for (ncep = ndp->nce_hash_tbl;
979 		    ncep < A_END(ndp->nce_hash_tbl); ncep++) {
980 			nce = *ncep;
981 			if (nce != NULL) {
982 				nce_remove(ndp, nce, &free_nce_list);
983 			}
984 		}
985 		nce = ndp->nce_mask_entries;
986 		if (nce != NULL) {
987 			nce_remove(ndp, nce, &free_nce_list);
988 		}
989 		ndp->ndp_g_walker_cleanup = B_FALSE;
990 	}
991 
992 	mutex_exit(&ndp->ndp_g_lock);
993 
994 	if (free_nce_list != NULL) {
995 		nce_ire_delete_list(free_nce_list);
996 	}
997 }
998 
999 /*
1000  * Walk everything.
1001  * Note that ill can be NULL hence can't derive the ipst from it.
1002  */
1003 void
1004 ndp_walk(ill_t *ill, pfi_t pfi, void *arg1, ip_stack_t *ipst)
1005 {
1006 	ndp_walk_common(ipst->ips_ndp4, ill, pfi, arg1, B_TRUE);
1007 	ndp_walk_common(ipst->ips_ndp6, ill, pfi, arg1, B_TRUE);
1008 }
1009 
1010 /*
1011  * Process resolve requests.  Handles both mapped entries
1012  * as well as cases that needs to be send out on the wire.
1013  * Lookup a NCE for a given IRE.  Regardless of whether one exists
1014  * or one is created, we defer making ire point to nce until the
1015  * ire is actually added at which point the nce_refcnt on the nce is
1016  * incremented.  This is done primarily to have symmetry between ire_add()
1017  * and ire_delete() which decrements the nce_refcnt, when an ire is deleted.
1018  */
1019 int
1020 ndp_resolver(ill_t *ill, const in6_addr_t *dst, mblk_t *mp, zoneid_t zoneid)
1021 {
1022 	nce_t		*nce;
1023 	int		err = 0;
1024 	uint32_t	ms;
1025 	mblk_t		*mp_nce = NULL;
1026 	ip_stack_t	*ipst = ill->ill_ipst;
1027 
1028 	ASSERT(ill->ill_isv6);
1029 	if (IN6_IS_ADDR_MULTICAST(dst)) {
1030 		err = nce_set_multicast(ill, dst);
1031 		return (err);
1032 	}
1033 	err = ndp_lookup_then_add_v6(ill,
1034 	    NULL,	/* No hardware address */
1035 	    dst,
1036 	    &ipv6_all_ones,
1037 	    &ipv6_all_zeros,
1038 	    0,
1039 	    (ill->ill_flags & ILLF_NONUD) ? NCE_F_NONUD : 0,
1040 	    ND_INCOMPLETE,
1041 	    &nce);
1042 
1043 	switch (err) {
1044 	case 0:
1045 		/*
1046 		 * New cache entry was created. Make sure that the state
1047 		 * is not ND_INCOMPLETE. It can be in some other state
1048 		 * even before we send out the solicitation as we could
1049 		 * get un-solicited advertisements.
1050 		 *
1051 		 * If this is an XRESOLV interface, simply return 0,
1052 		 * since we don't want to solicit just yet.
1053 		 */
1054 		if (ill->ill_flags & ILLF_XRESOLV) {
1055 			NCE_REFRELE(nce);
1056 			return (0);
1057 		}
1058 		rw_enter(&ipst->ips_ill_g_lock, RW_READER);
1059 		mutex_enter(&nce->nce_lock);
1060 		if (nce->nce_state != ND_INCOMPLETE) {
1061 			mutex_exit(&nce->nce_lock);
1062 			rw_exit(&ipst->ips_ill_g_lock);
1063 			NCE_REFRELE(nce);
1064 			return (0);
1065 		}
1066 		mp_nce = ip_prepend_zoneid(mp, zoneid, ipst);
1067 		if (mp_nce == NULL) {
1068 			/* The caller will free mp */
1069 			mutex_exit(&nce->nce_lock);
1070 			rw_exit(&ipst->ips_ill_g_lock);
1071 			ndp_delete(nce);
1072 			NCE_REFRELE(nce);
1073 			return (ENOMEM);
1074 		}
1075 		ms = nce_solicit(nce, mp_nce);
1076 		rw_exit(&ipst->ips_ill_g_lock);
1077 		if (ms == 0) {
1078 			/* The caller will free mp */
1079 			if (mp_nce != mp)
1080 				freeb(mp_nce);
1081 			mutex_exit(&nce->nce_lock);
1082 			ndp_delete(nce);
1083 			NCE_REFRELE(nce);
1084 			return (EBUSY);
1085 		}
1086 		mutex_exit(&nce->nce_lock);
1087 		NDP_RESTART_TIMER(nce, (clock_t)ms);
1088 		NCE_REFRELE(nce);
1089 		return (EINPROGRESS);
1090 	case EEXIST:
1091 		/* Resolution in progress just queue the packet */
1092 		mutex_enter(&nce->nce_lock);
1093 		if (nce->nce_state == ND_INCOMPLETE) {
1094 			mp_nce = ip_prepend_zoneid(mp, zoneid, ipst);
1095 			if (mp_nce == NULL) {
1096 				err = ENOMEM;
1097 			} else {
1098 				nce_queue_mp(nce, mp_nce);
1099 				err = EINPROGRESS;
1100 			}
1101 		} else {
1102 			/*
1103 			 * Any other state implies we have
1104 			 * a nce but IRE needs to be added ...
1105 			 * ire_add_v6() will take care of the
1106 			 * the case when the nce becomes CONDEMNED
1107 			 * before the ire is added to the table.
1108 			 */
1109 			err = 0;
1110 		}
1111 		mutex_exit(&nce->nce_lock);
1112 		NCE_REFRELE(nce);
1113 		break;
1114 	default:
1115 		ip1dbg(("ndp_resolver: Can't create NCE %d\n", err));
1116 		break;
1117 	}
1118 	return (err);
1119 }
1120 
1121 /*
1122  * When there is no resolver, the link layer template is passed in
1123  * the IRE.
1124  * Lookup a NCE for a given IRE.  Regardless of whether one exists
1125  * or one is created, we defer making ire point to nce until the
1126  * ire is actually added at which point the nce_refcnt on the nce is
1127  * incremented.  This is done primarily to have symmetry between ire_add()
1128  * and ire_delete() which decrements the nce_refcnt, when an ire is deleted.
1129  */
1130 int
1131 ndp_noresolver(ill_t *ill, const in6_addr_t *dst)
1132 {
1133 	nce_t		*nce;
1134 	int		err = 0;
1135 
1136 	ASSERT(ill != NULL);
1137 	ASSERT(ill->ill_isv6);
1138 	if (IN6_IS_ADDR_MULTICAST(dst)) {
1139 		err = nce_set_multicast(ill, dst);
1140 		return (err);
1141 	}
1142 
1143 	err = ndp_lookup_then_add_v6(ill,
1144 	    NULL,	/* hardware address */
1145 	    dst,
1146 	    &ipv6_all_ones,
1147 	    &ipv6_all_zeros,
1148 	    0,
1149 	    (ill->ill_flags & ILLF_NONUD) ? NCE_F_NONUD : 0,
1150 	    ND_REACHABLE,
1151 	    &nce);
1152 
1153 	switch (err) {
1154 	case 0:
1155 		/*
1156 		 * Cache entry with a proper resolver cookie was
1157 		 * created.
1158 		 */
1159 		NCE_REFRELE(nce);
1160 		break;
1161 	case EEXIST:
1162 		err = 0;
1163 		NCE_REFRELE(nce);
1164 		break;
1165 	default:
1166 		ip1dbg(("ndp_noresolver: Can't create NCE %d\n", err));
1167 		break;
1168 	}
1169 	return (err);
1170 }
1171 
1172 /*
1173  * For each interface an entry is added for the unspecified multicast group.
1174  * Here that mapping is used to form the multicast cache entry for a particular
1175  * multicast destination.
1176  */
1177 static int
1178 nce_set_multicast(ill_t *ill, const in6_addr_t *dst)
1179 {
1180 	nce_t		*mnce;	/* Multicast mapping entry */
1181 	nce_t		*nce;
1182 	uchar_t		*hw_addr = NULL;
1183 	int		err = 0;
1184 	ip_stack_t	*ipst = ill->ill_ipst;
1185 
1186 	ASSERT(ill != NULL);
1187 	ASSERT(ill->ill_isv6);
1188 	ASSERT(!(IN6_IS_ADDR_UNSPECIFIED(dst)));
1189 
1190 	mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
1191 	nce = *((nce_t **)NCE_HASH_PTR_V6(ipst, *dst));
1192 	nce = nce_lookup_addr(ill, dst, nce);
1193 	if (nce != NULL) {
1194 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
1195 		NCE_REFRELE(nce);
1196 		return (0);
1197 	}
1198 	/* No entry, now lookup for a mapping this should never fail */
1199 	mnce = nce_lookup_mapping(ill, dst);
1200 	if (mnce == NULL) {
1201 		/* Something broken for the interface. */
1202 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
1203 		return (ESRCH);
1204 	}
1205 	ASSERT(mnce->nce_flags & NCE_F_MAPPING);
1206 	if (ill->ill_net_type == IRE_IF_RESOLVER) {
1207 		/*
1208 		 * For IRE_IF_RESOLVER a hardware mapping can be
1209 		 * generated, for IRE_IF_NORESOLVER, resolution cookie
1210 		 * in the ill is copied in ndp_add_v6().
1211 		 */
1212 		hw_addr = kmem_alloc(ill->ill_nd_lla_len, KM_NOSLEEP);
1213 		if (hw_addr == NULL) {
1214 			mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
1215 			NCE_REFRELE(mnce);
1216 			return (ENOMEM);
1217 		}
1218 		nce_make_mapping(mnce, hw_addr, (uchar_t *)dst);
1219 	}
1220 	NCE_REFRELE(mnce);
1221 	/*
1222 	 * IRE_IF_NORESOLVER type simply copies the resolution
1223 	 * cookie passed in.  So no hw_addr is needed.
1224 	 */
1225 	err = ndp_add_v6(ill,
1226 	    hw_addr,
1227 	    dst,
1228 	    &ipv6_all_ones,
1229 	    &ipv6_all_zeros,
1230 	    0,
1231 	    NCE_F_NONUD,
1232 	    ND_REACHABLE,
1233 	    &nce);
1234 	mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
1235 	if (hw_addr != NULL)
1236 		kmem_free(hw_addr, ill->ill_nd_lla_len);
1237 	if (err != 0) {
1238 		ip1dbg(("nce_set_multicast: create failed" "%d\n", err));
1239 		return (err);
1240 	}
1241 	NCE_REFRELE(nce);
1242 	return (0);
1243 }
1244 
1245 /*
1246  * Return the link layer address, and any flags of a nce.
1247  */
1248 int
1249 ndp_query(ill_t *ill, struct lif_nd_req *lnr)
1250 {
1251 	nce_t		*nce;
1252 	in6_addr_t	*addr;
1253 	sin6_t		*sin6;
1254 	dl_unitdata_req_t	*dl;
1255 
1256 	ASSERT(ill != NULL && ill->ill_isv6);
1257 	sin6 = (sin6_t *)&lnr->lnr_addr;
1258 	addr =  &sin6->sin6_addr;
1259 
1260 	nce = ndp_lookup_v6(ill, addr, B_FALSE);
1261 	if (nce == NULL)
1262 		return (ESRCH);
1263 	/* If in INCOMPLETE state, no link layer address is available yet */
1264 	if (nce->nce_state == ND_INCOMPLETE)
1265 		goto done;
1266 	dl = (dl_unitdata_req_t *)nce->nce_res_mp->b_rptr;
1267 	if (ill->ill_flags & ILLF_XRESOLV)
1268 		lnr->lnr_hdw_len = dl->dl_dest_addr_length;
1269 	else
1270 		lnr->lnr_hdw_len = ill->ill_nd_lla_len;
1271 	ASSERT(NCE_LL_ADDR_OFFSET(ill) + lnr->lnr_hdw_len <=
1272 	    sizeof (lnr->lnr_hdw_addr));
1273 	bcopy(nce->nce_res_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill),
1274 	    (uchar_t *)&lnr->lnr_hdw_addr, lnr->lnr_hdw_len);
1275 	if (nce->nce_flags & NCE_F_ISROUTER)
1276 		lnr->lnr_flags = NDF_ISROUTER_ON;
1277 	if (nce->nce_flags & NCE_F_ANYCAST)
1278 		lnr->lnr_flags |= NDF_ANYCAST_ON;
1279 done:
1280 	NCE_REFRELE(nce);
1281 	return (0);
1282 }
1283 
1284 /*
1285  * Send Enable/Disable multicast reqs to driver.
1286  */
1287 int
1288 ndp_mcastreq(ill_t *ill, const in6_addr_t *addr, uint32_t hw_addr_len,
1289     uint32_t hw_addr_offset, mblk_t *mp)
1290 {
1291 	nce_t		*nce;
1292 	uchar_t		*hw_addr;
1293 	ip_stack_t	*ipst = ill->ill_ipst;
1294 
1295 	ASSERT(ill != NULL && ill->ill_isv6);
1296 	ASSERT(ill->ill_net_type == IRE_IF_RESOLVER);
1297 	hw_addr = mi_offset_paramc(mp, hw_addr_offset, hw_addr_len);
1298 	if (hw_addr == NULL || !IN6_IS_ADDR_MULTICAST(addr)) {
1299 		freemsg(mp);
1300 		return (EINVAL);
1301 	}
1302 	mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
1303 	nce = nce_lookup_mapping(ill, addr);
1304 	if (nce == NULL) {
1305 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
1306 		freemsg(mp);
1307 		return (ESRCH);
1308 	}
1309 	mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
1310 	/*
1311 	 * Update dl_addr_length and dl_addr_offset for primitives that
1312 	 * have physical addresses as opposed to full saps
1313 	 */
1314 	switch (((union DL_primitives *)mp->b_rptr)->dl_primitive) {
1315 	case DL_ENABMULTI_REQ:
1316 		/* Track the state if this is the first enabmulti */
1317 		if (ill->ill_dlpi_multicast_state == IDS_UNKNOWN)
1318 			ill->ill_dlpi_multicast_state = IDS_INPROGRESS;
1319 		ip1dbg(("ndp_mcastreq: ENABMULTI\n"));
1320 		break;
1321 	case DL_DISABMULTI_REQ:
1322 		ip1dbg(("ndp_mcastreq: DISABMULTI\n"));
1323 		break;
1324 	default:
1325 		NCE_REFRELE(nce);
1326 		ip1dbg(("ndp_mcastreq: default\n"));
1327 		return (EINVAL);
1328 	}
1329 	nce_make_mapping(nce, hw_addr, (uchar_t *)addr);
1330 	NCE_REFRELE(nce);
1331 	ill_dlpi_send(ill, mp);
1332 	return (0);
1333 }
1334 
1335 /*
1336  * Send a neighbor solicitation.
1337  * Returns number of milliseconds after which we should either rexmit or abort.
1338  * Return of zero means we should abort.
1339  * The caller holds the nce_lock to protect nce_qd_mp and nce_rcnt.
1340  *
1341  * NOTE: This routine drops nce_lock (and later reacquires it) when sending
1342  * the packet.
1343  * NOTE: This routine does not consume mp.
1344  */
1345 uint32_t
1346 nce_solicit(nce_t *nce, mblk_t *mp)
1347 {
1348 	ill_t		*ill;
1349 	ill_t		*src_ill;
1350 	ip6_t		*ip6h;
1351 	in6_addr_t	src;
1352 	in6_addr_t	dst;
1353 	ipif_t		*ipif;
1354 	ip6i_t		*ip6i;
1355 	boolean_t	dropped = B_FALSE;
1356 	ip_stack_t	*ipst = nce->nce_ill->ill_ipst;
1357 
1358 	ASSERT(RW_READ_HELD(&ipst->ips_ill_g_lock));
1359 	ASSERT(MUTEX_HELD(&nce->nce_lock));
1360 	ill = nce->nce_ill;
1361 	ASSERT(ill != NULL);
1362 
1363 	if (nce->nce_rcnt == 0) {
1364 		return (0);
1365 	}
1366 
1367 	if (mp == NULL) {
1368 		ASSERT(nce->nce_qd_mp != NULL);
1369 		mp = nce->nce_qd_mp;
1370 	} else {
1371 		nce_queue_mp(nce, mp);
1372 	}
1373 
1374 	/* Handle ip_newroute_v6 giving us IPSEC packets */
1375 	if (mp->b_datap->db_type == M_CTL)
1376 		mp = mp->b_cont;
1377 
1378 	ip6h = (ip6_t *)mp->b_rptr;
1379 	if (ip6h->ip6_nxt == IPPROTO_RAW) {
1380 		/*
1381 		 * This message should have been pulled up already in
1382 		 * ip_wput_v6. We can't do pullups here because the message
1383 		 * could be from the nce_qd_mp which could have b_next/b_prev
1384 		 * non-NULL.
1385 		 */
1386 		ip6i = (ip6i_t *)ip6h;
1387 		ASSERT((mp->b_wptr - (uchar_t *)ip6i) >=
1388 		    sizeof (ip6i_t) + IPV6_HDR_LEN);
1389 		ip6h = (ip6_t *)(mp->b_rptr + sizeof (ip6i_t));
1390 	}
1391 	src = ip6h->ip6_src;
1392 	/*
1393 	 * If the src of outgoing packet is one of the assigned interface
1394 	 * addresses use it, otherwise we will pick the source address below.
1395 	 */
1396 	src_ill = ill;
1397 	if (!IN6_IS_ADDR_UNSPECIFIED(&src)) {
1398 		if (ill->ill_group != NULL)
1399 			src_ill = ill->ill_group->illgrp_ill;
1400 		for (; src_ill != NULL; src_ill = src_ill->ill_group_next) {
1401 			for (ipif = src_ill->ill_ipif; ipif != NULL;
1402 			    ipif = ipif->ipif_next) {
1403 				if (IN6_ARE_ADDR_EQUAL(&src,
1404 				    &ipif->ipif_v6lcl_addr)) {
1405 					break;
1406 				}
1407 			}
1408 			if (ipif != NULL)
1409 				break;
1410 		}
1411 		/*
1412 		 * If no relevant ipif can be found, then it's not one of our
1413 		 * addresses.  Reset to :: and let nce_xmit.  If an ipif can be
1414 		 * found, but it's not yet done with DAD verification, then
1415 		 * just postpone this transmission until later.
1416 		 */
1417 		if (src_ill == NULL)
1418 			src = ipv6_all_zeros;
1419 		else if (!ipif->ipif_addr_ready)
1420 			return (ill->ill_reachable_retrans_time);
1421 	}
1422 	dst = nce->nce_addr;
1423 	/*
1424 	 * If source address is unspecified, nce_xmit will choose
1425 	 * one for us and initialize the hardware address also
1426 	 * appropriately.
1427 	 */
1428 	if (IN6_IS_ADDR_UNSPECIFIED(&src))
1429 		src_ill = NULL;
1430 	nce->nce_rcnt--;
1431 	mutex_exit(&nce->nce_lock);
1432 	rw_exit(&ipst->ips_ill_g_lock);
1433 	dropped = nce_xmit(ill, ND_NEIGHBOR_SOLICIT, src_ill, B_TRUE, &src,
1434 	    &dst, 0);
1435 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
1436 	mutex_enter(&nce->nce_lock);
1437 	if (dropped)
1438 		nce->nce_rcnt++;
1439 	return (ill->ill_reachable_retrans_time);
1440 }
1441 
1442 /*
1443  * Attempt to recover an address on an interface that's been marked as a
1444  * duplicate.  Because NCEs are destroyed when the interface goes down, there's
1445  * no easy way to just probe the address and have the right thing happen if
1446  * it's no longer in use.  Instead, we just bring it up normally and allow the
1447  * regular interface start-up logic to probe for a remaining duplicate and take
1448  * us back down if necessary.
1449  * Neither DHCP nor temporary addresses arrive here; they're excluded by
1450  * ip_ndp_excl.
1451  */
1452 /* ARGSUSED */
1453 static void
1454 ip_ndp_recover(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg)
1455 {
1456 	ill_t	*ill = rq->q_ptr;
1457 	ipif_t	*ipif;
1458 	in6_addr_t *addr = (in6_addr_t *)mp->b_rptr;
1459 
1460 	for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
1461 		/*
1462 		 * We do not support recovery of proxy ARP'd interfaces,
1463 		 * because the system lacks a complete proxy ARP mechanism.
1464 		 */
1465 		if ((ipif->ipif_flags & IPIF_POINTOPOINT) ||
1466 		    !IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6lcl_addr, addr)) {
1467 			continue;
1468 		}
1469 
1470 		/*
1471 		 * If we have already recovered or if the interface is going
1472 		 * away, then ignore.
1473 		 */
1474 		mutex_enter(&ill->ill_lock);
1475 		if (!(ipif->ipif_flags & IPIF_DUPLICATE) ||
1476 		    (ipif->ipif_flags & (IPIF_MOVING | IPIF_CONDEMNED))) {
1477 			mutex_exit(&ill->ill_lock);
1478 			continue;
1479 		}
1480 
1481 		ipif->ipif_flags &= ~IPIF_DUPLICATE;
1482 		ill->ill_ipif_dup_count--;
1483 		mutex_exit(&ill->ill_lock);
1484 		ipif->ipif_was_dup = B_TRUE;
1485 
1486 		if (ipif_ndp_up(ipif) != EINPROGRESS)
1487 			(void) ipif_up_done_v6(ipif);
1488 	}
1489 	freeb(mp);
1490 }
1491 
1492 /*
1493  * Attempt to recover an IPv6 interface that's been shut down as a duplicate.
1494  * As long as someone else holds the address, the interface will stay down.
1495  * When that conflict goes away, the interface is brought back up.  This is
1496  * done so that accidental shutdowns of addresses aren't made permanent.  Your
1497  * server will recover from a failure.
1498  *
1499  * For DHCP and temporary addresses, recovery is not done in the kernel.
1500  * Instead, it's handled by user space processes (dhcpagent and in.ndpd).
1501  *
1502  * This function is entered on a timer expiry; the ID is in ipif_recovery_id.
1503  */
1504 static void
1505 ipif6_dup_recovery(void *arg)
1506 {
1507 	ipif_t *ipif = arg;
1508 
1509 	ipif->ipif_recovery_id = 0;
1510 	if (!(ipif->ipif_flags & IPIF_DUPLICATE))
1511 		return;
1512 
1513 	/*
1514 	 * No lock, because this is just an optimization.
1515 	 */
1516 	if (ipif->ipif_state_flags & (IPIF_MOVING | IPIF_CONDEMNED))
1517 		return;
1518 
1519 	/* If the link is down, we'll retry this later */
1520 	if (!(ipif->ipif_ill->ill_phyint->phyint_flags & PHYI_RUNNING))
1521 		return;
1522 
1523 	ndp_do_recovery(ipif);
1524 }
1525 
1526 /*
1527  * Perform interface recovery by forcing the duplicate interfaces up and
1528  * allowing the system to determine which ones should stay up.
1529  *
1530  * Called both by recovery timer expiry and link-up notification.
1531  */
1532 void
1533 ndp_do_recovery(ipif_t *ipif)
1534 {
1535 	ill_t *ill = ipif->ipif_ill;
1536 	mblk_t *mp;
1537 	ip_stack_t *ipst = ill->ill_ipst;
1538 
1539 	mp = allocb(sizeof (ipif->ipif_v6lcl_addr), BPRI_MED);
1540 	if (mp == NULL) {
1541 		mutex_enter(&ill->ill_lock);
1542 		if (ipif->ipif_recovery_id == 0 &&
1543 		    !(ipif->ipif_state_flags & (IPIF_MOVING |
1544 		    IPIF_CONDEMNED))) {
1545 			ipif->ipif_recovery_id = timeout(ipif6_dup_recovery,
1546 			    ipif, MSEC_TO_TICK(ipst->ips_ip_dup_recovery));
1547 		}
1548 		mutex_exit(&ill->ill_lock);
1549 	} else {
1550 		bcopy(&ipif->ipif_v6lcl_addr, mp->b_rptr,
1551 		    sizeof (ipif->ipif_v6lcl_addr));
1552 		ill_refhold(ill);
1553 		qwriter_ip(ill, ill->ill_rq, mp, ip_ndp_recover, NEW_OP,
1554 		    B_FALSE);
1555 	}
1556 }
1557 
1558 /*
1559  * Find the solicitation in the given message, and extract printable details
1560  * (MAC and IP addresses) from it.
1561  */
1562 static nd_neighbor_solicit_t *
1563 ip_ndp_find_solicitation(mblk_t *mp, mblk_t *dl_mp, ill_t *ill, char *hbuf,
1564     size_t hlen, char *sbuf, size_t slen, uchar_t **haddr)
1565 {
1566 	nd_neighbor_solicit_t *ns;
1567 	ip6_t *ip6h;
1568 	uchar_t *addr;
1569 	int alen;
1570 
1571 	alen = 0;
1572 	ip6h = (ip6_t *)mp->b_rptr;
1573 	if (dl_mp == NULL) {
1574 		nd_opt_hdr_t *opt;
1575 		int nslen;
1576 
1577 		/*
1578 		 * If it's from the fast-path, then it can't be a probe
1579 		 * message, and thus must include the source linkaddr option.
1580 		 * Extract that here.
1581 		 */
1582 		ns = (nd_neighbor_solicit_t *)((char *)ip6h + IPV6_HDR_LEN);
1583 		nslen = mp->b_wptr - (uchar_t *)ns;
1584 		if ((nslen -= sizeof (*ns)) > 0) {
1585 			opt = ndp_get_option((nd_opt_hdr_t *)(ns + 1), nslen,
1586 			    ND_OPT_SOURCE_LINKADDR);
1587 			if (opt != NULL &&
1588 			    opt->nd_opt_len * 8 - sizeof (*opt) >=
1589 			    ill->ill_nd_lla_len) {
1590 				addr = (uchar_t *)(opt + 1);
1591 				alen = ill->ill_nd_lla_len;
1592 			}
1593 		}
1594 		/*
1595 		 * We cheat a bit here for the sake of printing usable log
1596 		 * messages in the rare case where the reply we got was unicast
1597 		 * without a source linkaddr option, and the interface is in
1598 		 * fastpath mode.  (Sigh.)
1599 		 */
1600 		if (alen == 0 && ill->ill_type == IFT_ETHER &&
1601 		    MBLKHEAD(mp) >= sizeof (struct ether_header)) {
1602 			struct ether_header *pether;
1603 
1604 			pether = (struct ether_header *)((char *)ip6h -
1605 			    sizeof (*pether));
1606 			addr = pether->ether_shost.ether_addr_octet;
1607 			alen = ETHERADDRL;
1608 		}
1609 	} else {
1610 		dl_unitdata_ind_t *dlu;
1611 
1612 		dlu = (dl_unitdata_ind_t *)dl_mp->b_rptr;
1613 		alen = dlu->dl_src_addr_length;
1614 		if (alen > 0 && dlu->dl_src_addr_offset >= sizeof (*dlu) &&
1615 		    dlu->dl_src_addr_offset + alen <= MBLKL(dl_mp)) {
1616 			addr = dl_mp->b_rptr + dlu->dl_src_addr_offset;
1617 			if (ill->ill_sap_length < 0) {
1618 				alen += ill->ill_sap_length;
1619 			} else {
1620 				addr += ill->ill_sap_length;
1621 				alen -= ill->ill_sap_length;
1622 			}
1623 		}
1624 	}
1625 	if (alen > 0) {
1626 		*haddr = addr;
1627 		(void) mac_colon_addr(addr, alen, hbuf, hlen);
1628 	} else {
1629 		*haddr = NULL;
1630 		(void) strcpy(hbuf, "?");
1631 	}
1632 	ns = (nd_neighbor_solicit_t *)((char *)ip6h + IPV6_HDR_LEN);
1633 	(void) inet_ntop(AF_INET6, &ns->nd_ns_target, sbuf, slen);
1634 	return (ns);
1635 }
1636 
1637 /*
1638  * This is for exclusive changes due to NDP duplicate address detection
1639  * failure.
1640  */
1641 /* ARGSUSED */
1642 static void
1643 ip_ndp_excl(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg)
1644 {
1645 	ill_t	*ill = rq->q_ptr;
1646 	ipif_t	*ipif;
1647 	char ibuf[LIFNAMSIZ + 10];	/* 10 digits for logical i/f number */
1648 	char hbuf[MAC_STR_LEN];
1649 	char sbuf[INET6_ADDRSTRLEN];
1650 	nd_neighbor_solicit_t *ns;
1651 	mblk_t *dl_mp = NULL;
1652 	uchar_t *haddr;
1653 	ip_stack_t *ipst = ill->ill_ipst;
1654 
1655 	if (DB_TYPE(mp) != M_DATA) {
1656 		dl_mp = mp;
1657 		mp = mp->b_cont;
1658 	}
1659 	ns = ip_ndp_find_solicitation(mp, dl_mp, ill, hbuf, sizeof (hbuf), sbuf,
1660 	    sizeof (sbuf), &haddr);
1661 	if (haddr != NULL &&
1662 	    bcmp(haddr, ill->ill_phys_addr, ill->ill_phys_addr_length) == 0) {
1663 		/*
1664 		 * Ignore conflicts generated by misbehaving switches that just
1665 		 * reflect our own messages back to us.
1666 		 */
1667 		goto ignore_conflict;
1668 	}
1669 
1670 	for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
1671 
1672 		if ((ipif->ipif_flags & IPIF_POINTOPOINT) ||
1673 		    !IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6lcl_addr,
1674 		    &ns->nd_ns_target)) {
1675 			continue;
1676 		}
1677 
1678 		/* If it's already marked, then don't do anything. */
1679 		if (ipif->ipif_flags & IPIF_DUPLICATE)
1680 			continue;
1681 
1682 		/*
1683 		 * If this is a failure during duplicate recovery, then don't
1684 		 * complain.  It may take a long time to recover.
1685 		 */
1686 		if (!ipif->ipif_was_dup) {
1687 			ipif_get_name(ipif, ibuf, sizeof (ibuf));
1688 			cmn_err(CE_WARN, "%s has duplicate address %s (in "
1689 			    "use by %s); disabled", ibuf, sbuf, hbuf);
1690 		}
1691 		mutex_enter(&ill->ill_lock);
1692 		ASSERT(!(ipif->ipif_flags & IPIF_DUPLICATE));
1693 		ipif->ipif_flags |= IPIF_DUPLICATE;
1694 		ill->ill_ipif_dup_count++;
1695 		mutex_exit(&ill->ill_lock);
1696 		(void) ipif_down(ipif, NULL, NULL);
1697 		ipif_down_tail(ipif);
1698 		mutex_enter(&ill->ill_lock);
1699 		if (!(ipif->ipif_flags & (IPIF_DHCPRUNNING|IPIF_TEMPORARY)) &&
1700 		    ill->ill_net_type == IRE_IF_RESOLVER &&
1701 		    !(ipif->ipif_state_flags & (IPIF_MOVING |
1702 		    IPIF_CONDEMNED)) &&
1703 		    ipst->ips_ip_dup_recovery > 0) {
1704 			ipif->ipif_recovery_id = timeout(ipif6_dup_recovery,
1705 			    ipif, MSEC_TO_TICK(ipst->ips_ip_dup_recovery));
1706 		}
1707 		mutex_exit(&ill->ill_lock);
1708 	}
1709 ignore_conflict:
1710 	if (dl_mp != NULL)
1711 		freeb(dl_mp);
1712 	freemsg(mp);
1713 }
1714 
1715 /*
1716  * Handle failure by tearing down the ipifs with the specified address.  Note
1717  * that tearing down the ipif also means deleting the nce through ipif_down, so
1718  * it's not possible to do recovery by just restarting the nce timer.  Instead,
1719  * we start a timer on the ipif.
1720  */
1721 static void
1722 ip_ndp_failure(ill_t *ill, mblk_t *mp, mblk_t *dl_mp, nce_t *nce)
1723 {
1724 	if ((mp = copymsg(mp)) != NULL) {
1725 		if (dl_mp == NULL)
1726 			dl_mp = mp;
1727 		else if ((dl_mp = copyb(dl_mp)) != NULL)
1728 			dl_mp->b_cont = mp;
1729 		if (dl_mp == NULL) {
1730 			freemsg(mp);
1731 		} else {
1732 			ill_refhold(ill);
1733 			qwriter_ip(ill, ill->ill_rq, dl_mp, ip_ndp_excl, NEW_OP,
1734 			    B_FALSE);
1735 		}
1736 	}
1737 	ndp_delete(nce);
1738 }
1739 
1740 /*
1741  * Handle a discovered conflict: some other system is advertising that it owns
1742  * one of our IP addresses.  We need to defend ourselves, or just shut down the
1743  * interface.
1744  */
1745 static void
1746 ip_ndp_conflict(ill_t *ill, mblk_t *mp, mblk_t *dl_mp, nce_t *nce)
1747 {
1748 	ipif_t *ipif;
1749 	uint32_t now;
1750 	uint_t maxdefense;
1751 	uint_t defs;
1752 	ip_stack_t *ipst = ill->ill_ipst;
1753 
1754 	ipif = ipif_lookup_addr_v6(&nce->nce_addr, ill, ALL_ZONES, NULL, NULL,
1755 	    NULL, NULL, ipst);
1756 	if (ipif == NULL)
1757 		return;
1758 	/*
1759 	 * First, figure out if this address is disposable.
1760 	 */
1761 	if (ipif->ipif_flags & (IPIF_DHCPRUNNING | IPIF_TEMPORARY))
1762 		maxdefense = ipst->ips_ip_max_temp_defend;
1763 	else
1764 		maxdefense = ipst->ips_ip_max_defend;
1765 
1766 	/*
1767 	 * Now figure out how many times we've defended ourselves.  Ignore
1768 	 * defenses that happened long in the past.
1769 	 */
1770 	now = gethrestime_sec();
1771 	mutex_enter(&nce->nce_lock);
1772 	if ((defs = nce->nce_defense_count) > 0 &&
1773 	    now - nce->nce_defense_time > ipst->ips_ip_defend_interval) {
1774 		nce->nce_defense_count = defs = 0;
1775 	}
1776 	nce->nce_defense_count++;
1777 	nce->nce_defense_time = now;
1778 	mutex_exit(&nce->nce_lock);
1779 	ipif_refrele(ipif);
1780 
1781 	/*
1782 	 * If we've defended ourselves too many times already, then give up and
1783 	 * tear down the interface(s) using this address.  Otherwise, defend by
1784 	 * sending out an unsolicited Neighbor Advertisement.
1785 	 */
1786 	if (defs >= maxdefense) {
1787 		ip_ndp_failure(ill, mp, dl_mp, nce);
1788 	} else {
1789 		char hbuf[MAC_STR_LEN];
1790 		char sbuf[INET6_ADDRSTRLEN];
1791 		uchar_t *haddr;
1792 
1793 		(void) ip_ndp_find_solicitation(mp, dl_mp, ill, hbuf,
1794 		    sizeof (hbuf), sbuf, sizeof (sbuf), &haddr);
1795 		cmn_err(CE_WARN, "node %s is using our IP address %s on %s",
1796 		    hbuf, sbuf, ill->ill_name);
1797 		(void) nce_xmit(ill, ND_NEIGHBOR_ADVERT, ill, B_FALSE,
1798 		    &nce->nce_addr, &ipv6_all_hosts_mcast,
1799 		    nce_advert_flags(nce));
1800 	}
1801 }
1802 
1803 static void
1804 ndp_input_solicit(ill_t *ill, mblk_t *mp, mblk_t *dl_mp)
1805 {
1806 	nd_neighbor_solicit_t *ns;
1807 	uint32_t	hlen = ill->ill_nd_lla_len;
1808 	uchar_t		*haddr = NULL;
1809 	icmp6_t		*icmp_nd;
1810 	ip6_t		*ip6h;
1811 	nce_t		*our_nce = NULL;
1812 	in6_addr_t	target;
1813 	in6_addr_t	src;
1814 	int		len;
1815 	int		flag = 0;
1816 	nd_opt_hdr_t	*opt = NULL;
1817 	boolean_t	bad_solicit = B_FALSE;
1818 	mib2_ipv6IfIcmpEntry_t	*mib = ill->ill_icmp6_mib;
1819 
1820 	ip6h = (ip6_t *)mp->b_rptr;
1821 	icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
1822 	len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN;
1823 	src = ip6h->ip6_src;
1824 	ns = (nd_neighbor_solicit_t *)icmp_nd;
1825 	target = ns->nd_ns_target;
1826 	if (IN6_IS_ADDR_MULTICAST(&target)) {
1827 		if (ip_debug > 2) {
1828 			/* ip1dbg */
1829 			pr_addr_dbg("ndp_input_solicit: Target is"
1830 			    " multicast! %s\n", AF_INET6, &target);
1831 		}
1832 		bad_solicit = B_TRUE;
1833 		goto done;
1834 	}
1835 	if (len > sizeof (nd_neighbor_solicit_t)) {
1836 		/* Options present */
1837 		opt = (nd_opt_hdr_t *)&ns[1];
1838 		len -= sizeof (nd_neighbor_solicit_t);
1839 		if (!ndp_verify_optlen(opt, len)) {
1840 			ip1dbg(("ndp_input_solicit: Bad opt len\n"));
1841 			bad_solicit = B_TRUE;
1842 			goto done;
1843 		}
1844 	}
1845 	if (IN6_IS_ADDR_UNSPECIFIED(&src)) {
1846 		/* Check to see if this is a valid DAD solicitation */
1847 		if (!IN6_IS_ADDR_MC_SOLICITEDNODE(&ip6h->ip6_dst)) {
1848 			if (ip_debug > 2) {
1849 				/* ip1dbg */
1850 				pr_addr_dbg("ndp_input_solicit: IPv6 "
1851 				    "Destination is not solicited node "
1852 				    "multicast %s\n", AF_INET6,
1853 				    &ip6h->ip6_dst);
1854 			}
1855 			bad_solicit = B_TRUE;
1856 			goto done;
1857 		}
1858 	}
1859 
1860 	our_nce = ndp_lookup_v6(ill, &target, B_FALSE);
1861 	/*
1862 	 * If this is a valid Solicitation, a permanent
1863 	 * entry should exist in the cache
1864 	 */
1865 	if (our_nce == NULL ||
1866 	    !(our_nce->nce_flags & NCE_F_PERMANENT)) {
1867 		ip1dbg(("ndp_input_solicit: Wrong target in NS?!"
1868 		    "ifname=%s ", ill->ill_name));
1869 		if (ip_debug > 2) {
1870 			/* ip1dbg */
1871 			pr_addr_dbg(" dst %s\n", AF_INET6, &target);
1872 		}
1873 		bad_solicit = B_TRUE;
1874 		goto done;
1875 	}
1876 
1877 	/* At this point we should have a verified NS per spec */
1878 	if (opt != NULL) {
1879 		opt = ndp_get_option(opt, len, ND_OPT_SOURCE_LINKADDR);
1880 		if (opt != NULL) {
1881 			haddr = (uchar_t *)&opt[1];
1882 			if (hlen > opt->nd_opt_len * 8 - sizeof (*opt) ||
1883 			    hlen == 0) {
1884 				ip1dbg(("ndp_input_advert: bad SLLA\n"));
1885 				bad_solicit = B_TRUE;
1886 				goto done;
1887 			}
1888 		}
1889 	}
1890 
1891 	/* If sending directly to peer, set the unicast flag */
1892 	if (!IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst))
1893 		flag |= NDP_UNICAST;
1894 
1895 	/*
1896 	 * Create/update the entry for the soliciting node.
1897 	 * or respond to outstanding queries, don't if
1898 	 * the source is unspecified address.
1899 	 */
1900 	if (!IN6_IS_ADDR_UNSPECIFIED(&src)) {
1901 		int	err;
1902 		nce_t	*nnce;
1903 
1904 		ASSERT(ill->ill_isv6);
1905 		/*
1906 		 * Regular solicitations *must* include the Source Link-Layer
1907 		 * Address option.  Ignore messages that do not.
1908 		 */
1909 		if (haddr == NULL && IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst)) {
1910 			ip1dbg(("ndp_input_solicit: source link-layer address "
1911 			    "option missing with a specified source.\n"));
1912 			bad_solicit = B_TRUE;
1913 			goto done;
1914 		}
1915 
1916 		/*
1917 		 * This is a regular solicitation.  If we're still in the
1918 		 * process of verifying the address, then don't respond at all
1919 		 * and don't keep track of the sender.
1920 		 */
1921 		if (our_nce->nce_state == ND_PROBE)
1922 			goto done;
1923 
1924 		/*
1925 		 * If the solicitation doesn't have sender hardware address
1926 		 * (legal for unicast solicitation), then process without
1927 		 * installing the return NCE.  Either we already know it, or
1928 		 * we'll be forced to look it up when (and if) we reply to the
1929 		 * packet.
1930 		 */
1931 		if (haddr == NULL)
1932 			goto no_source;
1933 
1934 		err = ndp_lookup_then_add_v6(ill,
1935 		    haddr,
1936 		    &src,	/* Soliciting nodes address */
1937 		    &ipv6_all_ones,
1938 		    &ipv6_all_zeros,
1939 		    0,
1940 		    0,
1941 		    ND_STALE,
1942 		    &nnce);
1943 		switch (err) {
1944 		case 0:
1945 			/* done with this entry */
1946 			NCE_REFRELE(nnce);
1947 			break;
1948 		case EEXIST:
1949 			/*
1950 			 * B_FALSE indicates this is not an
1951 			 * an advertisement.
1952 			 */
1953 			ndp_process(nnce, haddr, 0, B_FALSE);
1954 			NCE_REFRELE(nnce);
1955 			break;
1956 		default:
1957 			ip1dbg(("ndp_input_solicit: Can't create NCE %d\n",
1958 			    err));
1959 			goto done;
1960 		}
1961 no_source:
1962 		flag |= NDP_SOLICITED;
1963 	} else {
1964 		/*
1965 		 * No source link layer address option should be present in a
1966 		 * valid DAD request.
1967 		 */
1968 		if (haddr != NULL) {
1969 			ip1dbg(("ndp_input_solicit: source link-layer address "
1970 			    "option present with an unspecified source.\n"));
1971 			bad_solicit = B_TRUE;
1972 			goto done;
1973 		}
1974 		if (our_nce->nce_state == ND_PROBE) {
1975 			/*
1976 			 * Internally looped-back probes won't have DLPI
1977 			 * attached to them.  External ones (which are sent by
1978 			 * multicast) always will.  Just ignore our own
1979 			 * transmissions.
1980 			 */
1981 			if (dl_mp != NULL) {
1982 				/*
1983 				 * If someone else is probing our address, then
1984 				 * we've crossed wires.  Declare failure.
1985 				 */
1986 				ip_ndp_failure(ill, mp, dl_mp, our_nce);
1987 			}
1988 			goto done;
1989 		}
1990 		/*
1991 		 * This is a DAD probe.  Multicast the advertisement to the
1992 		 * all-nodes address.
1993 		 */
1994 		src = ipv6_all_hosts_mcast;
1995 	}
1996 	flag |= nce_advert_flags(our_nce);
1997 	/* Response to a solicitation */
1998 	(void) nce_xmit(ill,
1999 	    ND_NEIGHBOR_ADVERT,
2000 	    ill,	/* ill to be used for extracting ill_nd_lla */
2001 	    B_TRUE,	/* use ill_nd_lla */
2002 	    &target,	/* Source and target of the advertisement pkt */
2003 	    &src,	/* IP Destination (source of original pkt) */
2004 	    flag);
2005 done:
2006 	if (bad_solicit)
2007 		BUMP_MIB(mib, ipv6IfIcmpInBadNeighborSolicitations);
2008 	if (our_nce != NULL)
2009 		NCE_REFRELE(our_nce);
2010 }
2011 
2012 void
2013 ndp_input_advert(ill_t *ill, mblk_t *mp, mblk_t *dl_mp)
2014 {
2015 	nd_neighbor_advert_t *na;
2016 	uint32_t	hlen = ill->ill_nd_lla_len;
2017 	uchar_t		*haddr = NULL;
2018 	icmp6_t		*icmp_nd;
2019 	ip6_t		*ip6h;
2020 	nce_t		*dst_nce = NULL;
2021 	in6_addr_t	target;
2022 	nd_opt_hdr_t	*opt = NULL;
2023 	int		len;
2024 	mib2_ipv6IfIcmpEntry_t	*mib = ill->ill_icmp6_mib;
2025 	ip_stack_t	*ipst = ill->ill_ipst;
2026 
2027 	ip6h = (ip6_t *)mp->b_rptr;
2028 	icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
2029 	len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN;
2030 	na = (nd_neighbor_advert_t *)icmp_nd;
2031 	if (IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst) &&
2032 	    (na->nd_na_flags_reserved & ND_NA_FLAG_SOLICITED)) {
2033 		ip1dbg(("ndp_input_advert: Target is multicast but the "
2034 		    "solicited flag is not zero\n"));
2035 		BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements);
2036 		return;
2037 	}
2038 	target = na->nd_na_target;
2039 	if (IN6_IS_ADDR_MULTICAST(&target)) {
2040 		ip1dbg(("ndp_input_advert: Target is multicast!\n"));
2041 		BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements);
2042 		return;
2043 	}
2044 	if (len > sizeof (nd_neighbor_advert_t)) {
2045 		opt = (nd_opt_hdr_t *)&na[1];
2046 		if (!ndp_verify_optlen(opt,
2047 		    len - sizeof (nd_neighbor_advert_t))) {
2048 			ip1dbg(("ndp_input_advert: cannot verify SLLA\n"));
2049 			BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements);
2050 			return;
2051 		}
2052 		/* At this point we have a verified NA per spec */
2053 		len -= sizeof (nd_neighbor_advert_t);
2054 		opt = ndp_get_option(opt, len, ND_OPT_TARGET_LINKADDR);
2055 		if (opt != NULL) {
2056 			haddr = (uchar_t *)&opt[1];
2057 			if (hlen > opt->nd_opt_len * 8 - sizeof (*opt) ||
2058 			    hlen == 0) {
2059 				ip1dbg(("ndp_input_advert: bad SLLA\n"));
2060 				BUMP_MIB(mib,
2061 				    ipv6IfIcmpInBadNeighborAdvertisements);
2062 				return;
2063 			}
2064 		}
2065 	}
2066 
2067 	/*
2068 	 * If this interface is part of the group look at all the
2069 	 * ills in the group.
2070 	 */
2071 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
2072 	if (ill->ill_group != NULL)
2073 		ill = ill->ill_group->illgrp_ill;
2074 
2075 	for (; ill != NULL; ill = ill->ill_group_next) {
2076 		mutex_enter(&ill->ill_lock);
2077 		if (!ILL_CAN_LOOKUP(ill)) {
2078 			mutex_exit(&ill->ill_lock);
2079 			continue;
2080 		}
2081 		ill_refhold_locked(ill);
2082 		mutex_exit(&ill->ill_lock);
2083 		dst_nce = ndp_lookup_v6(ill, &target, B_FALSE);
2084 		/* We have to drop the lock since ndp_process calls put* */
2085 		rw_exit(&ipst->ips_ill_g_lock);
2086 		if (dst_nce != NULL) {
2087 			if ((dst_nce->nce_flags & NCE_F_PERMANENT) &&
2088 			    dst_nce->nce_state == ND_PROBE) {
2089 				/*
2090 				 * Someone else sent an advertisement for an
2091 				 * address that we're trying to configure.
2092 				 * Tear it down.  Note that dl_mp might be NULL
2093 				 * if we're getting a unicast reply.  This
2094 				 * isn't typically done (multicast is the norm
2095 				 * in response to a probe), but ip_ndp_failure
2096 				 * will handle the dl_mp == NULL case as well.
2097 				 */
2098 				ip_ndp_failure(ill, mp, dl_mp, dst_nce);
2099 			} else if (dst_nce->nce_flags & NCE_F_PERMANENT) {
2100 				/*
2101 				 * Someone just announced one of our local
2102 				 * addresses.  If it wasn't us, then this is a
2103 				 * conflict.  Defend the address or shut it
2104 				 * down.
2105 				 */
2106 				if (dl_mp != NULL &&
2107 				    (haddr == NULL ||
2108 				    nce_cmp_ll_addr(dst_nce, haddr,
2109 				    ill->ill_nd_lla_len))) {
2110 					ip_ndp_conflict(ill, mp, dl_mp,
2111 					    dst_nce);
2112 				}
2113 			} else {
2114 				if (na->nd_na_flags_reserved &
2115 				    ND_NA_FLAG_ROUTER) {
2116 					dst_nce->nce_flags |= NCE_F_ISROUTER;
2117 				}
2118 				/* B_TRUE indicates this an advertisement */
2119 				ndp_process(dst_nce, haddr,
2120 				    na->nd_na_flags_reserved, B_TRUE);
2121 			}
2122 			NCE_REFRELE(dst_nce);
2123 		}
2124 		rw_enter(&ipst->ips_ill_g_lock, RW_READER);
2125 		ill_refrele(ill);
2126 	}
2127 	rw_exit(&ipst->ips_ill_g_lock);
2128 }
2129 
2130 /*
2131  * Process NDP neighbor solicitation/advertisement messages.
2132  * The checksum has already checked o.k before reaching here.
2133  */
2134 void
2135 ndp_input(ill_t *ill, mblk_t *mp, mblk_t *dl_mp)
2136 {
2137 	icmp6_t		*icmp_nd;
2138 	ip6_t		*ip6h;
2139 	int		len;
2140 	mib2_ipv6IfIcmpEntry_t	*mib = ill->ill_icmp6_mib;
2141 
2142 
2143 	if (!pullupmsg(mp, -1)) {
2144 		ip1dbg(("ndp_input: pullupmsg failed\n"));
2145 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2146 		goto done;
2147 	}
2148 	ip6h = (ip6_t *)mp->b_rptr;
2149 	if (ip6h->ip6_hops != IPV6_MAX_HOPS) {
2150 		ip1dbg(("ndp_input: hoplimit != IPV6_MAX_HOPS\n"));
2151 		BUMP_MIB(mib, ipv6IfIcmpBadHoplimit);
2152 		goto done;
2153 	}
2154 	/*
2155 	 * NDP does not accept any extension headers between the
2156 	 * IP header and the ICMP header since e.g. a routing
2157 	 * header could be dangerous.
2158 	 * This assumes that any AH or ESP headers are removed
2159 	 * by ip prior to passing the packet to ndp_input.
2160 	 */
2161 	if (ip6h->ip6_nxt != IPPROTO_ICMPV6) {
2162 		ip1dbg(("ndp_input: Wrong next header 0x%x\n",
2163 		    ip6h->ip6_nxt));
2164 		BUMP_MIB(mib, ipv6IfIcmpInErrors);
2165 		goto done;
2166 	}
2167 	icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
2168 	ASSERT(icmp_nd->icmp6_type == ND_NEIGHBOR_SOLICIT ||
2169 	    icmp_nd->icmp6_type == ND_NEIGHBOR_ADVERT);
2170 	if (icmp_nd->icmp6_code != 0) {
2171 		ip1dbg(("ndp_input: icmp6 code != 0 \n"));
2172 		BUMP_MIB(mib, ipv6IfIcmpInErrors);
2173 		goto done;
2174 	}
2175 	len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN;
2176 	/*
2177 	 * Make sure packet length is large enough for either
2178 	 * a NS or a NA icmp packet.
2179 	 */
2180 	if (len <  sizeof (struct icmp6_hdr) + sizeof (struct in6_addr)) {
2181 		ip1dbg(("ndp_input: packet too short\n"));
2182 		BUMP_MIB(mib, ipv6IfIcmpInErrors);
2183 		goto done;
2184 	}
2185 	if (icmp_nd->icmp6_type == ND_NEIGHBOR_SOLICIT) {
2186 		ndp_input_solicit(ill, mp, dl_mp);
2187 	} else {
2188 		ndp_input_advert(ill, mp, dl_mp);
2189 	}
2190 done:
2191 	freemsg(mp);
2192 }
2193 
2194 /*
2195  * nce_xmit is called to form and transmit a ND solicitation or
2196  * advertisement ICMP packet.
2197  *
2198  * If the source address is unspecified and this isn't a probe (used for
2199  * duplicate address detection), an appropriate source address and link layer
2200  * address will be chosen here.  The link layer address option is included if
2201  * the source is specified (i.e., all non-probe packets), and omitted (per the
2202  * specification) otherwise.
2203  *
2204  * It returns B_FALSE only if it does a successful put() to the
2205  * corresponding ill's ill_wq otherwise returns B_TRUE.
2206  */
2207 static boolean_t
2208 nce_xmit(ill_t *ill, uint32_t operation, ill_t *hwaddr_ill,
2209     boolean_t use_nd_lla, const in6_addr_t *sender, const in6_addr_t *target,
2210     int flag)
2211 {
2212 	uint32_t	len;
2213 	icmp6_t 	*icmp6;
2214 	mblk_t		*mp;
2215 	ip6_t		*ip6h;
2216 	nd_opt_hdr_t	*opt;
2217 	uint_t		plen;
2218 	ip6i_t		*ip6i;
2219 	ipif_t		*src_ipif = NULL;
2220 	uint8_t		*hw_addr;
2221 	zoneid_t	zoneid = GLOBAL_ZONEID;
2222 
2223 	/*
2224 	 * If we have a unspecified source(sender) address, select a
2225 	 * proper source address for the solicitation here itself so
2226 	 * that we can initialize the h/w address correctly. This is
2227 	 * needed for interface groups as source address can come from
2228 	 * the whole group and the h/w address initialized from ill will
2229 	 * be wrong if the source address comes from a different ill.
2230 	 *
2231 	 * If the sender is specified then we use this address in order
2232 	 * to lookup the zoneid before calling ip_output_v6(). This is to
2233 	 * enable unicast ND_NEIGHBOR_ADVERT packets to be routed correctly
2234 	 * by IP (we cannot guarantee that the global zone has an interface
2235 	 * route to the destination).
2236 	 *
2237 	 * Note that the NA never comes here with the unspecified source
2238 	 * address. The following asserts that whenever the source
2239 	 * address is specified, the haddr also should be specified.
2240 	 */
2241 	ASSERT(IN6_IS_ADDR_UNSPECIFIED(sender) || (hwaddr_ill != NULL));
2242 
2243 	if (IN6_IS_ADDR_UNSPECIFIED(sender) && !(flag & NDP_PROBE)) {
2244 		ASSERT(operation != ND_NEIGHBOR_ADVERT);
2245 		/*
2246 		 * Pick a source address for this solicitation, but
2247 		 * restrict the selection to addresses assigned to the
2248 		 * output interface (or interface group).  We do this
2249 		 * because the destination will create a neighbor cache
2250 		 * entry for the source address of this packet, so the
2251 		 * source address had better be a valid neighbor.
2252 		 */
2253 		src_ipif = ipif_select_source_v6(ill, target, RESTRICT_TO_ILL,
2254 		    IPV6_PREFER_SRC_DEFAULT, ALL_ZONES);
2255 		if (src_ipif == NULL) {
2256 			char buf[INET6_ADDRSTRLEN];
2257 
2258 			ip1dbg(("nce_xmit: No source ipif for dst %s\n",
2259 			    inet_ntop(AF_INET6, (char *)target, buf,
2260 			    sizeof (buf))));
2261 			return (B_TRUE);
2262 		}
2263 		sender = &src_ipif->ipif_v6src_addr;
2264 		hwaddr_ill = src_ipif->ipif_ill;
2265 	} else if (!(IN6_IS_ADDR_UNSPECIFIED(sender))) {
2266 		zoneid = ipif_lookup_addr_zoneid_v6(sender, ill, ill->ill_ipst);
2267 		/*
2268 		 * It's possible for ipif_lookup_addr_zoneid_v6() to return
2269 		 * ALL_ZONES if it cannot find a matching ipif for the address
2270 		 * we are trying to use. In this case we err on the side of
2271 		 * trying to send the packet by defaulting to the GLOBAL_ZONEID.
2272 		 */
2273 		if (zoneid == ALL_ZONES)
2274 			zoneid = GLOBAL_ZONEID;
2275 	}
2276 
2277 	/*
2278 	 * Always make sure that the NS/NA packets don't get load
2279 	 * spread. This is needed so that the probe packets sent
2280 	 * by the in.mpathd daemon can really go out on the desired
2281 	 * interface. Probe packets are made to go out on a desired
2282 	 * interface by including a ip6i with ATTACH_IF flag. As these
2283 	 * packets indirectly end up sending/receiving NS/NA packets
2284 	 * (neighbor doing NUD), we have to make sure that NA
2285 	 * also go out on the same interface.
2286 	 */
2287 	plen = (sizeof (nd_opt_hdr_t) + ill->ill_nd_lla_len + 7) / 8;
2288 	len = IPV6_HDR_LEN + sizeof (ip6i_t) + sizeof (nd_neighbor_advert_t) +
2289 	    plen * 8;
2290 	mp = allocb(len,  BPRI_LO);
2291 	if (mp == NULL) {
2292 		if (src_ipif != NULL)
2293 			ipif_refrele(src_ipif);
2294 		return (B_TRUE);
2295 	}
2296 	bzero((char *)mp->b_rptr, len);
2297 	mp->b_wptr = mp->b_rptr + len;
2298 
2299 	ip6i = (ip6i_t *)mp->b_rptr;
2300 	ip6i->ip6i_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
2301 	ip6i->ip6i_nxt = IPPROTO_RAW;
2302 	ip6i->ip6i_flags = IP6I_ATTACH_IF | IP6I_HOPLIMIT;
2303 	if (flag & NDP_PROBE)
2304 		ip6i->ip6i_flags |= IP6I_UNSPEC_SRC;
2305 	ip6i->ip6i_ifindex = ill->ill_phyint->phyint_ifindex;
2306 
2307 	ip6h = (ip6_t *)(mp->b_rptr + sizeof (ip6i_t));
2308 	ip6h->ip6_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
2309 	ip6h->ip6_plen = htons(len - IPV6_HDR_LEN - sizeof (ip6i_t));
2310 	ip6h->ip6_nxt = IPPROTO_ICMPV6;
2311 	ip6h->ip6_hops = IPV6_MAX_HOPS;
2312 	ip6h->ip6_dst = *target;
2313 	icmp6 = (icmp6_t *)&ip6h[1];
2314 
2315 	opt = (nd_opt_hdr_t *)((uint8_t *)ip6h + IPV6_HDR_LEN +
2316 	    sizeof (nd_neighbor_advert_t));
2317 
2318 	if (operation == ND_NEIGHBOR_SOLICIT) {
2319 		nd_neighbor_solicit_t *ns = (nd_neighbor_solicit_t *)icmp6;
2320 
2321 		if (!(flag & NDP_PROBE))
2322 			opt->nd_opt_type = ND_OPT_SOURCE_LINKADDR;
2323 		ip6h->ip6_src = *sender;
2324 		ns->nd_ns_target = *target;
2325 		if (!(flag & NDP_UNICAST)) {
2326 			/* Form multicast address of the target */
2327 			ip6h->ip6_dst = ipv6_solicited_node_mcast;
2328 			ip6h->ip6_dst.s6_addr32[3] |=
2329 			    ns->nd_ns_target.s6_addr32[3];
2330 		}
2331 	} else {
2332 		nd_neighbor_advert_t *na = (nd_neighbor_advert_t *)icmp6;
2333 
2334 		ASSERT(!(flag & NDP_PROBE));
2335 		opt->nd_opt_type = ND_OPT_TARGET_LINKADDR;
2336 		ip6h->ip6_src = *sender;
2337 		na->nd_na_target = *sender;
2338 		if (flag & NDP_ISROUTER)
2339 			na->nd_na_flags_reserved |= ND_NA_FLAG_ROUTER;
2340 		if (flag & NDP_SOLICITED)
2341 			na->nd_na_flags_reserved |= ND_NA_FLAG_SOLICITED;
2342 		if (flag & NDP_ORIDE)
2343 			na->nd_na_flags_reserved |= ND_NA_FLAG_OVERRIDE;
2344 	}
2345 
2346 	hw_addr = NULL;
2347 	if (!(flag & NDP_PROBE)) {
2348 		hw_addr = use_nd_lla ? hwaddr_ill->ill_nd_lla :
2349 		    hwaddr_ill->ill_phys_addr;
2350 		if (hw_addr != NULL) {
2351 			/* Fill in link layer address and option len */
2352 			opt->nd_opt_len = (uint8_t)plen;
2353 			bcopy(hw_addr, &opt[1], hwaddr_ill->ill_nd_lla_len);
2354 		}
2355 	}
2356 	if (hw_addr == NULL) {
2357 		/* If there's no link layer address option, then strip it. */
2358 		len -= plen * 8;
2359 		mp->b_wptr = mp->b_rptr + len;
2360 		ip6h->ip6_plen = htons(len - IPV6_HDR_LEN - sizeof (ip6i_t));
2361 	}
2362 
2363 	icmp6->icmp6_type = (uint8_t)operation;
2364 	icmp6->icmp6_code = 0;
2365 	/*
2366 	 * Prepare for checksum by putting icmp length in the icmp
2367 	 * checksum field. The checksum is calculated in ip_wput_v6.
2368 	 */
2369 	icmp6->icmp6_cksum = ip6h->ip6_plen;
2370 
2371 	if (src_ipif != NULL)
2372 		ipif_refrele(src_ipif);
2373 
2374 	ip_output_v6((void *)(uintptr_t)zoneid, mp, ill->ill_wq, IP_WPUT);
2375 	return (B_FALSE);
2376 }
2377 
2378 /*
2379  * Make a link layer address (does not include the SAP) from an nce.
2380  * To form the link layer address, use the last four bytes of ipv6
2381  * address passed in and the fixed offset stored in nce.
2382  */
2383 static void
2384 nce_make_mapping(nce_t *nce, uchar_t *addrpos, uchar_t *addr)
2385 {
2386 	uchar_t *mask, *to;
2387 	ill_t	*ill = nce->nce_ill;
2388 	int 	len;
2389 
2390 	if (ill->ill_net_type == IRE_IF_NORESOLVER)
2391 		return;
2392 	ASSERT(nce->nce_res_mp != NULL);
2393 	ASSERT(ill->ill_net_type == IRE_IF_RESOLVER);
2394 	ASSERT(nce->nce_flags & NCE_F_MAPPING);
2395 	ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&nce->nce_extract_mask));
2396 	ASSERT(addr != NULL);
2397 	bcopy(nce->nce_res_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill),
2398 	    addrpos, ill->ill_nd_lla_len);
2399 	len = MIN((int)ill->ill_nd_lla_len - nce->nce_ll_extract_start,
2400 	    IPV6_ADDR_LEN);
2401 	mask = (uchar_t *)&nce->nce_extract_mask;
2402 	mask += (IPV6_ADDR_LEN - len);
2403 	addr += (IPV6_ADDR_LEN - len);
2404 	to = addrpos + nce->nce_ll_extract_start;
2405 	while (len-- > 0)
2406 		*to++ |= *mask++ & *addr++;
2407 }
2408 
2409 mblk_t *
2410 nce_udreq_alloc(ill_t *ill)
2411 {
2412 	mblk_t	*template_mp = NULL;
2413 	dl_unitdata_req_t *dlur;
2414 	int	sap_length;
2415 
2416 	ASSERT(ill->ill_isv6);
2417 
2418 	sap_length = ill->ill_sap_length;
2419 	template_mp = ip_dlpi_alloc(sizeof (dl_unitdata_req_t) +
2420 	    ill->ill_nd_lla_len + ABS(sap_length), DL_UNITDATA_REQ);
2421 	if (template_mp == NULL)
2422 		return (NULL);
2423 
2424 	dlur = (dl_unitdata_req_t *)template_mp->b_rptr;
2425 	dlur->dl_priority.dl_min = 0;
2426 	dlur->dl_priority.dl_max = 0;
2427 	dlur->dl_dest_addr_length = ABS(sap_length) + ill->ill_nd_lla_len;
2428 	dlur->dl_dest_addr_offset = sizeof (dl_unitdata_req_t);
2429 
2430 	/* Copy in the SAP value. */
2431 	NCE_LL_SAP_COPY(ill, template_mp);
2432 
2433 	return (template_mp);
2434 }
2435 
2436 /*
2437  * NDP retransmit timer.
2438  * This timer goes off when:
2439  * a. It is time to retransmit NS for resolver.
2440  * b. It is time to send reachability probes.
2441  */
2442 void
2443 ndp_timer(void *arg)
2444 {
2445 	nce_t		*nce = arg;
2446 	ill_t		*ill = nce->nce_ill;
2447 	uint32_t	ms;
2448 	char		addrbuf[INET6_ADDRSTRLEN];
2449 	mblk_t		*mp;
2450 	boolean_t	dropped = B_FALSE;
2451 	ip_stack_t	*ipst = ill->ill_ipst;
2452 
2453 	/*
2454 	 * The timer has to be cancelled by ndp_delete before doing the final
2455 	 * refrele. So the NCE is guaranteed to exist when the timer runs
2456 	 * until it clears the timeout_id. Before clearing the timeout_id
2457 	 * bump up the refcnt so that we can continue to use the nce
2458 	 */
2459 	ASSERT(nce != NULL);
2460 
2461 	/*
2462 	 * Grab the ill_g_lock now itself to avoid lock order problems.
2463 	 * nce_solicit needs ill_g_lock to be able to traverse ills
2464 	 */
2465 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
2466 	mutex_enter(&nce->nce_lock);
2467 	NCE_REFHOLD_LOCKED(nce);
2468 	nce->nce_timeout_id = 0;
2469 
2470 	/*
2471 	 * Check the reachability state first.
2472 	 */
2473 	switch (nce->nce_state) {
2474 	case ND_DELAY:
2475 		rw_exit(&ipst->ips_ill_g_lock);
2476 		nce->nce_state = ND_PROBE;
2477 		mutex_exit(&nce->nce_lock);
2478 		(void) nce_xmit(ill, ND_NEIGHBOR_SOLICIT, NULL, B_FALSE,
2479 		    &ipv6_all_zeros, &nce->nce_addr, NDP_UNICAST);
2480 		if (ip_debug > 3) {
2481 			/* ip2dbg */
2482 			pr_addr_dbg("ndp_timer: state for %s changed "
2483 			    "to PROBE\n", AF_INET6, &nce->nce_addr);
2484 		}
2485 		NDP_RESTART_TIMER(nce, ill->ill_reachable_retrans_time);
2486 		NCE_REFRELE(nce);
2487 		return;
2488 	case ND_PROBE:
2489 		/* must be retransmit timer */
2490 		rw_exit(&ipst->ips_ill_g_lock);
2491 		nce->nce_pcnt--;
2492 		ASSERT(nce->nce_pcnt < ND_MAX_UNICAST_SOLICIT &&
2493 		    nce->nce_pcnt >= -1);
2494 		if (nce->nce_pcnt > 0) {
2495 			/*
2496 			 * As per RFC2461, the nce gets deleted after
2497 			 * MAX_UNICAST_SOLICIT unsuccessful re-transmissions.
2498 			 * Note that the first unicast solicitation is sent
2499 			 * during the DELAY state.
2500 			 */
2501 			ip2dbg(("ndp_timer: pcount=%x dst %s\n",
2502 			    nce->nce_pcnt, inet_ntop(AF_INET6, &nce->nce_addr,
2503 			    addrbuf, sizeof (addrbuf))));
2504 			mutex_exit(&nce->nce_lock);
2505 			dropped = nce_xmit(ill, ND_NEIGHBOR_SOLICIT, NULL,
2506 			    B_FALSE, &ipv6_all_zeros, &nce->nce_addr,
2507 			    (nce->nce_flags & NCE_F_PERMANENT) ? NDP_PROBE :
2508 			    NDP_UNICAST);
2509 			if (dropped) {
2510 				mutex_enter(&nce->nce_lock);
2511 				nce->nce_pcnt++;
2512 				mutex_exit(&nce->nce_lock);
2513 			}
2514 			NDP_RESTART_TIMER(nce, ILL_PROBE_INTERVAL(ill));
2515 		} else if (nce->nce_pcnt < 0) {
2516 			/* No hope, delete the nce */
2517 			nce->nce_state = ND_UNREACHABLE;
2518 			mutex_exit(&nce->nce_lock);
2519 			if (ip_debug > 2) {
2520 				/* ip1dbg */
2521 				pr_addr_dbg("ndp_timer: Delete IRE for"
2522 				    " dst %s\n", AF_INET6, &nce->nce_addr);
2523 			}
2524 			ndp_delete(nce);
2525 		} else if (!(nce->nce_flags & NCE_F_PERMANENT)) {
2526 			/* Wait RetransTimer, before deleting the entry */
2527 			ip2dbg(("ndp_timer: pcount=%x dst %s\n",
2528 			    nce->nce_pcnt, inet_ntop(AF_INET6,
2529 			    &nce->nce_addr, addrbuf, sizeof (addrbuf))));
2530 			mutex_exit(&nce->nce_lock);
2531 			/* Wait one interval before killing */
2532 			NDP_RESTART_TIMER(nce, ill->ill_reachable_retrans_time);
2533 		} else if (ill->ill_phyint->phyint_flags & PHYI_RUNNING) {
2534 			ipif_t *ipif;
2535 
2536 			/*
2537 			 * We're done probing, and we can now declare this
2538 			 * address to be usable.  Let IP know that it's ok to
2539 			 * use.
2540 			 */
2541 			nce->nce_state = ND_REACHABLE;
2542 			mutex_exit(&nce->nce_lock);
2543 			ipif = ipif_lookup_addr_v6(&nce->nce_addr, ill,
2544 			    ALL_ZONES, NULL, NULL, NULL, NULL, ipst);
2545 			if (ipif != NULL) {
2546 				if (ipif->ipif_was_dup) {
2547 					char ibuf[LIFNAMSIZ + 10];
2548 					char sbuf[INET6_ADDRSTRLEN];
2549 
2550 					ipif->ipif_was_dup = B_FALSE;
2551 					(void) inet_ntop(AF_INET6,
2552 					    &ipif->ipif_v6lcl_addr,
2553 					    sbuf, sizeof (sbuf));
2554 					ipif_get_name(ipif, ibuf,
2555 					    sizeof (ibuf));
2556 					cmn_err(CE_NOTE, "recovered address "
2557 					    "%s on %s", sbuf, ibuf);
2558 				}
2559 				if ((ipif->ipif_flags & IPIF_UP) &&
2560 				    !ipif->ipif_addr_ready) {
2561 					ip_rts_ifmsg(ipif);
2562 					ip_rts_newaddrmsg(RTM_ADD, 0, ipif);
2563 					sctp_update_ipif(ipif, SCTP_IPIF_UP);
2564 				}
2565 				ipif->ipif_addr_ready = 1;
2566 				ipif_refrele(ipif);
2567 			}
2568 			/* Begin defending our new address */
2569 			nce->nce_unsolicit_count = 0;
2570 			dropped = nce_xmit(ill, ND_NEIGHBOR_ADVERT, ill,
2571 			    B_FALSE, &nce->nce_addr, &ipv6_all_hosts_mcast,
2572 			    nce_advert_flags(nce));
2573 			if (dropped) {
2574 				nce->nce_unsolicit_count = 1;
2575 				NDP_RESTART_TIMER(nce,
2576 				    ipst->ips_ip_ndp_unsolicit_interval);
2577 			} else if (ipst->ips_ip_ndp_defense_interval != 0) {
2578 				NDP_RESTART_TIMER(nce,
2579 				    ipst->ips_ip_ndp_defense_interval);
2580 			}
2581 		} else {
2582 			/*
2583 			 * This is an address we're probing to be our own, but
2584 			 * the ill is down.  Wait until it comes back before
2585 			 * doing anything, but switch to reachable state so
2586 			 * that the restart will work.
2587 			 */
2588 			nce->nce_state = ND_REACHABLE;
2589 			mutex_exit(&nce->nce_lock);
2590 		}
2591 		NCE_REFRELE(nce);
2592 		return;
2593 	case ND_INCOMPLETE:
2594 		/*
2595 		 * Must be resolvers retransmit timer.
2596 		 */
2597 		for (mp = nce->nce_qd_mp; mp != NULL; mp = mp->b_next) {
2598 			ip6i_t	*ip6i;
2599 			ip6_t	*ip6h;
2600 			mblk_t *data_mp;
2601 
2602 			/*
2603 			 * Walk the list of packets queued, and see if there
2604 			 * are any multipathing probe packets. Such packets
2605 			 * are always queued at the head. Since this is a
2606 			 * retransmit timer firing, mark such packets as
2607 			 * delayed in ND resolution. This info will be used
2608 			 * in ip_wput_v6(). Multipathing probe packets will
2609 			 * always have an ip6i_t. Once we hit a packet without
2610 			 * it, we can break out of this loop.
2611 			 */
2612 			if (mp->b_datap->db_type == M_CTL)
2613 				data_mp = mp->b_cont;
2614 			else
2615 				data_mp = mp;
2616 
2617 			ip6h = (ip6_t *)data_mp->b_rptr;
2618 			if (ip6h->ip6_nxt != IPPROTO_RAW)
2619 				break;
2620 
2621 			/*
2622 			 * This message should have been pulled up already in
2623 			 * ip_wput_v6. We can't do pullups here because the
2624 			 * b_next/b_prev is non-NULL.
2625 			 */
2626 			ip6i = (ip6i_t *)ip6h;
2627 			ASSERT((data_mp->b_wptr - (uchar_t *)ip6i) >=
2628 			    sizeof (ip6i_t) + IPV6_HDR_LEN);
2629 
2630 			/* Mark this packet as delayed due to ND resolution */
2631 			if (ip6i->ip6i_flags & IP6I_DROP_IFDELAYED)
2632 				ip6i->ip6i_flags |= IP6I_ND_DELAYED;
2633 		}
2634 		if (nce->nce_qd_mp != NULL) {
2635 			ms = nce_solicit(nce, NULL);
2636 			rw_exit(&ipst->ips_ill_g_lock);
2637 			if (ms == 0) {
2638 				if (nce->nce_state != ND_REACHABLE) {
2639 					mutex_exit(&nce->nce_lock);
2640 					nce_resolv_failed(nce);
2641 					ndp_delete(nce);
2642 				} else {
2643 					mutex_exit(&nce->nce_lock);
2644 				}
2645 			} else {
2646 				mutex_exit(&nce->nce_lock);
2647 				NDP_RESTART_TIMER(nce, (clock_t)ms);
2648 			}
2649 			NCE_REFRELE(nce);
2650 			return;
2651 		}
2652 		mutex_exit(&nce->nce_lock);
2653 		rw_exit(&ipst->ips_ill_g_lock);
2654 		NCE_REFRELE(nce);
2655 		break;
2656 	case ND_REACHABLE :
2657 		rw_exit(&ipst->ips_ill_g_lock);
2658 		if (((nce->nce_flags & NCE_F_UNSOL_ADV) &&
2659 		    nce->nce_unsolicit_count != 0) ||
2660 		    ((nce->nce_flags & NCE_F_PERMANENT) &&
2661 		    ipst->ips_ip_ndp_defense_interval != 0)) {
2662 			if (nce->nce_unsolicit_count > 0)
2663 				nce->nce_unsolicit_count--;
2664 			mutex_exit(&nce->nce_lock);
2665 			dropped = nce_xmit(ill,
2666 			    ND_NEIGHBOR_ADVERT,
2667 			    ill,	/* ill to be used for hw addr */
2668 			    B_FALSE,	/* use ill_phys_addr */
2669 			    &nce->nce_addr,
2670 			    &ipv6_all_hosts_mcast,
2671 			    nce_advert_flags(nce));
2672 			if (dropped) {
2673 				mutex_enter(&nce->nce_lock);
2674 				nce->nce_unsolicit_count++;
2675 				mutex_exit(&nce->nce_lock);
2676 			}
2677 			if (nce->nce_unsolicit_count != 0) {
2678 				NDP_RESTART_TIMER(nce,
2679 				    ipst->ips_ip_ndp_unsolicit_interval);
2680 			} else {
2681 				NDP_RESTART_TIMER(nce,
2682 				    ipst->ips_ip_ndp_defense_interval);
2683 			}
2684 		} else {
2685 			mutex_exit(&nce->nce_lock);
2686 		}
2687 		NCE_REFRELE(nce);
2688 		break;
2689 	default:
2690 		rw_exit(&ipst->ips_ill_g_lock);
2691 		mutex_exit(&nce->nce_lock);
2692 		NCE_REFRELE(nce);
2693 		break;
2694 	}
2695 }
2696 
2697 /*
2698  * Set a link layer address from the ll_addr passed in.
2699  * Copy SAP from ill.
2700  */
2701 static void
2702 nce_set_ll(nce_t *nce, uchar_t *ll_addr)
2703 {
2704 	ill_t	*ill = nce->nce_ill;
2705 	uchar_t	*woffset;
2706 
2707 	ASSERT(ll_addr != NULL);
2708 	/* Always called before fast_path_probe */
2709 	ASSERT(nce->nce_fp_mp == NULL);
2710 	if (ill->ill_sap_length != 0) {
2711 		/*
2712 		 * Copy the SAP type specified in the
2713 		 * request into the xmit template.
2714 		 */
2715 		NCE_LL_SAP_COPY(ill, nce->nce_res_mp);
2716 	}
2717 	if (ill->ill_phys_addr_length > 0) {
2718 		/*
2719 		 * The bcopy() below used to be called for the physical address
2720 		 * length rather than the link layer address length. For
2721 		 * ethernet and many other media, the phys_addr and lla are
2722 		 * identical.
2723 		 * However, with xresolv interfaces being introduced, the
2724 		 * phys_addr and lla are no longer the same, and the physical
2725 		 * address may not have any useful meaning, so we use the lla
2726 		 * for IPv6 address resolution and destination addressing.
2727 		 *
2728 		 * For PPP or other interfaces with a zero length
2729 		 * physical address, don't do anything here.
2730 		 * The bcopy() with a zero phys_addr length was previously
2731 		 * a no-op for interfaces with a zero-length physical address.
2732 		 * Using the lla for them would change the way they operate.
2733 		 * Doing nothing in such cases preserves expected behavior.
2734 		 */
2735 		woffset = nce->nce_res_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill);
2736 		bcopy(ll_addr, woffset, ill->ill_nd_lla_len);
2737 	}
2738 }
2739 
2740 static boolean_t
2741 nce_cmp_ll_addr(const nce_t *nce, const uchar_t *ll_addr, uint32_t ll_addr_len)
2742 {
2743 	ill_t	*ill = nce->nce_ill;
2744 	uchar_t	*ll_offset;
2745 
2746 	ASSERT(nce->nce_res_mp != NULL);
2747 	if (ll_addr == NULL)
2748 		return (B_FALSE);
2749 	ll_offset = nce->nce_res_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill);
2750 	if (bcmp(ll_addr, ll_offset, ll_addr_len) != 0)
2751 		return (B_TRUE);
2752 	return (B_FALSE);
2753 }
2754 
2755 /*
2756  * Updates the link layer address or the reachability state of
2757  * a cache entry.  Reset probe counter if needed.
2758  */
2759 static void
2760 nce_update(nce_t *nce, uint16_t new_state, uchar_t *new_ll_addr)
2761 {
2762 	ill_t	*ill = nce->nce_ill;
2763 	boolean_t need_stop_timer = B_FALSE;
2764 	boolean_t need_fastpath_update = B_FALSE;
2765 
2766 	ASSERT(MUTEX_HELD(&nce->nce_lock));
2767 	ASSERT(nce->nce_ipversion == IPV6_VERSION);
2768 	/*
2769 	 * If this interface does not do NUD, there is no point
2770 	 * in allowing an update to the cache entry.  Although
2771 	 * we will respond to NS.
2772 	 * The only time we accept an update for a resolver when
2773 	 * NUD is turned off is when it has just been created.
2774 	 * Non-Resolvers will always be created as REACHABLE.
2775 	 */
2776 	if (new_state != ND_UNCHANGED) {
2777 		if ((nce->nce_flags & NCE_F_NONUD) &&
2778 		    (nce->nce_state != ND_INCOMPLETE))
2779 			return;
2780 		ASSERT((int16_t)new_state >= ND_STATE_VALID_MIN);
2781 		ASSERT((int16_t)new_state <= ND_STATE_VALID_MAX);
2782 		need_stop_timer = B_TRUE;
2783 		if (new_state == ND_REACHABLE)
2784 			nce->nce_last = TICK_TO_MSEC(lbolt64);
2785 		else {
2786 			/* We force NUD in this case */
2787 			nce->nce_last = 0;
2788 		}
2789 		nce->nce_state = new_state;
2790 		nce->nce_pcnt = ND_MAX_UNICAST_SOLICIT;
2791 	}
2792 	/*
2793 	 * In case of fast path we need to free the the fastpath
2794 	 * M_DATA and do another probe.  Otherwise we can just
2795 	 * overwrite the DL_UNITDATA_REQ data, noting we'll lose
2796 	 * whatever packets that happens to be transmitting at the time.
2797 	 */
2798 	if (new_ll_addr != NULL) {
2799 		ASSERT(nce->nce_res_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill) +
2800 		    ill->ill_nd_lla_len <= nce->nce_res_mp->b_wptr);
2801 		bcopy(new_ll_addr, nce->nce_res_mp->b_rptr +
2802 		    NCE_LL_ADDR_OFFSET(ill), ill->ill_nd_lla_len);
2803 		if (nce->nce_fp_mp != NULL) {
2804 			freemsg(nce->nce_fp_mp);
2805 			nce->nce_fp_mp = NULL;
2806 		}
2807 		need_fastpath_update = B_TRUE;
2808 	}
2809 	mutex_exit(&nce->nce_lock);
2810 	if (need_stop_timer) {
2811 		(void) untimeout(nce->nce_timeout_id);
2812 		nce->nce_timeout_id = 0;
2813 	}
2814 	if (need_fastpath_update)
2815 		nce_fastpath(nce);
2816 	mutex_enter(&nce->nce_lock);
2817 }
2818 
2819 void
2820 nce_queue_mp_common(nce_t *nce, mblk_t *mp, boolean_t head_insert)
2821 {
2822 	uint_t	count = 0;
2823 	mblk_t  **mpp;
2824 
2825 	ASSERT(MUTEX_HELD(&nce->nce_lock));
2826 
2827 	for (mpp = &nce->nce_qd_mp; *mpp != NULL;
2828 	    mpp = &(*mpp)->b_next) {
2829 		if (++count >
2830 		    nce->nce_ill->ill_max_buf) {
2831 			mblk_t *tmp = nce->nce_qd_mp->b_next;
2832 
2833 			nce->nce_qd_mp->b_next = NULL;
2834 			nce->nce_qd_mp->b_prev = NULL;
2835 			freemsg(nce->nce_qd_mp);
2836 			nce->nce_qd_mp = tmp;
2837 		}
2838 	}
2839 	/* put this on the list */
2840 	if (head_insert) {
2841 		mp->b_next = nce->nce_qd_mp;
2842 		nce->nce_qd_mp = mp;
2843 	} else {
2844 		*mpp = mp;
2845 	}
2846 }
2847 
2848 static void
2849 nce_queue_mp(nce_t *nce, mblk_t *mp)
2850 {
2851 	boolean_t head_insert = B_FALSE;
2852 	ip6_t	*ip6h;
2853 	ip6i_t	*ip6i;
2854 	mblk_t *data_mp;
2855 
2856 	ASSERT(MUTEX_HELD(&nce->nce_lock));
2857 
2858 	if (mp->b_datap->db_type == M_CTL)
2859 		data_mp = mp->b_cont;
2860 	else
2861 		data_mp = mp;
2862 	ip6h = (ip6_t *)data_mp->b_rptr;
2863 	if (ip6h->ip6_nxt == IPPROTO_RAW) {
2864 		/*
2865 		 * This message should have been pulled up already in
2866 		 * ip_wput_v6. We can't do pullups here because the message
2867 		 * could be from the nce_qd_mp which could have b_next/b_prev
2868 		 * non-NULL.
2869 		 */
2870 		ip6i = (ip6i_t *)ip6h;
2871 		ASSERT((data_mp->b_wptr - (uchar_t *)ip6i) >=
2872 		    sizeof (ip6i_t) + IPV6_HDR_LEN);
2873 		/*
2874 		 * Multipathing probe packets have IP6I_DROP_IFDELAYED set.
2875 		 * This has 2 aspects mentioned below.
2876 		 * 1. Perform head insertion in the nce_qd_mp for these packets.
2877 		 * This ensures that next retransmit of ND solicitation
2878 		 * will use the interface specified by the probe packet,
2879 		 * for both NS and NA. This corresponds to the src address
2880 		 * in the IPv6 packet. If we insert at tail, we will be
2881 		 * depending on the packet at the head for successful
2882 		 * ND resolution. This is not reliable, because the interface
2883 		 * on which the NA arrives could be different from the interface
2884 		 * on which the NS was sent, and if the receiving interface is
2885 		 * failed, it will appear that the sending interface is also
2886 		 * failed, causing in.mpathd to misdiagnose this as link
2887 		 * failure.
2888 		 * 2. Drop the original packet, if the ND resolution did not
2889 		 * succeed in the first attempt. However we will create the
2890 		 * nce and the ire, as soon as the ND resolution succeeds.
2891 		 * We don't gain anything by queueing multiple probe packets
2892 		 * and sending them back-to-back once resolution succeeds.
2893 		 * It is sufficient to send just 1 packet after ND resolution
2894 		 * succeeds. Since mpathd is sending down probe packets at a
2895 		 * constant rate, we don't need to send the queued packet. We
2896 		 * need to queue it only for NDP resolution. The benefit of
2897 		 * dropping the probe packets that were delayed in ND
2898 		 * resolution, is that in.mpathd will not see inflated
2899 		 * RTT. If the ND resolution does not succeed within
2900 		 * in.mpathd's failure detection time, mpathd may detect
2901 		 * a failure, and it does not matter whether the packet
2902 		 * was queued or dropped.
2903 		 */
2904 		if (ip6i->ip6i_flags & IP6I_DROP_IFDELAYED)
2905 			head_insert = B_TRUE;
2906 	}
2907 
2908 	nce_queue_mp_common(nce, mp, head_insert);
2909 }
2910 
2911 /*
2912  * Called when address resolution failed due to a timeout.
2913  * Send an ICMP unreachable in response to all queued packets.
2914  */
2915 void
2916 nce_resolv_failed(nce_t *nce)
2917 {
2918 	mblk_t	*mp, *nxt_mp, *first_mp;
2919 	char	buf[INET6_ADDRSTRLEN];
2920 	ip6_t *ip6h;
2921 	zoneid_t zoneid = GLOBAL_ZONEID;
2922 	ip_stack_t	*ipst = nce->nce_ill->ill_ipst;
2923 
2924 	ip1dbg(("nce_resolv_failed: dst %s\n",
2925 	    inet_ntop(AF_INET6, (char *)&nce->nce_addr, buf, sizeof (buf))));
2926 	mutex_enter(&nce->nce_lock);
2927 	mp = nce->nce_qd_mp;
2928 	nce->nce_qd_mp = NULL;
2929 	mutex_exit(&nce->nce_lock);
2930 	while (mp != NULL) {
2931 		nxt_mp = mp->b_next;
2932 		mp->b_next = NULL;
2933 		mp->b_prev = NULL;
2934 
2935 		first_mp = mp;
2936 		if (mp->b_datap->db_type == M_CTL) {
2937 			ipsec_out_t *io = (ipsec_out_t *)mp->b_rptr;
2938 			ASSERT(io->ipsec_out_type == IPSEC_OUT);
2939 			zoneid = io->ipsec_out_zoneid;
2940 			ASSERT(zoneid != ALL_ZONES);
2941 			mp = mp->b_cont;
2942 		}
2943 
2944 		ip6h = (ip6_t *)mp->b_rptr;
2945 		if (ip6h->ip6_nxt == IPPROTO_RAW) {
2946 			ip6i_t *ip6i;
2947 			/*
2948 			 * This message should have been pulled up already
2949 			 * in ip_wput_v6. ip_hdr_complete_v6 assumes that
2950 			 * the header is pulled up.
2951 			 */
2952 			ip6i = (ip6i_t *)ip6h;
2953 			ASSERT((mp->b_wptr - (uchar_t *)ip6i) >=
2954 			    sizeof (ip6i_t) + IPV6_HDR_LEN);
2955 			mp->b_rptr += sizeof (ip6i_t);
2956 		}
2957 		/*
2958 		 * Ignore failure since icmp_unreachable_v6 will silently
2959 		 * drop packets with an unspecified source address.
2960 		 */
2961 		(void) ip_hdr_complete_v6((ip6_t *)mp->b_rptr, zoneid, ipst);
2962 		icmp_unreachable_v6(nce->nce_ill->ill_wq, first_mp,
2963 		    ICMP6_DST_UNREACH_ADDR, B_FALSE, B_FALSE, zoneid, ipst);
2964 		mp = nxt_mp;
2965 	}
2966 }
2967 
2968 /*
2969  * Called by SIOCSNDP* ioctl to add/change an nce entry
2970  * and the corresponding attributes.
2971  * Disallow states other than ND_REACHABLE or ND_STALE.
2972  */
2973 int
2974 ndp_sioc_update(ill_t *ill, lif_nd_req_t *lnr)
2975 {
2976 	sin6_t		*sin6;
2977 	in6_addr_t	*addr;
2978 	nce_t		*nce;
2979 	int		err;
2980 	uint16_t	new_flags = 0;
2981 	uint16_t	old_flags = 0;
2982 	int		inflags = lnr->lnr_flags;
2983 	ip_stack_t	*ipst = ill->ill_ipst;
2984 
2985 	ASSERT(ill->ill_isv6);
2986 	if ((lnr->lnr_state_create != ND_REACHABLE) &&
2987 	    (lnr->lnr_state_create != ND_STALE))
2988 		return (EINVAL);
2989 
2990 	sin6 = (sin6_t *)&lnr->lnr_addr;
2991 	addr = &sin6->sin6_addr;
2992 
2993 	mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
2994 	/* We know it can not be mapping so just look in the hash table */
2995 	nce = *((nce_t **)NCE_HASH_PTR_V6(ipst, *addr));
2996 	nce = nce_lookup_addr(ill, addr, nce);
2997 	if (nce != NULL)
2998 		new_flags = nce->nce_flags;
2999 
3000 	switch (inflags & (NDF_ISROUTER_ON|NDF_ISROUTER_OFF)) {
3001 	case NDF_ISROUTER_ON:
3002 		new_flags |= NCE_F_ISROUTER;
3003 		break;
3004 	case NDF_ISROUTER_OFF:
3005 		new_flags &= ~NCE_F_ISROUTER;
3006 		break;
3007 	case (NDF_ISROUTER_OFF|NDF_ISROUTER_ON):
3008 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
3009 		if (nce != NULL)
3010 			NCE_REFRELE(nce);
3011 		return (EINVAL);
3012 	}
3013 
3014 	switch (inflags & (NDF_ANYCAST_ON|NDF_ANYCAST_OFF)) {
3015 	case NDF_ANYCAST_ON:
3016 		new_flags |= NCE_F_ANYCAST;
3017 		break;
3018 	case NDF_ANYCAST_OFF:
3019 		new_flags &= ~NCE_F_ANYCAST;
3020 		break;
3021 	case (NDF_ANYCAST_OFF|NDF_ANYCAST_ON):
3022 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
3023 		if (nce != NULL)
3024 			NCE_REFRELE(nce);
3025 		return (EINVAL);
3026 	}
3027 
3028 	if (nce == NULL) {
3029 		err = ndp_add_v6(ill,
3030 		    (uchar_t *)lnr->lnr_hdw_addr,
3031 		    addr,
3032 		    &ipv6_all_ones,
3033 		    &ipv6_all_zeros,
3034 		    0,
3035 		    new_flags,
3036 		    lnr->lnr_state_create,
3037 		    &nce);
3038 		if (err != 0) {
3039 			mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
3040 			ip1dbg(("ndp_sioc_update: Can't create NCE %d\n", err));
3041 			return (err);
3042 		}
3043 	}
3044 	old_flags = nce->nce_flags;
3045 	if (old_flags & NCE_F_ISROUTER && !(new_flags & NCE_F_ISROUTER)) {
3046 		/*
3047 		 * Router turned to host, delete all ires.
3048 		 * XXX Just delete the entry, but we need to add too.
3049 		 */
3050 		nce->nce_flags &= ~NCE_F_ISROUTER;
3051 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
3052 		ndp_delete(nce);
3053 		NCE_REFRELE(nce);
3054 		return (0);
3055 	}
3056 	mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
3057 
3058 	mutex_enter(&nce->nce_lock);
3059 	nce->nce_flags = new_flags;
3060 	mutex_exit(&nce->nce_lock);
3061 	/*
3062 	 * Note that we ignore the state at this point, which
3063 	 * should be either STALE or REACHABLE.  Instead we let
3064 	 * the link layer address passed in to determine the state
3065 	 * much like incoming packets.
3066 	 */
3067 	ndp_process(nce, (uchar_t *)lnr->lnr_hdw_addr, 0, B_FALSE);
3068 	NCE_REFRELE(nce);
3069 	return (0);
3070 }
3071 
3072 /*
3073  * If the device driver supports it, we make nce_fp_mp to have
3074  * an M_DATA prepend.  Otherwise nce_fp_mp will be null.
3075  * The caller ensures there is hold on nce for this function.
3076  * Note that since ill_fastpath_probe() copies the mblk there is
3077  * no need for the hold beyond this function.
3078  */
3079 void
3080 nce_fastpath(nce_t *nce)
3081 {
3082 	ill_t	*ill = nce->nce_ill;
3083 	int res;
3084 
3085 	ASSERT(ill != NULL);
3086 	ASSERT(nce->nce_state != ND_INITIAL && nce->nce_state != ND_INCOMPLETE);
3087 
3088 	if (nce->nce_fp_mp != NULL) {
3089 		/* Already contains fastpath info */
3090 		return;
3091 	}
3092 	if (nce->nce_res_mp != NULL) {
3093 		nce_fastpath_list_add(nce);
3094 		res = ill_fastpath_probe(ill, nce->nce_res_mp);
3095 		/*
3096 		 * EAGAIN is an indication of a transient error
3097 		 * i.e. allocation failure etc. leave the nce in the list it
3098 		 * will be updated when another probe happens for another ire
3099 		 * if not it will be taken out of the list when the ire is
3100 		 * deleted.
3101 		 */
3102 
3103 		if (res != 0 && res != EAGAIN)
3104 			nce_fastpath_list_delete(nce);
3105 	}
3106 }
3107 
3108 /*
3109  * Drain the list of nce's waiting for fastpath response.
3110  */
3111 void
3112 nce_fastpath_list_dispatch(ill_t *ill, boolean_t (*func)(nce_t *, void  *),
3113     void *arg)
3114 {
3115 
3116 	nce_t *next_nce;
3117 	nce_t *current_nce;
3118 	nce_t *first_nce;
3119 	nce_t *prev_nce = NULL;
3120 
3121 	mutex_enter(&ill->ill_lock);
3122 	first_nce = current_nce = (nce_t *)ill->ill_fastpath_list;
3123 	while (current_nce != (nce_t *)&ill->ill_fastpath_list) {
3124 		next_nce = current_nce->nce_fastpath;
3125 		/*
3126 		 * Take it off the list if we're flushing, or if the callback
3127 		 * routine tells us to do so.  Otherwise, leave the nce in the
3128 		 * fastpath list to handle any pending response from the lower
3129 		 * layer.  We can't drain the list when the callback routine
3130 		 * comparison failed, because the response is asynchronous in
3131 		 * nature, and may not arrive in the same order as the list
3132 		 * insertion.
3133 		 */
3134 		if (func == NULL || func(current_nce, arg)) {
3135 			current_nce->nce_fastpath = NULL;
3136 			if (current_nce == first_nce)
3137 				ill->ill_fastpath_list = first_nce = next_nce;
3138 			else
3139 				prev_nce->nce_fastpath = next_nce;
3140 		} else {
3141 			/* previous element that is still in the list */
3142 			prev_nce = current_nce;
3143 		}
3144 		current_nce = next_nce;
3145 	}
3146 	mutex_exit(&ill->ill_lock);
3147 }
3148 
3149 /*
3150  * Add nce to the nce fastpath list.
3151  */
3152 void
3153 nce_fastpath_list_add(nce_t *nce)
3154 {
3155 	ill_t *ill;
3156 
3157 	ill = nce->nce_ill;
3158 
3159 	mutex_enter(&ill->ill_lock);
3160 	mutex_enter(&nce->nce_lock);
3161 
3162 	/*
3163 	 * if nce has not been deleted and
3164 	 * is not already in the list add it.
3165 	 */
3166 	if (!(nce->nce_flags & NCE_F_CONDEMNED) &&
3167 	    (nce->nce_fastpath == NULL)) {
3168 		nce->nce_fastpath = (nce_t *)ill->ill_fastpath_list;
3169 		ill->ill_fastpath_list = nce;
3170 	}
3171 
3172 	mutex_exit(&nce->nce_lock);
3173 	mutex_exit(&ill->ill_lock);
3174 }
3175 
3176 /*
3177  * remove nce from the nce fastpath list.
3178  */
3179 void
3180 nce_fastpath_list_delete(nce_t *nce)
3181 {
3182 	nce_t *nce_ptr;
3183 
3184 	ill_t *ill;
3185 
3186 	ill = nce->nce_ill;
3187 	ASSERT(ill != NULL);
3188 
3189 	mutex_enter(&ill->ill_lock);
3190 	if (nce->nce_fastpath == NULL)
3191 		goto done;
3192 
3193 	ASSERT(ill->ill_fastpath_list != &ill->ill_fastpath_list);
3194 
3195 	if (ill->ill_fastpath_list == nce) {
3196 		ill->ill_fastpath_list = nce->nce_fastpath;
3197 	} else {
3198 		nce_ptr = ill->ill_fastpath_list;
3199 		while (nce_ptr != (nce_t *)&ill->ill_fastpath_list) {
3200 			if (nce_ptr->nce_fastpath == nce) {
3201 				nce_ptr->nce_fastpath = nce->nce_fastpath;
3202 				break;
3203 			}
3204 			nce_ptr = nce_ptr->nce_fastpath;
3205 		}
3206 	}
3207 
3208 	nce->nce_fastpath = NULL;
3209 done:
3210 	mutex_exit(&ill->ill_lock);
3211 }
3212 
3213 /*
3214  * Update all NCE's that are not in fastpath mode and
3215  * have an nce_fp_mp that matches mp. mp->b_cont contains
3216  * the fastpath header.
3217  *
3218  * Returns TRUE if entry should be dequeued, or FALSE otherwise.
3219  */
3220 boolean_t
3221 ndp_fastpath_update(nce_t *nce, void *arg)
3222 {
3223 	mblk_t 	*mp, *fp_mp;
3224 	uchar_t	*mp_rptr, *ud_mp_rptr;
3225 	mblk_t	*ud_mp = nce->nce_res_mp;
3226 	ptrdiff_t	cmplen;
3227 
3228 	if (nce->nce_flags & NCE_F_MAPPING)
3229 		return (B_TRUE);
3230 	if ((nce->nce_fp_mp != NULL) || (ud_mp == NULL))
3231 		return (B_TRUE);
3232 
3233 	ip2dbg(("ndp_fastpath_update: trying\n"));
3234 	mp = (mblk_t *)arg;
3235 	mp_rptr = mp->b_rptr;
3236 	cmplen = mp->b_wptr - mp_rptr;
3237 	ASSERT(cmplen >= 0);
3238 	ud_mp_rptr = ud_mp->b_rptr;
3239 	/*
3240 	 * The nce is locked here to prevent any other threads
3241 	 * from accessing and changing nce_res_mp when the IPv6 address
3242 	 * becomes resolved to an lla while we're in the middle
3243 	 * of looking at and comparing the hardware address (lla).
3244 	 * It is also locked to prevent multiple threads in nce_fastpath_update
3245 	 * from examining nce_res_mp atthe same time.
3246 	 */
3247 	mutex_enter(&nce->nce_lock);
3248 	if (ud_mp->b_wptr - ud_mp_rptr != cmplen ||
3249 	    bcmp((char *)mp_rptr, (char *)ud_mp_rptr, cmplen) != 0) {
3250 		mutex_exit(&nce->nce_lock);
3251 		/*
3252 		 * Don't take the ire off the fastpath list yet,
3253 		 * since the response may come later.
3254 		 */
3255 		return (B_FALSE);
3256 	}
3257 	/* Matched - install mp as the fastpath mp */
3258 	ip1dbg(("ndp_fastpath_update: match\n"));
3259 	fp_mp = dupb(mp->b_cont);
3260 	if (fp_mp != NULL) {
3261 		nce->nce_fp_mp = fp_mp;
3262 	}
3263 	mutex_exit(&nce->nce_lock);
3264 	return (B_TRUE);
3265 }
3266 
3267 /*
3268  * This function handles the DL_NOTE_FASTPATH_FLUSH notification from
3269  * driver.  Note that it assumes IP is exclusive...
3270  */
3271 /* ARGSUSED */
3272 void
3273 ndp_fastpath_flush(nce_t *nce, char *arg)
3274 {
3275 	if (nce->nce_flags & NCE_F_MAPPING)
3276 		return;
3277 	/* No fastpath info? */
3278 	if (nce->nce_fp_mp == NULL || nce->nce_res_mp == NULL)
3279 		return;
3280 
3281 	if (nce->nce_ipversion == IPV4_VERSION &&
3282 	    nce->nce_flags & NCE_F_BCAST) {
3283 		/*
3284 		 * IPv4 BROADCAST entries:
3285 		 * We can't delete the nce since it is difficult to
3286 		 * recreate these without going through the
3287 		 * ipif down/up dance.
3288 		 *
3289 		 * All access to nce->nce_fp_mp in the case of these
3290 		 * is protected by nce_lock.
3291 		 */
3292 		mutex_enter(&nce->nce_lock);
3293 		if (nce->nce_fp_mp != NULL) {
3294 			freeb(nce->nce_fp_mp);
3295 			nce->nce_fp_mp = NULL;
3296 			mutex_exit(&nce->nce_lock);
3297 			nce_fastpath(nce);
3298 		} else {
3299 			mutex_exit(&nce->nce_lock);
3300 		}
3301 	} else {
3302 		/* Just delete the NCE... */
3303 		ndp_delete(nce);
3304 	}
3305 }
3306 
3307 /*
3308  * Return a pointer to a given option in the packet.
3309  * Assumes that option part of the packet have already been validated.
3310  */
3311 nd_opt_hdr_t *
3312 ndp_get_option(nd_opt_hdr_t *opt, int optlen, int opt_type)
3313 {
3314 	while (optlen > 0) {
3315 		if (opt->nd_opt_type == opt_type)
3316 			return (opt);
3317 		optlen -= 8 * opt->nd_opt_len;
3318 		opt = (struct nd_opt_hdr *)((char *)opt + 8 * opt->nd_opt_len);
3319 	}
3320 	return (NULL);
3321 }
3322 
3323 /*
3324  * Verify all option lengths present are > 0, also check to see
3325  * if the option lengths and packet length are consistent.
3326  */
3327 boolean_t
3328 ndp_verify_optlen(nd_opt_hdr_t *opt, int optlen)
3329 {
3330 	ASSERT(opt != NULL);
3331 	while (optlen > 0) {
3332 		if (opt->nd_opt_len == 0)
3333 			return (B_FALSE);
3334 		optlen -= 8 * opt->nd_opt_len;
3335 		if (optlen < 0)
3336 			return (B_FALSE);
3337 		opt = (struct nd_opt_hdr *)((char *)opt + 8 * opt->nd_opt_len);
3338 	}
3339 	return (B_TRUE);
3340 }
3341 
3342 /*
3343  * ndp_walk function.
3344  * Free a fraction of the NCE cache entries.
3345  * A fraction of zero means to not free any in that category.
3346  */
3347 void
3348 ndp_cache_reclaim(nce_t *nce, char *arg)
3349 {
3350 	nce_cache_reclaim_t *ncr = (nce_cache_reclaim_t *)arg;
3351 	uint_t	rand;
3352 
3353 	if (nce->nce_flags & NCE_F_PERMANENT)
3354 		return;
3355 
3356 	rand = (uint_t)lbolt +
3357 	    NCE_ADDR_HASH_V6(nce->nce_addr, NCE_TABLE_SIZE);
3358 	if (ncr->ncr_host != 0 &&
3359 	    (rand/ncr->ncr_host)*ncr->ncr_host == rand) {
3360 		ndp_delete(nce);
3361 		return;
3362 	}
3363 }
3364 
3365 /*
3366  * ndp_walk function.
3367  * Count the number of NCEs that can be deleted.
3368  * These would be hosts but not routers.
3369  */
3370 void
3371 ndp_cache_count(nce_t *nce, char *arg)
3372 {
3373 	ncc_cache_count_t *ncc = (ncc_cache_count_t *)arg;
3374 
3375 	if (nce->nce_flags & NCE_F_PERMANENT)
3376 		return;
3377 
3378 	ncc->ncc_total++;
3379 	if (!(nce->nce_flags & NCE_F_ISROUTER))
3380 		ncc->ncc_host++;
3381 }
3382 
3383 #ifdef DEBUG
3384 void
3385 nce_trace_ref(nce_t *nce)
3386 {
3387 	ASSERT(MUTEX_HELD(&nce->nce_lock));
3388 
3389 	if (nce->nce_trace_disable)
3390 		return;
3391 
3392 	if (!th_trace_ref(nce, nce->nce_ill->ill_ipst)) {
3393 		nce->nce_trace_disable = B_TRUE;
3394 		nce_trace_cleanup(nce);
3395 	}
3396 }
3397 
3398 void
3399 nce_untrace_ref(nce_t *nce)
3400 {
3401 	ASSERT(MUTEX_HELD(&nce->nce_lock));
3402 
3403 	if (!nce->nce_trace_disable)
3404 		th_trace_unref(nce);
3405 }
3406 
3407 static void
3408 nce_trace_cleanup(const nce_t *nce)
3409 {
3410 	th_trace_cleanup(nce, nce->nce_trace_disable);
3411 }
3412 #endif
3413 
3414 /*
3415  * Called when address resolution fails due to a timeout.
3416  * Send an ICMP unreachable in response to all queued packets.
3417  */
3418 void
3419 arp_resolv_failed(nce_t *nce)
3420 {
3421 	mblk_t	*mp, *nxt_mp, *first_mp;
3422 	char	buf[INET6_ADDRSTRLEN];
3423 	zoneid_t zoneid = GLOBAL_ZONEID;
3424 	struct in_addr ipv4addr;
3425 	ip_stack_t *ipst = nce->nce_ill->ill_ipst;
3426 
3427 	IN6_V4MAPPED_TO_INADDR(&nce->nce_addr, &ipv4addr);
3428 	ip3dbg(("arp_resolv_failed: dst %s\n",
3429 	    inet_ntop(AF_INET, &ipv4addr, buf, sizeof (buf))));
3430 	mutex_enter(&nce->nce_lock);
3431 	mp = nce->nce_qd_mp;
3432 	nce->nce_qd_mp = NULL;
3433 	mutex_exit(&nce->nce_lock);
3434 
3435 	while (mp != NULL) {
3436 		nxt_mp = mp->b_next;
3437 		mp->b_next = NULL;
3438 		mp->b_prev = NULL;
3439 
3440 		first_mp = mp;
3441 		/*
3442 		 * Send icmp unreachable messages
3443 		 * to the hosts.
3444 		 */
3445 		(void) ip_hdr_complete((ipha_t *)mp->b_rptr, zoneid, ipst);
3446 		ip3dbg(("arp_resolv_failed: Calling icmp_unreachable\n"));
3447 		icmp_unreachable(nce->nce_ill->ill_wq, first_mp,
3448 		    ICMP_HOST_UNREACHABLE, zoneid, ipst);
3449 		mp = nxt_mp;
3450 	}
3451 }
3452 
3453 int
3454 ndp_lookup_then_add_v4(ill_t *ill, const in_addr_t *addr, uint16_t flags,
3455     nce_t **newnce, nce_t *src_nce)
3456 {
3457 	int	err;
3458 	nce_t	*nce;
3459 	in6_addr_t addr6;
3460 	ip_stack_t *ipst = ill->ill_ipst;
3461 
3462 	mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
3463 	nce = *((nce_t **)NCE_HASH_PTR_V4(ipst, *addr));
3464 	IN6_IPADDR_TO_V4MAPPED(*addr, &addr6);
3465 	nce = nce_lookup_addr(ill, &addr6, nce);
3466 	if (nce == NULL) {
3467 		err = ndp_add_v4(ill, addr, flags, newnce, src_nce);
3468 	} else {
3469 		*newnce = nce;
3470 		err = EEXIST;
3471 	}
3472 	mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
3473 	return (err);
3474 }
3475 
3476 /*
3477  * NDP Cache Entry creation routine for IPv4.
3478  * Mapped entries are handled in arp.
3479  * This routine must always be called with ndp4->ndp_g_lock held.
3480  * Prior to return, nce_refcnt is incremented.
3481  */
3482 static int
3483 ndp_add_v4(ill_t *ill, const in_addr_t *addr, uint16_t flags,
3484     nce_t **newnce, nce_t *src_nce)
3485 {
3486 	static	nce_t		nce_nil;
3487 	nce_t		*nce;
3488 	mblk_t		*mp;
3489 	mblk_t		*template = NULL;
3490 	nce_t		**ncep;
3491 	ip_stack_t	*ipst = ill->ill_ipst;
3492 	uint16_t	state = ND_INITIAL;
3493 	int		err;
3494 
3495 	ASSERT(MUTEX_HELD(&ipst->ips_ndp4->ndp_g_lock));
3496 	ASSERT(!ill->ill_isv6);
3497 	ASSERT((flags & NCE_F_MAPPING) == 0);
3498 
3499 	if (ill->ill_resolver_mp == NULL)
3500 		return (EINVAL);
3501 	/*
3502 	 * Allocate the mblk to hold the nce.
3503 	 */
3504 	mp = allocb(sizeof (nce_t), BPRI_MED);
3505 	if (mp == NULL)
3506 		return (ENOMEM);
3507 
3508 	nce = (nce_t *)mp->b_rptr;
3509 	mp->b_wptr = (uchar_t *)&nce[1];
3510 	*nce = nce_nil;
3511 	nce->nce_ill = ill;
3512 	nce->nce_ipversion = IPV4_VERSION;
3513 	nce->nce_flags = flags;
3514 	nce->nce_pcnt = ND_MAX_UNICAST_SOLICIT;
3515 	nce->nce_rcnt = ill->ill_xmit_count;
3516 	IN6_IPADDR_TO_V4MAPPED(*addr, &nce->nce_addr);
3517 	nce->nce_mask = ipv6_all_ones;
3518 	nce->nce_extract_mask = ipv6_all_zeros;
3519 	nce->nce_ll_extract_start = 0;
3520 	nce->nce_qd_mp = NULL;
3521 	nce->nce_mp = mp;
3522 	/* This one is for nce getting created */
3523 	nce->nce_refcnt = 1;
3524 	mutex_init(&nce->nce_lock, NULL, MUTEX_DEFAULT, NULL);
3525 	ncep = ((nce_t **)NCE_HASH_PTR_V4(ipst, *addr));
3526 
3527 	nce->nce_trace_disable = B_FALSE;
3528 
3529 	if (src_nce != NULL) {
3530 		/*
3531 		 * src_nce has been provided by the caller. The only
3532 		 * caller who provides a non-null, non-broadcast
3533 		 * src_nce is from ip_newroute() which must pass in
3534 		 * a ND_REACHABLE src_nce (this condition is verified
3535 		 * via an ASSERT for the save_ire->ire_nce in ip_newroute())
3536 		 */
3537 		mutex_enter(&src_nce->nce_lock);
3538 		state = src_nce->nce_state;
3539 		if ((src_nce->nce_flags & NCE_F_CONDEMNED) ||
3540 		    (ipst->ips_ndp4->ndp_g_hw_change > 0)) {
3541 			/*
3542 			 * src_nce has been deleted, or
3543 			 * ip_arp_news is in the middle of
3544 			 * flushing entries in the the nce.
3545 			 * Fail the add, since we don't know
3546 			 * if it is safe to copy the contents of
3547 			 * src_nce
3548 			 */
3549 			DTRACE_PROBE2(nce__bad__src__nce,
3550 			    nce_t *, src_nce, ill_t *, ill);
3551 			mutex_exit(&src_nce->nce_lock);
3552 			err = EINVAL;
3553 			goto err_ret;
3554 		}
3555 		template = copyb(src_nce->nce_res_mp);
3556 		mutex_exit(&src_nce->nce_lock);
3557 		if (template == NULL) {
3558 			err = ENOMEM;
3559 			goto err_ret;
3560 		}
3561 	} else if (flags & NCE_F_BCAST) {
3562 		/*
3563 		 * broadcast nce.
3564 		 */
3565 		template = copyb(ill->ill_bcast_mp);
3566 		if (template == NULL) {
3567 			err = ENOMEM;
3568 			goto err_ret;
3569 		}
3570 		state = ND_REACHABLE;
3571 	} else if (ill->ill_net_type == IRE_IF_NORESOLVER) {
3572 		/*
3573 		 * NORESOLVER entries are always created in the REACHABLE
3574 		 * state. We create a nce_res_mp with the IP nexthop address
3575 		 * in the destination address in the DLPI hdr if the
3576 		 * physical length is exactly 4 bytes.
3577 		 *
3578 		 * XXX not clear which drivers set ill_phys_addr_length to
3579 		 * IP_ADDR_LEN.
3580 		 */
3581 		if (ill->ill_phys_addr_length == IP_ADDR_LEN) {
3582 			template = ill_dlur_gen((uchar_t *)addr,
3583 			    ill->ill_phys_addr_length,
3584 			    ill->ill_sap, ill->ill_sap_length);
3585 		} else {
3586 			template = copyb(ill->ill_resolver_mp);
3587 		}
3588 		if (template == NULL) {
3589 			err = ENOMEM;
3590 			goto err_ret;
3591 		}
3592 		state = ND_REACHABLE;
3593 	}
3594 	nce->nce_fp_mp = NULL;
3595 	nce->nce_res_mp = template;
3596 	nce->nce_state = state;
3597 	if (state == ND_REACHABLE) {
3598 		nce->nce_last = TICK_TO_MSEC(lbolt64);
3599 		nce->nce_init_time = TICK_TO_MSEC(lbolt64);
3600 	} else {
3601 		nce->nce_last = 0;
3602 		if (state == ND_INITIAL)
3603 			nce->nce_init_time = TICK_TO_MSEC(lbolt64);
3604 	}
3605 
3606 	ASSERT((nce->nce_res_mp == NULL && nce->nce_state == ND_INITIAL) ||
3607 	    (nce->nce_res_mp != NULL && nce->nce_state == ND_REACHABLE));
3608 	/*
3609 	 * Atomically ensure that the ill is not CONDEMNED, before
3610 	 * adding the NCE.
3611 	 */
3612 	mutex_enter(&ill->ill_lock);
3613 	if (ill->ill_state_flags & ILL_CONDEMNED) {
3614 		mutex_exit(&ill->ill_lock);
3615 		err = EINVAL;
3616 		goto err_ret;
3617 	}
3618 	if ((nce->nce_next = *ncep) != NULL)
3619 		nce->nce_next->nce_ptpn = &nce->nce_next;
3620 	*ncep = nce;
3621 	nce->nce_ptpn = ncep;
3622 	*newnce = nce;
3623 	/* This one is for nce being used by an active thread */
3624 	NCE_REFHOLD(*newnce);
3625 
3626 	/* Bump up the number of nce's referencing this ill */
3627 	ill->ill_nce_cnt++;
3628 	mutex_exit(&ill->ill_lock);
3629 	DTRACE_PROBE1(ndp__add__v4, nce_t *, nce);
3630 	return (0);
3631 err_ret:
3632 	freeb(mp);
3633 	freemsg(template);
3634 	return (err);
3635 }
3636 
3637 void
3638 ndp_flush_qd_mp(nce_t *nce)
3639 {
3640 	mblk_t *qd_mp, *qd_next;
3641 
3642 	ASSERT(MUTEX_HELD(&nce->nce_lock));
3643 	qd_mp = nce->nce_qd_mp;
3644 	nce->nce_qd_mp = NULL;
3645 	while (qd_mp != NULL) {
3646 		qd_next = qd_mp->b_next;
3647 		qd_mp->b_next = NULL;
3648 		qd_mp->b_prev = NULL;
3649 		freemsg(qd_mp);
3650 		qd_mp = qd_next;
3651 	}
3652 }
3653 
3654 
3655 /*
3656  * ndp_walk routine to delete all entries that have a given destination or
3657  * gateway address and cached link layer (MAC) address.  This is used when ARP
3658  * informs us that a network-to-link-layer mapping may have changed.
3659  */
3660 void
3661 nce_delete_hw_changed(nce_t *nce, void *arg)
3662 {
3663 	nce_hw_map_t *hwm = arg;
3664 	mblk_t *mp;
3665 	dl_unitdata_req_t *dlu;
3666 	uchar_t *macaddr;
3667 	ill_t *ill;
3668 	int saplen;
3669 	ipaddr_t nce_addr;
3670 
3671 	if (nce->nce_state != ND_REACHABLE)
3672 		return;
3673 
3674 	IN6_V4MAPPED_TO_IPADDR(&nce->nce_addr, nce_addr);
3675 	if (nce_addr != hwm->hwm_addr)
3676 		return;
3677 
3678 	mutex_enter(&nce->nce_lock);
3679 	if ((mp = nce->nce_res_mp) == NULL) {
3680 		mutex_exit(&nce->nce_lock);
3681 		return;
3682 	}
3683 	dlu = (dl_unitdata_req_t *)mp->b_rptr;
3684 	macaddr = (uchar_t *)(dlu + 1);
3685 	ill = nce->nce_ill;
3686 	if ((saplen = ill->ill_sap_length) > 0)
3687 		macaddr += saplen;
3688 	else
3689 		saplen = -saplen;
3690 
3691 	/*
3692 	 * If the hardware address is unchanged, then leave this one alone.
3693 	 * Note that saplen == abs(saplen) now.
3694 	 */
3695 	if (hwm->hwm_hwlen == dlu->dl_dest_addr_length - saplen &&
3696 	    bcmp(hwm->hwm_hwaddr, macaddr, hwm->hwm_hwlen) == 0) {
3697 		mutex_exit(&nce->nce_lock);
3698 		return;
3699 	}
3700 	mutex_exit(&nce->nce_lock);
3701 
3702 	DTRACE_PROBE1(nce__hw__deleted, nce_t *, nce);
3703 	ndp_delete(nce);
3704 }
3705 
3706 /*
3707  * This function verifies whether a given IPv4 address is potentially known to
3708  * the NCE subsystem.  If so, then ARP must not delete the corresponding ace_t,
3709  * so that it can continue to look for hardware changes on that address.
3710  */
3711 boolean_t
3712 ndp_lookup_ipaddr(in_addr_t addr, netstack_t *ns)
3713 {
3714 	nce_t		*nce;
3715 	struct in_addr	nceaddr;
3716 	ip_stack_t	*ipst = ns->netstack_ip;
3717 
3718 	if (addr == INADDR_ANY)
3719 		return (B_FALSE);
3720 
3721 	mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
3722 	nce = *(nce_t **)NCE_HASH_PTR_V4(ipst, addr);
3723 	for (; nce != NULL; nce = nce->nce_next) {
3724 		/* Note that only v4 mapped entries are in the table. */
3725 		IN6_V4MAPPED_TO_INADDR(&nce->nce_addr, &nceaddr);
3726 		if (addr == nceaddr.s_addr &&
3727 		    IN6_ARE_ADDR_EQUAL(&nce->nce_mask, &ipv6_all_ones)) {
3728 			/* Single flag check; no lock needed */
3729 			if (!(nce->nce_flags & NCE_F_CONDEMNED))
3730 				break;
3731 		}
3732 	}
3733 	mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
3734 	return (nce != NULL);
3735 }
3736