xref: /titanic_52/usr/src/uts/common/inet/ip/ip_ndp.c (revision bb25c06cca41ca78e5fb87fbb8e81d55beb18c95)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 #include <sys/types.h>
29 #include <sys/stream.h>
30 #include <sys/stropts.h>
31 #include <sys/strsun.h>
32 #include <sys/sysmacros.h>
33 #include <sys/errno.h>
34 #include <sys/dlpi.h>
35 #include <sys/socket.h>
36 #include <sys/ddi.h>
37 #include <sys/sunddi.h>
38 #include <sys/cmn_err.h>
39 #include <sys/debug.h>
40 #include <sys/vtrace.h>
41 #include <sys/kmem.h>
42 #include <sys/zone.h>
43 #include <sys/ethernet.h>
44 #include <sys/sdt.h>
45 
46 #include <net/if.h>
47 #include <net/if_types.h>
48 #include <net/if_dl.h>
49 #include <net/route.h>
50 #include <netinet/in.h>
51 #include <netinet/ip6.h>
52 #include <netinet/icmp6.h>
53 
54 #include <inet/common.h>
55 #include <inet/mi.h>
56 #include <inet/mib2.h>
57 #include <inet/nd.h>
58 #include <inet/ip.h>
59 #include <inet/ip_impl.h>
60 #include <inet/ip_if.h>
61 #include <inet/ip_ire.h>
62 #include <inet/ip_rts.h>
63 #include <inet/ip6.h>
64 #include <inet/ip_ndp.h>
65 #include <inet/ipsec_impl.h>
66 #include <inet/ipsec_info.h>
67 #include <inet/sctp_ip.h>
68 
69 /*
70  * Function names with nce_ prefix are static while function
71  * names with ndp_ prefix are used by rest of the IP.
72  *
73  * Lock ordering:
74  *
75  *	ndp_g_lock -> ill_lock -> nce_lock
76  *
77  * The ndp_g_lock protects the NCE hash (nce_hash_tbl, NCE_HASH_PTR) and
78  * nce_next.  Nce_lock protects the contents of the NCE (particularly
79  * nce_refcnt).
80  */
81 
82 static	boolean_t nce_cmp_ll_addr(const nce_t *nce, const uchar_t *new_ll_addr,
83     uint32_t ll_addr_len);
84 static	void	nce_fastpath(nce_t *nce);
85 static	void	nce_ire_delete(nce_t *nce);
86 static	void	nce_ire_delete1(ire_t *ire, char *nce_arg);
87 static	void 	nce_set_ll(nce_t *nce, uchar_t *ll_addr);
88 static	nce_t	*nce_lookup_addr(ill_t *, const in6_addr_t *, nce_t *);
89 static	nce_t	*nce_lookup_mapping(ill_t *ill, const in6_addr_t *addr);
90 static	void	nce_make_mapping(nce_t *nce, uchar_t *addrpos,
91     uchar_t *addr);
92 static	int	nce_set_multicast(ill_t *ill, const in6_addr_t *addr);
93 static	void	nce_queue_mp(nce_t *nce, mblk_t *mp);
94 static	void	nce_report1(nce_t *nce, uchar_t *mp_arg);
95 static	mblk_t	*nce_udreq_alloc(ill_t *ill);
96 static	void	nce_update(nce_t *nce, uint16_t new_state,
97     uchar_t *new_ll_addr);
98 static	uint32_t	nce_solicit(nce_t *nce, mblk_t *mp);
99 static	boolean_t	nce_xmit(ill_t *ill, uint32_t operation,
100     ill_t *hwaddr_ill, boolean_t use_lla_addr, const in6_addr_t *sender,
101     const in6_addr_t *target, int flag);
102 extern void	th_trace_rrecord(th_trace_t *);
103 static	int	ndp_lookup_then_add_v6(ill_t *, uchar_t *,
104     const in6_addr_t *, const in6_addr_t *, const in6_addr_t *,
105     uint32_t, uint16_t, uint16_t, nce_t **, mblk_t *, mblk_t *);
106 static	int	ndp_lookup_then_add_v4(ill_t *, uchar_t *,
107     const in_addr_t *, const in_addr_t *, const in_addr_t *,
108     uint32_t, uint16_t, uint16_t, nce_t **, mblk_t *, mblk_t *);
109 static	int	ndp_add_v6(ill_t *, uchar_t *, const in6_addr_t *,
110     const in6_addr_t *, const in6_addr_t *, uint32_t, uint16_t, uint16_t,
111     nce_t **);
112 static	int	ndp_add_v4(ill_t *, uchar_t *, const in_addr_t *,
113     const in_addr_t *, const in_addr_t *, uint32_t, uint16_t, uint16_t,
114     nce_t **, mblk_t *, mblk_t *);
115 
116 
117 #ifdef NCE_DEBUG
118 void	nce_trace_inactive(nce_t *);
119 #endif
120 
121 ndp_g_t ndp4, ndp6;
122 
123 #define	NCE_HASH_PTR_V4(addr) \
124 	(&(ndp4.nce_hash_tbl[IRE_ADDR_HASH(addr, NCE_TABLE_SIZE)]))
125 
126 #define	NCE_HASH_PTR_V6(addr) \
127 	(&(ndp6.nce_hash_tbl[NCE_ADDR_HASH_V6(addr, NCE_TABLE_SIZE)]))
128 
129 /*
130  * Compute default flags to use for an advertisement of this nce's address.
131  */
132 static int
133 nce_advert_flags(const nce_t *nce)
134 {
135 	int flag = 0;
136 
137 	if (nce->nce_flags & NCE_F_ISROUTER)
138 		flag |= NDP_ISROUTER;
139 	if (!(nce->nce_flags & NCE_F_PROXY))
140 		flag |= NDP_ORIDE;
141 	return (flag);
142 }
143 
144 int
145 ndp_add(ill_t *ill, uchar_t *hw_addr, const void *addr,
146     const void *mask, const void *extract_mask,
147     uint32_t hw_extract_start, uint16_t flags, uint16_t state,
148     nce_t **newnce, mblk_t *fp_mp, mblk_t *res_mp)
149 {
150 	int status;
151 
152 	if (ill->ill_isv6)
153 		status = ndp_add_v6(ill, hw_addr, (in6_addr_t *)addr,
154 		    (in6_addr_t *)mask, (in6_addr_t *)extract_mask,
155 		    hw_extract_start, flags, state, newnce);
156 	else
157 		status = ndp_add_v4(ill, hw_addr, (in_addr_t *)addr,
158 		    (in_addr_t *)mask, (in_addr_t *)extract_mask,
159 		    hw_extract_start, flags, state, newnce, fp_mp, res_mp);
160 	return (status);
161 }
162 
163 /* Non-tunable probe interval, based on link capabilities */
164 #define	ILL_PROBE_INTERVAL(ill)	((ill)->ill_note_link ? 150 : 1500)
165 
166 /*
167  * NDP Cache Entry creation routine.
168  * Mapped entries will never do NUD .
169  * This routine must always be called with ndp6.ndp_g_lock held.
170  * Prior to return, nce_refcnt is incremented.
171  */
172 static int
173 ndp_add_v6(ill_t *ill, uchar_t *hw_addr, const in6_addr_t *addr,
174     const in6_addr_t *mask, const in6_addr_t *extract_mask,
175     uint32_t hw_extract_start, uint16_t flags, uint16_t state,
176     nce_t **newnce)
177 {
178 	static	nce_t		nce_nil;
179 	nce_t		*nce;
180 	mblk_t		*mp;
181 	mblk_t		*template;
182 	nce_t		**ncep;
183 	int		err;
184 	boolean_t	dropped = B_FALSE;
185 
186 	ASSERT(MUTEX_HELD(&ndp6.ndp_g_lock));
187 	ASSERT(ill != NULL && ill->ill_isv6);
188 	if (IN6_IS_ADDR_UNSPECIFIED(addr)) {
189 		ip0dbg(("ndp_add: no addr\n"));
190 		return (EINVAL);
191 	}
192 	if ((flags & ~NCE_EXTERNAL_FLAGS_MASK)) {
193 		ip0dbg(("ndp_add: flags = %x\n", (int)flags));
194 		return (EINVAL);
195 	}
196 	if (IN6_IS_ADDR_UNSPECIFIED(extract_mask) &&
197 	    (flags & NCE_F_MAPPING)) {
198 		ip0dbg(("ndp_add: extract mask zero for mapping"));
199 		return (EINVAL);
200 	}
201 	/*
202 	 * Allocate the mblk to hold the nce.
203 	 *
204 	 * XXX This can come out of a separate cache - nce_cache.
205 	 * We don't need the mp anymore as there are no more
206 	 * "qwriter"s
207 	 */
208 	mp = allocb(sizeof (nce_t), BPRI_MED);
209 	if (mp == NULL)
210 		return (ENOMEM);
211 
212 	nce = (nce_t *)mp->b_rptr;
213 	mp->b_wptr = (uchar_t *)&nce[1];
214 	*nce = nce_nil;
215 
216 	/*
217 	 * This one holds link layer address
218 	 */
219 	if (ill->ill_net_type == IRE_IF_RESOLVER) {
220 		template = nce_udreq_alloc(ill);
221 	} else {
222 		ASSERT((ill->ill_net_type == IRE_IF_NORESOLVER));
223 		ASSERT((ill->ill_resolver_mp != NULL));
224 		template = copyb(ill->ill_resolver_mp);
225 	}
226 	if (template == NULL) {
227 		freeb(mp);
228 		return (ENOMEM);
229 	}
230 	nce->nce_ill = ill;
231 	nce->nce_ipversion = IPV6_VERSION;
232 	nce->nce_flags = flags;
233 	nce->nce_state = state;
234 	nce->nce_pcnt = ND_MAX_UNICAST_SOLICIT;
235 	nce->nce_rcnt = ill->ill_xmit_count;
236 	nce->nce_addr = *addr;
237 	nce->nce_mask = *mask;
238 	nce->nce_extract_mask = *extract_mask;
239 	nce->nce_ll_extract_start = hw_extract_start;
240 	nce->nce_fp_mp = NULL;
241 	nce->nce_res_mp = template;
242 	if (state == ND_REACHABLE)
243 		nce->nce_last = TICK_TO_MSEC(lbolt64);
244 	else
245 		nce->nce_last = 0;
246 	nce->nce_qd_mp = NULL;
247 	nce->nce_mp = mp;
248 	if (hw_addr != NULL)
249 		nce_set_ll(nce, hw_addr);
250 	/* This one is for nce getting created */
251 	nce->nce_refcnt = 1;
252 	mutex_init(&nce->nce_lock, NULL, MUTEX_DEFAULT, NULL);
253 	if (nce->nce_flags & NCE_F_MAPPING) {
254 		ASSERT(IN6_IS_ADDR_MULTICAST(addr));
255 		ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&nce->nce_mask));
256 		ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&nce->nce_extract_mask));
257 		ncep = &ndp6.nce_mask_entries;
258 	} else {
259 		ncep = ((nce_t **)NCE_HASH_PTR_V6(*addr));
260 	}
261 
262 #ifdef NCE_DEBUG
263 	bzero(nce->nce_trace, sizeof (th_trace_t *) * IP_TR_HASH_MAX);
264 #endif
265 	/*
266 	 * Atomically ensure that the ill is not CONDEMNED, before
267 	 * adding the NCE.
268 	 */
269 	mutex_enter(&ill->ill_lock);
270 	if (ill->ill_state_flags & ILL_CONDEMNED) {
271 		mutex_exit(&ill->ill_lock);
272 		freeb(mp);
273 		freeb(template);
274 		return (EINVAL);
275 	}
276 	if ((nce->nce_next = *ncep) != NULL)
277 		nce->nce_next->nce_ptpn = &nce->nce_next;
278 	*ncep = nce;
279 	nce->nce_ptpn = ncep;
280 	*newnce = nce;
281 	/* This one is for nce being used by an active thread */
282 	NCE_REFHOLD(*newnce);
283 
284 	/* Bump up the number of nce's referencing this ill */
285 	ill->ill_nce_cnt++;
286 	mutex_exit(&ill->ill_lock);
287 
288 	err = 0;
289 	if ((flags & NCE_F_PERMANENT) && state == ND_PROBE) {
290 		mutex_enter(&nce->nce_lock);
291 		mutex_exit(&ndp6.ndp_g_lock);
292 		nce->nce_pcnt = ND_MAX_UNICAST_SOLICIT;
293 		mutex_exit(&nce->nce_lock);
294 		dropped = nce_xmit(ill, ND_NEIGHBOR_SOLICIT, NULL, B_FALSE,
295 		    &ipv6_all_zeros, addr, NDP_PROBE);
296 		if (dropped) {
297 			mutex_enter(&nce->nce_lock);
298 			nce->nce_pcnt++;
299 			mutex_exit(&nce->nce_lock);
300 		}
301 		NDP_RESTART_TIMER(nce, ILL_PROBE_INTERVAL(ill));
302 		mutex_enter(&ndp6.ndp_g_lock);
303 		err = EINPROGRESS;
304 	} else if (flags & NCE_F_UNSOL_ADV) {
305 		/*
306 		 * We account for the transmit below by assigning one
307 		 * less than the ndd variable. Subsequent decrements
308 		 * are done in ndp_timer.
309 		 */
310 		mutex_enter(&nce->nce_lock);
311 		mutex_exit(&ndp6.ndp_g_lock);
312 		nce->nce_unsolicit_count = ip_ndp_unsolicit_count - 1;
313 		mutex_exit(&nce->nce_lock);
314 		dropped = nce_xmit(ill,
315 		    ND_NEIGHBOR_ADVERT,
316 		    ill,	/* ill to be used for extracting ill_nd_lla */
317 		    B_TRUE,	/* use ill_nd_lla */
318 		    addr,	/* Source and target of the advertisement pkt */
319 		    &ipv6_all_hosts_mcast, /* Destination of the packet */
320 		    nce_advert_flags(nce));
321 		mutex_enter(&nce->nce_lock);
322 		if (dropped)
323 			nce->nce_unsolicit_count++;
324 		if (nce->nce_unsolicit_count != 0) {
325 			nce->nce_timeout_id = timeout(ndp_timer, nce,
326 			    MSEC_TO_TICK(ip_ndp_unsolicit_interval));
327 		}
328 		mutex_exit(&nce->nce_lock);
329 		mutex_enter(&ndp6.ndp_g_lock);
330 	}
331 	/*
332 	 * If the hw_addr is NULL, typically for ND_INCOMPLETE nces, then
333 	 * we call nce_fastpath as soon as the nce is resolved in ndp_process.
334 	 * We call nce_fastpath from nce_update if the link layer address of
335 	 * the peer changes from nce_update
336 	 */
337 	if (hw_addr != NULL || ill->ill_net_type == IRE_IF_NORESOLVER)
338 		nce_fastpath(nce);
339 	return (err);
340 }
341 
342 int
343 ndp_lookup_then_add(ill_t *ill, uchar_t *hw_addr, const void *addr,
344     const void *mask, const void *extract_mask,
345     uint32_t hw_extract_start, uint16_t flags, uint16_t state,
346     nce_t **newnce, mblk_t *fp_mp, mblk_t *res_mp)
347 {
348 	int status;
349 
350 	if (ill->ill_isv6) {
351 		status = ndp_lookup_then_add_v6(ill, hw_addr,
352 		    (in6_addr_t *)addr, (in6_addr_t *)mask,
353 		    (in6_addr_t *)extract_mask, hw_extract_start, flags,
354 		    state, newnce, fp_mp, res_mp);
355 	} else  {
356 		status = ndp_lookup_then_add_v4(ill, hw_addr,
357 		    (in_addr_t *)addr, (in_addr_t *)mask,
358 		    (in_addr_t *)extract_mask, hw_extract_start, flags,
359 		    state, newnce, fp_mp, res_mp);
360 	}
361 
362 	return (status);
363 }
364 
365 static int
366 ndp_lookup_then_add_v6(ill_t *ill, uchar_t *hw_addr, const in6_addr_t *addr,
367     const in6_addr_t *mask, const in6_addr_t *extract_mask,
368     uint32_t hw_extract_start, uint16_t flags, uint16_t state,
369     nce_t **newnce, mblk_t *fp_mp, mblk_t *res_mp)
370 {
371 	int	err = 0;
372 	nce_t	*nce;
373 
374 	ASSERT(ill != NULL && ill->ill_isv6);
375 	mutex_enter(&ndp6.ndp_g_lock);
376 	nce = *((nce_t **)NCE_HASH_PTR_V6(*addr)); /* head of v6 hash table */
377 	nce = nce_lookup_addr(ill, addr, nce);
378 	if (nce == NULL) {
379 		err = ndp_add(ill,
380 		    hw_addr,
381 		    addr,
382 		    mask,
383 		    extract_mask,
384 		    hw_extract_start,
385 		    flags,
386 		    state,
387 		    newnce,
388 		    fp_mp,
389 		    res_mp);
390 	} else {
391 		*newnce = nce;
392 		err = EEXIST;
393 	}
394 	mutex_exit(&ndp6.ndp_g_lock);
395 	return (err);
396 }
397 
398 /*
399  * Remove all the CONDEMNED nces from the appropriate hash table.
400  * We create a private list of NCEs, these may have ires pointing
401  * to them, so the list will be passed through to clean up dependent
402  * ires and only then we can do NCE_REFRELE which can make NCE inactive.
403  */
404 static void
405 nce_remove(ndp_g_t *ndp, nce_t *nce, nce_t **free_nce_list)
406 {
407 	nce_t *nce1;
408 	nce_t **ptpn;
409 
410 	ASSERT(MUTEX_HELD(&ndp->ndp_g_lock));
411 	ASSERT(ndp->ndp_g_walker == 0);
412 	for (; nce; nce = nce1) {
413 		nce1 = nce->nce_next;
414 		mutex_enter(&nce->nce_lock);
415 		if (nce->nce_flags & NCE_F_CONDEMNED) {
416 			ptpn = nce->nce_ptpn;
417 			nce1 = nce->nce_next;
418 			if (nce1 != NULL)
419 				nce1->nce_ptpn = ptpn;
420 			*ptpn = nce1;
421 			nce->nce_ptpn = NULL;
422 			nce->nce_next = NULL;
423 			nce->nce_next = *free_nce_list;
424 			*free_nce_list = nce;
425 		}
426 		mutex_exit(&nce->nce_lock);
427 	}
428 }
429 
430 /*
431  * 1. Mark the nce CONDEMNED. This ensures that no new nce_lookup()
432  *    will return this NCE. Also no new IREs will be created that
433  *    point to this NCE (See ire_add_v6).  Also no new timeouts will
434  *    be started (See NDP_RESTART_TIMER).
435  * 2. Cancel any currently running timeouts.
436  * 3. If there is an ndp walker, return. The walker will do the cleanup.
437  *    This ensures that walkers see a consistent list of NCEs while walking.
438  * 4. Otherwise remove the NCE from the list of NCEs
439  * 5. Delete all IREs pointing to this NCE.
440  */
441 void
442 ndp_delete(nce_t *nce)
443 {
444 	nce_t	**ptpn;
445 	nce_t	*nce1;
446 	int	ipversion = nce->nce_ipversion;
447 	ndp_g_t *ndp = (ipversion == IPV4_VERSION ? &ndp4 : &ndp6);
448 
449 	/* Serialize deletes */
450 	mutex_enter(&nce->nce_lock);
451 	if (nce->nce_flags & NCE_F_CONDEMNED) {
452 		/* Some other thread is doing the delete */
453 		mutex_exit(&nce->nce_lock);
454 		return;
455 	}
456 	/*
457 	 * Caller has a refhold. Also 1 ref for being in the list. Thus
458 	 * refcnt has to be >= 2
459 	 */
460 	ASSERT(nce->nce_refcnt >= 2);
461 	nce->nce_flags |= NCE_F_CONDEMNED;
462 	mutex_exit(&nce->nce_lock);
463 
464 	nce_fastpath_list_delete(nce);
465 
466 	/*
467 	 * Cancel any running timer. Timeout can't be restarted
468 	 * since CONDEMNED is set. Can't hold nce_lock across untimeout.
469 	 * Passing invalid timeout id is fine.
470 	 */
471 	if (nce->nce_timeout_id != 0) {
472 		(void) untimeout(nce->nce_timeout_id);
473 		nce->nce_timeout_id = 0;
474 	}
475 
476 	mutex_enter(&ndp->ndp_g_lock);
477 	if (nce->nce_ptpn == NULL) {
478 		/*
479 		 * The last ndp walker has already removed this nce from
480 		 * the list after we marked the nce CONDEMNED and before
481 		 * we grabbed the global lock.
482 		 */
483 		mutex_exit(&ndp->ndp_g_lock);
484 		return;
485 	}
486 	if (ndp->ndp_g_walker > 0) {
487 		/*
488 		 * Can't unlink. The walker will clean up
489 		 */
490 		ndp->ndp_g_walker_cleanup = B_TRUE;
491 		mutex_exit(&ndp->ndp_g_lock);
492 		return;
493 	}
494 
495 	/*
496 	 * Now remove the nce from the list. NDP_RESTART_TIMER won't restart
497 	 * the timer since it is marked CONDEMNED.
498 	 */
499 	ptpn = nce->nce_ptpn;
500 	nce1 = nce->nce_next;
501 	if (nce1 != NULL)
502 		nce1->nce_ptpn = ptpn;
503 	*ptpn = nce1;
504 	nce->nce_ptpn = NULL;
505 	nce->nce_next = NULL;
506 	mutex_exit(&ndp->ndp_g_lock);
507 
508 	nce_ire_delete(nce);
509 }
510 
511 void
512 ndp_inactive(nce_t *nce)
513 {
514 	mblk_t		**mpp;
515 	ill_t		*ill;
516 
517 	ASSERT(nce->nce_refcnt == 0);
518 	ASSERT(MUTEX_HELD(&nce->nce_lock));
519 	ASSERT(nce->nce_fastpath == NULL);
520 
521 	/* Free all nce allocated messages */
522 	mpp = &nce->nce_first_mp_to_free;
523 	do {
524 		while (*mpp != NULL) {
525 			mblk_t  *mp;
526 
527 			mp = *mpp;
528 			*mpp = mp->b_next;
529 
530 			inet_freemsg(mp);
531 		}
532 	} while (mpp++ != &nce->nce_last_mp_to_free);
533 
534 #ifdef NCE_DEBUG
535 	nce_trace_inactive(nce);
536 #endif
537 
538 	ill = nce->nce_ill;
539 	mutex_enter(&ill->ill_lock);
540 	ill->ill_nce_cnt--;
541 	/*
542 	 * If the number of nce's associated with this ill have dropped
543 	 * to zero, check whether we need to restart any operation that
544 	 * is waiting for this to happen.
545 	 */
546 	if (ill->ill_nce_cnt == 0) {
547 		/* ipif_ill_refrele_tail drops the ill_lock */
548 		ipif_ill_refrele_tail(ill);
549 	} else {
550 		mutex_exit(&ill->ill_lock);
551 	}
552 	mutex_destroy(&nce->nce_lock);
553 	if (nce->nce_mp != NULL)
554 		inet_freemsg(nce->nce_mp);
555 }
556 
557 /*
558  * ndp_walk routine.  Delete the nce if it is associated with the ill
559  * that is going away.  Always called as a writer.
560  */
561 void
562 ndp_delete_per_ill(nce_t *nce, uchar_t *arg)
563 {
564 	if ((nce != NULL) && nce->nce_ill == (ill_t *)arg) {
565 		ndp_delete(nce);
566 	}
567 }
568 
569 /*
570  * Walk a list of to be inactive NCEs and blow away all the ires.
571  */
572 static void
573 nce_ire_delete_list(nce_t *nce)
574 {
575 	nce_t *nce_next;
576 
577 	ASSERT(nce != NULL);
578 	while (nce != NULL) {
579 		nce_next = nce->nce_next;
580 		nce->nce_next = NULL;
581 
582 		/*
583 		 * It is possible for the last ndp walker (this thread)
584 		 * to come here after ndp_delete has marked the nce CONDEMNED
585 		 * and before it has removed the nce from the fastpath list
586 		 * or called untimeout. So we need to do it here. It is safe
587 		 * for both ndp_delete and this thread to do it twice or
588 		 * even simultaneously since each of the threads has a
589 		 * reference on the nce.
590 		 */
591 		nce_fastpath_list_delete(nce);
592 		/*
593 		 * Cancel any running timer. Timeout can't be restarted
594 		 * since CONDEMNED is set. Can't hold nce_lock across untimeout.
595 		 * Passing invalid timeout id is fine.
596 		 */
597 		if (nce->nce_timeout_id != 0) {
598 			(void) untimeout(nce->nce_timeout_id);
599 			nce->nce_timeout_id = 0;
600 		}
601 		/*
602 		 * We might hit this func thus in the v4 case:
603 		 * ipif_down->ipif_ndp_down->ndp_walk
604 		 */
605 
606 		if (nce->nce_ipversion == IPV4_VERSION) {
607 			ire_walk_ill_v4(MATCH_IRE_ILL | MATCH_IRE_TYPE,
608 			    IRE_CACHE, nce_ire_delete1,
609 			    (char *)nce, nce->nce_ill);
610 		} else {
611 			ASSERT(nce->nce_ipversion == IPV6_VERSION);
612 			ire_walk_ill_v6(MATCH_IRE_ILL | MATCH_IRE_TYPE,
613 			    IRE_CACHE, nce_ire_delete1,
614 			    (char *)nce, nce->nce_ill);
615 		}
616 		NCE_REFRELE_NOTR(nce);
617 		nce = nce_next;
618 	}
619 }
620 
621 /*
622  * Delete an ire when the nce goes away.
623  */
624 /* ARGSUSED */
625 static void
626 nce_ire_delete(nce_t *nce)
627 {
628 	if (nce->nce_ipversion == IPV6_VERSION) {
629 		ire_walk_ill_v6(MATCH_IRE_ILL | MATCH_IRE_TYPE, IRE_CACHE,
630 		    nce_ire_delete1, (char *)nce, nce->nce_ill);
631 		NCE_REFRELE_NOTR(nce);
632 	} else {
633 		ire_walk_ill_v4(MATCH_IRE_ILL | MATCH_IRE_TYPE, IRE_CACHE,
634 		    nce_ire_delete1, (char *)nce, nce->nce_ill);
635 		NCE_REFRELE_NOTR(nce);
636 	}
637 }
638 
639 /*
640  * ire_walk routine used to delete every IRE that shares this nce
641  */
642 static void
643 nce_ire_delete1(ire_t *ire, char *nce_arg)
644 {
645 	nce_t	*nce = (nce_t *)nce_arg;
646 
647 	ASSERT(ire->ire_type == IRE_CACHE);
648 
649 	if (ire->ire_nce == nce) {
650 		ASSERT(ire->ire_ipversion == nce->nce_ipversion);
651 		ire_delete(ire);
652 	}
653 }
654 
655 /*
656  * Restart DAD on given NCE.  Returns B_TRUE if DAD has been restarted.
657  */
658 boolean_t
659 ndp_restart_dad(nce_t *nce)
660 {
661 	boolean_t started;
662 	boolean_t dropped;
663 
664 	if (nce == NULL)
665 		return (B_FALSE);
666 	mutex_enter(&nce->nce_lock);
667 	if (nce->nce_state == ND_PROBE) {
668 		mutex_exit(&nce->nce_lock);
669 		started = B_TRUE;
670 	} else if (nce->nce_state == ND_REACHABLE) {
671 		nce->nce_state = ND_PROBE;
672 		nce->nce_pcnt = ND_MAX_UNICAST_SOLICIT - 1;
673 		mutex_exit(&nce->nce_lock);
674 		dropped = nce_xmit(nce->nce_ill, ND_NEIGHBOR_SOLICIT, NULL,
675 		    B_FALSE, &ipv6_all_zeros, &nce->nce_addr, NDP_PROBE);
676 		if (dropped) {
677 			mutex_enter(&nce->nce_lock);
678 			nce->nce_pcnt++;
679 			mutex_exit(&nce->nce_lock);
680 		}
681 		NDP_RESTART_TIMER(nce, ILL_PROBE_INTERVAL(nce->nce_ill));
682 		started = B_TRUE;
683 	} else {
684 		mutex_exit(&nce->nce_lock);
685 		started = B_FALSE;
686 	}
687 	return (started);
688 }
689 
690 /*
691  * IPv6 Cache entry lookup.  Try to find an nce matching the parameters passed.
692  * If one is found, the refcnt on the nce will be incremented.
693  */
694 nce_t *
695 ndp_lookup_v6(ill_t *ill, const in6_addr_t *addr, boolean_t caller_holds_lock)
696 {
697 	nce_t	*nce;
698 
699 	ASSERT(ill != NULL && ill->ill_isv6);
700 	if (!caller_holds_lock) {
701 		mutex_enter(&ndp6.ndp_g_lock);
702 	}
703 	nce = *((nce_t **)NCE_HASH_PTR_V6(*addr)); /* head of v6 hash table */
704 	nce = nce_lookup_addr(ill, addr, nce);
705 	if (nce == NULL)
706 		nce = nce_lookup_mapping(ill, addr);
707 	if (!caller_holds_lock)
708 		mutex_exit(&ndp6.ndp_g_lock);
709 	return (nce);
710 }
711 /*
712  * IPv4 Cache entry lookup.  Try to find an nce matching the parameters passed.
713  * If one is found, the refcnt on the nce will be incremented.
714  * Since multicast mappings are handled in arp, there are no nce_mcast_entries
715  * so we skip the nce_lookup_mapping call.
716  * XXX TODO: if the nce is found to be ND_STALE, ndp_delete it and return NULL
717  */
718 nce_t *
719 ndp_lookup_v4(ill_t *ill, const in_addr_t *addr, boolean_t caller_holds_lock)
720 {
721 	nce_t	*nce;
722 	in6_addr_t addr6;
723 
724 	if (!caller_holds_lock) {
725 		mutex_enter(&ndp4.ndp_g_lock);
726 	}
727 	nce = *((nce_t **)NCE_HASH_PTR_V4(*addr)); /* head of v6 hash table */
728 	IN6_IPADDR_TO_V4MAPPED(*addr, &addr6);
729 	nce = nce_lookup_addr(ill, &addr6, nce);
730 	if (!caller_holds_lock)
731 		mutex_exit(&ndp4.ndp_g_lock);
732 	return (nce);
733 }
734 
735 /*
736  * Cache entry lookup.  Try to find an nce matching the parameters passed.
737  * Look only for exact entries (no mappings).  If an nce is found, increment
738  * the hold count on that nce. The caller passes in the start of the
739  * appropriate hash table, and must be holding the appropriate global
740  * lock (ndp_g_lock).
741  */
742 static nce_t *
743 nce_lookup_addr(ill_t *ill, const in6_addr_t *addr, nce_t *nce)
744 {
745 	ndp_g_t *ndp = (ill->ill_isv6 ? &ndp6 : &ndp4);
746 
747 	ASSERT(ill != NULL);
748 	ASSERT(MUTEX_HELD(&ndp->ndp_g_lock));
749 	if (IN6_IS_ADDR_UNSPECIFIED(addr))
750 		return (NULL);
751 	for (; nce != NULL; nce = nce->nce_next) {
752 		if (nce->nce_ill == ill) {
753 			if (IN6_ARE_ADDR_EQUAL(&nce->nce_addr, addr) &&
754 			    IN6_ARE_ADDR_EQUAL(&nce->nce_mask,
755 			    &ipv6_all_ones)) {
756 				mutex_enter(&nce->nce_lock);
757 				if (!(nce->nce_flags & NCE_F_CONDEMNED)) {
758 					NCE_REFHOLD_LOCKED(nce);
759 					mutex_exit(&nce->nce_lock);
760 					break;
761 				}
762 				mutex_exit(&nce->nce_lock);
763 			}
764 		}
765 	}
766 	return (nce);
767 }
768 
769 /*
770  * Cache entry lookup.  Try to find an nce matching the parameters passed.
771  * Look only for mappings.
772  */
773 static nce_t *
774 nce_lookup_mapping(ill_t *ill, const in6_addr_t *addr)
775 {
776 	nce_t	*nce;
777 
778 	ASSERT(ill != NULL && ill->ill_isv6);
779 	ASSERT(MUTEX_HELD(&ndp6.ndp_g_lock));
780 	if (!IN6_IS_ADDR_MULTICAST(addr))
781 		return (NULL);
782 	nce = ndp6.nce_mask_entries;
783 	for (; nce != NULL; nce = nce->nce_next)
784 		if (nce->nce_ill == ill &&
785 		    (V6_MASK_EQ(*addr, nce->nce_mask, nce->nce_addr))) {
786 			mutex_enter(&nce->nce_lock);
787 			if (!(nce->nce_flags & NCE_F_CONDEMNED)) {
788 				NCE_REFHOLD_LOCKED(nce);
789 				mutex_exit(&nce->nce_lock);
790 				break;
791 			}
792 			mutex_exit(&nce->nce_lock);
793 		}
794 	return (nce);
795 }
796 
797 /*
798  * Process passed in parameters either from an incoming packet or via
799  * user ioctl.
800  */
801 void
802 ndp_process(nce_t *nce, uchar_t *hw_addr, uint32_t flag, boolean_t is_adv)
803 {
804 	ill_t	*ill = nce->nce_ill;
805 	uint32_t hw_addr_len = ill->ill_nd_lla_len;
806 	mblk_t	*mp;
807 	boolean_t ll_updated = B_FALSE;
808 	boolean_t ll_changed;
809 
810 	ASSERT(nce->nce_ipversion == IPV6_VERSION);
811 	/*
812 	 * No updates of link layer address or the neighbor state is
813 	 * allowed, when the cache is in NONUD state.  This still
814 	 * allows for responding to reachability solicitation.
815 	 */
816 	mutex_enter(&nce->nce_lock);
817 	if (nce->nce_state == ND_INCOMPLETE) {
818 		if (hw_addr == NULL) {
819 			mutex_exit(&nce->nce_lock);
820 			return;
821 		}
822 		nce_set_ll(nce, hw_addr);
823 		/*
824 		 * Update nce state and send the queued packets
825 		 * back to ip this time ire will be added.
826 		 */
827 		if (flag & ND_NA_FLAG_SOLICITED) {
828 			nce_update(nce, ND_REACHABLE, NULL);
829 		} else {
830 			nce_update(nce, ND_STALE, NULL);
831 		}
832 		mutex_exit(&nce->nce_lock);
833 		nce_fastpath(nce);
834 		mutex_enter(&nce->nce_lock);
835 		mp = nce->nce_qd_mp;
836 		nce->nce_qd_mp = NULL;
837 		mutex_exit(&nce->nce_lock);
838 		while (mp != NULL) {
839 			mblk_t *nxt_mp, *data_mp;
840 
841 			nxt_mp = mp->b_next;
842 			mp->b_next = NULL;
843 
844 			if (mp->b_datap->db_type == M_CTL)
845 				data_mp = mp->b_cont;
846 			else
847 				data_mp = mp;
848 			if (data_mp->b_prev != NULL) {
849 				ill_t   *inbound_ill;
850 				queue_t *fwdq = NULL;
851 				uint_t ifindex;
852 
853 				ifindex = (uint_t)(uintptr_t)data_mp->b_prev;
854 				inbound_ill = ill_lookup_on_ifindex(ifindex,
855 				    B_TRUE, NULL, NULL, NULL, NULL);
856 				if (inbound_ill == NULL) {
857 					data_mp->b_prev = NULL;
858 					freemsg(mp);
859 					return;
860 				} else {
861 					fwdq = inbound_ill->ill_rq;
862 				}
863 				data_mp->b_prev = NULL;
864 				/*
865 				 * Send a forwarded packet back into ip_rput_v6
866 				 * just as in ire_send_v6().
867 				 * Extract the queue from b_prev (set in
868 				 * ip_rput_data_v6).
869 				 */
870 				if (fwdq != NULL) {
871 					/*
872 					 * Forwarded packets hop count will
873 					 * get decremented in ip_rput_data_v6
874 					 */
875 					if (data_mp != mp)
876 						freeb(mp);
877 					put(fwdq, data_mp);
878 				} else {
879 					/*
880 					 * Send locally originated packets back
881 					 * into * ip_wput_v6.
882 					 */
883 					put(ill->ill_wq, mp);
884 				}
885 				ill_refrele(inbound_ill);
886 			} else {
887 				put(ill->ill_wq, mp);
888 			}
889 			mp = nxt_mp;
890 		}
891 		return;
892 	}
893 	ll_changed = nce_cmp_ll_addr(nce, hw_addr, hw_addr_len);
894 	if (!is_adv) {
895 		/* If this is a SOLICITATION request only */
896 		if (ll_changed)
897 			nce_update(nce, ND_STALE, hw_addr);
898 		mutex_exit(&nce->nce_lock);
899 		return;
900 	}
901 	if (!(flag & ND_NA_FLAG_OVERRIDE) && ll_changed) {
902 		/* If in any other state than REACHABLE, ignore */
903 		if (nce->nce_state == ND_REACHABLE) {
904 			nce_update(nce, ND_STALE, NULL);
905 		}
906 		mutex_exit(&nce->nce_lock);
907 		return;
908 	} else {
909 		if (ll_changed) {
910 			nce_update(nce, ND_UNCHANGED, hw_addr);
911 			ll_updated = B_TRUE;
912 		}
913 		if (flag & ND_NA_FLAG_SOLICITED) {
914 			nce_update(nce, ND_REACHABLE, NULL);
915 		} else {
916 			if (ll_updated) {
917 				nce_update(nce, ND_STALE, NULL);
918 			}
919 		}
920 		mutex_exit(&nce->nce_lock);
921 		if (!(flag & ND_NA_FLAG_ROUTER) && (nce->nce_flags &
922 		    NCE_F_ISROUTER)) {
923 			ire_t *ire;
924 
925 			/*
926 			 * Router turned to host.  We need to remove the
927 			 * entry as well as any default route that may be
928 			 * using this as a next hop.  This is required by
929 			 * section 7.2.5 of RFC 2461.
930 			 */
931 			ire = ire_ftable_lookup_v6(&ipv6_all_zeros,
932 			    &ipv6_all_zeros, &nce->nce_addr, IRE_DEFAULT,
933 			    nce->nce_ill->ill_ipif, NULL, ALL_ZONES, 0, NULL,
934 			    MATCH_IRE_ILL | MATCH_IRE_TYPE | MATCH_IRE_GW |
935 			    MATCH_IRE_DEFAULT);
936 			if (ire != NULL) {
937 				ip_rts_rtmsg(RTM_DELETE, ire, 0);
938 				ire_delete(ire);
939 				ire_refrele(ire);
940 			}
941 			ndp_delete(nce);
942 		}
943 	}
944 }
945 
946 /*
947  * Pass arg1 to the pfi supplied, along with each nce in existence.
948  * ndp_walk() places a REFHOLD on the nce and drops the lock when
949  * walking the hash list.
950  */
951 void
952 ndp_walk_common(ndp_g_t *ndp, ill_t *ill, pfi_t pfi, void *arg1,
953     boolean_t trace)
954 {
955 
956 	nce_t	*nce;
957 	nce_t	*nce1;
958 	nce_t	**ncep;
959 	nce_t	*free_nce_list = NULL;
960 
961 	mutex_enter(&ndp->ndp_g_lock);
962 	/* Prevent ndp_delete from unlink and free of NCE */
963 	ndp->ndp_g_walker++;
964 	mutex_exit(&ndp->ndp_g_lock);
965 	for (ncep = ndp->nce_hash_tbl;
966 	    ncep < A_END(ndp->nce_hash_tbl); ncep++) {
967 		for (nce = *ncep; nce != NULL; nce = nce1) {
968 			nce1 = nce->nce_next;
969 			if (ill == NULL || nce->nce_ill == ill) {
970 				if (trace) {
971 					NCE_REFHOLD(nce);
972 					(*pfi)(nce, arg1);
973 					NCE_REFRELE(nce);
974 				} else {
975 					NCE_REFHOLD_NOTR(nce);
976 					(*pfi)(nce, arg1);
977 					NCE_REFRELE_NOTR(nce);
978 				}
979 			}
980 		}
981 	}
982 	for (nce = ndp->nce_mask_entries; nce != NULL; nce = nce1) {
983 		nce1 = nce->nce_next;
984 		if (ill == NULL || nce->nce_ill == ill) {
985 			if (trace) {
986 				NCE_REFHOLD(nce);
987 				(*pfi)(nce, arg1);
988 				NCE_REFRELE(nce);
989 			} else {
990 				NCE_REFHOLD_NOTR(nce);
991 				(*pfi)(nce, arg1);
992 				NCE_REFRELE_NOTR(nce);
993 			}
994 		}
995 	}
996 	mutex_enter(&ndp->ndp_g_lock);
997 	ndp->ndp_g_walker--;
998 	/*
999 	 * While NCE's are removed from global list they are placed
1000 	 * in a private list, to be passed to nce_ire_delete_list().
1001 	 * The reason is, there may be ires pointing to this nce
1002 	 * which needs to cleaned up.
1003 	 */
1004 	if (ndp->ndp_g_walker_cleanup && ndp->ndp_g_walker == 0) {
1005 		/* Time to delete condemned entries */
1006 		for (ncep = ndp->nce_hash_tbl;
1007 		    ncep < A_END(ndp->nce_hash_tbl); ncep++) {
1008 			nce = *ncep;
1009 			if (nce != NULL) {
1010 				nce_remove(ndp, nce, &free_nce_list);
1011 			}
1012 		}
1013 		nce = ndp->nce_mask_entries;
1014 		if (nce != NULL) {
1015 			nce_remove(ndp, nce, &free_nce_list);
1016 		}
1017 		ndp->ndp_g_walker_cleanup = B_FALSE;
1018 	}
1019 	mutex_exit(&ndp->ndp_g_lock);
1020 
1021 	if (free_nce_list != NULL) {
1022 		nce_ire_delete_list(free_nce_list);
1023 	}
1024 }
1025 
1026 void
1027 ndp_walk(ill_t *ill, pfi_t pfi, void *arg1)
1028 {
1029 	ndp_walk_common(&ndp4, ill, pfi, arg1, B_TRUE);
1030 	ndp_walk_common(&ndp6, ill, pfi, arg1, B_TRUE);
1031 }
1032 
1033 /*
1034  * Process resolve requests.  Handles both mapped entries
1035  * as well as cases that needs to be send out on the wire.
1036  * Lookup a NCE for a given IRE.  Regardless of whether one exists
1037  * or one is created, we defer making ire point to nce until the
1038  * ire is actually added at which point the nce_refcnt on the nce is
1039  * incremented.  This is done primarily to have symmetry between ire_add()
1040  * and ire_delete() which decrements the nce_refcnt, when an ire is deleted.
1041  */
1042 int
1043 ndp_resolver(ill_t *ill, const in6_addr_t *dst, mblk_t *mp, zoneid_t zoneid)
1044 {
1045 	nce_t		*nce;
1046 	int		err = 0;
1047 	uint32_t	ms;
1048 	mblk_t		*mp_nce = NULL;
1049 
1050 	ASSERT(ill != NULL);
1051 	ASSERT(ill->ill_isv6);
1052 	if (IN6_IS_ADDR_MULTICAST(dst)) {
1053 		err = nce_set_multicast(ill, dst);
1054 		return (err);
1055 	}
1056 	err = ndp_lookup_then_add(ill,
1057 	    NULL,	/* No hardware address */
1058 	    dst,
1059 	    &ipv6_all_ones,
1060 	    &ipv6_all_zeros,
1061 	    0,
1062 	    (ill->ill_flags & ILLF_NONUD) ? NCE_F_NONUD : 0,
1063 	    ND_INCOMPLETE,
1064 	    &nce,
1065 	    NULL, /* let ndp_add figure out fastpath mp and dlureq_mp for v6 */
1066 	    NULL);
1067 
1068 	switch (err) {
1069 	case 0:
1070 		/*
1071 		 * New cache entry was created. Make sure that the state
1072 		 * is not ND_INCOMPLETE. It can be in some other state
1073 		 * even before we send out the solicitation as we could
1074 		 * get un-solicited advertisements.
1075 		 *
1076 		 * If this is an XRESOLV interface, simply return 0,
1077 		 * since we don't want to solicit just yet.
1078 		 */
1079 		if (ill->ill_flags & ILLF_XRESOLV) {
1080 			NCE_REFRELE(nce);
1081 			return (0);
1082 		}
1083 		rw_enter(&ill_g_lock, RW_READER);
1084 		mutex_enter(&nce->nce_lock);
1085 		if (nce->nce_state != ND_INCOMPLETE) {
1086 			mutex_exit(&nce->nce_lock);
1087 			rw_exit(&ill_g_lock);
1088 			NCE_REFRELE(nce);
1089 			return (0);
1090 		}
1091 		mp_nce = ip_prepend_zoneid(mp, zoneid);
1092 		if (mp_nce == NULL) {
1093 			/* The caller will free mp */
1094 			mutex_exit(&nce->nce_lock);
1095 			rw_exit(&ill_g_lock);
1096 			ndp_delete(nce);
1097 			NCE_REFRELE(nce);
1098 			return (ENOMEM);
1099 		}
1100 		ms = nce_solicit(nce, mp_nce);
1101 		rw_exit(&ill_g_lock);
1102 		if (ms == 0) {
1103 			/* The caller will free mp */
1104 			if (mp_nce != mp)
1105 				freeb(mp_nce);
1106 			mutex_exit(&nce->nce_lock);
1107 			ndp_delete(nce);
1108 			NCE_REFRELE(nce);
1109 			return (EBUSY);
1110 		}
1111 		mutex_exit(&nce->nce_lock);
1112 		NDP_RESTART_TIMER(nce, (clock_t)ms);
1113 		NCE_REFRELE(nce);
1114 		return (EINPROGRESS);
1115 	case EEXIST:
1116 		/* Resolution in progress just queue the packet */
1117 		mutex_enter(&nce->nce_lock);
1118 		if (nce->nce_state == ND_INCOMPLETE) {
1119 			mp_nce = ip_prepend_zoneid(mp, zoneid);
1120 			if (mp_nce == NULL) {
1121 				err = ENOMEM;
1122 			} else {
1123 				nce_queue_mp(nce, mp_nce);
1124 				err = EINPROGRESS;
1125 			}
1126 		} else {
1127 			/*
1128 			 * Any other state implies we have
1129 			 * a nce but IRE needs to be added ...
1130 			 * ire_add_v6() will take care of the
1131 			 * the case when the nce becomes CONDEMNED
1132 			 * before the ire is added to the table.
1133 			 */
1134 			err = 0;
1135 		}
1136 		mutex_exit(&nce->nce_lock);
1137 		NCE_REFRELE(nce);
1138 		break;
1139 	default:
1140 		ip1dbg(("ndp_resolver: Can't create NCE %d\n", err));
1141 		break;
1142 	}
1143 	return (err);
1144 }
1145 
1146 /*
1147  * When there is no resolver, the link layer template is passed in
1148  * the IRE.
1149  * Lookup a NCE for a given IRE.  Regardless of whether one exists
1150  * or one is created, we defer making ire point to nce until the
1151  * ire is actually added at which point the nce_refcnt on the nce is
1152  * incremented.  This is done primarily to have symmetry between ire_add()
1153  * and ire_delete() which decrements the nce_refcnt, when an ire is deleted.
1154  */
1155 int
1156 ndp_noresolver(ill_t *ill, const in6_addr_t *dst)
1157 {
1158 	nce_t		*nce;
1159 	int		err = 0;
1160 
1161 	ASSERT(ill != NULL);
1162 	ASSERT(ill->ill_isv6);
1163 	if (IN6_IS_ADDR_MULTICAST(dst)) {
1164 		err = nce_set_multicast(ill, dst);
1165 		return (err);
1166 	}
1167 
1168 	err = ndp_lookup_then_add(ill,
1169 	    NULL,	/* hardware address */
1170 	    dst,
1171 	    &ipv6_all_ones,
1172 	    &ipv6_all_zeros,
1173 	    0,
1174 	    (ill->ill_flags & ILLF_NONUD) ? NCE_F_NONUD : 0,
1175 	    ND_REACHABLE,
1176 	    &nce,
1177 	    NULL, /* let ndp_add figure out fp_mp/dlureq_mp for v6 */
1178 	    NULL);
1179 
1180 	switch (err) {
1181 	case 0:
1182 		/*
1183 		 * Cache entry with a proper resolver cookie was
1184 		 * created.
1185 		 */
1186 		NCE_REFRELE(nce);
1187 		break;
1188 	case EEXIST:
1189 		err = 0;
1190 		NCE_REFRELE(nce);
1191 		break;
1192 	default:
1193 		ip1dbg(("ndp_noresolver: Can't create NCE %d\n", err));
1194 		break;
1195 	}
1196 	return (err);
1197 }
1198 
1199 /*
1200  * For each interface an entry is added for the unspecified multicast group.
1201  * Here that mapping is used to form the multicast cache entry for a particular
1202  * multicast destination.
1203  */
1204 static int
1205 nce_set_multicast(ill_t *ill, const in6_addr_t *dst)
1206 {
1207 	nce_t		*mnce;	/* Multicast mapping entry */
1208 	nce_t		*nce;
1209 	uchar_t		*hw_addr = NULL;
1210 	int		err = 0;
1211 
1212 	ASSERT(ill != NULL);
1213 	ASSERT(ill->ill_isv6);
1214 	ASSERT(!(IN6_IS_ADDR_UNSPECIFIED(dst)));
1215 
1216 	mutex_enter(&ndp6.ndp_g_lock);
1217 	nce = *((nce_t **)NCE_HASH_PTR_V6(*dst));
1218 	nce = nce_lookup_addr(ill, dst, nce);
1219 	if (nce != NULL) {
1220 		mutex_exit(&ndp6.ndp_g_lock);
1221 		NCE_REFRELE(nce);
1222 		return (0);
1223 	}
1224 	/* No entry, now lookup for a mapping this should never fail */
1225 	mnce = nce_lookup_mapping(ill, dst);
1226 	if (mnce == NULL) {
1227 		/* Something broken for the interface. */
1228 		mutex_exit(&ndp6.ndp_g_lock);
1229 		return (ESRCH);
1230 	}
1231 	ASSERT(mnce->nce_flags & NCE_F_MAPPING);
1232 	if (ill->ill_net_type == IRE_IF_RESOLVER) {
1233 		/*
1234 		 * For IRE_IF_RESOLVER a hardware mapping can be
1235 		 * generated, for IRE_IF_NORESOLVER, resolution cookie
1236 		 * in the ill is copied in ndp_add().
1237 		 */
1238 		hw_addr = kmem_alloc(ill->ill_nd_lla_len, KM_NOSLEEP);
1239 		if (hw_addr == NULL) {
1240 			mutex_exit(&ndp6.ndp_g_lock);
1241 			NCE_REFRELE(mnce);
1242 			return (ENOMEM);
1243 		}
1244 		nce_make_mapping(mnce, hw_addr, (uchar_t *)dst);
1245 	}
1246 	NCE_REFRELE(mnce);
1247 	/*
1248 	 * IRE_IF_NORESOLVER type simply copies the resolution
1249 	 * cookie passed in.  So no hw_addr is needed.
1250 	 */
1251 	err = ndp_add(ill,
1252 	    hw_addr,
1253 	    dst,
1254 	    &ipv6_all_ones,
1255 	    &ipv6_all_zeros,
1256 	    0,
1257 	    NCE_F_NONUD,
1258 	    ND_REACHABLE,
1259 	    &nce,
1260 	    NULL,
1261 	    NULL);
1262 	mutex_exit(&ndp6.ndp_g_lock);
1263 	if (hw_addr != NULL)
1264 		kmem_free(hw_addr, ill->ill_nd_lla_len);
1265 	if (err != 0) {
1266 		ip1dbg(("nce_set_multicast: create failed" "%d\n", err));
1267 		return (err);
1268 	}
1269 	NCE_REFRELE(nce);
1270 	return (0);
1271 }
1272 
1273 /*
1274  * Return the link layer address, and any flags of a nce.
1275  */
1276 int
1277 ndp_query(ill_t *ill, struct lif_nd_req *lnr)
1278 {
1279 	nce_t		*nce;
1280 	in6_addr_t	*addr;
1281 	sin6_t		*sin6;
1282 	dl_unitdata_req_t	*dl;
1283 
1284 	ASSERT(ill != NULL && ill->ill_isv6);
1285 	sin6 = (sin6_t *)&lnr->lnr_addr;
1286 	addr =  &sin6->sin6_addr;
1287 
1288 	nce = ndp_lookup_v6(ill, addr, B_FALSE);
1289 	if (nce == NULL)
1290 		return (ESRCH);
1291 	/* If in INCOMPLETE state, no link layer address is available yet */
1292 	if (nce->nce_state == ND_INCOMPLETE)
1293 		goto done;
1294 	dl = (dl_unitdata_req_t *)nce->nce_res_mp->b_rptr;
1295 	if (ill->ill_flags & ILLF_XRESOLV)
1296 		lnr->lnr_hdw_len = dl->dl_dest_addr_length;
1297 	else
1298 		lnr->lnr_hdw_len = ill->ill_nd_lla_len;
1299 	ASSERT(NCE_LL_ADDR_OFFSET(ill) + lnr->lnr_hdw_len <=
1300 	    sizeof (lnr->lnr_hdw_addr));
1301 	bcopy(nce->nce_res_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill),
1302 	    (uchar_t *)&lnr->lnr_hdw_addr, lnr->lnr_hdw_len);
1303 	if (nce->nce_flags & NCE_F_ISROUTER)
1304 		lnr->lnr_flags = NDF_ISROUTER_ON;
1305 	if (nce->nce_flags & NCE_F_PROXY)
1306 		lnr->lnr_flags |= NDF_PROXY_ON;
1307 	if (nce->nce_flags & NCE_F_ANYCAST)
1308 		lnr->lnr_flags |= NDF_ANYCAST_ON;
1309 done:
1310 	NCE_REFRELE(nce);
1311 	return (0);
1312 }
1313 
1314 /*
1315  * Send Enable/Disable multicast reqs to driver.
1316  */
1317 int
1318 ndp_mcastreq(ill_t *ill, const in6_addr_t *addr, uint32_t hw_addr_len,
1319     uint32_t hw_addr_offset, mblk_t *mp)
1320 {
1321 	nce_t		*nce;
1322 	uchar_t		*hw_addr;
1323 
1324 	ASSERT(ill != NULL && ill->ill_isv6);
1325 	ASSERT(ill->ill_net_type == IRE_IF_RESOLVER);
1326 	hw_addr = mi_offset_paramc(mp, hw_addr_offset, hw_addr_len);
1327 	if (hw_addr == NULL || !IN6_IS_ADDR_MULTICAST(addr)) {
1328 		freemsg(mp);
1329 		return (EINVAL);
1330 	}
1331 	mutex_enter(&ndp6.ndp_g_lock);
1332 	nce = nce_lookup_mapping(ill, addr);
1333 	if (nce == NULL) {
1334 		mutex_exit(&ndp6.ndp_g_lock);
1335 		freemsg(mp);
1336 		return (ESRCH);
1337 	}
1338 	mutex_exit(&ndp6.ndp_g_lock);
1339 	/*
1340 	 * Update dl_addr_length and dl_addr_offset for primitives that
1341 	 * have physical addresses as opposed to full saps
1342 	 */
1343 	switch (((union DL_primitives *)mp->b_rptr)->dl_primitive) {
1344 	case DL_ENABMULTI_REQ:
1345 		/* Track the state if this is the first enabmulti */
1346 		if (ill->ill_dlpi_multicast_state == IDS_UNKNOWN)
1347 			ill->ill_dlpi_multicast_state = IDS_INPROGRESS;
1348 		ip1dbg(("ndp_mcastreq: ENABMULTI\n"));
1349 		break;
1350 	case DL_DISABMULTI_REQ:
1351 		ip1dbg(("ndp_mcastreq: DISABMULTI\n"));
1352 		break;
1353 	default:
1354 		NCE_REFRELE(nce);
1355 		ip1dbg(("ndp_mcastreq: default\n"));
1356 		return (EINVAL);
1357 	}
1358 	nce_make_mapping(nce, hw_addr, (uchar_t *)addr);
1359 	NCE_REFRELE(nce);
1360 	putnext(ill->ill_wq, mp);
1361 	return (0);
1362 }
1363 
1364 /*
1365  * Send a neighbor solicitation.
1366  * Returns number of milliseconds after which we should either rexmit or abort.
1367  * Return of zero means we should abort.
1368  * The caller holds the nce_lock to protect nce_qd_mp and nce_rcnt.
1369  *
1370  * NOTE: This routine drops nce_lock (and later reacquires it) when sending
1371  * the packet.
1372  * NOTE: This routine does not consume mp.
1373  */
1374 uint32_t
1375 nce_solicit(nce_t *nce, mblk_t *mp)
1376 {
1377 	ill_t		*ill;
1378 	ill_t		*src_ill;
1379 	ip6_t		*ip6h;
1380 	in6_addr_t	src;
1381 	in6_addr_t	dst;
1382 	ipif_t		*ipif;
1383 	ip6i_t		*ip6i;
1384 	boolean_t	dropped = B_FALSE;
1385 
1386 	ASSERT(RW_READ_HELD(&ill_g_lock));
1387 	ASSERT(MUTEX_HELD(&nce->nce_lock));
1388 	ill = nce->nce_ill;
1389 	ASSERT(ill != NULL);
1390 
1391 	if (nce->nce_rcnt == 0) {
1392 		return (0);
1393 	}
1394 
1395 	if (mp == NULL) {
1396 		ASSERT(nce->nce_qd_mp != NULL);
1397 		mp = nce->nce_qd_mp;
1398 	} else {
1399 		nce_queue_mp(nce, mp);
1400 	}
1401 
1402 	/* Handle ip_newroute_v6 giving us IPSEC packets */
1403 	if (mp->b_datap->db_type == M_CTL)
1404 		mp = mp->b_cont;
1405 
1406 	ip6h = (ip6_t *)mp->b_rptr;
1407 	if (ip6h->ip6_nxt == IPPROTO_RAW) {
1408 		/*
1409 		 * This message should have been pulled up already in
1410 		 * ip_wput_v6. We can't do pullups here because the message
1411 		 * could be from the nce_qd_mp which could have b_next/b_prev
1412 		 * non-NULL.
1413 		 */
1414 		ip6i = (ip6i_t *)ip6h;
1415 		ASSERT((mp->b_wptr - (uchar_t *)ip6i) >=
1416 			    sizeof (ip6i_t) + IPV6_HDR_LEN);
1417 		ip6h = (ip6_t *)(mp->b_rptr + sizeof (ip6i_t));
1418 	}
1419 	src = ip6h->ip6_src;
1420 	/*
1421 	 * If the src of outgoing packet is one of the assigned interface
1422 	 * addresses use it, otherwise we will pick the source address below.
1423 	 */
1424 	src_ill = ill;
1425 	if (!IN6_IS_ADDR_UNSPECIFIED(&src)) {
1426 		if (ill->ill_group != NULL)
1427 			src_ill = ill->ill_group->illgrp_ill;
1428 		for (; src_ill != NULL; src_ill = src_ill->ill_group_next) {
1429 			for (ipif = src_ill->ill_ipif; ipif != NULL;
1430 			    ipif = ipif->ipif_next) {
1431 				if (IN6_ARE_ADDR_EQUAL(&src,
1432 				    &ipif->ipif_v6lcl_addr)) {
1433 					break;
1434 				}
1435 			}
1436 			if (ipif != NULL)
1437 				break;
1438 		}
1439 		/*
1440 		 * If no relevant ipif can be found, then it's not one of our
1441 		 * addresses.  Reset to :: and let nce_xmit.  If an ipif can be
1442 		 * found, but it's not yet done with DAD verification, then
1443 		 * just postpone this transmission until later.
1444 		 */
1445 		if (src_ill == NULL)
1446 			src = ipv6_all_zeros;
1447 		else if (!ipif->ipif_addr_ready)
1448 			return (ill->ill_reachable_retrans_time);
1449 	}
1450 	dst = nce->nce_addr;
1451 	/*
1452 	 * If source address is unspecified, nce_xmit will choose
1453 	 * one for us and initialize the hardware address also
1454 	 * appropriately.
1455 	 */
1456 	if (IN6_IS_ADDR_UNSPECIFIED(&src))
1457 		src_ill = NULL;
1458 	nce->nce_rcnt--;
1459 	mutex_exit(&nce->nce_lock);
1460 	rw_exit(&ill_g_lock);
1461 	dropped = nce_xmit(ill, ND_NEIGHBOR_SOLICIT, src_ill, B_TRUE, &src,
1462 	    &dst, 0);
1463 	rw_enter(&ill_g_lock, RW_READER);
1464 	mutex_enter(&nce->nce_lock);
1465 	if (dropped)
1466 		nce->nce_rcnt++;
1467 	return (ill->ill_reachable_retrans_time);
1468 }
1469 
1470 /*
1471  * Attempt to recover an address on an interface that's been marked as a
1472  * duplicate.  Because NCEs are destroyed when the interface goes down, there's
1473  * no easy way to just probe the address and have the right thing happen if
1474  * it's no longer in use.  Instead, we just bring it up normally and allow the
1475  * regular interface start-up logic to probe for a remaining duplicate and take
1476  * us back down if necessary.
1477  * Neither DHCP nor temporary addresses arrive here; they're excluded by
1478  * ip_ndp_excl.
1479  */
1480 /* ARGSUSED */
1481 static void
1482 ip_ndp_recover(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg)
1483 {
1484 	ill_t	*ill = rq->q_ptr;
1485 	ipif_t	*ipif;
1486 	in6_addr_t *addr = (in6_addr_t *)mp->b_rptr;
1487 
1488 	for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
1489 		/*
1490 		 * We do not support recovery of proxy ARP'd interfaces,
1491 		 * because the system lacks a complete proxy ARP mechanism.
1492 		 */
1493 		if ((ipif->ipif_flags & IPIF_POINTOPOINT) ||
1494 		    !IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6lcl_addr, addr)) {
1495 			continue;
1496 		}
1497 
1498 		/*
1499 		 * If we have already recovered, then ignore.
1500 		 */
1501 		mutex_enter(&ill->ill_lock);
1502 		if (!(ipif->ipif_flags & IPIF_DUPLICATE)) {
1503 			mutex_exit(&ill->ill_lock);
1504 			continue;
1505 		}
1506 
1507 		ipif->ipif_flags &= ~IPIF_DUPLICATE;
1508 		ill->ill_ipif_dup_count--;
1509 		mutex_exit(&ill->ill_lock);
1510 		ipif->ipif_was_dup = B_TRUE;
1511 
1512 		if (ipif_ndp_up(ipif, addr, B_FALSE) != EINPROGRESS)
1513 			(void) ipif_up_done_v6(ipif);
1514 	}
1515 	freeb(mp);
1516 }
1517 
1518 /*
1519  * Attempt to recover an IPv6 interface that's been shut down as a duplicate.
1520  * As long as someone else holds the address, the interface will stay down.
1521  * When that conflict goes away, the interface is brought back up.  This is
1522  * done so that accidental shutdowns of addresses aren't made permanent.  Your
1523  * server will recover from a failure.
1524  *
1525  * For DHCP and temporary addresses, recovery is not done in the kernel.
1526  * Instead, it's handled by user space processes (dhcpagent and in.ndpd).
1527  *
1528  * This function is entered on a timer expiry; the ID is in ipif_recovery_id.
1529  */
1530 static void
1531 ipif6_dup_recovery(void *arg)
1532 {
1533 	ipif_t *ipif = arg;
1534 
1535 	ipif->ipif_recovery_id = 0;
1536 	if (!(ipif->ipif_flags & IPIF_DUPLICATE))
1537 		return;
1538 
1539 	/* If the link is down, we'll retry this later */
1540 	if (!(ipif->ipif_ill->ill_phyint->phyint_flags & PHYI_RUNNING))
1541 		return;
1542 
1543 	ndp_do_recovery(ipif);
1544 }
1545 
1546 /*
1547  * Perform interface recovery by forcing the duplicate interfaces up and
1548  * allowing the system to determine which ones should stay up.
1549  *
1550  * Called both by recovery timer expiry and link-up notification.
1551  */
1552 void
1553 ndp_do_recovery(ipif_t *ipif)
1554 {
1555 	ill_t *ill = ipif->ipif_ill;
1556 	mblk_t *mp;
1557 
1558 	mp = allocb(sizeof (ipif->ipif_v6lcl_addr), BPRI_MED);
1559 	if (mp == NULL) {
1560 		ipif->ipif_recovery_id = timeout(ipif6_dup_recovery,
1561 		    ipif, MSEC_TO_TICK(ip_dup_recovery));
1562 	} else {
1563 		bcopy(&ipif->ipif_v6lcl_addr, mp->b_rptr,
1564 		    sizeof (ipif->ipif_v6lcl_addr));
1565 		ill_refhold(ill);
1566 		(void) qwriter_ip(NULL, ill, ill->ill_rq, mp, ip_ndp_recover,
1567 		    CUR_OP, B_FALSE);
1568 	}
1569 }
1570 
1571 /*
1572  * Find the solicitation in the given message, and extract printable details
1573  * (MAC and IP addresses) from it.
1574  */
1575 static nd_neighbor_solicit_t *
1576 ip_ndp_find_solicitation(mblk_t *mp, mblk_t *dl_mp, ill_t *ill, char *hbuf,
1577     size_t hlen, char *sbuf, size_t slen, uchar_t **haddr)
1578 {
1579 	nd_neighbor_solicit_t *ns;
1580 	ip6_t *ip6h;
1581 	uchar_t *addr;
1582 	int alen;
1583 
1584 	alen = 0;
1585 	ip6h = (ip6_t *)mp->b_rptr;
1586 	if (dl_mp == NULL) {
1587 		nd_opt_hdr_t *opt;
1588 		int nslen;
1589 
1590 		/*
1591 		 * If it's from the fast-path, then it can't be a probe
1592 		 * message, and thus must include the source linkaddr option.
1593 		 * Extract that here.
1594 		 */
1595 		ns = (nd_neighbor_solicit_t *)((char *)ip6h + IPV6_HDR_LEN);
1596 		nslen = mp->b_wptr - (uchar_t *)ns;
1597 		if ((nslen -= sizeof (*ns)) > 0) {
1598 			opt = ndp_get_option((nd_opt_hdr_t *)(ns + 1), nslen,
1599 			    ND_OPT_SOURCE_LINKADDR);
1600 			if (opt != NULL &&
1601 			    opt->nd_opt_len * 8 - sizeof (*opt) >=
1602 			    ill->ill_nd_lla_len) {
1603 				addr = (uchar_t *)(opt + 1);
1604 				alen = ill->ill_nd_lla_len;
1605 			}
1606 		}
1607 		/*
1608 		 * We cheat a bit here for the sake of printing usable log
1609 		 * messages in the rare case where the reply we got was unicast
1610 		 * without a source linkaddr option, and the interface is in
1611 		 * fastpath mode.  (Sigh.)
1612 		 */
1613 		if (alen == 0 && ill->ill_type == IFT_ETHER &&
1614 		    MBLKHEAD(mp) >= sizeof (struct ether_header)) {
1615 			struct ether_header *pether;
1616 
1617 			pether = (struct ether_header *)((char *)ip6h -
1618 			    sizeof (*pether));
1619 			addr = pether->ether_shost.ether_addr_octet;
1620 			alen = ETHERADDRL;
1621 		}
1622 	} else {
1623 		dl_unitdata_ind_t *dlu;
1624 
1625 		dlu = (dl_unitdata_ind_t *)dl_mp->b_rptr;
1626 		alen = dlu->dl_src_addr_length;
1627 		if (alen > 0 && dlu->dl_src_addr_offset >= sizeof (*dlu) &&
1628 		    dlu->dl_src_addr_offset + alen <= MBLKL(dl_mp)) {
1629 			addr = dl_mp->b_rptr + dlu->dl_src_addr_offset;
1630 			if (ill->ill_sap_length < 0) {
1631 				alen += ill->ill_sap_length;
1632 			} else {
1633 				addr += ill->ill_sap_length;
1634 				alen -= ill->ill_sap_length;
1635 			}
1636 		}
1637 	}
1638 	if (alen > 0) {
1639 		*haddr = addr;
1640 		(void) mac_colon_addr(addr, alen, hbuf, hlen);
1641 	} else {
1642 		*haddr = NULL;
1643 		(void) strcpy(hbuf, "?");
1644 	}
1645 	ns = (nd_neighbor_solicit_t *)((char *)ip6h + IPV6_HDR_LEN);
1646 	(void) inet_ntop(AF_INET6, &ns->nd_ns_target, sbuf, slen);
1647 	return (ns);
1648 }
1649 
1650 /*
1651  * This is for exclusive changes due to NDP duplicate address detection
1652  * failure.
1653  */
1654 /* ARGSUSED */
1655 static void
1656 ip_ndp_excl(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg)
1657 {
1658 	ill_t	*ill = rq->q_ptr;
1659 	ipif_t	*ipif;
1660 	char ibuf[LIFNAMSIZ + 10];	/* 10 digits for logical i/f number */
1661 	char hbuf[MAC_STR_LEN];
1662 	char sbuf[INET6_ADDRSTRLEN];
1663 	nd_neighbor_solicit_t *ns;
1664 	mblk_t *dl_mp = NULL;
1665 	uchar_t *haddr;
1666 
1667 	if (DB_TYPE(mp) != M_DATA) {
1668 		dl_mp = mp;
1669 		mp = mp->b_cont;
1670 	}
1671 	ns = ip_ndp_find_solicitation(mp, dl_mp, ill, hbuf, sizeof (hbuf), sbuf,
1672 	    sizeof (sbuf), &haddr);
1673 	if (haddr != NULL &&
1674 	    bcmp(haddr, ill->ill_phys_addr, ill->ill_phys_addr_length) == 0) {
1675 		/*
1676 		 * Ignore conflicts generated by misbehaving switches that just
1677 		 * reflect our own messages back to us.
1678 		 */
1679 		goto ignore_conflict;
1680 	}
1681 	(void) strlcpy(ibuf, ill->ill_name, sizeof (ibuf));
1682 	for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
1683 
1684 		if ((ipif->ipif_flags & IPIF_POINTOPOINT) ||
1685 		    !IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6lcl_addr,
1686 		    &ns->nd_ns_target)) {
1687 			continue;
1688 		}
1689 
1690 		/* If it's already marked, then don't do anything. */
1691 		if (ipif->ipif_flags & IPIF_DUPLICATE)
1692 			continue;
1693 
1694 		/*
1695 		 * If this is a failure during duplicate recovery, then don't
1696 		 * complain.  It may take a long time to recover.
1697 		 */
1698 		if (!ipif->ipif_was_dup) {
1699 			if (ipif->ipif_id != 0) {
1700 				(void) snprintf(ibuf + ill->ill_name_length - 1,
1701 				    sizeof (ibuf) - ill->ill_name_length + 1,
1702 				    ":%d", ipif->ipif_id);
1703 			}
1704 			cmn_err(CE_WARN, "%s has duplicate address %s (in "
1705 			    "use by %s); disabled", ibuf, sbuf, hbuf);
1706 		}
1707 		mutex_enter(&ill->ill_lock);
1708 		ASSERT(!(ipif->ipif_flags & IPIF_DUPLICATE));
1709 		ipif->ipif_flags |= IPIF_DUPLICATE;
1710 		ill->ill_ipif_dup_count++;
1711 		mutex_exit(&ill->ill_lock);
1712 		(void) ipif_down(ipif, NULL, NULL);
1713 		ipif_down_tail(ipif);
1714 		if (!(ipif->ipif_flags & (IPIF_DHCPRUNNING|IPIF_TEMPORARY)) &&
1715 		    ill->ill_net_type == IRE_IF_RESOLVER &&
1716 		    ip_dup_recovery > 0)
1717 			ipif->ipif_recovery_id = timeout(ipif6_dup_recovery,
1718 			    ipif, MSEC_TO_TICK(ip_dup_recovery));
1719 	}
1720 ignore_conflict:
1721 	if (dl_mp != NULL)
1722 		freeb(dl_mp);
1723 	freemsg(mp);
1724 }
1725 
1726 /*
1727  * Handle failure by tearing down the ipifs with the specified address.  Note
1728  * that tearing down the ipif also means deleting the nce through ipif_down, so
1729  * it's not possible to do recovery by just restarting the nce timer.  Instead,
1730  * we start a timer on the ipif.
1731  */
1732 static void
1733 ip_ndp_failure(ill_t *ill, mblk_t *mp, mblk_t *dl_mp, nce_t *nce)
1734 {
1735 	if ((mp = copymsg(mp)) != NULL) {
1736 		if (dl_mp == NULL)
1737 			dl_mp = mp;
1738 		else if ((dl_mp = copyb(dl_mp)) != NULL)
1739 			dl_mp->b_cont = mp;
1740 		if (dl_mp == NULL) {
1741 			freemsg(mp);
1742 		} else {
1743 			ill_refhold(ill);
1744 			(void) qwriter_ip(NULL, ill, ill->ill_rq, dl_mp,
1745 			    ip_ndp_excl, CUR_OP, B_FALSE);
1746 		}
1747 	}
1748 	ndp_delete(nce);
1749 }
1750 
1751 /*
1752  * Handle a discovered conflict: some other system is advertising that it owns
1753  * one of our IP addresses.  We need to defend ourselves, or just shut down the
1754  * interface.
1755  */
1756 static void
1757 ip_ndp_conflict(ill_t *ill, mblk_t *mp, mblk_t *dl_mp, nce_t *nce)
1758 {
1759 	ipif_t *ipif;
1760 	uint32_t now;
1761 	uint_t maxdefense;
1762 	uint_t defs;
1763 
1764 	ipif = ipif_lookup_addr_v6(&nce->nce_addr, ill, ALL_ZONES, NULL, NULL,
1765 	    NULL, NULL);
1766 	if (ipif == NULL)
1767 		return;
1768 	/*
1769 	 * First, figure out if this address is disposable.
1770 	 */
1771 	if (ipif->ipif_flags & (IPIF_DHCPRUNNING | IPIF_TEMPORARY))
1772 		maxdefense = ip_max_temp_defend;
1773 	else
1774 		maxdefense = ip_max_defend;
1775 
1776 	/*
1777 	 * Now figure out how many times we've defended ourselves.  Ignore
1778 	 * defenses that happened long in the past.
1779 	 */
1780 	now = gethrestime_sec();
1781 	mutex_enter(&nce->nce_lock);
1782 	if ((defs = nce->nce_defense_count) > 0 &&
1783 	    now - nce->nce_defense_time > ip_defend_interval) {
1784 		nce->nce_defense_count = defs = 0;
1785 	}
1786 	nce->nce_defense_count++;
1787 	nce->nce_defense_time = now;
1788 	mutex_exit(&nce->nce_lock);
1789 	ipif_refrele(ipif);
1790 
1791 	/*
1792 	 * If we've defended ourselves too many times already, then give up and
1793 	 * tear down the interface(s) using this address.  Otherwise, defend by
1794 	 * sending out an unsolicited Neighbor Advertisement.
1795 	 */
1796 	if (defs >= maxdefense) {
1797 		ip_ndp_failure(ill, mp, dl_mp, nce);
1798 	} else {
1799 		char hbuf[MAC_STR_LEN];
1800 		char sbuf[INET6_ADDRSTRLEN];
1801 		uchar_t *haddr;
1802 
1803 		(void) ip_ndp_find_solicitation(mp, dl_mp, ill, hbuf,
1804 		    sizeof (hbuf), sbuf, sizeof (sbuf), &haddr);
1805 		cmn_err(CE_WARN, "node %s is using our IP address %s on %s",
1806 		    hbuf, sbuf, ill->ill_name);
1807 		(void) nce_xmit(ill, ND_NEIGHBOR_ADVERT, ill, B_FALSE,
1808 		    &nce->nce_addr, &ipv6_all_hosts_mcast,
1809 		    nce_advert_flags(nce));
1810 	}
1811 }
1812 
1813 static void
1814 ndp_input_solicit(ill_t *ill, mblk_t *mp, mblk_t *dl_mp)
1815 {
1816 	nd_neighbor_solicit_t *ns;
1817 	uint32_t	hlen = ill->ill_nd_lla_len;
1818 	uchar_t		*haddr = NULL;
1819 	icmp6_t		*icmp_nd;
1820 	ip6_t		*ip6h;
1821 	nce_t		*our_nce = NULL;
1822 	in6_addr_t	target;
1823 	in6_addr_t	src;
1824 	int		len;
1825 	int		flag = 0;
1826 	nd_opt_hdr_t	*opt = NULL;
1827 	boolean_t	bad_solicit = B_FALSE;
1828 	mib2_ipv6IfIcmpEntry_t	*mib = ill->ill_icmp6_mib;
1829 
1830 	ip6h = (ip6_t *)mp->b_rptr;
1831 	icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
1832 	len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN;
1833 	src = ip6h->ip6_src;
1834 	ns = (nd_neighbor_solicit_t *)icmp_nd;
1835 	target = ns->nd_ns_target;
1836 	if (IN6_IS_ADDR_MULTICAST(&target)) {
1837 		if (ip_debug > 2) {
1838 			/* ip1dbg */
1839 			pr_addr_dbg("ndp_input_solicit: Target is"
1840 			    " multicast! %s\n", AF_INET6, &target);
1841 		}
1842 		bad_solicit = B_TRUE;
1843 		goto done;
1844 	}
1845 	if (len > sizeof (nd_neighbor_solicit_t)) {
1846 		/* Options present */
1847 		opt = (nd_opt_hdr_t *)&ns[1];
1848 		len -= sizeof (nd_neighbor_solicit_t);
1849 		if (!ndp_verify_optlen(opt, len)) {
1850 			ip1dbg(("ndp_input_solicit: Bad opt len\n"));
1851 			bad_solicit = B_TRUE;
1852 			goto done;
1853 		}
1854 	}
1855 	if (IN6_IS_ADDR_UNSPECIFIED(&src)) {
1856 		/* Check to see if this is a valid DAD solicitation */
1857 		if (!IN6_IS_ADDR_MC_SOLICITEDNODE(&ip6h->ip6_dst)) {
1858 			if (ip_debug > 2) {
1859 				/* ip1dbg */
1860 				pr_addr_dbg("ndp_input_solicit: IPv6 "
1861 				    "Destination is not solicited node "
1862 				    "multicast %s\n", AF_INET6,
1863 				    &ip6h->ip6_dst);
1864 			}
1865 			bad_solicit = B_TRUE;
1866 			goto done;
1867 		}
1868 	}
1869 
1870 	our_nce = ndp_lookup_v6(ill, &target, B_FALSE);
1871 	/*
1872 	 * If this is a valid Solicitation, a permanent
1873 	 * entry should exist in the cache
1874 	 */
1875 	if (our_nce == NULL ||
1876 	    !(our_nce->nce_flags & NCE_F_PERMANENT)) {
1877 		ip1dbg(("ndp_input_solicit: Wrong target in NS?!"
1878 		    "ifname=%s ", ill->ill_name));
1879 		if (ip_debug > 2) {
1880 			/* ip1dbg */
1881 			pr_addr_dbg(" dst %s\n", AF_INET6, &target);
1882 		}
1883 		bad_solicit = B_TRUE;
1884 		goto done;
1885 	}
1886 
1887 	/* At this point we should have a verified NS per spec */
1888 	if (opt != NULL) {
1889 		opt = ndp_get_option(opt, len, ND_OPT_SOURCE_LINKADDR);
1890 		if (opt != NULL) {
1891 			haddr = (uchar_t *)&opt[1];
1892 			if (hlen > opt->nd_opt_len * 8 - sizeof (*opt) ||
1893 			    hlen == 0) {
1894 				ip1dbg(("ndp_input_advert: bad SLLA\n"));
1895 				bad_solicit = B_TRUE;
1896 				goto done;
1897 			}
1898 		}
1899 	}
1900 
1901 	/* If sending directly to peer, set the unicast flag */
1902 	if (!IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst))
1903 		flag |= NDP_UNICAST;
1904 
1905 	/*
1906 	 * Create/update the entry for the soliciting node.
1907 	 * or respond to outstanding queries, don't if
1908 	 * the source is unspecified address.
1909 	 */
1910 	if (!IN6_IS_ADDR_UNSPECIFIED(&src)) {
1911 		int	err;
1912 		nce_t	*nnce;
1913 
1914 		ASSERT(ill->ill_isv6);
1915 		/*
1916 		 * Regular solicitations *must* include the Source Link-Layer
1917 		 * Address option.  Ignore messages that do not.
1918 		 */
1919 		if (haddr == NULL && IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst)) {
1920 			ip1dbg(("ndp_input_solicit: source link-layer address "
1921 			    "option missing with a specified source.\n"));
1922 			bad_solicit = B_TRUE;
1923 			goto done;
1924 		}
1925 
1926 		/*
1927 		 * This is a regular solicitation.  If we're still in the
1928 		 * process of verifying the address, then don't respond at all
1929 		 * and don't keep track of the sender.
1930 		 */
1931 		if (our_nce->nce_state == ND_PROBE)
1932 			goto done;
1933 
1934 		/*
1935 		 * If the solicitation doesn't have sender hardware address
1936 		 * (legal for unicast solicitation), then process without
1937 		 * installing the return NCE.  Either we already know it, or
1938 		 * we'll be forced to look it up when (and if) we reply to the
1939 		 * packet.
1940 		 */
1941 		if (haddr == NULL)
1942 			goto no_source;
1943 
1944 		err = ndp_lookup_then_add(ill,
1945 		    haddr,
1946 		    &src,	/* Soliciting nodes address */
1947 		    &ipv6_all_ones,
1948 		    &ipv6_all_zeros,
1949 		    0,
1950 		    0,
1951 		    ND_STALE,
1952 		    &nnce,
1953 		    NULL,
1954 		    NULL);
1955 		switch (err) {
1956 		case 0:
1957 			/* done with this entry */
1958 			NCE_REFRELE(nnce);
1959 			break;
1960 		case EEXIST:
1961 			/*
1962 			 * B_FALSE indicates this is not an
1963 			 * an advertisement.
1964 			 */
1965 			ndp_process(nnce, haddr, 0, B_FALSE);
1966 			NCE_REFRELE(nnce);
1967 			break;
1968 		default:
1969 			ip1dbg(("ndp_input_solicit: Can't create NCE %d\n",
1970 			    err));
1971 			goto done;
1972 		}
1973 no_source:
1974 		flag |= NDP_SOLICITED;
1975 	} else {
1976 		/*
1977 		 * No source link layer address option should be present in a
1978 		 * valid DAD request.
1979 		 */
1980 		if (haddr != NULL) {
1981 			ip1dbg(("ndp_input_solicit: source link-layer address "
1982 			    "option present with an unspecified source.\n"));
1983 			bad_solicit = B_TRUE;
1984 			goto done;
1985 		}
1986 		if (our_nce->nce_state == ND_PROBE) {
1987 			/*
1988 			 * Internally looped-back probes won't have DLPI
1989 			 * attached to them.  External ones (which are sent by
1990 			 * multicast) always will.  Just ignore our own
1991 			 * transmissions.
1992 			 */
1993 			if (dl_mp != NULL) {
1994 				/*
1995 				 * If someone else is probing our address, then
1996 				 * we've crossed wires.  Declare failure.
1997 				 */
1998 				ip_ndp_failure(ill, mp, dl_mp, our_nce);
1999 			}
2000 			goto done;
2001 		}
2002 		/*
2003 		 * This is a DAD probe.  Multicast the advertisement to the
2004 		 * all-nodes address.
2005 		 */
2006 		src = ipv6_all_hosts_mcast;
2007 	}
2008 	flag |= nce_advert_flags(our_nce);
2009 	/* Response to a solicitation */
2010 	(void) nce_xmit(ill,
2011 	    ND_NEIGHBOR_ADVERT,
2012 	    ill,	/* ill to be used for extracting ill_nd_lla */
2013 	    B_TRUE,	/* use ill_nd_lla */
2014 	    &target,	/* Source and target of the advertisement pkt */
2015 	    &src,	/* IP Destination (source of original pkt) */
2016 	    flag);
2017 done:
2018 	if (bad_solicit)
2019 		BUMP_MIB(mib, ipv6IfIcmpInBadNeighborSolicitations);
2020 	if (our_nce != NULL)
2021 		NCE_REFRELE(our_nce);
2022 }
2023 
2024 void
2025 ndp_input_advert(ill_t *ill, mblk_t *mp, mblk_t *dl_mp)
2026 {
2027 	nd_neighbor_advert_t *na;
2028 	uint32_t	hlen = ill->ill_nd_lla_len;
2029 	uchar_t		*haddr = NULL;
2030 	icmp6_t		*icmp_nd;
2031 	ip6_t		*ip6h;
2032 	nce_t		*dst_nce = NULL;
2033 	in6_addr_t	target;
2034 	nd_opt_hdr_t	*opt = NULL;
2035 	int		len;
2036 	mib2_ipv6IfIcmpEntry_t	*mib = ill->ill_icmp6_mib;
2037 
2038 	ip6h = (ip6_t *)mp->b_rptr;
2039 	icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
2040 	len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN;
2041 	na = (nd_neighbor_advert_t *)icmp_nd;
2042 	if (IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst) &&
2043 	    (na->nd_na_flags_reserved & ND_NA_FLAG_SOLICITED)) {
2044 		ip1dbg(("ndp_input_advert: Target is multicast but the "
2045 		    "solicited flag is not zero\n"));
2046 		BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements);
2047 		return;
2048 	}
2049 	target = na->nd_na_target;
2050 	if (IN6_IS_ADDR_MULTICAST(&target)) {
2051 		ip1dbg(("ndp_input_advert: Target is multicast!\n"));
2052 		BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements);
2053 		return;
2054 	}
2055 	if (len > sizeof (nd_neighbor_advert_t)) {
2056 		opt = (nd_opt_hdr_t *)&na[1];
2057 		if (!ndp_verify_optlen(opt,
2058 		    len - sizeof (nd_neighbor_advert_t))) {
2059 			ip1dbg(("ndp_input_advert: cannot verify SLLA\n"));
2060 			BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements);
2061 			return;
2062 		}
2063 		/* At this point we have a verified NA per spec */
2064 		len -= sizeof (nd_neighbor_advert_t);
2065 		opt = ndp_get_option(opt, len, ND_OPT_TARGET_LINKADDR);
2066 		if (opt != NULL) {
2067 			haddr = (uchar_t *)&opt[1];
2068 			if (hlen > opt->nd_opt_len * 8 - sizeof (*opt) ||
2069 			    hlen == 0) {
2070 				ip1dbg(("ndp_input_advert: bad SLLA\n"));
2071 				BUMP_MIB(mib,
2072 				    ipv6IfIcmpInBadNeighborAdvertisements);
2073 				return;
2074 			}
2075 		}
2076 	}
2077 
2078 	/*
2079 	 * If this interface is part of the group look at all the
2080 	 * ills in the group.
2081 	 */
2082 	rw_enter(&ill_g_lock, RW_READER);
2083 	if (ill->ill_group != NULL)
2084 		ill = ill->ill_group->illgrp_ill;
2085 
2086 	for (; ill != NULL; ill = ill->ill_group_next) {
2087 		mutex_enter(&ill->ill_lock);
2088 		if (!ILL_CAN_LOOKUP(ill)) {
2089 			mutex_exit(&ill->ill_lock);
2090 			continue;
2091 		}
2092 		ill_refhold_locked(ill);
2093 		mutex_exit(&ill->ill_lock);
2094 		dst_nce = ndp_lookup_v6(ill, &target, B_FALSE);
2095 		/* We have to drop the lock since ndp_process calls put* */
2096 		rw_exit(&ill_g_lock);
2097 		if (dst_nce != NULL) {
2098 			if ((dst_nce->nce_flags & NCE_F_PERMANENT) &&
2099 			    dst_nce->nce_state == ND_PROBE) {
2100 				/*
2101 				 * Someone else sent an advertisement for an
2102 				 * address that we're trying to configure.
2103 				 * Tear it down.  Note that dl_mp might be NULL
2104 				 * if we're getting a unicast reply.  This
2105 				 * isn't typically done (multicast is the norm
2106 				 * in response to a probe), but ip_ndp_failure
2107 				 * will handle the dl_mp == NULL case as well.
2108 				 */
2109 				ip_ndp_failure(ill, mp, dl_mp, dst_nce);
2110 			} else if (dst_nce->nce_flags & NCE_F_PERMANENT) {
2111 				/*
2112 				 * Someone just announced one of our local
2113 				 * addresses.  If it wasn't us, then this is a
2114 				 * conflict.  Defend the address or shut it
2115 				 * down.
2116 				 */
2117 				if (dl_mp != NULL &&
2118 				    (haddr == NULL ||
2119 				    nce_cmp_ll_addr(dst_nce, haddr,
2120 				    ill->ill_nd_lla_len))) {
2121 					ip_ndp_conflict(ill, mp, dl_mp,
2122 					    dst_nce);
2123 				}
2124 			} else {
2125 				if (na->nd_na_flags_reserved &
2126 				    ND_NA_FLAG_ROUTER) {
2127 					dst_nce->nce_flags |= NCE_F_ISROUTER;
2128 				}
2129 				/* B_TRUE indicates this an advertisement */
2130 				ndp_process(dst_nce, haddr,
2131 				    na->nd_na_flags_reserved, B_TRUE);
2132 			}
2133 			NCE_REFRELE(dst_nce);
2134 		}
2135 		rw_enter(&ill_g_lock, RW_READER);
2136 		ill_refrele(ill);
2137 	}
2138 	rw_exit(&ill_g_lock);
2139 }
2140 
2141 /*
2142  * Process NDP neighbor solicitation/advertisement messages.
2143  * The checksum has already checked o.k before reaching here.
2144  */
2145 void
2146 ndp_input(ill_t *ill, mblk_t *mp, mblk_t *dl_mp)
2147 {
2148 	icmp6_t		*icmp_nd;
2149 	ip6_t		*ip6h;
2150 	int		len;
2151 	mib2_ipv6IfIcmpEntry_t	*mib = ill->ill_icmp6_mib;
2152 
2153 
2154 	if (!pullupmsg(mp, -1)) {
2155 		ip1dbg(("ndp_input: pullupmsg failed\n"));
2156 		BUMP_MIB(ill->ill_ip6_mib, ipv6InDiscards);
2157 		goto done;
2158 	}
2159 	ip6h = (ip6_t *)mp->b_rptr;
2160 	if (ip6h->ip6_hops != IPV6_MAX_HOPS) {
2161 		ip1dbg(("ndp_input: hoplimit != IPV6_MAX_HOPS\n"));
2162 		BUMP_MIB(mib, ipv6IfIcmpBadHoplimit);
2163 		goto done;
2164 	}
2165 	/*
2166 	 * NDP does not accept any extension headers between the
2167 	 * IP header and the ICMP header since e.g. a routing
2168 	 * header could be dangerous.
2169 	 * This assumes that any AH or ESP headers are removed
2170 	 * by ip prior to passing the packet to ndp_input.
2171 	 */
2172 	if (ip6h->ip6_nxt != IPPROTO_ICMPV6) {
2173 		ip1dbg(("ndp_input: Wrong next header 0x%x\n",
2174 		    ip6h->ip6_nxt));
2175 		BUMP_MIB(mib, ipv6IfIcmpInErrors);
2176 		goto done;
2177 	}
2178 	icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
2179 	ASSERT(icmp_nd->icmp6_type == ND_NEIGHBOR_SOLICIT ||
2180 	    icmp_nd->icmp6_type == ND_NEIGHBOR_ADVERT);
2181 	if (icmp_nd->icmp6_code != 0) {
2182 		ip1dbg(("ndp_input: icmp6 code != 0 \n"));
2183 		BUMP_MIB(mib, ipv6IfIcmpInErrors);
2184 		goto done;
2185 	}
2186 	len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN;
2187 	/*
2188 	 * Make sure packet length is large enough for either
2189 	 * a NS or a NA icmp packet.
2190 	 */
2191 	if (len <  sizeof (struct icmp6_hdr) + sizeof (struct in6_addr)) {
2192 		ip1dbg(("ndp_input: packet too short\n"));
2193 		BUMP_MIB(mib, ipv6IfIcmpInErrors);
2194 		goto done;
2195 	}
2196 	if (icmp_nd->icmp6_type == ND_NEIGHBOR_SOLICIT) {
2197 		ndp_input_solicit(ill, mp, dl_mp);
2198 	} else {
2199 		ndp_input_advert(ill, mp, dl_mp);
2200 	}
2201 done:
2202 	freemsg(mp);
2203 }
2204 
2205 /*
2206  * nce_xmit is called to form and transmit a ND solicitation or
2207  * advertisement ICMP packet.
2208  *
2209  * If the source address is unspecified and this isn't a probe (used for
2210  * duplicate address detection), an appropriate source address and link layer
2211  * address will be chosen here.  The link layer address option is included if
2212  * the source is specified (i.e., all non-probe packets), and omitted (per the
2213  * specification) otherwise.
2214  *
2215  * It returns B_FALSE only if it does a successful put() to the
2216  * corresponding ill's ill_wq otherwise returns B_TRUE.
2217  */
2218 static boolean_t
2219 nce_xmit(ill_t *ill, uint32_t operation, ill_t *hwaddr_ill,
2220     boolean_t use_nd_lla, const in6_addr_t *sender, const in6_addr_t *target,
2221     int flag)
2222 {
2223 	uint32_t	len;
2224 	icmp6_t 	*icmp6;
2225 	mblk_t		*mp;
2226 	ip6_t		*ip6h;
2227 	nd_opt_hdr_t	*opt;
2228 	uint_t		plen;
2229 	ip6i_t		*ip6i;
2230 	ipif_t		*src_ipif = NULL;
2231 	uint8_t		*hw_addr;
2232 
2233 	/*
2234 	 * If we have a unspecified source(sender) address, select a
2235 	 * proper source address for the solicitation here itself so
2236 	 * that we can initialize the h/w address correctly. This is
2237 	 * needed for interface groups as source address can come from
2238 	 * the whole group and the h/w address initialized from ill will
2239 	 * be wrong if the source address comes from a different ill.
2240 	 *
2241 	 * Note that the NA never comes here with the unspecified source
2242 	 * address. The following asserts that whenever the source
2243 	 * address is specified, the haddr also should be specified.
2244 	 */
2245 	ASSERT(IN6_IS_ADDR_UNSPECIFIED(sender) || (hwaddr_ill != NULL));
2246 
2247 	if (IN6_IS_ADDR_UNSPECIFIED(sender) && !(flag & NDP_PROBE)) {
2248 		ASSERT(operation != ND_NEIGHBOR_ADVERT);
2249 		/*
2250 		 * Pick a source address for this solicitation, but
2251 		 * restrict the selection to addresses assigned to the
2252 		 * output interface (or interface group).  We do this
2253 		 * because the destination will create a neighbor cache
2254 		 * entry for the source address of this packet, so the
2255 		 * source address had better be a valid neighbor.
2256 		 */
2257 		src_ipif = ipif_select_source_v6(ill, target, RESTRICT_TO_ILL,
2258 		    IPV6_PREFER_SRC_DEFAULT, GLOBAL_ZONEID);
2259 		if (src_ipif == NULL) {
2260 			char buf[INET6_ADDRSTRLEN];
2261 
2262 			ip1dbg(("nce_xmit: No source ipif for dst %s\n",
2263 			    inet_ntop(AF_INET6, (char *)target, buf,
2264 			    sizeof (buf))));
2265 			return (B_TRUE);
2266 		}
2267 		sender = &src_ipif->ipif_v6src_addr;
2268 		hwaddr_ill = src_ipif->ipif_ill;
2269 	}
2270 
2271 	/*
2272 	 * Always make sure that the NS/NA packets don't get load
2273 	 * spread. This is needed so that the probe packets sent
2274 	 * by the in.mpathd daemon can really go out on the desired
2275 	 * interface. Probe packets are made to go out on a desired
2276 	 * interface by including a ip6i with ATTACH_IF flag. As these
2277 	 * packets indirectly end up sending/receiving NS/NA packets
2278 	 * (neighbor doing NUD), we have to make sure that NA
2279 	 * also go out on the same interface.
2280 	 */
2281 	plen = (sizeof (nd_opt_hdr_t) + ill->ill_nd_lla_len + 7) / 8;
2282 	len = IPV6_HDR_LEN + sizeof (ip6i_t) + sizeof (nd_neighbor_advert_t) +
2283 	    plen * 8;
2284 	mp = allocb(len,  BPRI_LO);
2285 	if (mp == NULL) {
2286 		if (src_ipif != NULL)
2287 			ipif_refrele(src_ipif);
2288 		return (B_TRUE);
2289 	}
2290 	bzero((char *)mp->b_rptr, len);
2291 	mp->b_wptr = mp->b_rptr + len;
2292 
2293 	ip6i = (ip6i_t *)mp->b_rptr;
2294 	ip6i->ip6i_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
2295 	ip6i->ip6i_nxt = IPPROTO_RAW;
2296 	ip6i->ip6i_flags = IP6I_ATTACH_IF | IP6I_HOPLIMIT;
2297 	if (flag & NDP_PROBE)
2298 		ip6i->ip6i_flags |= IP6I_UNSPEC_SRC;
2299 	ip6i->ip6i_ifindex = ill->ill_phyint->phyint_ifindex;
2300 
2301 	ip6h = (ip6_t *)(mp->b_rptr + sizeof (ip6i_t));
2302 	ip6h->ip6_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
2303 	ip6h->ip6_plen = htons(len - IPV6_HDR_LEN - sizeof (ip6i_t));
2304 	ip6h->ip6_nxt = IPPROTO_ICMPV6;
2305 	ip6h->ip6_hops = IPV6_MAX_HOPS;
2306 	ip6h->ip6_dst = *target;
2307 	icmp6 = (icmp6_t *)&ip6h[1];
2308 
2309 	opt = (nd_opt_hdr_t *)((uint8_t *)ip6h + IPV6_HDR_LEN +
2310 	    sizeof (nd_neighbor_advert_t));
2311 
2312 	if (operation == ND_NEIGHBOR_SOLICIT) {
2313 		nd_neighbor_solicit_t *ns = (nd_neighbor_solicit_t *)icmp6;
2314 
2315 		if (!(flag & NDP_PROBE))
2316 			opt->nd_opt_type = ND_OPT_SOURCE_LINKADDR;
2317 		ip6h->ip6_src = *sender;
2318 		ns->nd_ns_target = *target;
2319 		if (!(flag & NDP_UNICAST)) {
2320 			/* Form multicast address of the target */
2321 			ip6h->ip6_dst = ipv6_solicited_node_mcast;
2322 			ip6h->ip6_dst.s6_addr32[3] |=
2323 			    ns->nd_ns_target.s6_addr32[3];
2324 		}
2325 	} else {
2326 		nd_neighbor_advert_t *na = (nd_neighbor_advert_t *)icmp6;
2327 
2328 		ASSERT(!(flag & NDP_PROBE));
2329 		opt->nd_opt_type = ND_OPT_TARGET_LINKADDR;
2330 		ip6h->ip6_src = *sender;
2331 		na->nd_na_target = *sender;
2332 		if (flag & NDP_ISROUTER)
2333 			na->nd_na_flags_reserved |= ND_NA_FLAG_ROUTER;
2334 		if (flag & NDP_SOLICITED)
2335 			na->nd_na_flags_reserved |= ND_NA_FLAG_SOLICITED;
2336 		if (flag & NDP_ORIDE)
2337 			na->nd_na_flags_reserved |= ND_NA_FLAG_OVERRIDE;
2338 	}
2339 
2340 	hw_addr = NULL;
2341 	if (!(flag & NDP_PROBE)) {
2342 		mutex_enter(&hwaddr_ill->ill_lock);
2343 		hw_addr = use_nd_lla ? hwaddr_ill->ill_nd_lla :
2344 		    hwaddr_ill->ill_phys_addr;
2345 		if (hw_addr != NULL) {
2346 			/* Fill in link layer address and option len */
2347 			opt->nd_opt_len = (uint8_t)plen;
2348 			bcopy(hw_addr, &opt[1], hwaddr_ill->ill_nd_lla_len);
2349 		}
2350 		mutex_exit(&hwaddr_ill->ill_lock);
2351 	}
2352 	if (hw_addr == NULL) {
2353 		/* If there's no link layer address option, then strip it. */
2354 		len -= plen * 8;
2355 		mp->b_wptr = mp->b_rptr + len;
2356 		ip6h->ip6_plen = htons(len - IPV6_HDR_LEN - sizeof (ip6i_t));
2357 	}
2358 
2359 	icmp6->icmp6_type = (uint8_t)operation;
2360 	icmp6->icmp6_code = 0;
2361 	/*
2362 	 * Prepare for checksum by putting icmp length in the icmp
2363 	 * checksum field. The checksum is calculated in ip_wput_v6.
2364 	 */
2365 	icmp6->icmp6_cksum = ip6h->ip6_plen;
2366 
2367 	if (src_ipif != NULL)
2368 		ipif_refrele(src_ipif);
2369 	if (canput(ill->ill_wq)) {
2370 		put(ill->ill_wq, mp);
2371 		return (B_FALSE);
2372 	}
2373 	freemsg(mp);
2374 	return (B_TRUE);
2375 }
2376 
2377 /*
2378  * Make a link layer address (does not include the SAP) from an nce.
2379  * To form the link layer address, use the last four bytes of ipv6
2380  * address passed in and the fixed offset stored in nce.
2381  */
2382 static void
2383 nce_make_mapping(nce_t *nce, uchar_t *addrpos, uchar_t *addr)
2384 {
2385 	uchar_t *mask, *to;
2386 	ill_t	*ill = nce->nce_ill;
2387 	int 	len;
2388 
2389 	if (ill->ill_net_type == IRE_IF_NORESOLVER)
2390 		return;
2391 	ASSERT(nce->nce_res_mp != NULL);
2392 	ASSERT(ill->ill_net_type == IRE_IF_RESOLVER);
2393 	ASSERT(nce->nce_flags & NCE_F_MAPPING);
2394 	ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&nce->nce_extract_mask));
2395 	ASSERT(addr != NULL);
2396 	bcopy(nce->nce_res_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill),
2397 	    addrpos, ill->ill_nd_lla_len);
2398 	len = MIN((int)ill->ill_nd_lla_len - nce->nce_ll_extract_start,
2399 	    IPV6_ADDR_LEN);
2400 	mask = (uchar_t *)&nce->nce_extract_mask;
2401 	mask += (IPV6_ADDR_LEN - len);
2402 	addr += (IPV6_ADDR_LEN - len);
2403 	to = addrpos + nce->nce_ll_extract_start;
2404 	while (len-- > 0)
2405 		*to++ |= *mask++ & *addr++;
2406 }
2407 
2408 /*
2409  * Pass a cache report back out via NDD.
2410  */
2411 /* ARGSUSED */
2412 int
2413 ndp_report(queue_t *q, mblk_t *mp, caddr_t arg, cred_t *ioc_cr)
2414 {
2415 	(void) mi_mpprintf(mp, "ifname      hardware addr    flags"
2416 			"     proto addr/mask");
2417 	ndp_walk(NULL, (pfi_t)nce_report1, (uchar_t *)mp);
2418 	return (0);
2419 }
2420 
2421 /*
2422  * Add a single line to the NDP Cache Entry Report.
2423  */
2424 static void
2425 nce_report1(nce_t *nce, uchar_t *mp_arg)
2426 {
2427 	ill_t		*ill = nce->nce_ill;
2428 	char		local_buf[INET6_ADDRSTRLEN];
2429 	uchar_t		flags_buf[10];
2430 	uint32_t	flags = nce->nce_flags;
2431 	mblk_t		*mp = (mblk_t *)mp_arg;
2432 	uchar_t		*h;
2433 	uchar_t		*m = flags_buf;
2434 	in6_addr_t	v6addr;
2435 
2436 	/*
2437 	 * Lock the nce to protect nce_res_mp from being changed
2438 	 * if an external resolver address resolution completes
2439 	 * while nce_res_mp is being accessed here.
2440 	 *
2441 	 * Deal with all address formats, not just Ethernet-specific
2442 	 * In addition, make sure that the mblk has enough space
2443 	 * before writing to it. If is doesn't, allocate a new one.
2444 	 */
2445 	if (nce->nce_ipversion == IPV4_VERSION)
2446 		/* Don't include v4 nce_ts in NDP cache entry report */
2447 		return;
2448 
2449 	ASSERT(ill != NULL);
2450 	v6addr = nce->nce_mask;
2451 	if (flags & NCE_F_PERMANENT)
2452 		*m++ = 'P';
2453 	if (flags & NCE_F_ISROUTER)
2454 		*m++ = 'R';
2455 	if (flags & NCE_F_MAPPING)
2456 		*m++ = 'M';
2457 	*m = '\0';
2458 
2459 	if (ill->ill_net_type == IRE_IF_RESOLVER) {
2460 		size_t		addrlen;
2461 		char		*addr_buf;
2462 		dl_unitdata_req_t	*dl;
2463 
2464 		mutex_enter(&nce->nce_lock);
2465 		h = nce->nce_res_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill);
2466 		dl = (dl_unitdata_req_t *)nce->nce_res_mp->b_rptr;
2467 		if (ill->ill_flags & ILLF_XRESOLV)
2468 			addrlen = (3 * (dl->dl_dest_addr_length));
2469 		else
2470 			addrlen = (3 * (ill->ill_nd_lla_len));
2471 		if (addrlen <= 0) {
2472 			mutex_exit(&nce->nce_lock);
2473 			(void) mi_mpprintf(mp,
2474 			    "%8s %9s %5s %s/%d",
2475 			    ill->ill_name,
2476 			    "None",
2477 			    (uchar_t *)&flags_buf,
2478 			    inet_ntop(AF_INET6, (char *)&nce->nce_addr,
2479 				(char *)local_buf, sizeof (local_buf)),
2480 				ip_mask_to_plen_v6(&v6addr));
2481 		} else {
2482 			/*
2483 			 * Convert the hardware/lla address to ascii
2484 			 */
2485 			addr_buf = kmem_zalloc(addrlen, KM_NOSLEEP);
2486 			if (addr_buf == NULL) {
2487 				mutex_exit(&nce->nce_lock);
2488 				return;
2489 			}
2490 			(void) mac_colon_addr((uint8_t *)h,
2491 			    (ill->ill_flags & ILLF_XRESOLV) ?
2492 			    dl->dl_dest_addr_length : ill->ill_nd_lla_len,
2493 			    addr_buf, addrlen);
2494 			mutex_exit(&nce->nce_lock);
2495 			(void) mi_mpprintf(mp, "%8s %17s %5s %s/%d",
2496 			    ill->ill_name, addr_buf, (uchar_t *)&flags_buf,
2497 			    inet_ntop(AF_INET6, (char *)&nce->nce_addr,
2498 				(char *)local_buf, sizeof (local_buf)),
2499 				ip_mask_to_plen_v6(&v6addr));
2500 			kmem_free(addr_buf, addrlen);
2501 		}
2502 	} else {
2503 		(void) mi_mpprintf(mp,
2504 		    "%8s %9s %5s %s/%d",
2505 		    ill->ill_name,
2506 		    "None",
2507 		    (uchar_t *)&flags_buf,
2508 		    inet_ntop(AF_INET6, (char *)&nce->nce_addr,
2509 			(char *)local_buf, sizeof (local_buf)),
2510 			ip_mask_to_plen_v6(&v6addr));
2511 	}
2512 }
2513 
2514 mblk_t *
2515 nce_udreq_alloc(ill_t *ill)
2516 {
2517 	mblk_t	*template_mp = NULL;
2518 	dl_unitdata_req_t *dlur;
2519 	int	sap_length;
2520 
2521 	ASSERT(ill->ill_isv6);
2522 
2523 	sap_length = ill->ill_sap_length;
2524 	template_mp = ip_dlpi_alloc(sizeof (dl_unitdata_req_t) +
2525 	    ill->ill_nd_lla_len + ABS(sap_length), DL_UNITDATA_REQ);
2526 	if (template_mp == NULL)
2527 		return (NULL);
2528 
2529 	dlur = (dl_unitdata_req_t *)template_mp->b_rptr;
2530 	dlur->dl_priority.dl_min = 0;
2531 	dlur->dl_priority.dl_max = 0;
2532 	dlur->dl_dest_addr_length = ABS(sap_length) + ill->ill_nd_lla_len;
2533 	dlur->dl_dest_addr_offset = sizeof (dl_unitdata_req_t);
2534 
2535 	/* Copy in the SAP value. */
2536 	NCE_LL_SAP_COPY(ill, template_mp);
2537 
2538 	return (template_mp);
2539 }
2540 
2541 /*
2542  * NDP retransmit timer.
2543  * This timer goes off when:
2544  * a. It is time to retransmit NS for resolver.
2545  * b. It is time to send reachability probes.
2546  */
2547 void
2548 ndp_timer(void *arg)
2549 {
2550 	nce_t		*nce = arg;
2551 	ill_t		*ill = nce->nce_ill;
2552 	uint32_t	ms;
2553 	char		addrbuf[INET6_ADDRSTRLEN];
2554 	mblk_t		*mp;
2555 	boolean_t	dropped = B_FALSE;
2556 
2557 	/*
2558 	 * The timer has to be cancelled by ndp_delete before doing the final
2559 	 * refrele. So the NCE is guaranteed to exist when the timer runs
2560 	 * until it clears the timeout_id. Before clearing the timeout_id
2561 	 * bump up the refcnt so that we can continue to use the nce
2562 	 */
2563 	ASSERT(nce != NULL);
2564 
2565 	/*
2566 	 * Grab the ill_g_lock now itself to avoid lock order problems.
2567 	 * nce_solicit needs ill_g_lock to be able to traverse ills
2568 	 */
2569 	rw_enter(&ill_g_lock, RW_READER);
2570 	mutex_enter(&nce->nce_lock);
2571 	NCE_REFHOLD_LOCKED(nce);
2572 	nce->nce_timeout_id = 0;
2573 
2574 	/*
2575 	 * Check the reachability state first.
2576 	 */
2577 	switch (nce->nce_state) {
2578 	case ND_DELAY:
2579 		rw_exit(&ill_g_lock);
2580 		nce->nce_state = ND_PROBE;
2581 		mutex_exit(&nce->nce_lock);
2582 		(void) nce_xmit(ill, ND_NEIGHBOR_SOLICIT, NULL, B_FALSE,
2583 		    &ipv6_all_zeros, &nce->nce_addr, NDP_UNICAST);
2584 		if (ip_debug > 3) {
2585 			/* ip2dbg */
2586 			pr_addr_dbg("ndp_timer: state for %s changed "
2587 			    "to PROBE\n", AF_INET6, &nce->nce_addr);
2588 		}
2589 		NDP_RESTART_TIMER(nce, ill->ill_reachable_retrans_time);
2590 		NCE_REFRELE(nce);
2591 		return;
2592 	case ND_PROBE:
2593 		/* must be retransmit timer */
2594 		rw_exit(&ill_g_lock);
2595 		nce->nce_pcnt--;
2596 		ASSERT(nce->nce_pcnt < ND_MAX_UNICAST_SOLICIT &&
2597 		    nce->nce_pcnt >= -1);
2598 		if (nce->nce_pcnt > 0) {
2599 			/*
2600 			 * As per RFC2461, the nce gets deleted after
2601 			 * MAX_UNICAST_SOLICIT unsuccessful re-transmissions.
2602 			 * Note that the first unicast solicitation is sent
2603 			 * during the DELAY state.
2604 			 */
2605 			ip2dbg(("ndp_timer: pcount=%x dst %s\n",
2606 			    nce->nce_pcnt, inet_ntop(AF_INET6, &nce->nce_addr,
2607 			    addrbuf, sizeof (addrbuf))));
2608 			mutex_exit(&nce->nce_lock);
2609 			dropped = nce_xmit(ill, ND_NEIGHBOR_SOLICIT, NULL,
2610 			    B_FALSE, &ipv6_all_zeros, &nce->nce_addr,
2611 			    (nce->nce_flags & NCE_F_PERMANENT) ? NDP_PROBE :
2612 			    NDP_UNICAST);
2613 			if (dropped) {
2614 				mutex_enter(&nce->nce_lock);
2615 				nce->nce_pcnt++;
2616 				mutex_exit(&nce->nce_lock);
2617 			}
2618 			NDP_RESTART_TIMER(nce, ILL_PROBE_INTERVAL(ill));
2619 		} else if (nce->nce_pcnt < 0) {
2620 			/* No hope, delete the nce */
2621 			nce->nce_state = ND_UNREACHABLE;
2622 			mutex_exit(&nce->nce_lock);
2623 			if (ip_debug > 2) {
2624 				/* ip1dbg */
2625 				pr_addr_dbg("ndp_timer: Delete IRE for"
2626 				    " dst %s\n", AF_INET6, &nce->nce_addr);
2627 			}
2628 			ndp_delete(nce);
2629 		} else if (!(nce->nce_flags & NCE_F_PERMANENT)) {
2630 			/* Wait RetransTimer, before deleting the entry */
2631 			ip2dbg(("ndp_timer: pcount=%x dst %s\n",
2632 			    nce->nce_pcnt, inet_ntop(AF_INET6,
2633 			    &nce->nce_addr, addrbuf, sizeof (addrbuf))));
2634 			mutex_exit(&nce->nce_lock);
2635 			/* Wait one interval before killing */
2636 			NDP_RESTART_TIMER(nce, ill->ill_reachable_retrans_time);
2637 		} else if (ill->ill_phyint->phyint_flags & PHYI_RUNNING) {
2638 			ipif_t *ipif;
2639 
2640 			/*
2641 			 * We're done probing, and we can now declare this
2642 			 * address to be usable.  Let IP know that it's ok to
2643 			 * use.
2644 			 */
2645 			nce->nce_state = ND_REACHABLE;
2646 			mutex_exit(&nce->nce_lock);
2647 			ipif = ipif_lookup_addr_v6(&nce->nce_addr, ill,
2648 			    ALL_ZONES, NULL, NULL, NULL, NULL);
2649 			if (ipif != NULL) {
2650 				if (ipif->ipif_was_dup) {
2651 					char ibuf[LIFNAMSIZ + 10];
2652 					char sbuf[INET6_ADDRSTRLEN];
2653 
2654 					ipif->ipif_was_dup = B_FALSE;
2655 					(void) strlcpy(ibuf, ill->ill_name,
2656 					    sizeof (ibuf));
2657 					(void) inet_ntop(AF_INET6,
2658 					    &ipif->ipif_v6lcl_addr,
2659 					    sbuf, sizeof (sbuf));
2660 					if (ipif->ipif_id != 0) {
2661 						(void) snprintf(ibuf +
2662 						    ill->ill_name_length - 1,
2663 						    sizeof (ibuf) -
2664 						    ill->ill_name_length + 1,
2665 						    ":%d", ipif->ipif_id);
2666 					}
2667 					cmn_err(CE_NOTE, "recovered address "
2668 					    "%s on %s", sbuf, ibuf);
2669 				}
2670 				if ((ipif->ipif_flags & IPIF_UP) &&
2671 				    !ipif->ipif_addr_ready) {
2672 					ip_rts_ifmsg(ipif);
2673 					ip_rts_newaddrmsg(RTM_ADD, 0, ipif);
2674 					sctp_update_ipif(ipif, SCTP_IPIF_UP);
2675 				}
2676 				ipif->ipif_addr_ready = 1;
2677 				ipif_refrele(ipif);
2678 			}
2679 			/* Begin defending our new address */
2680 			nce->nce_unsolicit_count = 0;
2681 			dropped = nce_xmit(ill, ND_NEIGHBOR_ADVERT, ill,
2682 			    B_FALSE, &nce->nce_addr, &ipv6_all_hosts_mcast,
2683 			    nce_advert_flags(nce));
2684 			if (dropped) {
2685 				nce->nce_unsolicit_count = 1;
2686 				NDP_RESTART_TIMER(nce,
2687 				    ip_ndp_unsolicit_interval);
2688 			} else if (ip_ndp_defense_interval != 0) {
2689 				NDP_RESTART_TIMER(nce, ip_ndp_defense_interval);
2690 			}
2691 		} else {
2692 			/*
2693 			 * This is an address we're probing to be our own, but
2694 			 * the ill is down.  Wait until it comes back before
2695 			 * doing anything, but switch to reachable state so
2696 			 * that the restart will work.
2697 			 */
2698 			nce->nce_state = ND_REACHABLE;
2699 			mutex_exit(&nce->nce_lock);
2700 		}
2701 		NCE_REFRELE(nce);
2702 		return;
2703 	case ND_INCOMPLETE:
2704 		/*
2705 		 * Must be resolvers retransmit timer.
2706 		 */
2707 		for (mp = nce->nce_qd_mp; mp != NULL; mp = mp->b_next) {
2708 			ip6i_t	*ip6i;
2709 			ip6_t	*ip6h;
2710 			mblk_t *data_mp;
2711 
2712 			/*
2713 			 * Walk the list of packets queued, and see if there
2714 			 * are any multipathing probe packets. Such packets
2715 			 * are always queued at the head. Since this is a
2716 			 * retransmit timer firing, mark such packets as
2717 			 * delayed in ND resolution. This info will be used
2718 			 * in ip_wput_v6(). Multipathing probe packets will
2719 			 * always have an ip6i_t. Once we hit a packet without
2720 			 * it, we can break out of this loop.
2721 			 */
2722 			if (mp->b_datap->db_type == M_CTL)
2723 				data_mp = mp->b_cont;
2724 			else
2725 				data_mp = mp;
2726 
2727 			ip6h = (ip6_t *)data_mp->b_rptr;
2728 			if (ip6h->ip6_nxt != IPPROTO_RAW)
2729 				break;
2730 
2731 			/*
2732 			 * This message should have been pulled up already in
2733 			 * ip_wput_v6. We can't do pullups here because the
2734 			 * b_next/b_prev is non-NULL.
2735 			 */
2736 			ip6i = (ip6i_t *)ip6h;
2737 			ASSERT((data_mp->b_wptr - (uchar_t *)ip6i) >=
2738 			    sizeof (ip6i_t) + IPV6_HDR_LEN);
2739 
2740 			/* Mark this packet as delayed due to ND resolution */
2741 			if (ip6i->ip6i_flags & IP6I_DROP_IFDELAYED)
2742 				ip6i->ip6i_flags |= IP6I_ND_DELAYED;
2743 		}
2744 		if (nce->nce_qd_mp != NULL) {
2745 			ms = nce_solicit(nce, NULL);
2746 			rw_exit(&ill_g_lock);
2747 			if (ms == 0) {
2748 				if (nce->nce_state != ND_REACHABLE) {
2749 					mutex_exit(&nce->nce_lock);
2750 					nce_resolv_failed(nce);
2751 					ndp_delete(nce);
2752 				} else {
2753 					mutex_exit(&nce->nce_lock);
2754 				}
2755 			} else {
2756 				mutex_exit(&nce->nce_lock);
2757 				NDP_RESTART_TIMER(nce, (clock_t)ms);
2758 			}
2759 			NCE_REFRELE(nce);
2760 			return;
2761 		}
2762 		mutex_exit(&nce->nce_lock);
2763 		rw_exit(&ill_g_lock);
2764 		NCE_REFRELE(nce);
2765 		break;
2766 	case ND_REACHABLE :
2767 		rw_exit(&ill_g_lock);
2768 		if (((nce->nce_flags & NCE_F_UNSOL_ADV) &&
2769 		    nce->nce_unsolicit_count != 0) ||
2770 		    ((nce->nce_flags & NCE_F_PERMANENT) &&
2771 		    ip_ndp_defense_interval != 0)) {
2772 			if (nce->nce_unsolicit_count > 0)
2773 				nce->nce_unsolicit_count--;
2774 			mutex_exit(&nce->nce_lock);
2775 			dropped = nce_xmit(ill,
2776 			    ND_NEIGHBOR_ADVERT,
2777 			    ill,	/* ill to be used for hw addr */
2778 			    B_FALSE,	/* use ill_phys_addr */
2779 			    &nce->nce_addr,
2780 			    &ipv6_all_hosts_mcast,
2781 			    nce_advert_flags(nce));
2782 			if (dropped) {
2783 				mutex_enter(&nce->nce_lock);
2784 				nce->nce_unsolicit_count++;
2785 				mutex_exit(&nce->nce_lock);
2786 			}
2787 			if (nce->nce_unsolicit_count != 0) {
2788 				NDP_RESTART_TIMER(nce,
2789 				    ip_ndp_unsolicit_interval);
2790 			} else {
2791 				NDP_RESTART_TIMER(nce,
2792 				    ip_ndp_defense_interval);
2793 			}
2794 		} else {
2795 			mutex_exit(&nce->nce_lock);
2796 		}
2797 		NCE_REFRELE(nce);
2798 		break;
2799 	default:
2800 		rw_exit(&ill_g_lock);
2801 		mutex_exit(&nce->nce_lock);
2802 		NCE_REFRELE(nce);
2803 		break;
2804 	}
2805 }
2806 
2807 /*
2808  * Set a link layer address from the ll_addr passed in.
2809  * Copy SAP from ill.
2810  */
2811 static void
2812 nce_set_ll(nce_t *nce, uchar_t *ll_addr)
2813 {
2814 	ill_t	*ill = nce->nce_ill;
2815 	uchar_t	*woffset;
2816 
2817 	ASSERT(ll_addr != NULL);
2818 	/* Always called before fast_path_probe */
2819 	ASSERT(nce->nce_fp_mp == NULL);
2820 	if (ill->ill_sap_length != 0) {
2821 		/*
2822 		 * Copy the SAP type specified in the
2823 		 * request into the xmit template.
2824 		 */
2825 		NCE_LL_SAP_COPY(ill, nce->nce_res_mp);
2826 	}
2827 	if (ill->ill_phys_addr_length > 0) {
2828 		/*
2829 		 * The bcopy() below used to be called for the physical address
2830 		 * length rather than the link layer address length. For
2831 		 * ethernet and many other media, the phys_addr and lla are
2832 		 * identical.
2833 		 * However, with xresolv interfaces being introduced, the
2834 		 * phys_addr and lla are no longer the same, and the physical
2835 		 * address may not have any useful meaning, so we use the lla
2836 		 * for IPv6 address resolution and destination addressing.
2837 		 *
2838 		 * For PPP or other interfaces with a zero length
2839 		 * physical address, don't do anything here.
2840 		 * The bcopy() with a zero phys_addr length was previously
2841 		 * a no-op for interfaces with a zero-length physical address.
2842 		 * Using the lla for them would change the way they operate.
2843 		 * Doing nothing in such cases preserves expected behavior.
2844 		 */
2845 		woffset = nce->nce_res_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill);
2846 		bcopy(ll_addr, woffset, ill->ill_nd_lla_len);
2847 	}
2848 }
2849 
2850 static boolean_t
2851 nce_cmp_ll_addr(const nce_t *nce, const uchar_t *ll_addr, uint32_t ll_addr_len)
2852 {
2853 	ill_t	*ill = nce->nce_ill;
2854 	uchar_t	*ll_offset;
2855 
2856 	ASSERT(nce->nce_res_mp != NULL);
2857 	if (ll_addr == NULL)
2858 		return (B_FALSE);
2859 	ll_offset = nce->nce_res_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill);
2860 	if (bcmp(ll_addr, ll_offset, ll_addr_len) != 0)
2861 		return (B_TRUE);
2862 	return (B_FALSE);
2863 }
2864 
2865 /*
2866  * Updates the link layer address or the reachability state of
2867  * a cache entry.  Reset probe counter if needed.
2868  */
2869 static void
2870 nce_update(nce_t *nce, uint16_t new_state, uchar_t *new_ll_addr)
2871 {
2872 	ill_t	*ill = nce->nce_ill;
2873 	boolean_t need_stop_timer = B_FALSE;
2874 	boolean_t need_fastpath_update = B_FALSE;
2875 
2876 	ASSERT(MUTEX_HELD(&nce->nce_lock));
2877 	ASSERT(nce->nce_ipversion == IPV6_VERSION);
2878 	/*
2879 	 * If this interface does not do NUD, there is no point
2880 	 * in allowing an update to the cache entry.  Although
2881 	 * we will respond to NS.
2882 	 * The only time we accept an update for a resolver when
2883 	 * NUD is turned off is when it has just been created.
2884 	 * Non-Resolvers will always be created as REACHABLE.
2885 	 */
2886 	if (new_state != ND_UNCHANGED) {
2887 		if ((nce->nce_flags & NCE_F_NONUD) &&
2888 		    (nce->nce_state != ND_INCOMPLETE))
2889 			return;
2890 		ASSERT((int16_t)new_state >= ND_STATE_VALID_MIN);
2891 		ASSERT((int16_t)new_state <= ND_STATE_VALID_MAX);
2892 		need_stop_timer = B_TRUE;
2893 		if (new_state == ND_REACHABLE)
2894 			nce->nce_last = TICK_TO_MSEC(lbolt64);
2895 		else {
2896 			/* We force NUD in this case */
2897 			nce->nce_last = 0;
2898 		}
2899 		nce->nce_state = new_state;
2900 		nce->nce_pcnt = ND_MAX_UNICAST_SOLICIT;
2901 	}
2902 	/*
2903 	 * In case of fast path we need to free the the fastpath
2904 	 * M_DATA and do another probe.  Otherwise we can just
2905 	 * overwrite the DL_UNITDATA_REQ data, noting we'll lose
2906 	 * whatever packets that happens to be transmitting at the time.
2907 	 */
2908 	if (new_ll_addr != NULL) {
2909 		ASSERT(nce->nce_res_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill) +
2910 		    ill->ill_nd_lla_len <= nce->nce_res_mp->b_wptr);
2911 		bcopy(new_ll_addr, nce->nce_res_mp->b_rptr +
2912 		    NCE_LL_ADDR_OFFSET(ill), ill->ill_nd_lla_len);
2913 		if (nce->nce_fp_mp != NULL) {
2914 			freemsg(nce->nce_fp_mp);
2915 			nce->nce_fp_mp = NULL;
2916 		}
2917 		need_fastpath_update = B_TRUE;
2918 	}
2919 	mutex_exit(&nce->nce_lock);
2920 	if (need_stop_timer) {
2921 		(void) untimeout(nce->nce_timeout_id);
2922 		nce->nce_timeout_id = 0;
2923 	}
2924 	if (need_fastpath_update)
2925 		nce_fastpath(nce);
2926 	mutex_enter(&nce->nce_lock);
2927 }
2928 
2929 void
2930 nce_queue_mp_common(nce_t *nce, mblk_t *mp, boolean_t head_insert)
2931 {
2932 	uint_t	count = 0;
2933 	mblk_t  **mpp;
2934 
2935 	ASSERT(MUTEX_HELD(&nce->nce_lock));
2936 
2937 	for (mpp = &nce->nce_qd_mp; *mpp != NULL;
2938 	    mpp = &(*mpp)->b_next) {
2939 		if (++count >
2940 		    nce->nce_ill->ill_max_buf) {
2941 			mblk_t *tmp = nce->nce_qd_mp->b_next;
2942 
2943 			nce->nce_qd_mp->b_next = NULL;
2944 			nce->nce_qd_mp->b_prev = NULL;
2945 			freemsg(nce->nce_qd_mp);
2946 			nce->nce_qd_mp = tmp;
2947 		}
2948 	}
2949 	/* put this on the list */
2950 	if (head_insert) {
2951 		mp->b_next = nce->nce_qd_mp;
2952 		nce->nce_qd_mp = mp;
2953 	} else {
2954 		*mpp = mp;
2955 	}
2956 }
2957 
2958 static void
2959 nce_queue_mp(nce_t *nce, mblk_t *mp)
2960 {
2961 	boolean_t head_insert = B_FALSE;
2962 	ip6_t	*ip6h;
2963 	ip6i_t	*ip6i;
2964 	mblk_t *data_mp;
2965 
2966 	ASSERT(MUTEX_HELD(&nce->nce_lock));
2967 
2968 	if (mp->b_datap->db_type == M_CTL)
2969 		data_mp = mp->b_cont;
2970 	else
2971 		data_mp = mp;
2972 	ip6h = (ip6_t *)data_mp->b_rptr;
2973 	if (ip6h->ip6_nxt == IPPROTO_RAW) {
2974 		/*
2975 		 * This message should have been pulled up already in
2976 		 * ip_wput_v6. We can't do pullups here because the message
2977 		 * could be from the nce_qd_mp which could have b_next/b_prev
2978 		 * non-NULL.
2979 		 */
2980 		ip6i = (ip6i_t *)ip6h;
2981 		ASSERT((data_mp->b_wptr - (uchar_t *)ip6i) >=
2982 			    sizeof (ip6i_t) + IPV6_HDR_LEN);
2983 		/*
2984 		 * Multipathing probe packets have IP6I_DROP_IFDELAYED set.
2985 		 * This has 2 aspects mentioned below.
2986 		 * 1. Perform head insertion in the nce_qd_mp for these packets.
2987 		 * This ensures that next retransmit of ND solicitation
2988 		 * will use the interface specified by the probe packet,
2989 		 * for both NS and NA. This corresponds to the src address
2990 		 * in the IPv6 packet. If we insert at tail, we will be
2991 		 * depending on the packet at the head for successful
2992 		 * ND resolution. This is not reliable, because the interface
2993 		 * on which the NA arrives could be different from the interface
2994 		 * on which the NS was sent, and if the receiving interface is
2995 		 * failed, it will appear that the sending interface is also
2996 		 * failed, causing in.mpathd to misdiagnose this as link
2997 		 * failure.
2998 		 * 2. Drop the original packet, if the ND resolution did not
2999 		 * succeed in the first attempt. However we will create the
3000 		 * nce and the ire, as soon as the ND resolution succeeds.
3001 		 * We don't gain anything by queueing multiple probe packets
3002 		 * and sending them back-to-back once resolution succeeds.
3003 		 * It is sufficient to send just 1 packet after ND resolution
3004 		 * succeeds. Since mpathd is sending down probe packets at a
3005 		 * constant rate, we don't need to send the queued packet. We
3006 		 * need to queue it only for NDP resolution. The benefit of
3007 		 * dropping the probe packets that were delayed in ND
3008 		 * resolution, is that in.mpathd will not see inflated
3009 		 * RTT. If the ND resolution does not succeed within
3010 		 * in.mpathd's failure detection time, mpathd may detect
3011 		 * a failure, and it does not matter whether the packet
3012 		 * was queued or dropped.
3013 		 */
3014 		if (ip6i->ip6i_flags & IP6I_DROP_IFDELAYED)
3015 			head_insert = B_TRUE;
3016 	}
3017 
3018 	nce_queue_mp_common(nce, mp, head_insert);
3019 }
3020 
3021 /*
3022  * Called when address resolution failed due to a timeout.
3023  * Send an ICMP unreachable in response to all queued packets.
3024  */
3025 void
3026 nce_resolv_failed(nce_t *nce)
3027 {
3028 	mblk_t	*mp, *nxt_mp, *first_mp;
3029 	char	buf[INET6_ADDRSTRLEN];
3030 	ip6_t *ip6h;
3031 	zoneid_t zoneid = GLOBAL_ZONEID;
3032 
3033 	ip1dbg(("nce_resolv_failed: dst %s\n",
3034 	    inet_ntop(AF_INET6, (char *)&nce->nce_addr, buf, sizeof (buf))));
3035 	mutex_enter(&nce->nce_lock);
3036 	mp = nce->nce_qd_mp;
3037 	nce->nce_qd_mp = NULL;
3038 	mutex_exit(&nce->nce_lock);
3039 	while (mp != NULL) {
3040 		nxt_mp = mp->b_next;
3041 		mp->b_next = NULL;
3042 		mp->b_prev = NULL;
3043 
3044 		first_mp = mp;
3045 		if (mp->b_datap->db_type == M_CTL) {
3046 			ipsec_out_t *io = (ipsec_out_t *)mp->b_rptr;
3047 			ASSERT(io->ipsec_out_type == IPSEC_OUT);
3048 			zoneid = io->ipsec_out_zoneid;
3049 			ASSERT(zoneid != ALL_ZONES);
3050 			mp = mp->b_cont;
3051 		}
3052 
3053 		ip6h = (ip6_t *)mp->b_rptr;
3054 		if (ip6h->ip6_nxt == IPPROTO_RAW) {
3055 			ip6i_t *ip6i;
3056 			/*
3057 			 * This message should have been pulled up already
3058 			 * in ip_wput_v6. ip_hdr_complete_v6 assumes that
3059 			 * the header is pulled up.
3060 			 */
3061 			ip6i = (ip6i_t *)ip6h;
3062 			ASSERT((mp->b_wptr - (uchar_t *)ip6i) >=
3063 			    sizeof (ip6i_t) + IPV6_HDR_LEN);
3064 			mp->b_rptr += sizeof (ip6i_t);
3065 		}
3066 		/*
3067 		 * Ignore failure since icmp_unreachable_v6 will silently
3068 		 * drop packets with an unspecified source address.
3069 		 */
3070 		(void) ip_hdr_complete_v6((ip6_t *)mp->b_rptr, zoneid);
3071 		icmp_unreachable_v6(nce->nce_ill->ill_wq, first_mp,
3072 		    ICMP6_DST_UNREACH_ADDR, B_FALSE, B_FALSE, zoneid);
3073 		mp = nxt_mp;
3074 	}
3075 }
3076 
3077 /*
3078  * Called by SIOCSNDP* ioctl to add/change an nce entry
3079  * and the corresponding attributes.
3080  * Disallow states other than ND_REACHABLE or ND_STALE.
3081  */
3082 int
3083 ndp_sioc_update(ill_t *ill, lif_nd_req_t *lnr)
3084 {
3085 	sin6_t		*sin6;
3086 	in6_addr_t	*addr;
3087 	nce_t		*nce;
3088 	int		err;
3089 	uint16_t	new_flags = 0;
3090 	uint16_t	old_flags = 0;
3091 	int		inflags = lnr->lnr_flags;
3092 
3093 	ASSERT(ill->ill_isv6);
3094 	if ((lnr->lnr_state_create != ND_REACHABLE) &&
3095 	    (lnr->lnr_state_create != ND_STALE))
3096 		return (EINVAL);
3097 
3098 	sin6 = (sin6_t *)&lnr->lnr_addr;
3099 	addr = &sin6->sin6_addr;
3100 
3101 	mutex_enter(&ndp6.ndp_g_lock);
3102 	/* We know it can not be mapping so just look in the hash table */
3103 	nce = *((nce_t **)NCE_HASH_PTR_V6(*addr));
3104 	nce = nce_lookup_addr(ill, addr, nce);
3105 	if (nce != NULL)
3106 		new_flags = nce->nce_flags;
3107 
3108 	switch (inflags & (NDF_ISROUTER_ON|NDF_ISROUTER_OFF)) {
3109 	case NDF_ISROUTER_ON:
3110 		new_flags |= NCE_F_ISROUTER;
3111 		break;
3112 	case NDF_ISROUTER_OFF:
3113 		new_flags &= ~NCE_F_ISROUTER;
3114 		break;
3115 	case (NDF_ISROUTER_OFF|NDF_ISROUTER_ON):
3116 		mutex_exit(&ndp6.ndp_g_lock);
3117 		if (nce != NULL)
3118 			NCE_REFRELE(nce);
3119 		return (EINVAL);
3120 	}
3121 
3122 	switch (inflags & (NDF_ANYCAST_ON|NDF_ANYCAST_OFF)) {
3123 	case NDF_ANYCAST_ON:
3124 		new_flags |= NCE_F_ANYCAST;
3125 		break;
3126 	case NDF_ANYCAST_OFF:
3127 		new_flags &= ~NCE_F_ANYCAST;
3128 		break;
3129 	case (NDF_ANYCAST_OFF|NDF_ANYCAST_ON):
3130 		mutex_exit(&ndp6.ndp_g_lock);
3131 		if (nce != NULL)
3132 			NCE_REFRELE(nce);
3133 		return (EINVAL);
3134 	}
3135 
3136 	switch (inflags & (NDF_PROXY_ON|NDF_PROXY_OFF)) {
3137 	case NDF_PROXY_ON:
3138 		new_flags |= NCE_F_PROXY;
3139 		break;
3140 	case NDF_PROXY_OFF:
3141 		new_flags &= ~NCE_F_PROXY;
3142 		break;
3143 	case (NDF_PROXY_OFF|NDF_PROXY_ON):
3144 		mutex_exit(&ndp6.ndp_g_lock);
3145 		if (nce != NULL)
3146 			NCE_REFRELE(nce);
3147 		return (EINVAL);
3148 	}
3149 
3150 	if (nce == NULL) {
3151 		err = ndp_add(ill,
3152 		    (uchar_t *)lnr->lnr_hdw_addr,
3153 		    addr,
3154 		    &ipv6_all_ones,
3155 		    &ipv6_all_zeros,
3156 		    0,
3157 		    new_flags,
3158 		    lnr->lnr_state_create,
3159 		    &nce,
3160 		    NULL,
3161 		    NULL);
3162 		if (err != 0) {
3163 			mutex_exit(&ndp6.ndp_g_lock);
3164 			ip1dbg(("ndp_sioc_update: Can't create NCE %d\n", err));
3165 			return (err);
3166 		}
3167 	}
3168 	old_flags = nce->nce_flags;
3169 	if (old_flags & NCE_F_ISROUTER && !(new_flags & NCE_F_ISROUTER)) {
3170 		/*
3171 		 * Router turned to host, delete all ires.
3172 		 * XXX Just delete the entry, but we need to add too.
3173 		 */
3174 		nce->nce_flags &= ~NCE_F_ISROUTER;
3175 		mutex_exit(&ndp6.ndp_g_lock);
3176 		ndp_delete(nce);
3177 		NCE_REFRELE(nce);
3178 		return (0);
3179 	}
3180 	mutex_exit(&ndp6.ndp_g_lock);
3181 
3182 	mutex_enter(&nce->nce_lock);
3183 	nce->nce_flags = new_flags;
3184 	mutex_exit(&nce->nce_lock);
3185 	/*
3186 	 * Note that we ignore the state at this point, which
3187 	 * should be either STALE or REACHABLE.  Instead we let
3188 	 * the link layer address passed in to determine the state
3189 	 * much like incoming packets.
3190 	 */
3191 	ndp_process(nce, (uchar_t *)lnr->lnr_hdw_addr, 0, B_FALSE);
3192 	NCE_REFRELE(nce);
3193 	return (0);
3194 }
3195 
3196 /*
3197  * If the device driver supports it, we make nce_fp_mp to have
3198  * an M_DATA prepend.  Otherwise nce_fp_mp will be null.
3199  * The caller insures there is hold on nce for this function.
3200  * Note that since ill_fastpath_probe() copies the mblk there is
3201  * no need for the hold beyond this function.
3202  */
3203 static void
3204 nce_fastpath(nce_t *nce)
3205 {
3206 	ill_t	*ill = nce->nce_ill;
3207 	int res;
3208 
3209 	ASSERT(ill != NULL);
3210 	if (nce->nce_fp_mp != NULL) {
3211 		/* Already contains fastpath info */
3212 		return;
3213 	}
3214 	if (nce->nce_res_mp != NULL) {
3215 		nce_fastpath_list_add(nce);
3216 		res = ill_fastpath_probe(ill, nce->nce_res_mp);
3217 		/*
3218 		 * EAGAIN is an indication of a transient error
3219 		 * i.e. allocation failure etc. leave the nce in the list it
3220 		 * will be updated when another probe happens for another ire
3221 		 * if not it will be taken out of the list when the ire is
3222 		 * deleted.
3223 		 */
3224 
3225 		if (res != 0 && res != EAGAIN)
3226 			nce_fastpath_list_delete(nce);
3227 	}
3228 }
3229 
3230 /*
3231  * Drain the list of nce's waiting for fastpath response.
3232  */
3233 void
3234 nce_fastpath_list_dispatch(ill_t *ill, boolean_t (*func)(nce_t *, void  *),
3235     void *arg)
3236 {
3237 
3238 	nce_t *next_nce;
3239 	nce_t *current_nce;
3240 	nce_t *first_nce;
3241 	nce_t *prev_nce = NULL;
3242 
3243 	ASSERT(ill != NULL && ill->ill_isv6);
3244 
3245 	mutex_enter(&ill->ill_lock);
3246 	first_nce = current_nce = (nce_t *)ill->ill_fastpath_list;
3247 	while (current_nce != (nce_t *)&ill->ill_fastpath_list) {
3248 		next_nce = current_nce->nce_fastpath;
3249 		/*
3250 		 * Take it off the list if we're flushing, or if the callback
3251 		 * routine tells us to do so.  Otherwise, leave the nce in the
3252 		 * fastpath list to handle any pending response from the lower
3253 		 * layer.  We can't drain the list when the callback routine
3254 		 * comparison failed, because the response is asynchronous in
3255 		 * nature, and may not arrive in the same order as the list
3256 		 * insertion.
3257 		 */
3258 		if (func == NULL || func(current_nce, arg)) {
3259 			current_nce->nce_fastpath = NULL;
3260 			if (current_nce == first_nce)
3261 				ill->ill_fastpath_list = first_nce = next_nce;
3262 			else
3263 				prev_nce->nce_fastpath = next_nce;
3264 		} else {
3265 			/* previous element that is still in the list */
3266 			prev_nce = current_nce;
3267 		}
3268 		current_nce = next_nce;
3269 	}
3270 	mutex_exit(&ill->ill_lock);
3271 }
3272 
3273 /*
3274  * Add nce to the nce fastpath list.
3275  */
3276 void
3277 nce_fastpath_list_add(nce_t *nce)
3278 {
3279 	ill_t *ill;
3280 
3281 	ill = nce->nce_ill;
3282 	ASSERT(ill != NULL && ill->ill_isv6);
3283 
3284 	mutex_enter(&ill->ill_lock);
3285 	mutex_enter(&nce->nce_lock);
3286 
3287 	/*
3288 	 * if nce has not been deleted and
3289 	 * is not already in the list add it.
3290 	 */
3291 	if (!(nce->nce_flags & NCE_F_CONDEMNED) &&
3292 	    (nce->nce_fastpath == NULL)) {
3293 		nce->nce_fastpath = (nce_t *)ill->ill_fastpath_list;
3294 		ill->ill_fastpath_list = nce;
3295 	}
3296 
3297 	mutex_exit(&nce->nce_lock);
3298 	mutex_exit(&ill->ill_lock);
3299 }
3300 
3301 /*
3302  * remove nce from the nce fastpath list.
3303  */
3304 void
3305 nce_fastpath_list_delete(nce_t *nce)
3306 {
3307 	nce_t *nce_ptr;
3308 
3309 	ill_t *ill;
3310 
3311 	ill = nce->nce_ill;
3312 	ASSERT(ill != NULL);
3313 	if (!ill->ill_isv6)  {
3314 		/*
3315 		 * v4 nce_t's do not have nce_fastpath set.
3316 		 */
3317 		return;
3318 	}
3319 
3320 	mutex_enter(&ill->ill_lock);
3321 	if (nce->nce_fastpath == NULL)
3322 		goto done;
3323 
3324 	ASSERT(ill->ill_fastpath_list != &ill->ill_fastpath_list);
3325 
3326 	if (ill->ill_fastpath_list == nce) {
3327 		ill->ill_fastpath_list = nce->nce_fastpath;
3328 	} else {
3329 		nce_ptr = ill->ill_fastpath_list;
3330 		while (nce_ptr != (nce_t *)&ill->ill_fastpath_list) {
3331 			if (nce_ptr->nce_fastpath == nce) {
3332 				nce_ptr->nce_fastpath = nce->nce_fastpath;
3333 				break;
3334 			}
3335 			nce_ptr = nce_ptr->nce_fastpath;
3336 		}
3337 	}
3338 
3339 	nce->nce_fastpath = NULL;
3340 done:
3341 	mutex_exit(&ill->ill_lock);
3342 }
3343 
3344 /*
3345  * Update all NCE's that are not in fastpath mode and
3346  * have an nce_fp_mp that matches mp. mp->b_cont contains
3347  * the fastpath header.
3348  *
3349  * Returns TRUE if entry should be dequeued, or FALSE otherwise.
3350  */
3351 boolean_t
3352 ndp_fastpath_update(nce_t *nce, void *arg)
3353 {
3354 	mblk_t 	*mp, *fp_mp;
3355 	uchar_t	*mp_rptr, *ud_mp_rptr;
3356 	mblk_t	*ud_mp = nce->nce_res_mp;
3357 	ptrdiff_t	cmplen;
3358 
3359 	if (nce->nce_flags & NCE_F_MAPPING)
3360 		return (B_TRUE);
3361 	if ((nce->nce_fp_mp != NULL) || (ud_mp == NULL))
3362 		return (B_TRUE);
3363 
3364 	ip2dbg(("ndp_fastpath_update: trying\n"));
3365 	mp = (mblk_t *)arg;
3366 	mp_rptr = mp->b_rptr;
3367 	cmplen = mp->b_wptr - mp_rptr;
3368 	ASSERT(cmplen >= 0);
3369 	ud_mp_rptr = ud_mp->b_rptr;
3370 	/*
3371 	 * The nce is locked here to prevent any other threads
3372 	 * from accessing and changing nce_res_mp when the IPv6 address
3373 	 * becomes resolved to an lla while we're in the middle
3374 	 * of looking at and comparing the hardware address (lla).
3375 	 * It is also locked to prevent multiple threads in nce_fastpath_update
3376 	 * from examining nce_res_mp atthe same time.
3377 	 */
3378 	mutex_enter(&nce->nce_lock);
3379 	if (ud_mp->b_wptr - ud_mp_rptr != cmplen ||
3380 	    bcmp((char *)mp_rptr, (char *)ud_mp_rptr, cmplen) != 0) {
3381 		mutex_exit(&nce->nce_lock);
3382 		/*
3383 		 * Don't take the ire off the fastpath list yet,
3384 		 * since the response may come later.
3385 		 */
3386 		return (B_FALSE);
3387 	}
3388 	/* Matched - install mp as the fastpath mp */
3389 	ip1dbg(("ndp_fastpath_update: match\n"));
3390 	fp_mp = dupb(mp->b_cont);
3391 	if (fp_mp != NULL) {
3392 		nce->nce_fp_mp = fp_mp;
3393 	}
3394 	mutex_exit(&nce->nce_lock);
3395 	return (B_TRUE);
3396 }
3397 
3398 /*
3399  * This function handles the DL_NOTE_FASTPATH_FLUSH notification from
3400  * driver.  Note that it assumes IP is exclusive...
3401  */
3402 /* ARGSUSED */
3403 void
3404 ndp_fastpath_flush(nce_t *nce, char *arg)
3405 {
3406 	if (nce->nce_flags & NCE_F_MAPPING)
3407 		return;
3408 	/* No fastpath info? */
3409 	if (nce->nce_fp_mp == NULL || nce->nce_res_mp == NULL)
3410 		return;
3411 
3412 	/* Just delete the NCE... */
3413 	ndp_delete(nce);
3414 }
3415 
3416 /*
3417  * Return a pointer to a given option in the packet.
3418  * Assumes that option part of the packet have already been validated.
3419  */
3420 nd_opt_hdr_t *
3421 ndp_get_option(nd_opt_hdr_t *opt, int optlen, int opt_type)
3422 {
3423 	while (optlen > 0) {
3424 		if (opt->nd_opt_type == opt_type)
3425 			return (opt);
3426 		optlen -= 8 * opt->nd_opt_len;
3427 		opt = (struct nd_opt_hdr *)((char *)opt + 8 * opt->nd_opt_len);
3428 	}
3429 	return (NULL);
3430 }
3431 
3432 /*
3433  * Verify all option lengths present are > 0, also check to see
3434  * if the option lengths and packet length are consistent.
3435  */
3436 boolean_t
3437 ndp_verify_optlen(nd_opt_hdr_t *opt, int optlen)
3438 {
3439 	ASSERT(opt != NULL);
3440 	while (optlen > 0) {
3441 		if (opt->nd_opt_len == 0)
3442 			return (B_FALSE);
3443 		optlen -= 8 * opt->nd_opt_len;
3444 		if (optlen < 0)
3445 			return (B_FALSE);
3446 		opt = (struct nd_opt_hdr *)((char *)opt + 8 * opt->nd_opt_len);
3447 	}
3448 	return (B_TRUE);
3449 }
3450 
3451 /*
3452  * ndp_walk function.
3453  * Free a fraction of the NCE cache entries.
3454  * A fraction of zero means to not free any in that category.
3455  */
3456 void
3457 ndp_cache_reclaim(nce_t *nce, char *arg)
3458 {
3459 	nce_cache_reclaim_t *ncr = (nce_cache_reclaim_t *)arg;
3460 	uint_t	rand;
3461 
3462 	if (nce->nce_flags & NCE_F_PERMANENT)
3463 		return;
3464 
3465 	rand = (uint_t)lbolt +
3466 	    NCE_ADDR_HASH_V6(nce->nce_addr, NCE_TABLE_SIZE);
3467 	if (ncr->ncr_host != 0 &&
3468 	    (rand/ncr->ncr_host)*ncr->ncr_host == rand) {
3469 		ndp_delete(nce);
3470 		return;
3471 	}
3472 }
3473 
3474 /*
3475  * ndp_walk function.
3476  * Count the number of NCEs that can be deleted.
3477  * These would be hosts but not routers.
3478  */
3479 void
3480 ndp_cache_count(nce_t *nce, char *arg)
3481 {
3482 	ncc_cache_count_t *ncc = (ncc_cache_count_t *)arg;
3483 
3484 	if (nce->nce_flags & NCE_F_PERMANENT)
3485 		return;
3486 
3487 	ncc->ncc_total++;
3488 	if (!(nce->nce_flags & NCE_F_ISROUTER))
3489 		ncc->ncc_host++;
3490 }
3491 
3492 #ifdef NCE_DEBUG
3493 th_trace_t *
3494 th_trace_nce_lookup(nce_t *nce)
3495 {
3496 	int bucket_id;
3497 	th_trace_t *th_trace;
3498 
3499 	ASSERT(MUTEX_HELD(&nce->nce_lock));
3500 
3501 	bucket_id = IP_TR_HASH(curthread);
3502 	ASSERT(bucket_id < IP_TR_HASH_MAX);
3503 
3504 	for (th_trace = nce->nce_trace[bucket_id]; th_trace != NULL;
3505 	    th_trace = th_trace->th_next) {
3506 		if (th_trace->th_id == curthread)
3507 			return (th_trace);
3508 	}
3509 	return (NULL);
3510 }
3511 
3512 void
3513 nce_trace_ref(nce_t *nce)
3514 {
3515 	int bucket_id;
3516 	th_trace_t *th_trace;
3517 
3518 	/*
3519 	 * Attempt to locate the trace buffer for the curthread.
3520 	 * If it does not exist, then allocate a new trace buffer
3521 	 * and link it in list of trace bufs for this ipif, at the head
3522 	 */
3523 	ASSERT(MUTEX_HELD(&nce->nce_lock));
3524 
3525 	if (nce->nce_trace_disable == B_TRUE)
3526 		return;
3527 
3528 	th_trace = th_trace_nce_lookup(nce);
3529 	if (th_trace == NULL) {
3530 		bucket_id = IP_TR_HASH(curthread);
3531 		th_trace = (th_trace_t *)kmem_zalloc(sizeof (th_trace_t),
3532 		    KM_NOSLEEP);
3533 		if (th_trace == NULL) {
3534 			nce->nce_trace_disable = B_TRUE;
3535 			nce_trace_inactive(nce);
3536 			return;
3537 		}
3538 		th_trace->th_id = curthread;
3539 		th_trace->th_next = nce->nce_trace[bucket_id];
3540 		th_trace->th_prev = &nce->nce_trace[bucket_id];
3541 		if (th_trace->th_next != NULL)
3542 			th_trace->th_next->th_prev = &th_trace->th_next;
3543 		nce->nce_trace[bucket_id] = th_trace;
3544 	}
3545 	ASSERT(th_trace->th_refcnt < TR_BUF_MAX - 1);
3546 	th_trace->th_refcnt++;
3547 	th_trace_rrecord(th_trace);
3548 }
3549 
3550 void
3551 nce_untrace_ref(nce_t *nce)
3552 {
3553 	th_trace_t *th_trace;
3554 
3555 	ASSERT(MUTEX_HELD(&nce->nce_lock));
3556 
3557 	if (nce->nce_trace_disable == B_TRUE)
3558 		return;
3559 
3560 	th_trace = th_trace_nce_lookup(nce);
3561 	ASSERT(th_trace != NULL && th_trace->th_refcnt > 0);
3562 
3563 	th_trace_rrecord(th_trace);
3564 	th_trace->th_refcnt--;
3565 }
3566 
3567 void
3568 nce_trace_inactive(nce_t *nce)
3569 {
3570 	th_trace_t *th_trace;
3571 	int i;
3572 
3573 	ASSERT(MUTEX_HELD(&nce->nce_lock));
3574 
3575 	for (i = 0; i < IP_TR_HASH_MAX; i++) {
3576 		while (nce->nce_trace[i] != NULL) {
3577 			th_trace = nce->nce_trace[i];
3578 
3579 			/* unlink th_trace and free it */
3580 			nce->nce_trace[i] = th_trace->th_next;
3581 			if (th_trace->th_next != NULL)
3582 				th_trace->th_next->th_prev =
3583 				    &nce->nce_trace[i];
3584 
3585 			th_trace->th_next = NULL;
3586 			th_trace->th_prev = NULL;
3587 			kmem_free(th_trace, sizeof (th_trace_t));
3588 		}
3589 	}
3590 
3591 }
3592 
3593 /* ARGSUSED */
3594 int
3595 nce_thread_exit(nce_t *nce, caddr_t arg)
3596 {
3597 	th_trace_t	*th_trace;
3598 
3599 	mutex_enter(&nce->nce_lock);
3600 	th_trace = th_trace_nce_lookup(nce);
3601 
3602 	if (th_trace == NULL) {
3603 		mutex_exit(&nce->nce_lock);
3604 		return (0);
3605 	}
3606 
3607 	ASSERT(th_trace->th_refcnt == 0);
3608 
3609 	/* unlink th_trace and free it */
3610 	*th_trace->th_prev = th_trace->th_next;
3611 	if (th_trace->th_next != NULL)
3612 		th_trace->th_next->th_prev = th_trace->th_prev;
3613 	th_trace->th_next = NULL;
3614 	th_trace->th_prev = NULL;
3615 	kmem_free(th_trace, sizeof (th_trace_t));
3616 	mutex_exit(&nce->nce_lock);
3617 	return (0);
3618 }
3619 #endif
3620 
3621 /*
3622  * Called when address resolution fails due to a timeout.
3623  * Send an ICMP unreachable in response to all queued packets.
3624  */
3625 void
3626 arp_resolv_failed(nce_t *nce)
3627 {
3628 	mblk_t	*mp, *nxt_mp, *first_mp;
3629 	char	buf[INET6_ADDRSTRLEN];
3630 	zoneid_t zoneid = GLOBAL_ZONEID;
3631 	struct in_addr ipv4addr;
3632 
3633 	IN6_V4MAPPED_TO_INADDR(&nce->nce_addr, &ipv4addr);
3634 	ip3dbg(("arp_resolv_failed: dst %s\n",
3635 	    inet_ntop(AF_INET, &ipv4addr, buf, sizeof (buf))));
3636 	mutex_enter(&nce->nce_lock);
3637 	mp = nce->nce_qd_mp;
3638 	nce->nce_qd_mp = NULL;
3639 	mutex_exit(&nce->nce_lock);
3640 
3641 	while (mp != NULL) {
3642 		nxt_mp = mp->b_next;
3643 		mp->b_next = NULL;
3644 		mp->b_prev = NULL;
3645 
3646 		first_mp = mp;
3647 		/*
3648 		 * Send icmp unreachable messages
3649 		 * to the hosts.
3650 		 */
3651 		(void) ip_hdr_complete((ipha_t *)mp->b_rptr, zoneid);
3652 		ip3dbg(("arp_resolv_failed: Calling icmp_unreachable\n"));
3653 		icmp_unreachable(nce->nce_ill->ill_wq, first_mp,
3654 		    ICMP_HOST_UNREACHABLE, zoneid);
3655 		mp = nxt_mp;
3656 	}
3657 }
3658 
3659 static int
3660 ndp_lookup_then_add_v4(ill_t *ill, uchar_t *hw_addr, const in_addr_t *addr,
3661     const in_addr_t *mask, const in_addr_t *extract_mask,
3662     uint32_t hw_extract_start, uint16_t flags, uint16_t state,
3663     nce_t **newnce, mblk_t *fp_mp, mblk_t *res_mp)
3664 {
3665 	int	err = 0;
3666 	nce_t	*nce;
3667 	in6_addr_t addr6;
3668 
3669 	mutex_enter(&ndp4.ndp_g_lock);
3670 	nce = *((nce_t **)NCE_HASH_PTR_V4(*addr));
3671 	IN6_IPADDR_TO_V4MAPPED(*addr, &addr6);
3672 	nce = nce_lookup_addr(ill, &addr6, nce);
3673 	if (nce == NULL) {
3674 		err = ndp_add_v4(ill,
3675 		    hw_addr,
3676 		    addr,
3677 		    mask,
3678 		    extract_mask,
3679 		    hw_extract_start,
3680 		    flags,
3681 		    state,
3682 		    newnce,
3683 		    fp_mp,
3684 		    res_mp);
3685 	} else {
3686 		*newnce = nce;
3687 		err = EEXIST;
3688 	}
3689 	mutex_exit(&ndp4.ndp_g_lock);
3690 	return (err);
3691 }
3692 
3693 /*
3694  * NDP Cache Entry creation routine for IPv4.
3695  * Mapped entries are handled in arp.
3696  * This routine must always be called with ndp4.ndp_g_lock held.
3697  * Prior to return, nce_refcnt is incremented.
3698  */
3699 static int
3700 ndp_add_v4(ill_t *ill, uchar_t *hw_addr, const in_addr_t *addr,
3701     const in_addr_t *mask, const in_addr_t *extract_mask,
3702     uint32_t hw_extract_start, uint16_t flags, uint16_t state,
3703     nce_t **newnce, mblk_t *fp_mp, mblk_t *res_mp)
3704 {
3705 	static	nce_t		nce_nil;
3706 	nce_t		*nce;
3707 	mblk_t		*mp;
3708 	mblk_t		*template;
3709 	nce_t		**ncep;
3710 
3711 	ASSERT(MUTEX_HELD(&ndp4.ndp_g_lock));
3712 	ASSERT(ill != NULL);
3713 	if ((flags & ~NCE_EXTERNAL_FLAGS_MASK)) {
3714 		return (EINVAL);
3715 	}
3716 	ASSERT((flags & NCE_F_MAPPING) == 0);
3717 	ASSERT(extract_mask == NULL);
3718 	/*
3719 	 * Allocate the mblk to hold the nce.
3720 	 */
3721 	mp = allocb(sizeof (nce_t), BPRI_MED);
3722 	if (mp == NULL)
3723 		return (ENOMEM);
3724 
3725 	nce = (nce_t *)mp->b_rptr;
3726 	mp->b_wptr = (uchar_t *)&nce[1];
3727 	*nce = nce_nil;
3728 
3729 	/*
3730 	 * This one holds link layer address; if res_mp has been provided
3731 	 * by the caller, accept it without any further checks. Otherwise,
3732 	 * for V4, we fill it up with ill_resolver_mp here, then in
3733 	 * in ire_arpresolve(), we fill it up with the ARP query
3734 	 * once its formulated.
3735 	 */
3736 	if (res_mp != NULL) {
3737 		template = res_mp;
3738 	} else  {
3739 		template = copyb(ill->ill_resolver_mp);
3740 	}
3741 	if (template == NULL) {
3742 		freeb(mp);
3743 		return (ENOMEM);
3744 	}
3745 	nce->nce_ill = ill;
3746 	nce->nce_ipversion = IPV4_VERSION;
3747 	nce->nce_flags = flags;
3748 	nce->nce_state = state;
3749 	nce->nce_pcnt = ND_MAX_UNICAST_SOLICIT;
3750 	nce->nce_rcnt = ill->ill_xmit_count;
3751 	IN6_IPADDR_TO_V4MAPPED(*addr, &nce->nce_addr);
3752 	if (*mask == IP_HOST_MASK) {
3753 		nce->nce_mask = ipv6_all_ones;
3754 	} else  {
3755 		IN6_IPADDR_TO_V4MAPPED(*mask, &nce->nce_mask);
3756 	}
3757 	nce->nce_extract_mask = ipv6_all_zeros;
3758 	nce->nce_ll_extract_start = hw_extract_start;
3759 	nce->nce_fp_mp = (fp_mp? fp_mp : NULL);
3760 	nce->nce_res_mp = template;
3761 	if (state == ND_REACHABLE)
3762 		nce->nce_last = TICK_TO_MSEC(lbolt64);
3763 	else
3764 		nce->nce_last = 0;
3765 	nce->nce_qd_mp = NULL;
3766 	nce->nce_mp = mp;
3767 	if (hw_addr != NULL)
3768 		nce_set_ll(nce, hw_addr);
3769 	/* This one is for nce getting created */
3770 	nce->nce_refcnt = 1;
3771 	mutex_init(&nce->nce_lock, NULL, MUTEX_DEFAULT, NULL);
3772 	ncep = ((nce_t **)NCE_HASH_PTR_V4(*addr));
3773 
3774 #ifdef NCE_DEBUG
3775 	bzero(nce->nce_trace, sizeof (th_trace_t *) * IP_TR_HASH_MAX);
3776 #endif
3777 	/*
3778 	 * Atomically ensure that the ill is not CONDEMNED, before
3779 	 * adding the NCE.
3780 	 */
3781 	mutex_enter(&ill->ill_lock);
3782 	if (ill->ill_state_flags & ILL_CONDEMNED) {
3783 		mutex_exit(&ill->ill_lock);
3784 		freeb(mp);
3785 		if (res_mp == NULL) {
3786 			/*
3787 			 * template was locally allocated. need to free it.
3788 			 */
3789 			freeb(template);
3790 		}
3791 		return (EINVAL);
3792 	}
3793 	if ((nce->nce_next = *ncep) != NULL)
3794 		nce->nce_next->nce_ptpn = &nce->nce_next;
3795 	*ncep = nce;
3796 	nce->nce_ptpn = ncep;
3797 	*newnce = nce;
3798 	/* This one is for nce being used by an active thread */
3799 	NCE_REFHOLD(*newnce);
3800 
3801 	/* Bump up the number of nce's referencing this ill */
3802 	ill->ill_nce_cnt++;
3803 	mutex_exit(&ill->ill_lock);
3804 	return (0);
3805 }
3806 
3807 void
3808 ndp_flush_qd_mp(nce_t *nce)
3809 {
3810 	mblk_t *qd_mp, *qd_next;
3811 
3812 	ASSERT(MUTEX_HELD(&nce->nce_lock));
3813 	qd_mp = nce->nce_qd_mp;
3814 	nce->nce_qd_mp = NULL;
3815 	while (qd_mp != NULL) {
3816 		qd_next = qd_mp->b_next;
3817 		qd_mp->b_next = NULL;
3818 		qd_mp->b_prev = NULL;
3819 		freemsg(qd_mp);
3820 		qd_mp = qd_next;
3821 	}
3822 }
3823 
3824 nce_t *
3825 nce_reinit(nce_t *nce)
3826 {
3827 	nce_t *newnce = NULL;
3828 	in_addr_t nce_addr, nce_mask;
3829 
3830 	IN6_V4MAPPED_TO_IPADDR(&nce->nce_addr, nce_addr);
3831 	IN6_V4MAPPED_TO_IPADDR(&nce->nce_mask, nce_mask);
3832 	/*
3833 	 * delete the old one. this will get rid of any ire's pointing
3834 	 * at this nce.
3835 	 */
3836 	ndp_delete(nce);
3837 	/*
3838 	 * create a new nce with the same addr and mask.
3839 	 */
3840 	mutex_enter(&ndp4.ndp_g_lock);
3841 	(void) ndp_add_v4(nce->nce_ill, NULL, &nce_addr, &nce_mask, NULL, 0, 0,
3842 	    ND_INITIAL, &newnce, NULL, NULL);
3843 	mutex_exit(&ndp4.ndp_g_lock);
3844 	/*
3845 	 * refrele the old nce.
3846 	 */
3847 	NCE_REFRELE(nce);
3848 	return (newnce);
3849 }
3850 
3851 /*
3852  * ndp_walk routine to delete all entries that have a given destination or
3853  * gateway address and cached link layer (MAC) address.  This is used when ARP
3854  * informs us that a network-to-link-layer mapping may have changed.
3855  */
3856 void
3857 nce_delete_hw_changed(nce_t *nce, void *arg)
3858 {
3859 	nce_hw_map_t *hwm = arg;
3860 	mblk_t *mp;
3861 	dl_unitdata_req_t *dlu;
3862 	uchar_t *macaddr;
3863 	ill_t *ill;
3864 	int saplen;
3865 	ipaddr_t nce_addr;
3866 
3867 	if (nce->nce_state != ND_REACHABLE)
3868 		return;
3869 
3870 	IN6_V4MAPPED_TO_IPADDR(&nce->nce_addr, nce_addr);
3871 	if (nce_addr != hwm->hwm_addr)
3872 		return;
3873 
3874 	mutex_enter(&nce->nce_lock);
3875 	if ((mp = nce->nce_res_mp) == NULL) {
3876 		mutex_exit(&nce->nce_lock);
3877 		return;
3878 	}
3879 	dlu = (dl_unitdata_req_t *)mp->b_rptr;
3880 	macaddr = (uchar_t *)(dlu + 1);
3881 	ill = nce->nce_ill;
3882 	if ((saplen = ill->ill_sap_length) > 0)
3883 		macaddr += saplen;
3884 	else
3885 		saplen = -saplen;
3886 
3887 	/*
3888 	 * If the hardware address is unchanged, then leave this one alone.
3889 	 * Note that saplen == abs(saplen) now.
3890 	 */
3891 	if (hwm->hwm_hwlen == dlu->dl_dest_addr_length - saplen &&
3892 	    bcmp(hwm->hwm_hwaddr, macaddr, hwm->hwm_hwlen) == 0) {
3893 		mutex_exit(&nce->nce_lock);
3894 		return;
3895 	}
3896 	mutex_exit(&nce->nce_lock);
3897 
3898 	DTRACE_PROBE1(nce__hw__deleted, nce_t *, nce);
3899 	ndp_delete(nce);
3900 }
3901 
3902 /*
3903  * This function verifies whether a given IPv4 address is potentially known to
3904  * the NCE subsystem.  If so, then ARP must not delete the corresponding ace_t,
3905  * so that it can continue to look for hardware changes on that address.
3906  */
3907 boolean_t
3908 ndp_lookup_ipaddr(in_addr_t addr)
3909 {
3910 	nce_t		*nce;
3911 	struct in_addr	nceaddr;
3912 
3913 	if (addr == INADDR_ANY)
3914 		return (B_FALSE);
3915 
3916 	mutex_enter(&ndp4.ndp_g_lock);
3917 	nce = *(nce_t **)NCE_HASH_PTR_V4(addr);
3918 	for (; nce != NULL; nce = nce->nce_next) {
3919 		/* Note that only v4 mapped entries are in the table. */
3920 		IN6_V4MAPPED_TO_INADDR(&nce->nce_addr, &nceaddr);
3921 		if (addr == nceaddr.s_addr &&
3922 		    IN6_ARE_ADDR_EQUAL(&nce->nce_mask, &ipv6_all_ones)) {
3923 			/* Single flag check; no lock needed */
3924 			if (!(nce->nce_flags & NCE_F_CONDEMNED))
3925 				break;
3926 		}
3927 	}
3928 	mutex_exit(&ndp4.ndp_g_lock);
3929 	return (nce != NULL);
3930 }
3931