xref: /titanic_44/usr/src/uts/common/inet/ip/ip_ndp.c (revision bfec485cb59a4a6ed2a407550a0aef666cbc4a1f)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 #include <sys/types.h>
29 #include <sys/stream.h>
30 #include <sys/stropts.h>
31 #include <sys/strsun.h>
32 #include <sys/sysmacros.h>
33 #include <sys/errno.h>
34 #include <sys/dlpi.h>
35 #include <sys/socket.h>
36 #include <sys/ddi.h>
37 #include <sys/sunddi.h>
38 #include <sys/cmn_err.h>
39 #include <sys/debug.h>
40 #include <sys/vtrace.h>
41 #include <sys/kmem.h>
42 #include <sys/zone.h>
43 #include <sys/ethernet.h>
44 #include <sys/sdt.h>
45 
46 #include <net/if.h>
47 #include <net/if_types.h>
48 #include <net/if_dl.h>
49 #include <net/route.h>
50 #include <netinet/in.h>
51 #include <netinet/ip6.h>
52 #include <netinet/icmp6.h>
53 
54 #include <inet/common.h>
55 #include <inet/mi.h>
56 #include <inet/mib2.h>
57 #include <inet/nd.h>
58 #include <inet/ip.h>
59 #include <inet/ip_if.h>
60 #include <inet/ip_ire.h>
61 #include <inet/ip_rts.h>
62 #include <inet/ip6.h>
63 #include <inet/ip_ndp.h>
64 #include <inet/ipsec_impl.h>
65 #include <inet/ipsec_info.h>
66 #include <inet/sctp_ip.h>
67 
68 /*
69  * Function names with nce_ prefix are static while function
70  * names with ndp_ prefix are used by rest of the IP.
71  *
72  * Lock ordering:
73  *
74  *	ndp_g_lock -> ill_lock -> nce_lock
75  *
76  * The ndp_g_lock protects the NCE hash (nce_hash_tbl, NCE_HASH_PTR) and
77  * nce_next.  Nce_lock protects the contents of the NCE (particularly
78  * nce_refcnt).
79  */
80 
81 static	boolean_t nce_cmp_ll_addr(const nce_t *nce, const uchar_t *new_ll_addr,
82     uint32_t ll_addr_len);
83 static	void	nce_fastpath(nce_t *nce);
84 static	void	nce_ire_delete(nce_t *nce);
85 static	void	nce_ire_delete1(ire_t *ire, char *nce_arg);
86 static	void 	nce_set_ll(nce_t *nce, uchar_t *ll_addr);
87 static	nce_t	*nce_lookup_addr(ill_t *, const in6_addr_t *, nce_t *);
88 static	nce_t	*nce_lookup_mapping(ill_t *ill, const in6_addr_t *addr);
89 static	void	nce_make_mapping(nce_t *nce, uchar_t *addrpos,
90     uchar_t *addr);
91 static	int	nce_set_multicast(ill_t *ill, const in6_addr_t *addr);
92 static	void	nce_queue_mp(nce_t *nce, mblk_t *mp);
93 static	void	nce_report1(nce_t *nce, uchar_t *mp_arg);
94 static	mblk_t	*nce_udreq_alloc(ill_t *ill);
95 static	void	nce_update(nce_t *nce, uint16_t new_state,
96     uchar_t *new_ll_addr);
97 static	uint32_t	nce_solicit(nce_t *nce, mblk_t *mp);
98 static	boolean_t	nce_xmit(ill_t *ill, uint32_t operation,
99     ill_t *hwaddr_ill, boolean_t use_lla_addr, const in6_addr_t *sender,
100     const in6_addr_t *target, int flag);
101 extern void	th_trace_rrecord(th_trace_t *);
102 static	int	ndp_lookup_then_add_v6(ill_t *, uchar_t *,
103     const in6_addr_t *, const in6_addr_t *, const in6_addr_t *,
104     uint32_t, uint16_t, uint16_t, nce_t **, mblk_t *, mblk_t *);
105 static	int	ndp_lookup_then_add_v4(ill_t *, uchar_t *,
106     const in_addr_t *, const in_addr_t *, const in_addr_t *,
107     uint32_t, uint16_t, uint16_t, nce_t **, mblk_t *, mblk_t *);
108 static	int	ndp_add_v6(ill_t *, uchar_t *, const in6_addr_t *,
109     const in6_addr_t *, const in6_addr_t *, uint32_t, uint16_t, uint16_t,
110     nce_t **);
111 static	int	ndp_add_v4(ill_t *, uchar_t *, const in_addr_t *,
112     const in_addr_t *, const in_addr_t *, uint32_t, uint16_t, uint16_t,
113     nce_t **, mblk_t *, mblk_t *);
114 
115 
116 #ifdef NCE_DEBUG
117 void	nce_trace_inactive(nce_t *);
118 #endif
119 
120 ndp_g_t ndp4, ndp6;
121 
122 #define	NCE_HASH_PTR_V4(addr) \
123 	(&(ndp4.nce_hash_tbl[IRE_ADDR_HASH(addr, NCE_TABLE_SIZE)]))
124 
125 #define	NCE_HASH_PTR_V6(addr) \
126 	(&(ndp6.nce_hash_tbl[NCE_ADDR_HASH_V6(addr, NCE_TABLE_SIZE)]))
127 
128 int
129 ndp_add(ill_t *ill, uchar_t *hw_addr, const void *addr,
130     const void *mask, const void *extract_mask,
131     uint32_t hw_extract_start, uint16_t flags, uint16_t state,
132     nce_t **newnce, mblk_t *fp_mp, mblk_t *res_mp)
133 {
134 	int status;
135 
136 	if (ill->ill_isv6)
137 		status = ndp_add_v6(ill, hw_addr, (in6_addr_t *)addr,
138 		    (in6_addr_t *)mask, (in6_addr_t *)extract_mask,
139 		    hw_extract_start, flags, state, newnce);
140 	else
141 		status = ndp_add_v4(ill, hw_addr, (in_addr_t *)addr,
142 		    (in_addr_t *)mask, (in_addr_t *)extract_mask,
143 		    hw_extract_start, flags, state, newnce, fp_mp, res_mp);
144 	return (status);
145 }
146 
147 /* Non-tunable probe interval, based on link capabilities */
148 #define	ILL_PROBE_INTERVAL(ill)	((ill)->ill_note_link ? 150 : 1500)
149 
150 /*
151  * NDP Cache Entry creation routine.
152  * Mapped entries will never do NUD .
153  * This routine must always be called with ndp6.ndp_g_lock held.
154  * Prior to return, nce_refcnt is incremented.
155  */
156 static int
157 ndp_add_v6(ill_t *ill, uchar_t *hw_addr, const in6_addr_t *addr,
158     const in6_addr_t *mask, const in6_addr_t *extract_mask,
159     uint32_t hw_extract_start, uint16_t flags, uint16_t state,
160     nce_t **newnce)
161 {
162 	static	nce_t		nce_nil;
163 	nce_t		*nce;
164 	mblk_t		*mp;
165 	mblk_t		*template;
166 	nce_t		**ncep;
167 	int		err;
168 	boolean_t	dropped = B_FALSE;
169 
170 	ASSERT(MUTEX_HELD(&ndp6.ndp_g_lock));
171 	ASSERT(ill != NULL && ill->ill_isv6);
172 	if (IN6_IS_ADDR_UNSPECIFIED(addr)) {
173 		ip0dbg(("ndp_add: no addr\n"));
174 		return (EINVAL);
175 	}
176 	if ((flags & ~NCE_EXTERNAL_FLAGS_MASK)) {
177 		ip0dbg(("ndp_add: flags = %x\n", (int)flags));
178 		return (EINVAL);
179 	}
180 	if (IN6_IS_ADDR_UNSPECIFIED(extract_mask) &&
181 	    (flags & NCE_F_MAPPING)) {
182 		ip0dbg(("ndp_add: extract mask zero for mapping"));
183 		return (EINVAL);
184 	}
185 	/*
186 	 * Allocate the mblk to hold the nce.
187 	 *
188 	 * XXX This can come out of a separate cache - nce_cache.
189 	 * We don't need the mp anymore as there are no more
190 	 * "qwriter"s
191 	 */
192 	mp = allocb(sizeof (nce_t), BPRI_MED);
193 	if (mp == NULL)
194 		return (ENOMEM);
195 
196 	nce = (nce_t *)mp->b_rptr;
197 	mp->b_wptr = (uchar_t *)&nce[1];
198 	*nce = nce_nil;
199 
200 	/*
201 	 * This one holds link layer address
202 	 */
203 	if (ill->ill_net_type == IRE_IF_RESOLVER) {
204 		template = nce_udreq_alloc(ill);
205 	} else {
206 		ASSERT((ill->ill_net_type == IRE_IF_NORESOLVER));
207 		ASSERT((ill->ill_resolver_mp != NULL));
208 		template = copyb(ill->ill_resolver_mp);
209 	}
210 	if (template == NULL) {
211 		freeb(mp);
212 		return (ENOMEM);
213 	}
214 	nce->nce_ill = ill;
215 	nce->nce_ipversion = IPV6_VERSION;
216 	nce->nce_flags = flags;
217 	nce->nce_state = state;
218 	nce->nce_pcnt = ND_MAX_UNICAST_SOLICIT;
219 	nce->nce_rcnt = ill->ill_xmit_count;
220 	nce->nce_addr = *addr;
221 	nce->nce_mask = *mask;
222 	nce->nce_extract_mask = *extract_mask;
223 	nce->nce_ll_extract_start = hw_extract_start;
224 	nce->nce_fp_mp = NULL;
225 	nce->nce_res_mp = template;
226 	if (state == ND_REACHABLE)
227 		nce->nce_last = TICK_TO_MSEC(lbolt64);
228 	else
229 		nce->nce_last = 0;
230 	nce->nce_qd_mp = NULL;
231 	nce->nce_mp = mp;
232 	if (hw_addr != NULL)
233 		nce_set_ll(nce, hw_addr);
234 	/* This one is for nce getting created */
235 	nce->nce_refcnt = 1;
236 	mutex_init(&nce->nce_lock, NULL, MUTEX_DEFAULT, NULL);
237 	if (nce->nce_flags & NCE_F_MAPPING) {
238 		ASSERT(IN6_IS_ADDR_MULTICAST(addr));
239 		ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&nce->nce_mask));
240 		ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&nce->nce_extract_mask));
241 		ncep = &ndp6.nce_mask_entries;
242 	} else {
243 		ncep = ((nce_t **)NCE_HASH_PTR_V6(*addr));
244 	}
245 
246 #ifdef NCE_DEBUG
247 	bzero(nce->nce_trace, sizeof (th_trace_t *) * IP_TR_HASH_MAX);
248 #endif
249 	/*
250 	 * Atomically ensure that the ill is not CONDEMNED, before
251 	 * adding the NCE.
252 	 */
253 	mutex_enter(&ill->ill_lock);
254 	if (ill->ill_state_flags & ILL_CONDEMNED) {
255 		mutex_exit(&ill->ill_lock);
256 		freeb(mp);
257 		freeb(template);
258 		return (EINVAL);
259 	}
260 	if ((nce->nce_next = *ncep) != NULL)
261 		nce->nce_next->nce_ptpn = &nce->nce_next;
262 	*ncep = nce;
263 	nce->nce_ptpn = ncep;
264 	*newnce = nce;
265 	/* This one is for nce being used by an active thread */
266 	NCE_REFHOLD(*newnce);
267 
268 	/* Bump up the number of nce's referencing this ill */
269 	ill->ill_nce_cnt++;
270 	mutex_exit(&ill->ill_lock);
271 
272 	err = 0;
273 	if ((flags & NCE_F_PERMANENT) && state == ND_PROBE) {
274 		mutex_enter(&nce->nce_lock);
275 		mutex_exit(&ndp6.ndp_g_lock);
276 		nce->nce_pcnt = ND_MAX_UNICAST_SOLICIT;
277 		mutex_exit(&nce->nce_lock);
278 		dropped = nce_xmit(ill, ND_NEIGHBOR_SOLICIT, NULL, B_FALSE,
279 		    &ipv6_all_zeros, addr, NDP_PROBE);
280 		if (dropped) {
281 			mutex_enter(&nce->nce_lock);
282 			nce->nce_pcnt++;
283 			mutex_exit(&nce->nce_lock);
284 		}
285 		NDP_RESTART_TIMER(nce, ILL_PROBE_INTERVAL(ill));
286 		mutex_enter(&ndp6.ndp_g_lock);
287 		err = EINPROGRESS;
288 	} else if (flags & NCE_F_UNSOL_ADV) {
289 		/*
290 		 * We account for the transmit below by assigning one
291 		 * less than the ndd variable. Subsequent decrements
292 		 * are done in ndp_timer.
293 		 */
294 		mutex_enter(&nce->nce_lock);
295 		mutex_exit(&ndp6.ndp_g_lock);
296 		nce->nce_unsolicit_count = ip_ndp_unsolicit_count - 1;
297 		mutex_exit(&nce->nce_lock);
298 		dropped = nce_xmit(ill,
299 		    ND_NEIGHBOR_ADVERT,
300 		    ill,	/* ill to be used for extracting ill_nd_lla */
301 		    B_TRUE,	/* use ill_nd_lla */
302 		    addr,	/* Source and target of the advertisement pkt */
303 		    &ipv6_all_hosts_mcast, /* Destination of the packet */
304 		    NDP_ORIDE);
305 		mutex_enter(&nce->nce_lock);
306 		if (dropped)
307 			nce->nce_unsolicit_count++;
308 		if (nce->nce_unsolicit_count != 0) {
309 			nce->nce_timeout_id = timeout(ndp_timer, nce,
310 			    MSEC_TO_TICK(ip_ndp_unsolicit_interval));
311 		}
312 		mutex_exit(&nce->nce_lock);
313 		mutex_enter(&ndp6.ndp_g_lock);
314 	}
315 	/*
316 	 * If the hw_addr is NULL, typically for ND_INCOMPLETE nces, then
317 	 * we call nce_fastpath as soon as the nce is resolved in ndp_process.
318 	 * We call nce_fastpath from nce_update if the link layer address of
319 	 * the peer changes from nce_update
320 	 */
321 	if (hw_addr != NULL || ill->ill_net_type == IRE_IF_NORESOLVER)
322 		nce_fastpath(nce);
323 	return (err);
324 }
325 
326 int
327 ndp_lookup_then_add(ill_t *ill, uchar_t *hw_addr, const void *addr,
328     const void *mask, const void *extract_mask,
329     uint32_t hw_extract_start, uint16_t flags, uint16_t state,
330     nce_t **newnce, mblk_t *fp_mp, mblk_t *res_mp)
331 {
332 	int status;
333 
334 	if (ill->ill_isv6) {
335 		status = ndp_lookup_then_add_v6(ill, hw_addr,
336 		    (in6_addr_t *)addr, (in6_addr_t *)mask,
337 		    (in6_addr_t *)extract_mask, hw_extract_start, flags,
338 		    state, newnce, fp_mp, res_mp);
339 	} else  {
340 		status = ndp_lookup_then_add_v4(ill, hw_addr,
341 		    (in_addr_t *)addr, (in_addr_t *)mask,
342 		    (in_addr_t *)extract_mask, hw_extract_start, flags,
343 		    state, newnce, fp_mp, res_mp);
344 	}
345 
346 	return (status);
347 }
348 
349 static int
350 ndp_lookup_then_add_v6(ill_t *ill, uchar_t *hw_addr, const in6_addr_t *addr,
351     const in6_addr_t *mask, const in6_addr_t *extract_mask,
352     uint32_t hw_extract_start, uint16_t flags, uint16_t state,
353     nce_t **newnce, mblk_t *fp_mp, mblk_t *res_mp)
354 {
355 	int	err = 0;
356 	nce_t	*nce;
357 
358 	ASSERT(ill != NULL && ill->ill_isv6);
359 	mutex_enter(&ndp6.ndp_g_lock);
360 	nce = *((nce_t **)NCE_HASH_PTR_V6(*addr)); /* head of v6 hash table */
361 	nce = nce_lookup_addr(ill, addr, nce);
362 	if (nce == NULL) {
363 		err = ndp_add(ill,
364 		    hw_addr,
365 		    addr,
366 		    mask,
367 		    extract_mask,
368 		    hw_extract_start,
369 		    flags,
370 		    state,
371 		    newnce,
372 		    fp_mp,
373 		    res_mp);
374 	} else {
375 		*newnce = nce;
376 		err = EEXIST;
377 	}
378 	mutex_exit(&ndp6.ndp_g_lock);
379 	return (err);
380 }
381 
382 /*
383  * Remove all the CONDEMNED nces from the appropriate hash table.
384  * We create a private list of NCEs, these may have ires pointing
385  * to them, so the list will be passed through to clean up dependent
386  * ires and only then we can do NCE_REFRELE which can make NCE inactive.
387  */
388 static void
389 nce_remove(ndp_g_t *ndp, nce_t *nce, nce_t **free_nce_list)
390 {
391 	nce_t *nce1;
392 	nce_t **ptpn;
393 
394 	ASSERT(MUTEX_HELD(&ndp->ndp_g_lock));
395 	ASSERT(ndp->ndp_g_walker == 0);
396 	for (; nce; nce = nce1) {
397 		nce1 = nce->nce_next;
398 		mutex_enter(&nce->nce_lock);
399 		if (nce->nce_flags & NCE_F_CONDEMNED) {
400 			ptpn = nce->nce_ptpn;
401 			nce1 = nce->nce_next;
402 			if (nce1 != NULL)
403 				nce1->nce_ptpn = ptpn;
404 			*ptpn = nce1;
405 			nce->nce_ptpn = NULL;
406 			nce->nce_next = NULL;
407 			nce->nce_next = *free_nce_list;
408 			*free_nce_list = nce;
409 		}
410 		mutex_exit(&nce->nce_lock);
411 	}
412 }
413 
414 /*
415  * 1. Mark the nce CONDEMNED. This ensures that no new nce_lookup()
416  *    will return this NCE. Also no new IREs will be created that
417  *    point to this NCE (See ire_add_v6).  Also no new timeouts will
418  *    be started (See NDP_RESTART_TIMER).
419  * 2. Cancel any currently running timeouts.
420  * 3. If there is an ndp walker, return. The walker will do the cleanup.
421  *    This ensures that walkers see a consistent list of NCEs while walking.
422  * 4. Otherwise remove the NCE from the list of NCEs
423  * 5. Delete all IREs pointing to this NCE.
424  */
425 void
426 ndp_delete(nce_t *nce)
427 {
428 	nce_t	**ptpn;
429 	nce_t	*nce1;
430 	int	ipversion = nce->nce_ipversion;
431 	ndp_g_t *ndp = (ipversion == IPV4_VERSION ? &ndp4 : &ndp6);
432 
433 	/* Serialize deletes */
434 	mutex_enter(&nce->nce_lock);
435 	if (nce->nce_flags & NCE_F_CONDEMNED) {
436 		/* Some other thread is doing the delete */
437 		mutex_exit(&nce->nce_lock);
438 		return;
439 	}
440 	/*
441 	 * Caller has a refhold. Also 1 ref for being in the list. Thus
442 	 * refcnt has to be >= 2
443 	 */
444 	ASSERT(nce->nce_refcnt >= 2);
445 	nce->nce_flags |= NCE_F_CONDEMNED;
446 	mutex_exit(&nce->nce_lock);
447 
448 	nce_fastpath_list_delete(nce);
449 
450 	/*
451 	 * Cancel any running timer. Timeout can't be restarted
452 	 * since CONDEMNED is set. Can't hold nce_lock across untimeout.
453 	 * Passing invalid timeout id is fine.
454 	 */
455 	if (nce->nce_timeout_id != 0) {
456 		(void) untimeout(nce->nce_timeout_id);
457 		nce->nce_timeout_id = 0;
458 	}
459 
460 	mutex_enter(&ndp->ndp_g_lock);
461 	if (nce->nce_ptpn == NULL) {
462 		/*
463 		 * The last ndp walker has already removed this nce from
464 		 * the list after we marked the nce CONDEMNED and before
465 		 * we grabbed the global lock.
466 		 */
467 		mutex_exit(&ndp->ndp_g_lock);
468 		return;
469 	}
470 	if (ndp->ndp_g_walker > 0) {
471 		/*
472 		 * Can't unlink. The walker will clean up
473 		 */
474 		ndp->ndp_g_walker_cleanup = B_TRUE;
475 		mutex_exit(&ndp->ndp_g_lock);
476 		return;
477 	}
478 
479 	/*
480 	 * Now remove the nce from the list. NDP_RESTART_TIMER won't restart
481 	 * the timer since it is marked CONDEMNED.
482 	 */
483 	ptpn = nce->nce_ptpn;
484 	nce1 = nce->nce_next;
485 	if (nce1 != NULL)
486 		nce1->nce_ptpn = ptpn;
487 	*ptpn = nce1;
488 	nce->nce_ptpn = NULL;
489 	nce->nce_next = NULL;
490 	mutex_exit(&ndp->ndp_g_lock);
491 
492 	nce_ire_delete(nce);
493 }
494 
495 void
496 ndp_inactive(nce_t *nce)
497 {
498 	mblk_t		**mpp;
499 	ill_t		*ill;
500 
501 	ASSERT(nce->nce_refcnt == 0);
502 	ASSERT(MUTEX_HELD(&nce->nce_lock));
503 	ASSERT(nce->nce_fastpath == NULL);
504 
505 	/* Free all nce allocated messages */
506 	mpp = &nce->nce_first_mp_to_free;
507 	do {
508 		while (*mpp != NULL) {
509 			mblk_t  *mp;
510 
511 			mp = *mpp;
512 			*mpp = mp->b_next;
513 			mp->b_next = NULL;
514 			mp->b_prev = NULL;
515 			freemsg(mp);
516 		}
517 	} while (mpp++ != &nce->nce_last_mp_to_free);
518 
519 #ifdef NCE_DEBUG
520 	nce_trace_inactive(nce);
521 #endif
522 
523 	ill = nce->nce_ill;
524 	mutex_enter(&ill->ill_lock);
525 	ill->ill_nce_cnt--;
526 	/*
527 	 * If the number of nce's associated with this ill have dropped
528 	 * to zero, check whether we need to restart any operation that
529 	 * is waiting for this to happen.
530 	 */
531 	if (ill->ill_nce_cnt == 0) {
532 		/* ipif_ill_refrele_tail drops the ill_lock */
533 		ipif_ill_refrele_tail(ill);
534 	} else {
535 		mutex_exit(&ill->ill_lock);
536 	}
537 	mutex_destroy(&nce->nce_lock);
538 	freeb(nce->nce_mp);
539 }
540 
541 /*
542  * ndp_walk routine.  Delete the nce if it is associated with the ill
543  * that is going away.  Always called as a writer.
544  */
545 void
546 ndp_delete_per_ill(nce_t *nce, uchar_t *arg)
547 {
548 	if ((nce != NULL) && nce->nce_ill == (ill_t *)arg) {
549 		ndp_delete(nce);
550 	}
551 }
552 
553 /*
554  * Walk a list of to be inactive NCEs and blow away all the ires.
555  */
556 static void
557 nce_ire_delete_list(nce_t *nce)
558 {
559 	nce_t *nce_next;
560 
561 	ASSERT(nce != NULL);
562 	while (nce != NULL) {
563 		nce_next = nce->nce_next;
564 		nce->nce_next = NULL;
565 
566 		/*
567 		 * It is possible for the last ndp walker (this thread)
568 		 * to come here after ndp_delete has marked the nce CONDEMNED
569 		 * and before it has removed the nce from the fastpath list
570 		 * or called untimeout. So we need to do it here. It is safe
571 		 * for both ndp_delete and this thread to do it twice or
572 		 * even simultaneously since each of the threads has a
573 		 * reference on the nce.
574 		 */
575 		nce_fastpath_list_delete(nce);
576 		/*
577 		 * Cancel any running timer. Timeout can't be restarted
578 		 * since CONDEMNED is set. Can't hold nce_lock across untimeout.
579 		 * Passing invalid timeout id is fine.
580 		 */
581 		if (nce->nce_timeout_id != 0) {
582 			(void) untimeout(nce->nce_timeout_id);
583 			nce->nce_timeout_id = 0;
584 		}
585 		/*
586 		 * We might hit this func thus in the v4 case:
587 		 * ipif_down->ipif_ndp_down->ndp_walk
588 		 */
589 
590 		if (nce->nce_ipversion == IPV4_VERSION) {
591 			ire_walk_ill_v4(MATCH_IRE_ILL | MATCH_IRE_TYPE,
592 			    IRE_CACHE, nce_ire_delete1,
593 			    (char *)nce, nce->nce_ill);
594 		} else {
595 			ASSERT(nce->nce_ipversion == IPV6_VERSION);
596 			ire_walk_ill_v6(MATCH_IRE_ILL | MATCH_IRE_TYPE,
597 			    IRE_CACHE, nce_ire_delete1,
598 			    (char *)nce, nce->nce_ill);
599 		}
600 		NCE_REFRELE_NOTR(nce);
601 		nce = nce_next;
602 	}
603 }
604 
605 /*
606  * Delete an ire when the nce goes away.
607  */
608 /* ARGSUSED */
609 static void
610 nce_ire_delete(nce_t *nce)
611 {
612 	if (nce->nce_ipversion == IPV6_VERSION) {
613 		ire_walk_ill_v6(MATCH_IRE_ILL | MATCH_IRE_TYPE, IRE_CACHE,
614 		    nce_ire_delete1, (char *)nce, nce->nce_ill);
615 		NCE_REFRELE_NOTR(nce);
616 	} else {
617 		ire_walk_ill_v4(MATCH_IRE_ILL | MATCH_IRE_TYPE, IRE_CACHE,
618 		    nce_ire_delete1, (char *)nce, nce->nce_ill);
619 		NCE_REFRELE_NOTR(nce);
620 	}
621 }
622 
623 /*
624  * ire_walk routine used to delete every IRE that shares this nce
625  */
626 static void
627 nce_ire_delete1(ire_t *ire, char *nce_arg)
628 {
629 	nce_t	*nce = (nce_t *)nce_arg;
630 
631 	ASSERT(ire->ire_type == IRE_CACHE);
632 
633 	if (ire->ire_nce == nce) {
634 		ASSERT(ire->ire_ipversion == nce->nce_ipversion);
635 		ire_delete(ire);
636 	}
637 }
638 
639 /*
640  * Restart DAD on given NCE.  Returns B_TRUE if DAD has been restarted.
641  */
642 boolean_t
643 ndp_restart_dad(nce_t *nce)
644 {
645 	boolean_t started;
646 	boolean_t dropped;
647 
648 	if (nce == NULL)
649 		return (B_FALSE);
650 	mutex_enter(&nce->nce_lock);
651 	if (nce->nce_state == ND_PROBE) {
652 		mutex_exit(&nce->nce_lock);
653 		started = B_TRUE;
654 	} else if (nce->nce_state == ND_REACHABLE) {
655 		nce->nce_state = ND_PROBE;
656 		nce->nce_pcnt = ND_MAX_UNICAST_SOLICIT - 1;
657 		mutex_exit(&nce->nce_lock);
658 		dropped = nce_xmit(nce->nce_ill, ND_NEIGHBOR_SOLICIT, NULL,
659 		    B_FALSE, &ipv6_all_zeros, &nce->nce_addr, NDP_PROBE);
660 		if (dropped) {
661 			mutex_enter(&nce->nce_lock);
662 			nce->nce_pcnt++;
663 			mutex_exit(&nce->nce_lock);
664 		}
665 		NDP_RESTART_TIMER(nce, ILL_PROBE_INTERVAL(nce->nce_ill));
666 		started = B_TRUE;
667 	} else {
668 		mutex_exit(&nce->nce_lock);
669 		started = B_FALSE;
670 	}
671 	return (started);
672 }
673 
674 /*
675  * IPv6 Cache entry lookup.  Try to find an nce matching the parameters passed.
676  * If one is found, the refcnt on the nce will be incremented.
677  */
678 nce_t *
679 ndp_lookup_v6(ill_t *ill, const in6_addr_t *addr, boolean_t caller_holds_lock)
680 {
681 	nce_t	*nce;
682 
683 	ASSERT(ill != NULL && ill->ill_isv6);
684 	if (!caller_holds_lock) {
685 		mutex_enter(&ndp6.ndp_g_lock);
686 	}
687 	nce = *((nce_t **)NCE_HASH_PTR_V6(*addr)); /* head of v6 hash table */
688 	nce = nce_lookup_addr(ill, addr, nce);
689 	if (nce == NULL)
690 		nce = nce_lookup_mapping(ill, addr);
691 	if (!caller_holds_lock)
692 		mutex_exit(&ndp6.ndp_g_lock);
693 	return (nce);
694 }
695 /*
696  * IPv4 Cache entry lookup.  Try to find an nce matching the parameters passed.
697  * If one is found, the refcnt on the nce will be incremented.
698  * Since multicast mappings are handled in arp, there are no nce_mcast_entries
699  * so we skip the nce_lookup_mapping call.
700  * XXX TODO: if the nce is found to be ND_STALE, ndp_delete it and return NULL
701  */
702 nce_t *
703 ndp_lookup_v4(ill_t *ill, const in_addr_t *addr, boolean_t caller_holds_lock)
704 {
705 	nce_t	*nce;
706 	in6_addr_t addr6;
707 
708 	if (!caller_holds_lock) {
709 		mutex_enter(&ndp4.ndp_g_lock);
710 	}
711 	nce = *((nce_t **)NCE_HASH_PTR_V4(*addr)); /* head of v6 hash table */
712 	IN6_IPADDR_TO_V4MAPPED(*addr, &addr6);
713 	nce = nce_lookup_addr(ill, &addr6, nce);
714 	if (!caller_holds_lock)
715 		mutex_exit(&ndp4.ndp_g_lock);
716 	return (nce);
717 }
718 
719 /*
720  * Cache entry lookup.  Try to find an nce matching the parameters passed.
721  * Look only for exact entries (no mappings).  If an nce is found, increment
722  * the hold count on that nce. The caller passes in the start of the
723  * appropriate hash table, and must be holding the appropriate global
724  * lock (ndp_g_lock).
725  */
726 static nce_t *
727 nce_lookup_addr(ill_t *ill, const in6_addr_t *addr, nce_t *nce)
728 {
729 	ndp_g_t *ndp = (ill->ill_isv6 ? &ndp6 : &ndp4);
730 
731 	ASSERT(ill != NULL);
732 	ASSERT(MUTEX_HELD(&ndp->ndp_g_lock));
733 	if (IN6_IS_ADDR_UNSPECIFIED(addr))
734 		return (NULL);
735 	for (; nce != NULL; nce = nce->nce_next) {
736 		if (nce->nce_ill == ill) {
737 			if (IN6_ARE_ADDR_EQUAL(&nce->nce_addr, addr) &&
738 			    IN6_ARE_ADDR_EQUAL(&nce->nce_mask,
739 			    &ipv6_all_ones)) {
740 				mutex_enter(&nce->nce_lock);
741 				if (!(nce->nce_flags & NCE_F_CONDEMNED)) {
742 					NCE_REFHOLD_LOCKED(nce);
743 					mutex_exit(&nce->nce_lock);
744 					break;
745 				}
746 				mutex_exit(&nce->nce_lock);
747 			}
748 		}
749 	}
750 	return (nce);
751 }
752 
753 /*
754  * Cache entry lookup.  Try to find an nce matching the parameters passed.
755  * Look only for mappings.
756  */
757 static nce_t *
758 nce_lookup_mapping(ill_t *ill, const in6_addr_t *addr)
759 {
760 	nce_t	*nce;
761 
762 	ASSERT(ill != NULL && ill->ill_isv6);
763 	ASSERT(MUTEX_HELD(&ndp6.ndp_g_lock));
764 	if (!IN6_IS_ADDR_MULTICAST(addr))
765 		return (NULL);
766 	nce = ndp6.nce_mask_entries;
767 	for (; nce != NULL; nce = nce->nce_next)
768 		if (nce->nce_ill == ill &&
769 		    (V6_MASK_EQ(*addr, nce->nce_mask, nce->nce_addr))) {
770 			mutex_enter(&nce->nce_lock);
771 			if (!(nce->nce_flags & NCE_F_CONDEMNED)) {
772 				NCE_REFHOLD_LOCKED(nce);
773 				mutex_exit(&nce->nce_lock);
774 				break;
775 			}
776 			mutex_exit(&nce->nce_lock);
777 		}
778 	return (nce);
779 }
780 
781 /*
782  * Process passed in parameters either from an incoming packet or via
783  * user ioctl.
784  */
785 void
786 ndp_process(nce_t *nce, uchar_t *hw_addr, uint32_t flag, boolean_t is_adv)
787 {
788 	ill_t	*ill = nce->nce_ill;
789 	uint32_t hw_addr_len = ill->ill_nd_lla_len;
790 	mblk_t	*mp;
791 	boolean_t ll_updated = B_FALSE;
792 	boolean_t ll_changed;
793 
794 	ASSERT(nce->nce_ipversion == IPV6_VERSION);
795 	/*
796 	 * No updates of link layer address or the neighbor state is
797 	 * allowed, when the cache is in NONUD state.  This still
798 	 * allows for responding to reachability solicitation.
799 	 */
800 	mutex_enter(&nce->nce_lock);
801 	if (nce->nce_state == ND_INCOMPLETE) {
802 		if (hw_addr == NULL) {
803 			mutex_exit(&nce->nce_lock);
804 			return;
805 		}
806 		nce_set_ll(nce, hw_addr);
807 		/*
808 		 * Update nce state and send the queued packets
809 		 * back to ip this time ire will be added.
810 		 */
811 		if (flag & ND_NA_FLAG_SOLICITED) {
812 			nce_update(nce, ND_REACHABLE, NULL);
813 		} else {
814 			nce_update(nce, ND_STALE, NULL);
815 		}
816 		mutex_exit(&nce->nce_lock);
817 		nce_fastpath(nce);
818 		mutex_enter(&nce->nce_lock);
819 		mp = nce->nce_qd_mp;
820 		nce->nce_qd_mp = NULL;
821 		mutex_exit(&nce->nce_lock);
822 		while (mp != NULL) {
823 			mblk_t *nxt_mp;
824 
825 			nxt_mp = mp->b_next;
826 			mp->b_next = NULL;
827 			if (mp->b_prev != NULL) {
828 				ill_t   *inbound_ill;
829 				queue_t *fwdq = NULL;
830 				uint_t ifindex;
831 
832 				ifindex = (uint_t)(uintptr_t)mp->b_prev;
833 				inbound_ill = ill_lookup_on_ifindex(ifindex,
834 				    B_TRUE, NULL, NULL, NULL, NULL);
835 				if (inbound_ill == NULL) {
836 					mp->b_prev = NULL;
837 					freemsg(mp);
838 					return;
839 				} else {
840 					fwdq = inbound_ill->ill_rq;
841 				}
842 				mp->b_prev = NULL;
843 				/*
844 				 * Send a forwarded packet back into ip_rput_v6
845 				 * just as in ire_send_v6().
846 				 * Extract the queue from b_prev (set in
847 				 * ip_rput_data_v6).
848 				 */
849 				if (fwdq != NULL) {
850 					/*
851 					 * Forwarded packets hop count will
852 					 * get decremented in ip_rput_data_v6
853 					 */
854 					put(fwdq, mp);
855 				} else {
856 					/*
857 					 * Send locally originated packets back
858 					 * into * ip_wput_v6.
859 					 */
860 					put(ill->ill_wq, mp);
861 				}
862 				ill_refrele(inbound_ill);
863 			} else {
864 				put(ill->ill_wq, mp);
865 			}
866 			mp = nxt_mp;
867 		}
868 		return;
869 	}
870 	ll_changed = nce_cmp_ll_addr(nce, hw_addr, hw_addr_len);
871 	if (!is_adv) {
872 		/* If this is a SOLICITATION request only */
873 		if (ll_changed)
874 			nce_update(nce, ND_STALE, hw_addr);
875 		mutex_exit(&nce->nce_lock);
876 		return;
877 	}
878 	if (!(flag & ND_NA_FLAG_OVERRIDE) && ll_changed) {
879 		/* If in any other state than REACHABLE, ignore */
880 		if (nce->nce_state == ND_REACHABLE) {
881 			nce_update(nce, ND_STALE, NULL);
882 		}
883 		mutex_exit(&nce->nce_lock);
884 		return;
885 	} else {
886 		if (ll_changed) {
887 			nce_update(nce, ND_UNCHANGED, hw_addr);
888 			ll_updated = B_TRUE;
889 		}
890 		if (flag & ND_NA_FLAG_SOLICITED) {
891 			nce_update(nce, ND_REACHABLE, NULL);
892 		} else {
893 			if (ll_updated) {
894 				nce_update(nce, ND_STALE, NULL);
895 			}
896 		}
897 		mutex_exit(&nce->nce_lock);
898 		if (!(flag & ND_NA_FLAG_ROUTER) && (nce->nce_flags &
899 		    NCE_F_ISROUTER)) {
900 			ire_t *ire;
901 
902 			/*
903 			 * Router turned to host.  We need to remove the
904 			 * entry as well as any default route that may be
905 			 * using this as a next hop.  This is required by
906 			 * section 7.2.5 of RFC 2461.
907 			 */
908 			ire = ire_ftable_lookup_v6(&ipv6_all_zeros,
909 			    &ipv6_all_zeros, &nce->nce_addr, IRE_DEFAULT,
910 			    nce->nce_ill->ill_ipif, NULL, ALL_ZONES, 0, NULL,
911 			    MATCH_IRE_ILL | MATCH_IRE_TYPE | MATCH_IRE_GW |
912 			    MATCH_IRE_DEFAULT);
913 			if (ire != NULL) {
914 				ip_rts_rtmsg(RTM_DELETE, ire, 0);
915 				ire_delete(ire);
916 				ire_refrele(ire);
917 			}
918 			ndp_delete(nce);
919 		}
920 	}
921 }
922 
923 /*
924  * Pass arg1 to the pfi supplied, along with each nce in existence.
925  * ndp_walk() places a REFHOLD on the nce and drops the lock when
926  * walking the hash list.
927  */
928 void
929 ndp_walk_common(ndp_g_t *ndp, ill_t *ill, pfi_t pfi, void *arg1,
930     boolean_t trace)
931 {
932 
933 	nce_t	*nce;
934 	nce_t	*nce1;
935 	nce_t	**ncep;
936 	nce_t	*free_nce_list = NULL;
937 
938 	mutex_enter(&ndp->ndp_g_lock);
939 	/* Prevent ndp_delete from unlink and free of NCE */
940 	ndp->ndp_g_walker++;
941 	mutex_exit(&ndp->ndp_g_lock);
942 	for (ncep = ndp->nce_hash_tbl;
943 	    ncep < A_END(ndp->nce_hash_tbl); ncep++) {
944 		for (nce = *ncep; nce != NULL; nce = nce1) {
945 			nce1 = nce->nce_next;
946 			if (ill == NULL || nce->nce_ill == ill) {
947 				if (trace) {
948 					NCE_REFHOLD(nce);
949 					(*pfi)(nce, arg1);
950 					NCE_REFRELE(nce);
951 				} else {
952 					NCE_REFHOLD_NOTR(nce);
953 					(*pfi)(nce, arg1);
954 					NCE_REFRELE_NOTR(nce);
955 				}
956 			}
957 		}
958 	}
959 	for (nce = ndp->nce_mask_entries; nce != NULL; nce = nce1) {
960 		nce1 = nce->nce_next;
961 		if (ill == NULL || nce->nce_ill == ill) {
962 			if (trace) {
963 				NCE_REFHOLD(nce);
964 				(*pfi)(nce, arg1);
965 				NCE_REFRELE(nce);
966 			} else {
967 				NCE_REFHOLD_NOTR(nce);
968 				(*pfi)(nce, arg1);
969 				NCE_REFRELE_NOTR(nce);
970 			}
971 		}
972 	}
973 	mutex_enter(&ndp->ndp_g_lock);
974 	ndp->ndp_g_walker--;
975 	/*
976 	 * While NCE's are removed from global list they are placed
977 	 * in a private list, to be passed to nce_ire_delete_list().
978 	 * The reason is, there may be ires pointing to this nce
979 	 * which needs to cleaned up.
980 	 */
981 	if (ndp->ndp_g_walker_cleanup && ndp->ndp_g_walker == 0) {
982 		/* Time to delete condemned entries */
983 		for (ncep = ndp->nce_hash_tbl;
984 		    ncep < A_END(ndp->nce_hash_tbl); ncep++) {
985 			nce = *ncep;
986 			if (nce != NULL) {
987 				nce_remove(ndp, nce, &free_nce_list);
988 			}
989 		}
990 		nce = ndp->nce_mask_entries;
991 		if (nce != NULL) {
992 			nce_remove(ndp, nce, &free_nce_list);
993 		}
994 		ndp->ndp_g_walker_cleanup = B_FALSE;
995 	}
996 	mutex_exit(&ndp->ndp_g_lock);
997 
998 	if (free_nce_list != NULL) {
999 		nce_ire_delete_list(free_nce_list);
1000 	}
1001 }
1002 
1003 void
1004 ndp_walk(ill_t *ill, pfi_t pfi, void *arg1)
1005 {
1006 	ndp_walk_common(&ndp4, ill, pfi, arg1, B_TRUE);
1007 	ndp_walk_common(&ndp6, ill, pfi, arg1, B_TRUE);
1008 }
1009 
1010 /*
1011  * Prepend the zoneid using an ipsec_out_t for later use by functions like
1012  * ip_rput_v6() after neighbor discovery has taken place.  If the message
1013  * block already has a M_CTL at the front of it, then simply set the zoneid
1014  * appropriately.
1015  */
1016 static mblk_t *
1017 ndp_prepend_zone(mblk_t *mp, zoneid_t zoneid)
1018 {
1019 	mblk_t		*first_mp;
1020 	ipsec_out_t	*io;
1021 
1022 	ASSERT(zoneid != ALL_ZONES);
1023 	if (mp->b_datap->db_type == M_CTL) {
1024 		io = (ipsec_out_t *)mp->b_rptr;
1025 		ASSERT(io->ipsec_out_type == IPSEC_OUT);
1026 		io->ipsec_out_zoneid = zoneid;
1027 		return (mp);
1028 	}
1029 
1030 	first_mp = ipsec_alloc_ipsec_out();
1031 	if (first_mp == NULL)
1032 		return (NULL);
1033 	io = (ipsec_out_t *)first_mp->b_rptr;
1034 	/* This is not a secure packet */
1035 	io->ipsec_out_secure = B_FALSE;
1036 	io->ipsec_out_zoneid = zoneid;
1037 	first_mp->b_cont = mp;
1038 	return (first_mp);
1039 }
1040 
1041 /*
1042  * Process resolve requests.  Handles both mapped entries
1043  * as well as cases that needs to be send out on the wire.
1044  * Lookup a NCE for a given IRE.  Regardless of whether one exists
1045  * or one is created, we defer making ire point to nce until the
1046  * ire is actually added at which point the nce_refcnt on the nce is
1047  * incremented.  This is done primarily to have symmetry between ire_add()
1048  * and ire_delete() which decrements the nce_refcnt, when an ire is deleted.
1049  */
1050 int
1051 ndp_resolver(ill_t *ill, const in6_addr_t *dst, mblk_t *mp, zoneid_t zoneid)
1052 {
1053 	nce_t		*nce;
1054 	int		err = 0;
1055 	uint32_t	ms;
1056 	mblk_t		*mp_nce = NULL;
1057 
1058 	ASSERT(ill != NULL);
1059 	ASSERT(ill->ill_isv6);
1060 	if (IN6_IS_ADDR_MULTICAST(dst)) {
1061 		err = nce_set_multicast(ill, dst);
1062 		return (err);
1063 	}
1064 	err = ndp_lookup_then_add(ill,
1065 	    NULL,	/* No hardware address */
1066 	    dst,
1067 	    &ipv6_all_ones,
1068 	    &ipv6_all_zeros,
1069 	    0,
1070 	    (ill->ill_flags & ILLF_NONUD) ? NCE_F_NONUD : 0,
1071 	    ND_INCOMPLETE,
1072 	    &nce,
1073 	    NULL, /* let ndp_add figure out fastpath mp and dlureq_mp for v6 */
1074 	    NULL);
1075 
1076 	switch (err) {
1077 	case 0:
1078 		/*
1079 		 * New cache entry was created. Make sure that the state
1080 		 * is not ND_INCOMPLETE. It can be in some other state
1081 		 * even before we send out the solicitation as we could
1082 		 * get un-solicited advertisements.
1083 		 *
1084 		 * If this is an XRESOLV interface, simply return 0,
1085 		 * since we don't want to solicit just yet.
1086 		 */
1087 		if (ill->ill_flags & ILLF_XRESOLV) {
1088 			NCE_REFRELE(nce);
1089 			return (0);
1090 		}
1091 		rw_enter(&ill_g_lock, RW_READER);
1092 		mutex_enter(&nce->nce_lock);
1093 		if (nce->nce_state != ND_INCOMPLETE) {
1094 			mutex_exit(&nce->nce_lock);
1095 			rw_exit(&ill_g_lock);
1096 			NCE_REFRELE(nce);
1097 			return (0);
1098 		}
1099 		mp_nce = ndp_prepend_zone(mp, zoneid);
1100 		if (mp_nce == NULL) {
1101 			/* The caller will free mp */
1102 			mutex_exit(&nce->nce_lock);
1103 			rw_exit(&ill_g_lock);
1104 			ndp_delete(nce);
1105 			NCE_REFRELE(nce);
1106 			return (ENOMEM);
1107 		}
1108 		ms = nce_solicit(nce, mp_nce);
1109 		rw_exit(&ill_g_lock);
1110 		if (ms == 0) {
1111 			/* The caller will free mp */
1112 			if (mp_nce != mp)
1113 				freeb(mp_nce);
1114 			mutex_exit(&nce->nce_lock);
1115 			ndp_delete(nce);
1116 			NCE_REFRELE(nce);
1117 			return (EBUSY);
1118 		}
1119 		mutex_exit(&nce->nce_lock);
1120 		NDP_RESTART_TIMER(nce, (clock_t)ms);
1121 		NCE_REFRELE(nce);
1122 		return (EINPROGRESS);
1123 	case EEXIST:
1124 		/* Resolution in progress just queue the packet */
1125 		mutex_enter(&nce->nce_lock);
1126 		if (nce->nce_state == ND_INCOMPLETE) {
1127 			mp_nce = ndp_prepend_zone(mp, zoneid);
1128 			if (mp_nce == NULL) {
1129 				err = ENOMEM;
1130 			} else {
1131 				nce_queue_mp(nce, mp_nce);
1132 				err = EINPROGRESS;
1133 			}
1134 		} else {
1135 			/*
1136 			 * Any other state implies we have
1137 			 * a nce but IRE needs to be added ...
1138 			 * ire_add_v6() will take care of the
1139 			 * the case when the nce becomes CONDEMNED
1140 			 * before the ire is added to the table.
1141 			 */
1142 			err = 0;
1143 		}
1144 		mutex_exit(&nce->nce_lock);
1145 		NCE_REFRELE(nce);
1146 		break;
1147 	default:
1148 		ip1dbg(("ndp_resolver: Can't create NCE %d\n", err));
1149 		break;
1150 	}
1151 	return (err);
1152 }
1153 
1154 /*
1155  * When there is no resolver, the link layer template is passed in
1156  * the IRE.
1157  * Lookup a NCE for a given IRE.  Regardless of whether one exists
1158  * or one is created, we defer making ire point to nce until the
1159  * ire is actually added at which point the nce_refcnt on the nce is
1160  * incremented.  This is done primarily to have symmetry between ire_add()
1161  * and ire_delete() which decrements the nce_refcnt, when an ire is deleted.
1162  */
1163 int
1164 ndp_noresolver(ill_t *ill, const in6_addr_t *dst)
1165 {
1166 	nce_t		*nce;
1167 	int		err = 0;
1168 
1169 	ASSERT(ill != NULL);
1170 	ASSERT(ill->ill_isv6);
1171 	if (IN6_IS_ADDR_MULTICAST(dst)) {
1172 		err = nce_set_multicast(ill, dst);
1173 		return (err);
1174 	}
1175 
1176 	err = ndp_lookup_then_add(ill,
1177 	    NULL,	/* hardware address */
1178 	    dst,
1179 	    &ipv6_all_ones,
1180 	    &ipv6_all_zeros,
1181 	    0,
1182 	    (ill->ill_flags & ILLF_NONUD) ? NCE_F_NONUD : 0,
1183 	    ND_REACHABLE,
1184 	    &nce,
1185 	    NULL, /* let ndp_add figure out fp_mp/dlureq_mp for v6 */
1186 	    NULL);
1187 
1188 	switch (err) {
1189 	case 0:
1190 		/*
1191 		 * Cache entry with a proper resolver cookie was
1192 		 * created.
1193 		 */
1194 		NCE_REFRELE(nce);
1195 		break;
1196 	case EEXIST:
1197 		err = 0;
1198 		NCE_REFRELE(nce);
1199 		break;
1200 	default:
1201 		ip1dbg(("ndp_noresolver: Can't create NCE %d\n", err));
1202 		break;
1203 	}
1204 	return (err);
1205 }
1206 
1207 /*
1208  * For each interface an entry is added for the unspecified multicast group.
1209  * Here that mapping is used to form the multicast cache entry for a particular
1210  * multicast destination.
1211  */
1212 static int
1213 nce_set_multicast(ill_t *ill, const in6_addr_t *dst)
1214 {
1215 	nce_t		*mnce;	/* Multicast mapping entry */
1216 	nce_t		*nce;
1217 	uchar_t		*hw_addr = NULL;
1218 	int		err = 0;
1219 
1220 	ASSERT(ill != NULL);
1221 	ASSERT(ill->ill_isv6);
1222 	ASSERT(!(IN6_IS_ADDR_UNSPECIFIED(dst)));
1223 
1224 	mutex_enter(&ndp6.ndp_g_lock);
1225 	nce = *((nce_t **)NCE_HASH_PTR_V6(*dst));
1226 	nce = nce_lookup_addr(ill, dst, nce);
1227 	if (nce != NULL) {
1228 		mutex_exit(&ndp6.ndp_g_lock);
1229 		NCE_REFRELE(nce);
1230 		return (0);
1231 	}
1232 	/* No entry, now lookup for a mapping this should never fail */
1233 	mnce = nce_lookup_mapping(ill, dst);
1234 	if (mnce == NULL) {
1235 		/* Something broken for the interface. */
1236 		mutex_exit(&ndp6.ndp_g_lock);
1237 		return (ESRCH);
1238 	}
1239 	ASSERT(mnce->nce_flags & NCE_F_MAPPING);
1240 	if (ill->ill_net_type == IRE_IF_RESOLVER) {
1241 		/*
1242 		 * For IRE_IF_RESOLVER a hardware mapping can be
1243 		 * generated, for IRE_IF_NORESOLVER, resolution cookie
1244 		 * in the ill is copied in ndp_add().
1245 		 */
1246 		hw_addr = kmem_alloc(ill->ill_nd_lla_len, KM_NOSLEEP);
1247 		if (hw_addr == NULL) {
1248 			mutex_exit(&ndp6.ndp_g_lock);
1249 			NCE_REFRELE(mnce);
1250 			return (ENOMEM);
1251 		}
1252 		nce_make_mapping(mnce, hw_addr, (uchar_t *)dst);
1253 	}
1254 	NCE_REFRELE(mnce);
1255 	/*
1256 	 * IRE_IF_NORESOLVER type simply copies the resolution
1257 	 * cookie passed in.  So no hw_addr is needed.
1258 	 */
1259 	err = ndp_add(ill,
1260 	    hw_addr,
1261 	    dst,
1262 	    &ipv6_all_ones,
1263 	    &ipv6_all_zeros,
1264 	    0,
1265 	    NCE_F_NONUD,
1266 	    ND_REACHABLE,
1267 	    &nce,
1268 	    NULL,
1269 	    NULL);
1270 	mutex_exit(&ndp6.ndp_g_lock);
1271 	if (hw_addr != NULL)
1272 		kmem_free(hw_addr, ill->ill_nd_lla_len);
1273 	if (err != 0) {
1274 		ip1dbg(("nce_set_multicast: create failed" "%d\n", err));
1275 		return (err);
1276 	}
1277 	NCE_REFRELE(nce);
1278 	return (0);
1279 }
1280 
1281 /*
1282  * Return the link layer address, and any flags of a nce.
1283  */
1284 int
1285 ndp_query(ill_t *ill, struct lif_nd_req *lnr)
1286 {
1287 	nce_t		*nce;
1288 	in6_addr_t	*addr;
1289 	sin6_t		*sin6;
1290 	dl_unitdata_req_t	*dl;
1291 
1292 	ASSERT(ill != NULL && ill->ill_isv6);
1293 	sin6 = (sin6_t *)&lnr->lnr_addr;
1294 	addr =  &sin6->sin6_addr;
1295 
1296 	nce = ndp_lookup_v6(ill, addr, B_FALSE);
1297 	if (nce == NULL)
1298 		return (ESRCH);
1299 	/* If in INCOMPLETE state, no link layer address is available yet */
1300 	if (nce->nce_state == ND_INCOMPLETE)
1301 		goto done;
1302 	dl = (dl_unitdata_req_t *)nce->nce_res_mp->b_rptr;
1303 	if (ill->ill_flags & ILLF_XRESOLV)
1304 		lnr->lnr_hdw_len = dl->dl_dest_addr_length;
1305 	else
1306 		lnr->lnr_hdw_len = ill->ill_nd_lla_len;
1307 	ASSERT(NCE_LL_ADDR_OFFSET(ill) + lnr->lnr_hdw_len <=
1308 	    sizeof (lnr->lnr_hdw_addr));
1309 	bcopy(nce->nce_res_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill),
1310 	    (uchar_t *)&lnr->lnr_hdw_addr, lnr->lnr_hdw_len);
1311 	if (nce->nce_flags & NCE_F_ISROUTER)
1312 		lnr->lnr_flags = NDF_ISROUTER_ON;
1313 	if (nce->nce_flags & NCE_F_PROXY)
1314 		lnr->lnr_flags |= NDF_PROXY_ON;
1315 	if (nce->nce_flags & NCE_F_ANYCAST)
1316 		lnr->lnr_flags |= NDF_ANYCAST_ON;
1317 done:
1318 	NCE_REFRELE(nce);
1319 	return (0);
1320 }
1321 
1322 /*
1323  * Send Enable/Disable multicast reqs to driver.
1324  */
1325 int
1326 ndp_mcastreq(ill_t *ill, const in6_addr_t *addr, uint32_t hw_addr_len,
1327     uint32_t hw_addr_offset, mblk_t *mp)
1328 {
1329 	nce_t		*nce;
1330 	uchar_t		*hw_addr;
1331 
1332 	ASSERT(ill != NULL && ill->ill_isv6);
1333 	ASSERT(ill->ill_net_type == IRE_IF_RESOLVER);
1334 	hw_addr = mi_offset_paramc(mp, hw_addr_offset, hw_addr_len);
1335 	if (hw_addr == NULL || !IN6_IS_ADDR_MULTICAST(addr)) {
1336 		freemsg(mp);
1337 		return (EINVAL);
1338 	}
1339 	mutex_enter(&ndp6.ndp_g_lock);
1340 	nce = nce_lookup_mapping(ill, addr);
1341 	if (nce == NULL) {
1342 		mutex_exit(&ndp6.ndp_g_lock);
1343 		freemsg(mp);
1344 		return (ESRCH);
1345 	}
1346 	mutex_exit(&ndp6.ndp_g_lock);
1347 	/*
1348 	 * Update dl_addr_length and dl_addr_offset for primitives that
1349 	 * have physical addresses as opposed to full saps
1350 	 */
1351 	switch (((union DL_primitives *)mp->b_rptr)->dl_primitive) {
1352 	case DL_ENABMULTI_REQ:
1353 		/* Track the state if this is the first enabmulti */
1354 		if (ill->ill_dlpi_multicast_state == IDMS_UNKNOWN)
1355 			ill->ill_dlpi_multicast_state = IDMS_INPROGRESS;
1356 		ip1dbg(("ndp_mcastreq: ENABMULTI\n"));
1357 		break;
1358 	case DL_DISABMULTI_REQ:
1359 		ip1dbg(("ndp_mcastreq: DISABMULTI\n"));
1360 		break;
1361 	default:
1362 		NCE_REFRELE(nce);
1363 		ip1dbg(("ndp_mcastreq: default\n"));
1364 		return (EINVAL);
1365 	}
1366 	nce_make_mapping(nce, hw_addr, (uchar_t *)addr);
1367 	NCE_REFRELE(nce);
1368 	putnext(ill->ill_wq, mp);
1369 	return (0);
1370 }
1371 
1372 /*
1373  * Send a neighbor solicitation.
1374  * Returns number of milliseconds after which we should either rexmit or abort.
1375  * Return of zero means we should abort.
1376  * The caller holds the nce_lock to protect nce_qd_mp and nce_rcnt.
1377  *
1378  * NOTE: This routine drops nce_lock (and later reacquires it) when sending
1379  * the packet.
1380  * NOTE: This routine does not consume mp.
1381  */
1382 uint32_t
1383 nce_solicit(nce_t *nce, mblk_t *mp)
1384 {
1385 	ill_t		*ill;
1386 	ill_t		*src_ill;
1387 	ip6_t		*ip6h;
1388 	in6_addr_t	src;
1389 	in6_addr_t	dst;
1390 	ipif_t		*ipif;
1391 	ip6i_t		*ip6i;
1392 	boolean_t	dropped = B_FALSE;
1393 
1394 	ASSERT(RW_READ_HELD(&ill_g_lock));
1395 	ASSERT(MUTEX_HELD(&nce->nce_lock));
1396 	ill = nce->nce_ill;
1397 	ASSERT(ill != NULL);
1398 
1399 	if (nce->nce_rcnt == 0) {
1400 		return (0);
1401 	}
1402 
1403 	if (mp == NULL) {
1404 		ASSERT(nce->nce_qd_mp != NULL);
1405 		mp = nce->nce_qd_mp;
1406 	} else {
1407 		nce_queue_mp(nce, mp);
1408 	}
1409 
1410 	/* Handle ip_newroute_v6 giving us IPSEC packets */
1411 	if (mp->b_datap->db_type == M_CTL)
1412 		mp = mp->b_cont;
1413 
1414 	ip6h = (ip6_t *)mp->b_rptr;
1415 	if (ip6h->ip6_nxt == IPPROTO_RAW) {
1416 		/*
1417 		 * This message should have been pulled up already in
1418 		 * ip_wput_v6. We can't do pullups here because the message
1419 		 * could be from the nce_qd_mp which could have b_next/b_prev
1420 		 * non-NULL.
1421 		 */
1422 		ip6i = (ip6i_t *)ip6h;
1423 		ASSERT((mp->b_wptr - (uchar_t *)ip6i) >=
1424 			    sizeof (ip6i_t) + IPV6_HDR_LEN);
1425 		ip6h = (ip6_t *)(mp->b_rptr + sizeof (ip6i_t));
1426 	}
1427 	src = ip6h->ip6_src;
1428 	/*
1429 	 * If the src of outgoing packet is one of the assigned interface
1430 	 * addresses use it, otherwise we will pick the source address below.
1431 	 */
1432 	src_ill = ill;
1433 	if (!IN6_IS_ADDR_UNSPECIFIED(&src)) {
1434 		if (ill->ill_group != NULL)
1435 			src_ill = ill->ill_group->illgrp_ill;
1436 		for (; src_ill != NULL; src_ill = src_ill->ill_group_next) {
1437 			for (ipif = src_ill->ill_ipif; ipif != NULL;
1438 			    ipif = ipif->ipif_next) {
1439 				if (IN6_ARE_ADDR_EQUAL(&src,
1440 				    &ipif->ipif_v6lcl_addr)) {
1441 					break;
1442 				}
1443 			}
1444 			if (ipif != NULL)
1445 				break;
1446 		}
1447 		/*
1448 		 * If no relevant ipif can be found, then it's not one of our
1449 		 * addresses.  Reset to :: and let nce_xmit.  If an ipif can be
1450 		 * found, but it's not yet done with DAD verification, then
1451 		 * just postpone this transmission until later.
1452 		 */
1453 		if (src_ill == NULL)
1454 			src = ipv6_all_zeros;
1455 		else if (!ipif->ipif_addr_ready)
1456 			return (ill->ill_reachable_retrans_time);
1457 	}
1458 	dst = nce->nce_addr;
1459 	/*
1460 	 * If source address is unspecified, nce_xmit will choose
1461 	 * one for us and initialize the hardware address also
1462 	 * appropriately.
1463 	 */
1464 	if (IN6_IS_ADDR_UNSPECIFIED(&src))
1465 		src_ill = NULL;
1466 	nce->nce_rcnt--;
1467 	mutex_exit(&nce->nce_lock);
1468 	rw_exit(&ill_g_lock);
1469 	dropped = nce_xmit(ill, ND_NEIGHBOR_SOLICIT, src_ill, B_TRUE, &src,
1470 	    &dst, 0);
1471 	rw_enter(&ill_g_lock, RW_READER);
1472 	mutex_enter(&nce->nce_lock);
1473 	if (dropped)
1474 		nce->nce_rcnt++;
1475 	return (ill->ill_reachable_retrans_time);
1476 }
1477 
1478 /*
1479  * Attempt to recover an address on an interface that's been marked as a
1480  * duplicate.  Because NCEs are destroyed when the interface goes down, there's
1481  * no easy way to just probe the address and have the right thing happen if
1482  * it's no longer in use.  Instead, we just bring it up normally and allow the
1483  * regular interface start-up logic to probe for a remaining duplicate and take
1484  * us back down if necessary.
1485  * Neither DHCP nor temporary addresses arrive here; they're excluded by
1486  * ip_ndp_excl.
1487  */
1488 /* ARGSUSED */
1489 static void
1490 ip_ndp_recover(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg)
1491 {
1492 	ill_t	*ill = rq->q_ptr;
1493 	ipif_t	*ipif;
1494 	in6_addr_t *addr = (in6_addr_t *)mp->b_rptr;
1495 
1496 	for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
1497 		/*
1498 		 * We do not support recovery of proxy ARP'd interfaces,
1499 		 * because the system lacks a complete proxy ARP mechanism.
1500 		 */
1501 		if ((ipif->ipif_flags & IPIF_POINTOPOINT) ||
1502 		    !IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6lcl_addr, addr)) {
1503 			continue;
1504 		}
1505 
1506 		/*
1507 		 * If we have already recovered, then ignore.
1508 		 */
1509 		mutex_enter(&ill->ill_lock);
1510 		if (!(ipif->ipif_flags & IPIF_DUPLICATE)) {
1511 			mutex_exit(&ill->ill_lock);
1512 			continue;
1513 		}
1514 
1515 		ipif->ipif_flags &= ~IPIF_DUPLICATE;
1516 		ill->ill_ipif_dup_count--;
1517 		mutex_exit(&ill->ill_lock);
1518 		ipif->ipif_was_dup = B_TRUE;
1519 
1520 		if (ipif_ndp_up(ipif, addr, B_FALSE) != EINPROGRESS)
1521 			(void) ipif_up_done_v6(ipif);
1522 	}
1523 	freeb(mp);
1524 }
1525 
1526 /*
1527  * Attempt to recover an IPv6 interface that's been shut down as a duplicate.
1528  * As long as someone else holds the address, the interface will stay down.
1529  * When that conflict goes away, the interface is brought back up.  This is
1530  * done so that accidental shutdowns of addresses aren't made permanent.  Your
1531  * server will recover from a failure.
1532  *
1533  * For DHCP and temporary addresses, recovery is not done in the kernel.
1534  * Instead, it's handled by user space processes (dhcpagent and in.ndpd).
1535  *
1536  * This function is entered on a timer expiry; the ID is in ipif_recovery_id.
1537  */
1538 static void
1539 ipif6_dup_recovery(void *arg)
1540 {
1541 	ipif_t *ipif = arg;
1542 
1543 	ipif->ipif_recovery_id = 0;
1544 	if (!(ipif->ipif_flags & IPIF_DUPLICATE))
1545 		return;
1546 
1547 	/* If the link is down, we'll retry this later */
1548 	if (!(ipif->ipif_ill->ill_phyint->phyint_flags & PHYI_RUNNING))
1549 		return;
1550 
1551 	ndp_do_recovery(ipif);
1552 }
1553 
1554 /*
1555  * Perform interface recovery by forcing the duplicate interfaces up and
1556  * allowing the system to determine which ones should stay up.
1557  *
1558  * Called both by recovery timer expiry and link-up notification.
1559  */
1560 void
1561 ndp_do_recovery(ipif_t *ipif)
1562 {
1563 	ill_t *ill = ipif->ipif_ill;
1564 	mblk_t *mp;
1565 
1566 	mp = allocb(sizeof (ipif->ipif_v6lcl_addr), BPRI_MED);
1567 	if (mp == NULL) {
1568 		ipif->ipif_recovery_id = timeout(ipif6_dup_recovery,
1569 		    ipif, MSEC_TO_TICK(ip_dup_recovery));
1570 	} else {
1571 		bcopy(&ipif->ipif_v6lcl_addr, mp->b_rptr,
1572 		    sizeof (ipif->ipif_v6lcl_addr));
1573 		ill_refhold(ill);
1574 		(void) qwriter_ip(NULL, ill, ill->ill_rq, mp, ip_ndp_recover,
1575 		    CUR_OP, B_FALSE);
1576 	}
1577 }
1578 
1579 /*
1580  * Find the solicitation in the given message, and extract printable details
1581  * (MAC and IP addresses) from it.
1582  */
1583 static nd_neighbor_solicit_t *
1584 ip_ndp_find_solicitation(mblk_t *mp, mblk_t *dl_mp, ill_t *ill, char *hbuf,
1585     size_t hlen, char *sbuf, size_t slen, uchar_t **haddr)
1586 {
1587 	nd_neighbor_solicit_t *ns;
1588 	ip6_t *ip6h;
1589 	uchar_t *addr;
1590 	int alen;
1591 
1592 	alen = 0;
1593 	ip6h = (ip6_t *)mp->b_rptr;
1594 	if (dl_mp == NULL) {
1595 		nd_opt_hdr_t *opt;
1596 		int nslen;
1597 
1598 		/*
1599 		 * If it's from the fast-path, then it can't be a probe
1600 		 * message, and thus must include the source linkaddr option.
1601 		 * Extract that here.
1602 		 */
1603 		ns = (nd_neighbor_solicit_t *)((char *)ip6h + IPV6_HDR_LEN);
1604 		nslen = mp->b_wptr - (uchar_t *)ns;
1605 		if ((nslen -= sizeof (*ns)) > 0) {
1606 			opt = ndp_get_option((nd_opt_hdr_t *)(ns + 1), nslen,
1607 			    ND_OPT_SOURCE_LINKADDR);
1608 			if (opt != NULL &&
1609 			    opt->nd_opt_len * 8 - sizeof (*opt) >=
1610 			    ill->ill_nd_lla_len) {
1611 				addr = (uchar_t *)(opt + 1);
1612 				alen = ill->ill_nd_lla_len;
1613 			}
1614 		}
1615 		/*
1616 		 * We cheat a bit here for the sake of printing usable log
1617 		 * messages in the rare case where the reply we got was unicast
1618 		 * without a source linkaddr option, and the interface is in
1619 		 * fastpath mode.  (Sigh.)
1620 		 */
1621 		if (alen == 0 && ill->ill_type == IFT_ETHER &&
1622 		    MBLKHEAD(mp) >= sizeof (struct ether_header)) {
1623 			struct ether_header *pether;
1624 
1625 			pether = (struct ether_header *)((char *)ip6h -
1626 			    sizeof (*pether));
1627 			addr = pether->ether_shost.ether_addr_octet;
1628 			alen = ETHERADDRL;
1629 		}
1630 	} else {
1631 		dl_unitdata_ind_t *dlu;
1632 
1633 		dlu = (dl_unitdata_ind_t *)dl_mp->b_rptr;
1634 		alen = dlu->dl_src_addr_length;
1635 		if (alen > 0 && dlu->dl_src_addr_offset >= sizeof (*dlu) &&
1636 		    dlu->dl_src_addr_offset + alen <= MBLKL(dl_mp)) {
1637 			addr = dl_mp->b_rptr + dlu->dl_src_addr_offset;
1638 			if (ill->ill_sap_length < 0) {
1639 				alen += ill->ill_sap_length;
1640 			} else {
1641 				addr += ill->ill_sap_length;
1642 				alen -= ill->ill_sap_length;
1643 			}
1644 		}
1645 	}
1646 	if (alen > 0) {
1647 		*haddr = addr;
1648 		(void) mac_colon_addr(addr, alen, hbuf, hlen);
1649 	} else {
1650 		*haddr = NULL;
1651 		(void) strcpy(hbuf, "?");
1652 	}
1653 	ns = (nd_neighbor_solicit_t *)((char *)ip6h + IPV6_HDR_LEN);
1654 	(void) inet_ntop(AF_INET6, &ns->nd_ns_target, sbuf, slen);
1655 	return (ns);
1656 }
1657 
1658 /*
1659  * This is for exclusive changes due to NDP duplicate address detection
1660  * failure.
1661  */
1662 /* ARGSUSED */
1663 static void
1664 ip_ndp_excl(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg)
1665 {
1666 	ill_t	*ill = rq->q_ptr;
1667 	ipif_t	*ipif;
1668 	char ibuf[LIFNAMSIZ + 10];	/* 10 digits for logical i/f number */
1669 	char hbuf[MAC_STR_LEN];
1670 	char sbuf[INET6_ADDRSTRLEN];
1671 	nd_neighbor_solicit_t *ns;
1672 	mblk_t *dl_mp = NULL;
1673 	uchar_t *haddr;
1674 
1675 	if (DB_TYPE(mp) != M_DATA) {
1676 		dl_mp = mp;
1677 		mp = mp->b_cont;
1678 	}
1679 	ns = ip_ndp_find_solicitation(mp, dl_mp, ill, hbuf, sizeof (hbuf), sbuf,
1680 	    sizeof (sbuf), &haddr);
1681 	if (haddr != NULL &&
1682 	    bcmp(haddr, ill->ill_phys_addr, ill->ill_phys_addr_length) == 0) {
1683 		/*
1684 		 * Ignore conflicts generated by misbehaving switches that just
1685 		 * reflect our own messages back to us.
1686 		 */
1687 		goto ignore_conflict;
1688 	}
1689 	(void) strlcpy(ibuf, ill->ill_name, sizeof (ibuf));
1690 	for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
1691 
1692 		if ((ipif->ipif_flags & IPIF_POINTOPOINT) ||
1693 		    !IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6lcl_addr,
1694 		    &ns->nd_ns_target)) {
1695 			continue;
1696 		}
1697 
1698 		/* If it's already marked, then don't do anything. */
1699 		if (ipif->ipif_flags & IPIF_DUPLICATE)
1700 			continue;
1701 
1702 		/*
1703 		 * If this is a failure during duplicate recovery, then don't
1704 		 * complain.  It may take a long time to recover.
1705 		 */
1706 		if (!ipif->ipif_was_dup) {
1707 			if (ipif->ipif_id != 0) {
1708 				(void) snprintf(ibuf + ill->ill_name_length - 1,
1709 				    sizeof (ibuf) - ill->ill_name_length + 1,
1710 				    ":%d", ipif->ipif_id);
1711 			}
1712 			cmn_err(CE_WARN, "%s has duplicate address %s (in "
1713 			    "use by %s); disabled", ibuf, sbuf, hbuf);
1714 		}
1715 		mutex_enter(&ill->ill_lock);
1716 		ASSERT(!(ipif->ipif_flags & IPIF_DUPLICATE));
1717 		ipif->ipif_flags |= IPIF_DUPLICATE;
1718 		ill->ill_ipif_dup_count++;
1719 		mutex_exit(&ill->ill_lock);
1720 		(void) ipif_down(ipif, NULL, NULL);
1721 		ipif_down_tail(ipif);
1722 		if (!(ipif->ipif_flags & (IPIF_DHCPRUNNING|IPIF_TEMPORARY)) &&
1723 		    ill->ill_net_type == IRE_IF_RESOLVER &&
1724 		    ip_dup_recovery > 0)
1725 			ipif->ipif_recovery_id = timeout(ipif6_dup_recovery,
1726 			    ipif, MSEC_TO_TICK(ip_dup_recovery));
1727 	}
1728 ignore_conflict:
1729 	if (dl_mp != NULL)
1730 		freeb(dl_mp);
1731 	freemsg(mp);
1732 }
1733 
1734 /*
1735  * Handle failure by tearing down the ipifs with the specified address.  Note
1736  * that tearing down the ipif also means deleting the nce through ipif_down, so
1737  * it's not possible to do recovery by just restarting the nce timer.  Instead,
1738  * we start a timer on the ipif.
1739  */
1740 static void
1741 ip_ndp_failure(ill_t *ill, mblk_t *mp, mblk_t *dl_mp, nce_t *nce)
1742 {
1743 	if ((mp = copymsg(mp)) != NULL) {
1744 		if (dl_mp == NULL)
1745 			dl_mp = mp;
1746 		else if ((dl_mp = copyb(dl_mp)) != NULL)
1747 			dl_mp->b_cont = mp;
1748 		if (dl_mp == NULL) {
1749 			freemsg(mp);
1750 		} else {
1751 			ill_refhold(ill);
1752 			(void) qwriter_ip(NULL, ill, ill->ill_rq, dl_mp,
1753 			    ip_ndp_excl, CUR_OP, B_FALSE);
1754 		}
1755 	}
1756 	ndp_delete(nce);
1757 }
1758 
1759 /*
1760  * Handle a discovered conflict: some other system is advertising that it owns
1761  * one of our IP addresses.  We need to defend ourselves, or just shut down the
1762  * interface.
1763  */
1764 static void
1765 ip_ndp_conflict(ill_t *ill, mblk_t *mp, mblk_t *dl_mp, nce_t *nce)
1766 {
1767 	ipif_t *ipif;
1768 	uint32_t now;
1769 	uint_t maxdefense;
1770 	uint_t defs;
1771 
1772 	ipif = ipif_lookup_addr_v6(&nce->nce_addr, ill, ALL_ZONES, NULL, NULL,
1773 	    NULL, NULL);
1774 	if (ipif == NULL)
1775 		return;
1776 	/*
1777 	 * First, figure out if this address is disposable.
1778 	 */
1779 	if (ipif->ipif_flags & (IPIF_DHCPRUNNING | IPIF_TEMPORARY))
1780 		maxdefense = ip_max_temp_defend;
1781 	else
1782 		maxdefense = ip_max_defend;
1783 
1784 	/*
1785 	 * Now figure out how many times we've defended ourselves.  Ignore
1786 	 * defenses that happened long in the past.
1787 	 */
1788 	now = gethrestime_sec();
1789 	mutex_enter(&nce->nce_lock);
1790 	if ((defs = nce->nce_defense_count) > 0 &&
1791 	    now - nce->nce_defense_time > ip_defend_interval) {
1792 		nce->nce_defense_count = defs = 0;
1793 	}
1794 	nce->nce_defense_count++;
1795 	nce->nce_defense_time = now;
1796 	mutex_exit(&nce->nce_lock);
1797 	ipif_refrele(ipif);
1798 
1799 	/*
1800 	 * If we've defended ourselves too many times already, then give up and
1801 	 * tear down the interface(s) using this address.  Otherwise, defend by
1802 	 * sending out an unsolicited Neighbor Advertisement.
1803 	 */
1804 	if (defs >= maxdefense) {
1805 		ip_ndp_failure(ill, mp, dl_mp, nce);
1806 	} else {
1807 		char hbuf[MAC_STR_LEN];
1808 		char sbuf[INET6_ADDRSTRLEN];
1809 		uchar_t *haddr;
1810 
1811 		(void) ip_ndp_find_solicitation(mp, dl_mp, ill, hbuf,
1812 		    sizeof (hbuf), sbuf, sizeof (sbuf), &haddr);
1813 		cmn_err(CE_WARN, "node %s is using our IP address %s on %s",
1814 		    hbuf, sbuf, ill->ill_name);
1815 		(void) nce_xmit(ill, ND_NEIGHBOR_ADVERT, ill, B_FALSE,
1816 		    &nce->nce_addr, &ipv6_all_hosts_mcast, NDP_ORIDE);
1817 	}
1818 }
1819 
1820 static void
1821 ndp_input_solicit(ill_t *ill, mblk_t *mp, mblk_t *dl_mp)
1822 {
1823 	nd_neighbor_solicit_t *ns;
1824 	uint32_t	hlen = ill->ill_nd_lla_len;
1825 	uchar_t		*haddr = NULL;
1826 	icmp6_t		*icmp_nd;
1827 	ip6_t		*ip6h;
1828 	nce_t		*our_nce = NULL;
1829 	in6_addr_t	target;
1830 	in6_addr_t	src;
1831 	int		len;
1832 	int		flag = 0;
1833 	nd_opt_hdr_t	*opt = NULL;
1834 	boolean_t	bad_solicit = B_FALSE;
1835 	mib2_ipv6IfIcmpEntry_t	*mib = ill->ill_icmp6_mib;
1836 
1837 	ip6h = (ip6_t *)mp->b_rptr;
1838 	icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
1839 	len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN;
1840 	src = ip6h->ip6_src;
1841 	ns = (nd_neighbor_solicit_t *)icmp_nd;
1842 	target = ns->nd_ns_target;
1843 	if (IN6_IS_ADDR_MULTICAST(&target)) {
1844 		if (ip_debug > 2) {
1845 			/* ip1dbg */
1846 			pr_addr_dbg("ndp_input_solicit: Target is"
1847 			    " multicast! %s\n", AF_INET6, &target);
1848 		}
1849 		bad_solicit = B_TRUE;
1850 		goto done;
1851 	}
1852 	if (len > sizeof (nd_neighbor_solicit_t)) {
1853 		/* Options present */
1854 		opt = (nd_opt_hdr_t *)&ns[1];
1855 		len -= sizeof (nd_neighbor_solicit_t);
1856 		if (!ndp_verify_optlen(opt, len)) {
1857 			ip1dbg(("ndp_input_solicit: Bad opt len\n"));
1858 			bad_solicit = B_TRUE;
1859 			goto done;
1860 		}
1861 	}
1862 	if (IN6_IS_ADDR_UNSPECIFIED(&src)) {
1863 		/* Check to see if this is a valid DAD solicitation */
1864 		if (!IN6_IS_ADDR_MC_SOLICITEDNODE(&ip6h->ip6_dst)) {
1865 			if (ip_debug > 2) {
1866 				/* ip1dbg */
1867 				pr_addr_dbg("ndp_input_solicit: IPv6 "
1868 				    "Destination is not solicited node "
1869 				    "multicast %s\n", AF_INET6,
1870 				    &ip6h->ip6_dst);
1871 			}
1872 			bad_solicit = B_TRUE;
1873 			goto done;
1874 		}
1875 	}
1876 
1877 	our_nce = ndp_lookup_v6(ill, &target, B_FALSE);
1878 	/*
1879 	 * If this is a valid Solicitation, a permanent
1880 	 * entry should exist in the cache
1881 	 */
1882 	if (our_nce == NULL ||
1883 	    !(our_nce->nce_flags & NCE_F_PERMANENT)) {
1884 		ip1dbg(("ndp_input_solicit: Wrong target in NS?!"
1885 		    "ifname=%s ", ill->ill_name));
1886 		if (ip_debug > 2) {
1887 			/* ip1dbg */
1888 			pr_addr_dbg(" dst %s\n", AF_INET6, &target);
1889 		}
1890 		bad_solicit = B_TRUE;
1891 		goto done;
1892 	}
1893 
1894 	/* At this point we should have a verified NS per spec */
1895 	if (opt != NULL) {
1896 		opt = ndp_get_option(opt, len, ND_OPT_SOURCE_LINKADDR);
1897 		if (opt != NULL) {
1898 			haddr = (uchar_t *)&opt[1];
1899 			if (hlen > opt->nd_opt_len * 8 - sizeof (*opt) ||
1900 			    hlen == 0) {
1901 				ip1dbg(("ndp_input_advert: bad SLLA\n"));
1902 				bad_solicit = B_TRUE;
1903 				goto done;
1904 			}
1905 		}
1906 	}
1907 
1908 	/* Set override flag, it will be reset later if need be. */
1909 	flag |= NDP_ORIDE;
1910 	if (!IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst)) {
1911 		flag |= NDP_UNICAST;
1912 	}
1913 
1914 	/*
1915 	 * Create/update the entry for the soliciting node.
1916 	 * or respond to outstanding queries, don't if
1917 	 * the source is unspecified address.
1918 	 */
1919 	if (!IN6_IS_ADDR_UNSPECIFIED(&src)) {
1920 		int	err;
1921 		nce_t	*nnce;
1922 
1923 		ASSERT(ill->ill_isv6);
1924 		/*
1925 		 * Regular solicitations *must* include the Source Link-Layer
1926 		 * Address option.  Ignore messages that do not.
1927 		 */
1928 		if (haddr == NULL && IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst)) {
1929 			ip1dbg(("ndp_input_solicit: source link-layer address "
1930 			    "option missing with a specified source.\n"));
1931 			bad_solicit = B_TRUE;
1932 			goto done;
1933 		}
1934 
1935 		/*
1936 		 * This is a regular solicitation.  If we're still in the
1937 		 * process of verifying the address, then don't respond at all
1938 		 * and don't keep track of the sender.
1939 		 */
1940 		if (our_nce->nce_state == ND_PROBE)
1941 			goto done;
1942 
1943 		/*
1944 		 * If the solicitation doesn't have sender hardware address
1945 		 * (legal for unicast solicitation), then process without
1946 		 * installing the return NCE.  Either we already know it, or
1947 		 * we'll be forced to look it up when (and if) we reply to the
1948 		 * packet.
1949 		 */
1950 		if (haddr == NULL)
1951 			goto no_source;
1952 
1953 		err = ndp_lookup_then_add(ill,
1954 		    haddr,
1955 		    &src,	/* Soliciting nodes address */
1956 		    &ipv6_all_ones,
1957 		    &ipv6_all_zeros,
1958 		    0,
1959 		    0,
1960 		    ND_STALE,
1961 		    &nnce,
1962 		    NULL,
1963 		    NULL);
1964 		switch (err) {
1965 		case 0:
1966 			/* done with this entry */
1967 			NCE_REFRELE(nnce);
1968 			break;
1969 		case EEXIST:
1970 			/*
1971 			 * B_FALSE indicates this is not an
1972 			 * an advertisement.
1973 			 */
1974 			ndp_process(nnce, haddr, 0, B_FALSE);
1975 			NCE_REFRELE(nnce);
1976 			break;
1977 		default:
1978 			ip1dbg(("ndp_input_solicit: Can't create NCE %d\n",
1979 			    err));
1980 			goto done;
1981 		}
1982 no_source:
1983 		flag |= NDP_SOLICITED;
1984 	} else {
1985 		/*
1986 		 * No source link layer address option should be present in a
1987 		 * valid DAD request.
1988 		 */
1989 		if (haddr != NULL) {
1990 			ip1dbg(("ndp_input_solicit: source link-layer address "
1991 			    "option present with an unspecified source.\n"));
1992 			bad_solicit = B_TRUE;
1993 			goto done;
1994 		}
1995 		if (our_nce->nce_state == ND_PROBE) {
1996 			/*
1997 			 * Internally looped-back probes won't have DLPI
1998 			 * attached to them.  External ones (which are sent by
1999 			 * multicast) always will.  Just ignore our own
2000 			 * transmissions.
2001 			 */
2002 			if (dl_mp != NULL) {
2003 				/*
2004 				 * If someone else is probing our address, then
2005 				 * we've crossed wires.  Declare failure.
2006 				 */
2007 				ip_ndp_failure(ill, mp, dl_mp, our_nce);
2008 			}
2009 			goto done;
2010 		}
2011 		/*
2012 		 * This is a DAD probe.  Multicast the advertisement to the
2013 		 * all-nodes address.
2014 		 */
2015 		src = ipv6_all_hosts_mcast;
2016 	}
2017 	if (our_nce->nce_flags & NCE_F_ISROUTER)
2018 		flag |= NDP_ISROUTER;
2019 	if (our_nce->nce_flags & NCE_F_PROXY)
2020 		flag &= ~NDP_ORIDE;
2021 	/* Response to a solicitation */
2022 	(void) nce_xmit(ill,
2023 	    ND_NEIGHBOR_ADVERT,
2024 	    ill,	/* ill to be used for extracting ill_nd_lla */
2025 	    B_TRUE,	/* use ill_nd_lla */
2026 	    &target,	/* Source and target of the advertisement pkt */
2027 	    &src,	/* IP Destination (source of original pkt) */
2028 	    flag);
2029 done:
2030 	if (bad_solicit)
2031 		BUMP_MIB(mib, ipv6IfIcmpInBadNeighborSolicitations);
2032 	if (our_nce != NULL)
2033 		NCE_REFRELE(our_nce);
2034 }
2035 
2036 void
2037 ndp_input_advert(ill_t *ill, mblk_t *mp, mblk_t *dl_mp)
2038 {
2039 	nd_neighbor_advert_t *na;
2040 	uint32_t	hlen = ill->ill_nd_lla_len;
2041 	uchar_t		*haddr = NULL;
2042 	icmp6_t		*icmp_nd;
2043 	ip6_t		*ip6h;
2044 	nce_t		*dst_nce = NULL;
2045 	in6_addr_t	target;
2046 	nd_opt_hdr_t	*opt = NULL;
2047 	int		len;
2048 	mib2_ipv6IfIcmpEntry_t	*mib = ill->ill_icmp6_mib;
2049 
2050 	ip6h = (ip6_t *)mp->b_rptr;
2051 	icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
2052 	len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN;
2053 	na = (nd_neighbor_advert_t *)icmp_nd;
2054 	if (IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst) &&
2055 	    (na->nd_na_flags_reserved & ND_NA_FLAG_SOLICITED)) {
2056 		ip1dbg(("ndp_input_advert: Target is multicast but the "
2057 		    "solicited flag is not zero\n"));
2058 		BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements);
2059 		return;
2060 	}
2061 	target = na->nd_na_target;
2062 	if (IN6_IS_ADDR_MULTICAST(&target)) {
2063 		ip1dbg(("ndp_input_advert: Target is multicast!\n"));
2064 		BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements);
2065 		return;
2066 	}
2067 	if (len > sizeof (nd_neighbor_advert_t)) {
2068 		opt = (nd_opt_hdr_t *)&na[1];
2069 		if (!ndp_verify_optlen(opt,
2070 		    len - sizeof (nd_neighbor_advert_t))) {
2071 			ip1dbg(("ndp_input_advert: cannot verify SLLA\n"));
2072 			BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements);
2073 			return;
2074 		}
2075 		/* At this point we have a verified NA per spec */
2076 		len -= sizeof (nd_neighbor_advert_t);
2077 		opt = ndp_get_option(opt, len, ND_OPT_TARGET_LINKADDR);
2078 		if (opt != NULL) {
2079 			haddr = (uchar_t *)&opt[1];
2080 			if (hlen > opt->nd_opt_len * 8 - sizeof (*opt) ||
2081 			    hlen == 0) {
2082 				ip1dbg(("ndp_input_advert: bad SLLA\n"));
2083 				BUMP_MIB(mib,
2084 				    ipv6IfIcmpInBadNeighborAdvertisements);
2085 				return;
2086 			}
2087 		}
2088 	}
2089 
2090 	/*
2091 	 * If this interface is part of the group look at all the
2092 	 * ills in the group.
2093 	 */
2094 	rw_enter(&ill_g_lock, RW_READER);
2095 	if (ill->ill_group != NULL)
2096 		ill = ill->ill_group->illgrp_ill;
2097 
2098 	for (; ill != NULL; ill = ill->ill_group_next) {
2099 		mutex_enter(&ill->ill_lock);
2100 		if (!ILL_CAN_LOOKUP(ill)) {
2101 			mutex_exit(&ill->ill_lock);
2102 			continue;
2103 		}
2104 		ill_refhold_locked(ill);
2105 		mutex_exit(&ill->ill_lock);
2106 		dst_nce = ndp_lookup_v6(ill, &target, B_FALSE);
2107 		/* We have to drop the lock since ndp_process calls put* */
2108 		rw_exit(&ill_g_lock);
2109 		if (dst_nce != NULL) {
2110 			if ((dst_nce->nce_flags & NCE_F_PERMANENT) &&
2111 			    dst_nce->nce_state == ND_PROBE) {
2112 				/*
2113 				 * Someone else sent an advertisement for an
2114 				 * address that we're trying to configure.
2115 				 * Tear it down.  Note that dl_mp might be NULL
2116 				 * if we're getting a unicast reply.  This
2117 				 * isn't typically done (multicast is the norm
2118 				 * in response to a probe), but ip_ndp_failure
2119 				 * will handle the dl_mp == NULL case as well.
2120 				 */
2121 				ip_ndp_failure(ill, mp, dl_mp, dst_nce);
2122 			} else if (dst_nce->nce_flags & NCE_F_PERMANENT) {
2123 				/*
2124 				 * Someone just announced one of our local
2125 				 * addresses.  If it wasn't us, then this is a
2126 				 * conflict.  Defend the address or shut it
2127 				 * down.
2128 				 */
2129 				if (dl_mp != NULL &&
2130 				    (haddr == NULL ||
2131 				    nce_cmp_ll_addr(dst_nce, haddr,
2132 				    ill->ill_nd_lla_len))) {
2133 					ip_ndp_conflict(ill, mp, dl_mp,
2134 					    dst_nce);
2135 				}
2136 			} else {
2137 				if (na->nd_na_flags_reserved &
2138 				    ND_NA_FLAG_ROUTER) {
2139 					dst_nce->nce_flags |= NCE_F_ISROUTER;
2140 				}
2141 				/* B_TRUE indicates this an advertisement */
2142 				ndp_process(dst_nce, haddr,
2143 				    na->nd_na_flags_reserved, B_TRUE);
2144 			}
2145 			NCE_REFRELE(dst_nce);
2146 		}
2147 		rw_enter(&ill_g_lock, RW_READER);
2148 		ill_refrele(ill);
2149 	}
2150 	rw_exit(&ill_g_lock);
2151 }
2152 
2153 /*
2154  * Process NDP neighbor solicitation/advertisement messages.
2155  * The checksum has already checked o.k before reaching here.
2156  */
2157 void
2158 ndp_input(ill_t *ill, mblk_t *mp, mblk_t *dl_mp)
2159 {
2160 	icmp6_t		*icmp_nd;
2161 	ip6_t		*ip6h;
2162 	int		len;
2163 	mib2_ipv6IfIcmpEntry_t	*mib = ill->ill_icmp6_mib;
2164 
2165 
2166 	if (!pullupmsg(mp, -1)) {
2167 		ip1dbg(("ndp_input: pullupmsg failed\n"));
2168 		BUMP_MIB(ill->ill_ip6_mib, ipv6InDiscards);
2169 		goto done;
2170 	}
2171 	ip6h = (ip6_t *)mp->b_rptr;
2172 	if (ip6h->ip6_hops != IPV6_MAX_HOPS) {
2173 		ip1dbg(("ndp_input: hoplimit != IPV6_MAX_HOPS\n"));
2174 		BUMP_MIB(mib, ipv6IfIcmpBadHoplimit);
2175 		goto done;
2176 	}
2177 	/*
2178 	 * NDP does not accept any extension headers between the
2179 	 * IP header and the ICMP header since e.g. a routing
2180 	 * header could be dangerous.
2181 	 * This assumes that any AH or ESP headers are removed
2182 	 * by ip prior to passing the packet to ndp_input.
2183 	 */
2184 	if (ip6h->ip6_nxt != IPPROTO_ICMPV6) {
2185 		ip1dbg(("ndp_input: Wrong next header 0x%x\n",
2186 		    ip6h->ip6_nxt));
2187 		BUMP_MIB(mib, ipv6IfIcmpInErrors);
2188 		goto done;
2189 	}
2190 	icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
2191 	ASSERT(icmp_nd->icmp6_type == ND_NEIGHBOR_SOLICIT ||
2192 	    icmp_nd->icmp6_type == ND_NEIGHBOR_ADVERT);
2193 	if (icmp_nd->icmp6_code != 0) {
2194 		ip1dbg(("ndp_input: icmp6 code != 0 \n"));
2195 		BUMP_MIB(mib, ipv6IfIcmpInErrors);
2196 		goto done;
2197 	}
2198 	len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN;
2199 	/*
2200 	 * Make sure packet length is large enough for either
2201 	 * a NS or a NA icmp packet.
2202 	 */
2203 	if (len <  sizeof (struct icmp6_hdr) + sizeof (struct in6_addr)) {
2204 		ip1dbg(("ndp_input: packet too short\n"));
2205 		BUMP_MIB(mib, ipv6IfIcmpInErrors);
2206 		goto done;
2207 	}
2208 	if (icmp_nd->icmp6_type == ND_NEIGHBOR_SOLICIT) {
2209 		ndp_input_solicit(ill, mp, dl_mp);
2210 	} else {
2211 		ndp_input_advert(ill, mp, dl_mp);
2212 	}
2213 done:
2214 	freemsg(mp);
2215 }
2216 
2217 /*
2218  * nce_xmit is called to form and transmit a ND solicitation or
2219  * advertisement ICMP packet.
2220  *
2221  * If the source address is unspecified and this isn't a probe (used for
2222  * duplicate address detection), an appropriate source address and link layer
2223  * address will be chosen here.  The link layer address option is included if
2224  * the source is specified (i.e., all non-probe packets), and omitted (per the
2225  * specification) otherwise.
2226  *
2227  * It returns B_FALSE only if it does a successful put() to the
2228  * corresponding ill's ill_wq otherwise returns B_TRUE.
2229  */
2230 static boolean_t
2231 nce_xmit(ill_t *ill, uint32_t operation, ill_t *hwaddr_ill,
2232     boolean_t use_nd_lla, const in6_addr_t *sender, const in6_addr_t *target,
2233     int flag)
2234 {
2235 	uint32_t	len;
2236 	icmp6_t 	*icmp6;
2237 	mblk_t		*mp;
2238 	ip6_t		*ip6h;
2239 	nd_opt_hdr_t	*opt;
2240 	uint_t		plen;
2241 	ip6i_t		*ip6i;
2242 	ipif_t		*src_ipif = NULL;
2243 	uint8_t		*hw_addr;
2244 
2245 	/*
2246 	 * If we have a unspecified source(sender) address, select a
2247 	 * proper source address for the solicitation here itself so
2248 	 * that we can initialize the h/w address correctly. This is
2249 	 * needed for interface groups as source address can come from
2250 	 * the whole group and the h/w address initialized from ill will
2251 	 * be wrong if the source address comes from a different ill.
2252 	 *
2253 	 * Note that the NA never comes here with the unspecified source
2254 	 * address. The following asserts that whenever the source
2255 	 * address is specified, the haddr also should be specified.
2256 	 */
2257 	ASSERT(IN6_IS_ADDR_UNSPECIFIED(sender) || (hwaddr_ill != NULL));
2258 
2259 	if (IN6_IS_ADDR_UNSPECIFIED(sender) && !(flag & NDP_PROBE)) {
2260 		ASSERT(operation != ND_NEIGHBOR_ADVERT);
2261 		/*
2262 		 * Pick a source address for this solicitation, but
2263 		 * restrict the selection to addresses assigned to the
2264 		 * output interface (or interface group).  We do this
2265 		 * because the destination will create a neighbor cache
2266 		 * entry for the source address of this packet, so the
2267 		 * source address had better be a valid neighbor.
2268 		 */
2269 		src_ipif = ipif_select_source_v6(ill, target, RESTRICT_TO_ILL,
2270 		    IPV6_PREFER_SRC_DEFAULT, GLOBAL_ZONEID);
2271 		if (src_ipif == NULL) {
2272 			char buf[INET6_ADDRSTRLEN];
2273 
2274 			ip1dbg(("nce_xmit: No source ipif for dst %s\n",
2275 			    inet_ntop(AF_INET6, (char *)target, buf,
2276 			    sizeof (buf))));
2277 			return (B_TRUE);
2278 		}
2279 		sender = &src_ipif->ipif_v6src_addr;
2280 		hwaddr_ill = src_ipif->ipif_ill;
2281 	}
2282 
2283 	/*
2284 	 * Always make sure that the NS/NA packets don't get load
2285 	 * spread. This is needed so that the probe packets sent
2286 	 * by the in.mpathd daemon can really go out on the desired
2287 	 * interface. Probe packets are made to go out on a desired
2288 	 * interface by including a ip6i with ATTACH_IF flag. As these
2289 	 * packets indirectly end up sending/receiving NS/NA packets
2290 	 * (neighbor doing NUD), we have to make sure that NA
2291 	 * also go out on the same interface.
2292 	 */
2293 	plen = (sizeof (nd_opt_hdr_t) + ill->ill_nd_lla_len + 7) / 8;
2294 	len = IPV6_HDR_LEN + sizeof (ip6i_t) + sizeof (nd_neighbor_advert_t) +
2295 	    plen * 8;
2296 	mp = allocb(len,  BPRI_LO);
2297 	if (mp == NULL) {
2298 		if (src_ipif != NULL)
2299 			ipif_refrele(src_ipif);
2300 		return (B_TRUE);
2301 	}
2302 	bzero((char *)mp->b_rptr, len);
2303 	mp->b_wptr = mp->b_rptr + len;
2304 
2305 	ip6i = (ip6i_t *)mp->b_rptr;
2306 	ip6i->ip6i_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
2307 	ip6i->ip6i_nxt = IPPROTO_RAW;
2308 	ip6i->ip6i_flags = IP6I_ATTACH_IF | IP6I_HOPLIMIT;
2309 	if (flag & NDP_PROBE)
2310 		ip6i->ip6i_flags |= IP6I_UNSPEC_SRC;
2311 	ip6i->ip6i_ifindex = ill->ill_phyint->phyint_ifindex;
2312 
2313 	ip6h = (ip6_t *)(mp->b_rptr + sizeof (ip6i_t));
2314 	ip6h->ip6_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
2315 	ip6h->ip6_plen = htons(len - IPV6_HDR_LEN - sizeof (ip6i_t));
2316 	ip6h->ip6_nxt = IPPROTO_ICMPV6;
2317 	ip6h->ip6_hops = IPV6_MAX_HOPS;
2318 	ip6h->ip6_dst = *target;
2319 	icmp6 = (icmp6_t *)&ip6h[1];
2320 
2321 	opt = (nd_opt_hdr_t *)((uint8_t *)ip6h + IPV6_HDR_LEN +
2322 	    sizeof (nd_neighbor_advert_t));
2323 
2324 	if (operation == ND_NEIGHBOR_SOLICIT) {
2325 		nd_neighbor_solicit_t *ns = (nd_neighbor_solicit_t *)icmp6;
2326 
2327 		if (!(flag & NDP_PROBE))
2328 			opt->nd_opt_type = ND_OPT_SOURCE_LINKADDR;
2329 		ip6h->ip6_src = *sender;
2330 		ns->nd_ns_target = *target;
2331 		if (!(flag & NDP_UNICAST)) {
2332 			/* Form multicast address of the target */
2333 			ip6h->ip6_dst = ipv6_solicited_node_mcast;
2334 			ip6h->ip6_dst.s6_addr32[3] |=
2335 			    ns->nd_ns_target.s6_addr32[3];
2336 		}
2337 	} else {
2338 		nd_neighbor_advert_t *na = (nd_neighbor_advert_t *)icmp6;
2339 
2340 		ASSERT(!(flag & NDP_PROBE));
2341 		opt->nd_opt_type = ND_OPT_TARGET_LINKADDR;
2342 		ip6h->ip6_src = *sender;
2343 		na->nd_na_target = *sender;
2344 		if (flag & NDP_ISROUTER)
2345 			na->nd_na_flags_reserved |= ND_NA_FLAG_ROUTER;
2346 		if (flag & NDP_SOLICITED)
2347 			na->nd_na_flags_reserved |= ND_NA_FLAG_SOLICITED;
2348 		if (flag & NDP_ORIDE)
2349 			na->nd_na_flags_reserved |= ND_NA_FLAG_OVERRIDE;
2350 	}
2351 
2352 	hw_addr = NULL;
2353 	if (!(flag & NDP_PROBE)) {
2354 		mutex_enter(&hwaddr_ill->ill_lock);
2355 		hw_addr = use_nd_lla ? hwaddr_ill->ill_nd_lla :
2356 		    hwaddr_ill->ill_phys_addr;
2357 		if (hw_addr != NULL) {
2358 			/* Fill in link layer address and option len */
2359 			opt->nd_opt_len = (uint8_t)plen;
2360 			bcopy(hw_addr, &opt[1], hwaddr_ill->ill_nd_lla_len);
2361 		}
2362 		mutex_exit(&hwaddr_ill->ill_lock);
2363 	}
2364 	if (hw_addr == NULL) {
2365 		/* If there's no link layer address option, then strip it. */
2366 		len -= plen * 8;
2367 		mp->b_wptr = mp->b_rptr + len;
2368 		ip6h->ip6_plen = htons(len - IPV6_HDR_LEN - sizeof (ip6i_t));
2369 	}
2370 
2371 	icmp6->icmp6_type = (uint8_t)operation;
2372 	icmp6->icmp6_code = 0;
2373 	/*
2374 	 * Prepare for checksum by putting icmp length in the icmp
2375 	 * checksum field. The checksum is calculated in ip_wput_v6.
2376 	 */
2377 	icmp6->icmp6_cksum = ip6h->ip6_plen;
2378 
2379 	if (src_ipif != NULL)
2380 		ipif_refrele(src_ipif);
2381 	if (canput(ill->ill_wq)) {
2382 		put(ill->ill_wq, mp);
2383 		return (B_FALSE);
2384 	}
2385 	freemsg(mp);
2386 	return (B_TRUE);
2387 }
2388 
2389 /*
2390  * Make a link layer address (does not include the SAP) from an nce.
2391  * To form the link layer address, use the last four bytes of ipv6
2392  * address passed in and the fixed offset stored in nce.
2393  */
2394 static void
2395 nce_make_mapping(nce_t *nce, uchar_t *addrpos, uchar_t *addr)
2396 {
2397 	uchar_t *mask, *to;
2398 	ill_t	*ill = nce->nce_ill;
2399 	int 	len;
2400 
2401 	if (ill->ill_net_type == IRE_IF_NORESOLVER)
2402 		return;
2403 	ASSERT(nce->nce_res_mp != NULL);
2404 	ASSERT(ill->ill_net_type == IRE_IF_RESOLVER);
2405 	ASSERT(nce->nce_flags & NCE_F_MAPPING);
2406 	ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&nce->nce_extract_mask));
2407 	ASSERT(addr != NULL);
2408 	bcopy(nce->nce_res_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill),
2409 	    addrpos, ill->ill_nd_lla_len);
2410 	len = MIN((int)ill->ill_nd_lla_len - nce->nce_ll_extract_start,
2411 	    IPV6_ADDR_LEN);
2412 	mask = (uchar_t *)&nce->nce_extract_mask;
2413 	mask += (IPV6_ADDR_LEN - len);
2414 	addr += (IPV6_ADDR_LEN - len);
2415 	to = addrpos + nce->nce_ll_extract_start;
2416 	while (len-- > 0)
2417 		*to++ |= *mask++ & *addr++;
2418 }
2419 
2420 /*
2421  * Pass a cache report back out via NDD.
2422  */
2423 /* ARGSUSED */
2424 int
2425 ndp_report(queue_t *q, mblk_t *mp, caddr_t arg, cred_t *ioc_cr)
2426 {
2427 	(void) mi_mpprintf(mp, "ifname      hardware addr    flags"
2428 			"     proto addr/mask");
2429 	ndp_walk(NULL, (pfi_t)nce_report1, (uchar_t *)mp);
2430 	return (0);
2431 }
2432 
2433 /*
2434  * Add a single line to the NDP Cache Entry Report.
2435  */
2436 static void
2437 nce_report1(nce_t *nce, uchar_t *mp_arg)
2438 {
2439 	ill_t		*ill = nce->nce_ill;
2440 	char		local_buf[INET6_ADDRSTRLEN];
2441 	uchar_t		flags_buf[10];
2442 	uint32_t	flags = nce->nce_flags;
2443 	mblk_t		*mp = (mblk_t *)mp_arg;
2444 	uchar_t		*h;
2445 	uchar_t		*m = flags_buf;
2446 	in6_addr_t	v6addr;
2447 
2448 	/*
2449 	 * Lock the nce to protect nce_res_mp from being changed
2450 	 * if an external resolver address resolution completes
2451 	 * while nce_res_mp is being accessed here.
2452 	 *
2453 	 * Deal with all address formats, not just Ethernet-specific
2454 	 * In addition, make sure that the mblk has enough space
2455 	 * before writing to it. If is doesn't, allocate a new one.
2456 	 */
2457 	if (nce->nce_ipversion == IPV4_VERSION)
2458 		/* Don't include v4 nce_ts in NDP cache entry report */
2459 		return;
2460 
2461 	ASSERT(ill != NULL);
2462 	v6addr = nce->nce_mask;
2463 	if (flags & NCE_F_PERMANENT)
2464 		*m++ = 'P';
2465 	if (flags & NCE_F_ISROUTER)
2466 		*m++ = 'R';
2467 	if (flags & NCE_F_MAPPING)
2468 		*m++ = 'M';
2469 	*m = '\0';
2470 
2471 	if (ill->ill_net_type == IRE_IF_RESOLVER) {
2472 		size_t		addrlen;
2473 		char		*addr_buf;
2474 		dl_unitdata_req_t	*dl;
2475 
2476 		mutex_enter(&nce->nce_lock);
2477 		h = nce->nce_res_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill);
2478 		dl = (dl_unitdata_req_t *)nce->nce_res_mp->b_rptr;
2479 		if (ill->ill_flags & ILLF_XRESOLV)
2480 			addrlen = (3 * (dl->dl_dest_addr_length));
2481 		else
2482 			addrlen = (3 * (ill->ill_nd_lla_len));
2483 		if (addrlen <= 0) {
2484 			mutex_exit(&nce->nce_lock);
2485 			(void) mi_mpprintf(mp,
2486 			    "%8s %9s %5s %s/%d",
2487 			    ill->ill_name,
2488 			    "None",
2489 			    (uchar_t *)&flags_buf,
2490 			    inet_ntop(AF_INET6, (char *)&nce->nce_addr,
2491 				(char *)local_buf, sizeof (local_buf)),
2492 				ip_mask_to_plen_v6(&v6addr));
2493 		} else {
2494 			/*
2495 			 * Convert the hardware/lla address to ascii
2496 			 */
2497 			addr_buf = kmem_zalloc(addrlen, KM_NOSLEEP);
2498 			if (addr_buf == NULL) {
2499 				mutex_exit(&nce->nce_lock);
2500 				return;
2501 			}
2502 			(void) mac_colon_addr((uint8_t *)h,
2503 			    (ill->ill_flags & ILLF_XRESOLV) ?
2504 			    dl->dl_dest_addr_length : ill->ill_nd_lla_len,
2505 			    addr_buf, addrlen);
2506 			mutex_exit(&nce->nce_lock);
2507 			(void) mi_mpprintf(mp, "%8s %17s %5s %s/%d",
2508 			    ill->ill_name, addr_buf, (uchar_t *)&flags_buf,
2509 			    inet_ntop(AF_INET6, (char *)&nce->nce_addr,
2510 				(char *)local_buf, sizeof (local_buf)),
2511 				ip_mask_to_plen_v6(&v6addr));
2512 			kmem_free(addr_buf, addrlen);
2513 		}
2514 	} else {
2515 		(void) mi_mpprintf(mp,
2516 		    "%8s %9s %5s %s/%d",
2517 		    ill->ill_name,
2518 		    "None",
2519 		    (uchar_t *)&flags_buf,
2520 		    inet_ntop(AF_INET6, (char *)&nce->nce_addr,
2521 			(char *)local_buf, sizeof (local_buf)),
2522 			ip_mask_to_plen_v6(&v6addr));
2523 	}
2524 }
2525 
2526 mblk_t *
2527 nce_udreq_alloc(ill_t *ill)
2528 {
2529 	mblk_t	*template_mp = NULL;
2530 	dl_unitdata_req_t *dlur;
2531 	int	sap_length;
2532 
2533 	ASSERT(ill->ill_isv6);
2534 
2535 	sap_length = ill->ill_sap_length;
2536 	template_mp = ip_dlpi_alloc(sizeof (dl_unitdata_req_t) +
2537 	    ill->ill_nd_lla_len + ABS(sap_length), DL_UNITDATA_REQ);
2538 	if (template_mp == NULL)
2539 		return (NULL);
2540 
2541 	dlur = (dl_unitdata_req_t *)template_mp->b_rptr;
2542 	dlur->dl_priority.dl_min = 0;
2543 	dlur->dl_priority.dl_max = 0;
2544 	dlur->dl_dest_addr_length = ABS(sap_length) + ill->ill_nd_lla_len;
2545 	dlur->dl_dest_addr_offset = sizeof (dl_unitdata_req_t);
2546 
2547 	/* Copy in the SAP value. */
2548 	NCE_LL_SAP_COPY(ill, template_mp);
2549 
2550 	return (template_mp);
2551 }
2552 
2553 /*
2554  * NDP retransmit timer.
2555  * This timer goes off when:
2556  * a. It is time to retransmit NS for resolver.
2557  * b. It is time to send reachability probes.
2558  */
2559 void
2560 ndp_timer(void *arg)
2561 {
2562 	nce_t		*nce = arg;
2563 	ill_t		*ill = nce->nce_ill;
2564 	uint32_t	ms;
2565 	char		addrbuf[INET6_ADDRSTRLEN];
2566 	mblk_t		*mp;
2567 	boolean_t	dropped = B_FALSE;
2568 
2569 	/*
2570 	 * The timer has to be cancelled by ndp_delete before doing the final
2571 	 * refrele. So the NCE is guaranteed to exist when the timer runs
2572 	 * until it clears the timeout_id. Before clearing the timeout_id
2573 	 * bump up the refcnt so that we can continue to use the nce
2574 	 */
2575 	ASSERT(nce != NULL);
2576 
2577 	/*
2578 	 * Grab the ill_g_lock now itself to avoid lock order problems.
2579 	 * nce_solicit needs ill_g_lock to be able to traverse ills
2580 	 */
2581 	rw_enter(&ill_g_lock, RW_READER);
2582 	mutex_enter(&nce->nce_lock);
2583 	NCE_REFHOLD_LOCKED(nce);
2584 	nce->nce_timeout_id = 0;
2585 
2586 	/*
2587 	 * Check the reachability state first.
2588 	 */
2589 	switch (nce->nce_state) {
2590 	case ND_DELAY:
2591 		rw_exit(&ill_g_lock);
2592 		nce->nce_state = ND_PROBE;
2593 		mutex_exit(&nce->nce_lock);
2594 		(void) nce_xmit(ill, ND_NEIGHBOR_SOLICIT, NULL, B_FALSE,
2595 		    &ipv6_all_zeros, &nce->nce_addr, NDP_UNICAST);
2596 		if (ip_debug > 3) {
2597 			/* ip2dbg */
2598 			pr_addr_dbg("ndp_timer: state for %s changed "
2599 			    "to PROBE\n", AF_INET6, &nce->nce_addr);
2600 		}
2601 		NDP_RESTART_TIMER(nce, ill->ill_reachable_retrans_time);
2602 		NCE_REFRELE(nce);
2603 		return;
2604 	case ND_PROBE:
2605 		/* must be retransmit timer */
2606 		rw_exit(&ill_g_lock);
2607 		nce->nce_pcnt--;
2608 		ASSERT(nce->nce_pcnt < ND_MAX_UNICAST_SOLICIT &&
2609 		    nce->nce_pcnt >= -1);
2610 		if (nce->nce_pcnt > 0) {
2611 			/*
2612 			 * As per RFC2461, the nce gets deleted after
2613 			 * MAX_UNICAST_SOLICIT unsuccessful re-transmissions.
2614 			 * Note that the first unicast solicitation is sent
2615 			 * during the DELAY state.
2616 			 */
2617 			ip2dbg(("ndp_timer: pcount=%x dst %s\n",
2618 			    nce->nce_pcnt, inet_ntop(AF_INET6, &nce->nce_addr,
2619 			    addrbuf, sizeof (addrbuf))));
2620 			mutex_exit(&nce->nce_lock);
2621 			dropped = nce_xmit(ill, ND_NEIGHBOR_SOLICIT, NULL,
2622 			    B_FALSE, &ipv6_all_zeros, &nce->nce_addr,
2623 			    (nce->nce_flags & NCE_F_PERMANENT) ? NDP_PROBE :
2624 			    NDP_UNICAST);
2625 			if (dropped) {
2626 				mutex_enter(&nce->nce_lock);
2627 				nce->nce_pcnt++;
2628 				mutex_exit(&nce->nce_lock);
2629 			}
2630 			NDP_RESTART_TIMER(nce, ILL_PROBE_INTERVAL(ill));
2631 		} else if (nce->nce_pcnt < 0) {
2632 			/* No hope, delete the nce */
2633 			nce->nce_state = ND_UNREACHABLE;
2634 			mutex_exit(&nce->nce_lock);
2635 			if (ip_debug > 2) {
2636 				/* ip1dbg */
2637 				pr_addr_dbg("ndp_timer: Delete IRE for"
2638 				    " dst %s\n", AF_INET6, &nce->nce_addr);
2639 			}
2640 			ndp_delete(nce);
2641 		} else if (!(nce->nce_flags & NCE_F_PERMANENT)) {
2642 			/* Wait RetransTimer, before deleting the entry */
2643 			ip2dbg(("ndp_timer: pcount=%x dst %s\n",
2644 			    nce->nce_pcnt, inet_ntop(AF_INET6,
2645 			    &nce->nce_addr, addrbuf, sizeof (addrbuf))));
2646 			mutex_exit(&nce->nce_lock);
2647 			/* Wait one interval before killing */
2648 			NDP_RESTART_TIMER(nce, ill->ill_reachable_retrans_time);
2649 		} else if (ill->ill_phyint->phyint_flags & PHYI_RUNNING) {
2650 			ipif_t *ipif;
2651 
2652 			/*
2653 			 * We're done probing, and we can now declare this
2654 			 * address to be usable.  Let IP know that it's ok to
2655 			 * use.
2656 			 */
2657 			nce->nce_state = ND_REACHABLE;
2658 			mutex_exit(&nce->nce_lock);
2659 			ipif = ipif_lookup_addr_v6(&nce->nce_addr, ill,
2660 			    ALL_ZONES, NULL, NULL, NULL, NULL);
2661 			if (ipif != NULL) {
2662 				if (ipif->ipif_was_dup) {
2663 					char ibuf[LIFNAMSIZ + 10];
2664 					char sbuf[INET6_ADDRSTRLEN];
2665 
2666 					ipif->ipif_was_dup = B_FALSE;
2667 					(void) strlcpy(ibuf, ill->ill_name,
2668 					    sizeof (ibuf));
2669 					(void) inet_ntop(AF_INET6,
2670 					    &ipif->ipif_v6lcl_addr,
2671 					    sbuf, sizeof (sbuf));
2672 					if (ipif->ipif_id != 0) {
2673 						(void) snprintf(ibuf +
2674 						    ill->ill_name_length - 1,
2675 						    sizeof (ibuf) -
2676 						    ill->ill_name_length + 1,
2677 						    ":%d", ipif->ipif_id);
2678 					}
2679 					cmn_err(CE_NOTE, "recovered address "
2680 					    "%s on %s", sbuf, ibuf);
2681 				}
2682 				if ((ipif->ipif_flags & IPIF_UP) &&
2683 				    !ipif->ipif_addr_ready) {
2684 					ip_rts_ifmsg(ipif);
2685 					ip_rts_newaddrmsg(RTM_ADD, 0, ipif);
2686 					sctp_update_ipif(ipif, SCTP_IPIF_UP);
2687 				}
2688 				ipif->ipif_addr_ready = 1;
2689 				ipif_refrele(ipif);
2690 			}
2691 			/* Begin defending our new address */
2692 			nce->nce_unsolicit_count = 0;
2693 			dropped = nce_xmit(ill, ND_NEIGHBOR_ADVERT, ill,
2694 			    B_FALSE, &nce->nce_addr, &ipv6_all_hosts_mcast,
2695 			    NDP_ORIDE);
2696 			if (dropped) {
2697 				nce->nce_unsolicit_count = 1;
2698 				NDP_RESTART_TIMER(nce,
2699 				    ip_ndp_unsolicit_interval);
2700 			} else if (ip_ndp_defense_interval != 0) {
2701 				NDP_RESTART_TIMER(nce, ip_ndp_defense_interval);
2702 			}
2703 		} else {
2704 			/*
2705 			 * This is an address we're probing to be our own, but
2706 			 * the ill is down.  Wait until it comes back before
2707 			 * doing anything, but switch to reachable state so
2708 			 * that the restart will work.
2709 			 */
2710 			nce->nce_state = ND_REACHABLE;
2711 			mutex_exit(&nce->nce_lock);
2712 		}
2713 		NCE_REFRELE(nce);
2714 		return;
2715 	case ND_INCOMPLETE:
2716 		/*
2717 		 * Must be resolvers retransmit timer.
2718 		 */
2719 		for (mp = nce->nce_qd_mp; mp != NULL; mp = mp->b_next) {
2720 			ip6i_t	*ip6i;
2721 			ip6_t	*ip6h;
2722 			mblk_t *data_mp;
2723 
2724 			/*
2725 			 * Walk the list of packets queued, and see if there
2726 			 * are any multipathing probe packets. Such packets
2727 			 * are always queued at the head. Since this is a
2728 			 * retransmit timer firing, mark such packets as
2729 			 * delayed in ND resolution. This info will be used
2730 			 * in ip_wput_v6(). Multipathing probe packets will
2731 			 * always have an ip6i_t. Once we hit a packet without
2732 			 * it, we can break out of this loop.
2733 			 */
2734 			if (mp->b_datap->db_type == M_CTL)
2735 				data_mp = mp->b_cont;
2736 			else
2737 				data_mp = mp;
2738 
2739 			ip6h = (ip6_t *)data_mp->b_rptr;
2740 			if (ip6h->ip6_nxt != IPPROTO_RAW)
2741 				break;
2742 
2743 			/*
2744 			 * This message should have been pulled up already in
2745 			 * ip_wput_v6. We can't do pullups here because the
2746 			 * b_next/b_prev is non-NULL.
2747 			 */
2748 			ip6i = (ip6i_t *)ip6h;
2749 			ASSERT((data_mp->b_wptr - (uchar_t *)ip6i) >=
2750 			    sizeof (ip6i_t) + IPV6_HDR_LEN);
2751 
2752 			/* Mark this packet as delayed due to ND resolution */
2753 			if (ip6i->ip6i_flags & IP6I_DROP_IFDELAYED)
2754 				ip6i->ip6i_flags |= IP6I_ND_DELAYED;
2755 		}
2756 		if (nce->nce_qd_mp != NULL) {
2757 			ms = nce_solicit(nce, NULL);
2758 			rw_exit(&ill_g_lock);
2759 			if (ms == 0) {
2760 				if (nce->nce_state != ND_REACHABLE) {
2761 					mutex_exit(&nce->nce_lock);
2762 					nce_resolv_failed(nce);
2763 					ndp_delete(nce);
2764 				} else {
2765 					mutex_exit(&nce->nce_lock);
2766 				}
2767 			} else {
2768 				mutex_exit(&nce->nce_lock);
2769 				NDP_RESTART_TIMER(nce, (clock_t)ms);
2770 			}
2771 			NCE_REFRELE(nce);
2772 			return;
2773 		}
2774 		mutex_exit(&nce->nce_lock);
2775 		rw_exit(&ill_g_lock);
2776 		NCE_REFRELE(nce);
2777 		break;
2778 	case ND_REACHABLE :
2779 		rw_exit(&ill_g_lock);
2780 		if (((nce->nce_flags & NCE_F_UNSOL_ADV) &&
2781 		    nce->nce_unsolicit_count != 0) ||
2782 		    ((nce->nce_flags & NCE_F_PERMANENT) &&
2783 		    ip_ndp_defense_interval != 0)) {
2784 			if (nce->nce_unsolicit_count > 0)
2785 				nce->nce_unsolicit_count--;
2786 			mutex_exit(&nce->nce_lock);
2787 			dropped = nce_xmit(ill,
2788 			    ND_NEIGHBOR_ADVERT,
2789 			    ill,	/* ill to be used for hw addr */
2790 			    B_FALSE,	/* use ill_phys_addr */
2791 			    &nce->nce_addr,
2792 			    &ipv6_all_hosts_mcast,
2793 			    NDP_ORIDE);
2794 			if (dropped) {
2795 				mutex_enter(&nce->nce_lock);
2796 				nce->nce_unsolicit_count++;
2797 				mutex_exit(&nce->nce_lock);
2798 			}
2799 			if (nce->nce_unsolicit_count != 0) {
2800 				NDP_RESTART_TIMER(nce,
2801 				    ip_ndp_unsolicit_interval);
2802 			} else {
2803 				NDP_RESTART_TIMER(nce,
2804 				    ip_ndp_defense_interval);
2805 			}
2806 		} else {
2807 			mutex_exit(&nce->nce_lock);
2808 		}
2809 		NCE_REFRELE(nce);
2810 		break;
2811 	default:
2812 		rw_exit(&ill_g_lock);
2813 		mutex_exit(&nce->nce_lock);
2814 		NCE_REFRELE(nce);
2815 		break;
2816 	}
2817 }
2818 
2819 /*
2820  * Set a link layer address from the ll_addr passed in.
2821  * Copy SAP from ill.
2822  */
2823 static void
2824 nce_set_ll(nce_t *nce, uchar_t *ll_addr)
2825 {
2826 	ill_t	*ill = nce->nce_ill;
2827 	uchar_t	*woffset;
2828 
2829 	ASSERT(ll_addr != NULL);
2830 	/* Always called before fast_path_probe */
2831 	ASSERT(nce->nce_fp_mp == NULL);
2832 	if (ill->ill_sap_length != 0) {
2833 		/*
2834 		 * Copy the SAP type specified in the
2835 		 * request into the xmit template.
2836 		 */
2837 		NCE_LL_SAP_COPY(ill, nce->nce_res_mp);
2838 	}
2839 	if (ill->ill_phys_addr_length > 0) {
2840 		/*
2841 		 * The bcopy() below used to be called for the physical address
2842 		 * length rather than the link layer address length. For
2843 		 * ethernet and many other media, the phys_addr and lla are
2844 		 * identical.
2845 		 * However, with xresolv interfaces being introduced, the
2846 		 * phys_addr and lla are no longer the same, and the physical
2847 		 * address may not have any useful meaning, so we use the lla
2848 		 * for IPv6 address resolution and destination addressing.
2849 		 *
2850 		 * For PPP or other interfaces with a zero length
2851 		 * physical address, don't do anything here.
2852 		 * The bcopy() with a zero phys_addr length was previously
2853 		 * a no-op for interfaces with a zero-length physical address.
2854 		 * Using the lla for them would change the way they operate.
2855 		 * Doing nothing in such cases preserves expected behavior.
2856 		 */
2857 		woffset = nce->nce_res_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill);
2858 		bcopy(ll_addr, woffset, ill->ill_nd_lla_len);
2859 	}
2860 }
2861 
2862 static boolean_t
2863 nce_cmp_ll_addr(const nce_t *nce, const uchar_t *ll_addr, uint32_t ll_addr_len)
2864 {
2865 	ill_t	*ill = nce->nce_ill;
2866 	uchar_t	*ll_offset;
2867 
2868 	ASSERT(nce->nce_res_mp != NULL);
2869 	if (ll_addr == NULL)
2870 		return (B_FALSE);
2871 	ll_offset = nce->nce_res_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill);
2872 	if (bcmp(ll_addr, ll_offset, ll_addr_len) != 0)
2873 		return (B_TRUE);
2874 	return (B_FALSE);
2875 }
2876 
2877 /*
2878  * Updates the link layer address or the reachability state of
2879  * a cache entry.  Reset probe counter if needed.
2880  */
2881 static void
2882 nce_update(nce_t *nce, uint16_t new_state, uchar_t *new_ll_addr)
2883 {
2884 	ill_t	*ill = nce->nce_ill;
2885 	boolean_t need_stop_timer = B_FALSE;
2886 	boolean_t need_fastpath_update = B_FALSE;
2887 
2888 	ASSERT(MUTEX_HELD(&nce->nce_lock));
2889 	ASSERT(nce->nce_ipversion == IPV6_VERSION);
2890 	/*
2891 	 * If this interface does not do NUD, there is no point
2892 	 * in allowing an update to the cache entry.  Although
2893 	 * we will respond to NS.
2894 	 * The only time we accept an update for a resolver when
2895 	 * NUD is turned off is when it has just been created.
2896 	 * Non-Resolvers will always be created as REACHABLE.
2897 	 */
2898 	if (new_state != ND_UNCHANGED) {
2899 		if ((nce->nce_flags & NCE_F_NONUD) &&
2900 		    (nce->nce_state != ND_INCOMPLETE))
2901 			return;
2902 		ASSERT((int16_t)new_state >= ND_STATE_VALID_MIN);
2903 		ASSERT((int16_t)new_state <= ND_STATE_VALID_MAX);
2904 		need_stop_timer = B_TRUE;
2905 		if (new_state == ND_REACHABLE)
2906 			nce->nce_last = TICK_TO_MSEC(lbolt64);
2907 		else {
2908 			/* We force NUD in this case */
2909 			nce->nce_last = 0;
2910 		}
2911 		nce->nce_state = new_state;
2912 		nce->nce_pcnt = ND_MAX_UNICAST_SOLICIT;
2913 	}
2914 	/*
2915 	 * In case of fast path we need to free the the fastpath
2916 	 * M_DATA and do another probe.  Otherwise we can just
2917 	 * overwrite the DL_UNITDATA_REQ data, noting we'll lose
2918 	 * whatever packets that happens to be transmitting at the time.
2919 	 */
2920 	if (new_ll_addr != NULL) {
2921 		ASSERT(nce->nce_res_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill) +
2922 		    ill->ill_nd_lla_len <= nce->nce_res_mp->b_wptr);
2923 		bcopy(new_ll_addr, nce->nce_res_mp->b_rptr +
2924 		    NCE_LL_ADDR_OFFSET(ill), ill->ill_nd_lla_len);
2925 		if (nce->nce_fp_mp != NULL) {
2926 			freemsg(nce->nce_fp_mp);
2927 			nce->nce_fp_mp = NULL;
2928 		}
2929 		need_fastpath_update = B_TRUE;
2930 	}
2931 	mutex_exit(&nce->nce_lock);
2932 	if (need_stop_timer) {
2933 		(void) untimeout(nce->nce_timeout_id);
2934 		nce->nce_timeout_id = 0;
2935 	}
2936 	if (need_fastpath_update)
2937 		nce_fastpath(nce);
2938 	mutex_enter(&nce->nce_lock);
2939 }
2940 
2941 void
2942 nce_queue_mp_common(nce_t *nce, mblk_t *mp, boolean_t head_insert)
2943 {
2944 	uint_t	count = 0;
2945 	mblk_t  **mpp;
2946 
2947 	ASSERT(MUTEX_HELD(&nce->nce_lock));
2948 
2949 	for (mpp = &nce->nce_qd_mp; *mpp != NULL;
2950 	    mpp = &(*mpp)->b_next) {
2951 		if (++count >
2952 		    nce->nce_ill->ill_max_buf) {
2953 			mblk_t *tmp = nce->nce_qd_mp->b_next;
2954 
2955 			nce->nce_qd_mp->b_next = NULL;
2956 			nce->nce_qd_mp->b_prev = NULL;
2957 			freemsg(nce->nce_qd_mp);
2958 			nce->nce_qd_mp = tmp;
2959 		}
2960 	}
2961 	/* put this on the list */
2962 	if (head_insert) {
2963 		mp->b_next = nce->nce_qd_mp;
2964 		nce->nce_qd_mp = mp;
2965 	} else {
2966 		*mpp = mp;
2967 	}
2968 }
2969 
2970 static void
2971 nce_queue_mp(nce_t *nce, mblk_t *mp)
2972 {
2973 	boolean_t head_insert = B_FALSE;
2974 	ip6_t	*ip6h;
2975 	ip6i_t	*ip6i;
2976 	mblk_t *data_mp;
2977 
2978 	ASSERT(MUTEX_HELD(&nce->nce_lock));
2979 
2980 	if (mp->b_datap->db_type == M_CTL)
2981 		data_mp = mp->b_cont;
2982 	else
2983 		data_mp = mp;
2984 	ip6h = (ip6_t *)data_mp->b_rptr;
2985 	if (ip6h->ip6_nxt == IPPROTO_RAW) {
2986 		/*
2987 		 * This message should have been pulled up already in
2988 		 * ip_wput_v6. We can't do pullups here because the message
2989 		 * could be from the nce_qd_mp which could have b_next/b_prev
2990 		 * non-NULL.
2991 		 */
2992 		ip6i = (ip6i_t *)ip6h;
2993 		ASSERT((data_mp->b_wptr - (uchar_t *)ip6i) >=
2994 			    sizeof (ip6i_t) + IPV6_HDR_LEN);
2995 		/*
2996 		 * Multipathing probe packets have IP6I_DROP_IFDELAYED set.
2997 		 * This has 2 aspects mentioned below.
2998 		 * 1. Perform head insertion in the nce_qd_mp for these packets.
2999 		 * This ensures that next retransmit of ND solicitation
3000 		 * will use the interface specified by the probe packet,
3001 		 * for both NS and NA. This corresponds to the src address
3002 		 * in the IPv6 packet. If we insert at tail, we will be
3003 		 * depending on the packet at the head for successful
3004 		 * ND resolution. This is not reliable, because the interface
3005 		 * on which the NA arrives could be different from the interface
3006 		 * on which the NS was sent, and if the receiving interface is
3007 		 * failed, it will appear that the sending interface is also
3008 		 * failed, causing in.mpathd to misdiagnose this as link
3009 		 * failure.
3010 		 * 2. Drop the original packet, if the ND resolution did not
3011 		 * succeed in the first attempt. However we will create the
3012 		 * nce and the ire, as soon as the ND resolution succeeds.
3013 		 * We don't gain anything by queueing multiple probe packets
3014 		 * and sending them back-to-back once resolution succeeds.
3015 		 * It is sufficient to send just 1 packet after ND resolution
3016 		 * succeeds. Since mpathd is sending down probe packets at a
3017 		 * constant rate, we don't need to send the queued packet. We
3018 		 * need to queue it only for NDP resolution. The benefit of
3019 		 * dropping the probe packets that were delayed in ND
3020 		 * resolution, is that in.mpathd will not see inflated
3021 		 * RTT. If the ND resolution does not succeed within
3022 		 * in.mpathd's failure detection time, mpathd may detect
3023 		 * a failure, and it does not matter whether the packet
3024 		 * was queued or dropped.
3025 		 */
3026 		if (ip6i->ip6i_flags & IP6I_DROP_IFDELAYED)
3027 			head_insert = B_TRUE;
3028 	}
3029 
3030 	nce_queue_mp_common(nce, mp, head_insert);
3031 }
3032 
3033 /*
3034  * Called when address resolution failed due to a timeout.
3035  * Send an ICMP unreachable in response to all queued packets.
3036  */
3037 void
3038 nce_resolv_failed(nce_t *nce)
3039 {
3040 	mblk_t	*mp, *nxt_mp, *first_mp;
3041 	char	buf[INET6_ADDRSTRLEN];
3042 	ip6_t *ip6h;
3043 	zoneid_t zoneid = GLOBAL_ZONEID;
3044 
3045 	ip1dbg(("nce_resolv_failed: dst %s\n",
3046 	    inet_ntop(AF_INET6, (char *)&nce->nce_addr, buf, sizeof (buf))));
3047 	mutex_enter(&nce->nce_lock);
3048 	mp = nce->nce_qd_mp;
3049 	nce->nce_qd_mp = NULL;
3050 	mutex_exit(&nce->nce_lock);
3051 	while (mp != NULL) {
3052 		nxt_mp = mp->b_next;
3053 		mp->b_next = NULL;
3054 		mp->b_prev = NULL;
3055 
3056 		first_mp = mp;
3057 		if (mp->b_datap->db_type == M_CTL) {
3058 			ipsec_out_t *io = (ipsec_out_t *)mp->b_rptr;
3059 			ASSERT(io->ipsec_out_type == IPSEC_OUT);
3060 			zoneid = io->ipsec_out_zoneid;
3061 			ASSERT(zoneid != ALL_ZONES);
3062 			mp = mp->b_cont;
3063 		}
3064 
3065 		ip6h = (ip6_t *)mp->b_rptr;
3066 		if (ip6h->ip6_nxt == IPPROTO_RAW) {
3067 			ip6i_t *ip6i;
3068 			/*
3069 			 * This message should have been pulled up already
3070 			 * in ip_wput_v6. ip_hdr_complete_v6 assumes that
3071 			 * the header is pulled up.
3072 			 */
3073 			ip6i = (ip6i_t *)ip6h;
3074 			ASSERT((mp->b_wptr - (uchar_t *)ip6i) >=
3075 			    sizeof (ip6i_t) + IPV6_HDR_LEN);
3076 			mp->b_rptr += sizeof (ip6i_t);
3077 		}
3078 		/*
3079 		 * Ignore failure since icmp_unreachable_v6 will silently
3080 		 * drop packets with an unspecified source address.
3081 		 */
3082 		(void) ip_hdr_complete_v6((ip6_t *)mp->b_rptr, zoneid);
3083 		icmp_unreachable_v6(nce->nce_ill->ill_wq, first_mp,
3084 		    ICMP6_DST_UNREACH_ADDR, B_FALSE, B_FALSE);
3085 		mp = nxt_mp;
3086 	}
3087 }
3088 
3089 /*
3090  * Called by SIOCSNDP* ioctl to add/change an nce entry
3091  * and the corresponding attributes.
3092  * Disallow states other than ND_REACHABLE or ND_STALE.
3093  */
3094 int
3095 ndp_sioc_update(ill_t *ill, lif_nd_req_t *lnr)
3096 {
3097 	sin6_t		*sin6;
3098 	in6_addr_t	*addr;
3099 	nce_t		*nce;
3100 	int		err;
3101 	uint16_t	new_flags = 0;
3102 	uint16_t	old_flags = 0;
3103 	int		inflags = lnr->lnr_flags;
3104 
3105 	ASSERT(ill->ill_isv6);
3106 	if ((lnr->lnr_state_create != ND_REACHABLE) &&
3107 	    (lnr->lnr_state_create != ND_STALE))
3108 		return (EINVAL);
3109 
3110 	sin6 = (sin6_t *)&lnr->lnr_addr;
3111 	addr = &sin6->sin6_addr;
3112 
3113 	mutex_enter(&ndp6.ndp_g_lock);
3114 	/* We know it can not be mapping so just look in the hash table */
3115 	nce = *((nce_t **)NCE_HASH_PTR_V6(*addr));
3116 	nce = nce_lookup_addr(ill, addr, nce);
3117 	if (nce != NULL)
3118 		new_flags = nce->nce_flags;
3119 
3120 	switch (inflags & (NDF_ISROUTER_ON|NDF_ISROUTER_OFF)) {
3121 	case NDF_ISROUTER_ON:
3122 		new_flags |= NCE_F_ISROUTER;
3123 		break;
3124 	case NDF_ISROUTER_OFF:
3125 		new_flags &= ~NCE_F_ISROUTER;
3126 		break;
3127 	case (NDF_ISROUTER_OFF|NDF_ISROUTER_ON):
3128 		mutex_exit(&ndp6.ndp_g_lock);
3129 		if (nce != NULL)
3130 			NCE_REFRELE(nce);
3131 		return (EINVAL);
3132 	}
3133 
3134 	switch (inflags & (NDF_ANYCAST_ON|NDF_ANYCAST_OFF)) {
3135 	case NDF_ANYCAST_ON:
3136 		new_flags |= NCE_F_ANYCAST;
3137 		break;
3138 	case NDF_ANYCAST_OFF:
3139 		new_flags &= ~NCE_F_ANYCAST;
3140 		break;
3141 	case (NDF_ANYCAST_OFF|NDF_ANYCAST_ON):
3142 		mutex_exit(&ndp6.ndp_g_lock);
3143 		if (nce != NULL)
3144 			NCE_REFRELE(nce);
3145 		return (EINVAL);
3146 	}
3147 
3148 	switch (inflags & (NDF_PROXY_ON|NDF_PROXY_OFF)) {
3149 	case NDF_PROXY_ON:
3150 		new_flags |= NCE_F_PROXY;
3151 		break;
3152 	case NDF_PROXY_OFF:
3153 		new_flags &= ~NCE_F_PROXY;
3154 		break;
3155 	case (NDF_PROXY_OFF|NDF_PROXY_ON):
3156 		mutex_exit(&ndp6.ndp_g_lock);
3157 		if (nce != NULL)
3158 			NCE_REFRELE(nce);
3159 		return (EINVAL);
3160 	}
3161 
3162 	if (nce == NULL) {
3163 		err = ndp_add(ill,
3164 		    (uchar_t *)lnr->lnr_hdw_addr,
3165 		    addr,
3166 		    &ipv6_all_ones,
3167 		    &ipv6_all_zeros,
3168 		    0,
3169 		    new_flags,
3170 		    lnr->lnr_state_create,
3171 		    &nce,
3172 		    NULL,
3173 		    NULL);
3174 		if (err != 0) {
3175 			mutex_exit(&ndp6.ndp_g_lock);
3176 			ip1dbg(("ndp_sioc_update: Can't create NCE %d\n", err));
3177 			return (err);
3178 		}
3179 	}
3180 	old_flags = nce->nce_flags;
3181 	if (old_flags & NCE_F_ISROUTER && !(new_flags & NCE_F_ISROUTER)) {
3182 		/*
3183 		 * Router turned to host, delete all ires.
3184 		 * XXX Just delete the entry, but we need to add too.
3185 		 */
3186 		nce->nce_flags &= ~NCE_F_ISROUTER;
3187 		mutex_exit(&ndp6.ndp_g_lock);
3188 		ndp_delete(nce);
3189 		NCE_REFRELE(nce);
3190 		return (0);
3191 	}
3192 	mutex_exit(&ndp6.ndp_g_lock);
3193 
3194 	mutex_enter(&nce->nce_lock);
3195 	nce->nce_flags = new_flags;
3196 	mutex_exit(&nce->nce_lock);
3197 	/*
3198 	 * Note that we ignore the state at this point, which
3199 	 * should be either STALE or REACHABLE.  Instead we let
3200 	 * the link layer address passed in to determine the state
3201 	 * much like incoming packets.
3202 	 */
3203 	ndp_process(nce, (uchar_t *)lnr->lnr_hdw_addr, 0, B_FALSE);
3204 	NCE_REFRELE(nce);
3205 	return (0);
3206 }
3207 
3208 /*
3209  * If the device driver supports it, we make nce_fp_mp to have
3210  * an M_DATA prepend.  Otherwise nce_fp_mp will be null.
3211  * The caller insures there is hold on nce for this function.
3212  * Note that since ill_fastpath_probe() copies the mblk there is
3213  * no need for the hold beyond this function.
3214  */
3215 static void
3216 nce_fastpath(nce_t *nce)
3217 {
3218 	ill_t	*ill = nce->nce_ill;
3219 	int res;
3220 
3221 	ASSERT(ill != NULL);
3222 	if (nce->nce_fp_mp != NULL) {
3223 		/* Already contains fastpath info */
3224 		return;
3225 	}
3226 	if (nce->nce_res_mp != NULL) {
3227 		nce_fastpath_list_add(nce);
3228 		res = ill_fastpath_probe(ill, nce->nce_res_mp);
3229 		/*
3230 		 * EAGAIN is an indication of a transient error
3231 		 * i.e. allocation failure etc. leave the nce in the list it
3232 		 * will be updated when another probe happens for another ire
3233 		 * if not it will be taken out of the list when the ire is
3234 		 * deleted.
3235 		 */
3236 
3237 		if (res != 0 && res != EAGAIN)
3238 			nce_fastpath_list_delete(nce);
3239 	}
3240 }
3241 
3242 /*
3243  * Drain the list of nce's waiting for fastpath response.
3244  */
3245 void
3246 nce_fastpath_list_dispatch(ill_t *ill, boolean_t (*func)(nce_t *, void  *),
3247     void *arg)
3248 {
3249 
3250 	nce_t *next_nce;
3251 	nce_t *current_nce;
3252 	nce_t *first_nce;
3253 	nce_t *prev_nce = NULL;
3254 
3255 	ASSERT(ill != NULL && ill->ill_isv6);
3256 
3257 	mutex_enter(&ill->ill_lock);
3258 	first_nce = current_nce = (nce_t *)ill->ill_fastpath_list;
3259 	while (current_nce != (nce_t *)&ill->ill_fastpath_list) {
3260 		next_nce = current_nce->nce_fastpath;
3261 		/*
3262 		 * Take it off the list if we're flushing, or if the callback
3263 		 * routine tells us to do so.  Otherwise, leave the nce in the
3264 		 * fastpath list to handle any pending response from the lower
3265 		 * layer.  We can't drain the list when the callback routine
3266 		 * comparison failed, because the response is asynchronous in
3267 		 * nature, and may not arrive in the same order as the list
3268 		 * insertion.
3269 		 */
3270 		if (func == NULL || func(current_nce, arg)) {
3271 			current_nce->nce_fastpath = NULL;
3272 			if (current_nce == first_nce)
3273 				ill->ill_fastpath_list = first_nce = next_nce;
3274 			else
3275 				prev_nce->nce_fastpath = next_nce;
3276 		} else {
3277 			/* previous element that is still in the list */
3278 			prev_nce = current_nce;
3279 		}
3280 		current_nce = next_nce;
3281 	}
3282 	mutex_exit(&ill->ill_lock);
3283 }
3284 
3285 /*
3286  * Add nce to the nce fastpath list.
3287  */
3288 void
3289 nce_fastpath_list_add(nce_t *nce)
3290 {
3291 	ill_t *ill;
3292 
3293 	ill = nce->nce_ill;
3294 	ASSERT(ill != NULL && ill->ill_isv6);
3295 
3296 	mutex_enter(&ill->ill_lock);
3297 	mutex_enter(&nce->nce_lock);
3298 
3299 	/*
3300 	 * if nce has not been deleted and
3301 	 * is not already in the list add it.
3302 	 */
3303 	if (!(nce->nce_flags & NCE_F_CONDEMNED) &&
3304 	    (nce->nce_fastpath == NULL)) {
3305 		nce->nce_fastpath = (nce_t *)ill->ill_fastpath_list;
3306 		ill->ill_fastpath_list = nce;
3307 	}
3308 
3309 	mutex_exit(&nce->nce_lock);
3310 	mutex_exit(&ill->ill_lock);
3311 }
3312 
3313 /*
3314  * remove nce from the nce fastpath list.
3315  */
3316 void
3317 nce_fastpath_list_delete(nce_t *nce)
3318 {
3319 	nce_t *nce_ptr;
3320 
3321 	ill_t *ill;
3322 
3323 	ill = nce->nce_ill;
3324 	ASSERT(ill != NULL);
3325 	if (!ill->ill_isv6)  {
3326 		/*
3327 		 * v4 nce_t's do not have nce_fastpath set.
3328 		 */
3329 		return;
3330 	}
3331 
3332 	mutex_enter(&ill->ill_lock);
3333 	if (nce->nce_fastpath == NULL)
3334 		goto done;
3335 
3336 	ASSERT(ill->ill_fastpath_list != &ill->ill_fastpath_list);
3337 
3338 	if (ill->ill_fastpath_list == nce) {
3339 		ill->ill_fastpath_list = nce->nce_fastpath;
3340 	} else {
3341 		nce_ptr = ill->ill_fastpath_list;
3342 		while (nce_ptr != (nce_t *)&ill->ill_fastpath_list) {
3343 			if (nce_ptr->nce_fastpath == nce) {
3344 				nce_ptr->nce_fastpath = nce->nce_fastpath;
3345 				break;
3346 			}
3347 			nce_ptr = nce_ptr->nce_fastpath;
3348 		}
3349 	}
3350 
3351 	nce->nce_fastpath = NULL;
3352 done:
3353 	mutex_exit(&ill->ill_lock);
3354 }
3355 
3356 /*
3357  * Update all NCE's that are not in fastpath mode and
3358  * have an nce_fp_mp that matches mp. mp->b_cont contains
3359  * the fastpath header.
3360  *
3361  * Returns TRUE if entry should be dequeued, or FALSE otherwise.
3362  */
3363 boolean_t
3364 ndp_fastpath_update(nce_t *nce, void *arg)
3365 {
3366 	mblk_t 	*mp, *fp_mp;
3367 	uchar_t	*mp_rptr, *ud_mp_rptr;
3368 	mblk_t	*ud_mp = nce->nce_res_mp;
3369 	ptrdiff_t	cmplen;
3370 
3371 	if (nce->nce_flags & NCE_F_MAPPING)
3372 		return (B_TRUE);
3373 	if ((nce->nce_fp_mp != NULL) || (ud_mp == NULL))
3374 		return (B_TRUE);
3375 
3376 	ip2dbg(("ndp_fastpath_update: trying\n"));
3377 	mp = (mblk_t *)arg;
3378 	mp_rptr = mp->b_rptr;
3379 	cmplen = mp->b_wptr - mp_rptr;
3380 	ASSERT(cmplen >= 0);
3381 	ud_mp_rptr = ud_mp->b_rptr;
3382 	/*
3383 	 * The nce is locked here to prevent any other threads
3384 	 * from accessing and changing nce_res_mp when the IPv6 address
3385 	 * becomes resolved to an lla while we're in the middle
3386 	 * of looking at and comparing the hardware address (lla).
3387 	 * It is also locked to prevent multiple threads in nce_fastpath_update
3388 	 * from examining nce_res_mp atthe same time.
3389 	 */
3390 	mutex_enter(&nce->nce_lock);
3391 	if (ud_mp->b_wptr - ud_mp_rptr != cmplen ||
3392 	    bcmp((char *)mp_rptr, (char *)ud_mp_rptr, cmplen) != 0) {
3393 		mutex_exit(&nce->nce_lock);
3394 		/*
3395 		 * Don't take the ire off the fastpath list yet,
3396 		 * since the response may come later.
3397 		 */
3398 		return (B_FALSE);
3399 	}
3400 	/* Matched - install mp as the fastpath mp */
3401 	ip1dbg(("ndp_fastpath_update: match\n"));
3402 	fp_mp = dupb(mp->b_cont);
3403 	if (fp_mp != NULL) {
3404 		nce->nce_fp_mp = fp_mp;
3405 	}
3406 	mutex_exit(&nce->nce_lock);
3407 	return (B_TRUE);
3408 }
3409 
3410 /*
3411  * This function handles the DL_NOTE_FASTPATH_FLUSH notification from
3412  * driver.  Note that it assumes IP is exclusive...
3413  */
3414 /* ARGSUSED */
3415 void
3416 ndp_fastpath_flush(nce_t *nce, char *arg)
3417 {
3418 	if (nce->nce_flags & NCE_F_MAPPING)
3419 		return;
3420 	/* No fastpath info? */
3421 	if (nce->nce_fp_mp == NULL || nce->nce_res_mp == NULL)
3422 		return;
3423 
3424 	/* Just delete the NCE... */
3425 	ndp_delete(nce);
3426 }
3427 
3428 /*
3429  * Return a pointer to a given option in the packet.
3430  * Assumes that option part of the packet have already been validated.
3431  */
3432 nd_opt_hdr_t *
3433 ndp_get_option(nd_opt_hdr_t *opt, int optlen, int opt_type)
3434 {
3435 	while (optlen > 0) {
3436 		if (opt->nd_opt_type == opt_type)
3437 			return (opt);
3438 		optlen -= 8 * opt->nd_opt_len;
3439 		opt = (struct nd_opt_hdr *)((char *)opt + 8 * opt->nd_opt_len);
3440 	}
3441 	return (NULL);
3442 }
3443 
3444 /*
3445  * Verify all option lengths present are > 0, also check to see
3446  * if the option lengths and packet length are consistent.
3447  */
3448 boolean_t
3449 ndp_verify_optlen(nd_opt_hdr_t *opt, int optlen)
3450 {
3451 	ASSERT(opt != NULL);
3452 	while (optlen > 0) {
3453 		if (opt->nd_opt_len == 0)
3454 			return (B_FALSE);
3455 		optlen -= 8 * opt->nd_opt_len;
3456 		if (optlen < 0)
3457 			return (B_FALSE);
3458 		opt = (struct nd_opt_hdr *)((char *)opt + 8 * opt->nd_opt_len);
3459 	}
3460 	return (B_TRUE);
3461 }
3462 
3463 /*
3464  * ndp_walk function.
3465  * Free a fraction of the NCE cache entries.
3466  * A fraction of zero means to not free any in that category.
3467  */
3468 void
3469 ndp_cache_reclaim(nce_t *nce, char *arg)
3470 {
3471 	nce_cache_reclaim_t *ncr = (nce_cache_reclaim_t *)arg;
3472 	uint_t	rand;
3473 
3474 	if (nce->nce_flags & NCE_F_PERMANENT)
3475 		return;
3476 
3477 	rand = (uint_t)lbolt +
3478 	    NCE_ADDR_HASH_V6(nce->nce_addr, NCE_TABLE_SIZE);
3479 	if (ncr->ncr_host != 0 &&
3480 	    (rand/ncr->ncr_host)*ncr->ncr_host == rand) {
3481 		ndp_delete(nce);
3482 		return;
3483 	}
3484 }
3485 
3486 /*
3487  * ndp_walk function.
3488  * Count the number of NCEs that can be deleted.
3489  * These would be hosts but not routers.
3490  */
3491 void
3492 ndp_cache_count(nce_t *nce, char *arg)
3493 {
3494 	ncc_cache_count_t *ncc = (ncc_cache_count_t *)arg;
3495 
3496 	if (nce->nce_flags & NCE_F_PERMANENT)
3497 		return;
3498 
3499 	ncc->ncc_total++;
3500 	if (!(nce->nce_flags & NCE_F_ISROUTER))
3501 		ncc->ncc_host++;
3502 }
3503 
3504 #ifdef NCE_DEBUG
3505 th_trace_t *
3506 th_trace_nce_lookup(nce_t *nce)
3507 {
3508 	int bucket_id;
3509 	th_trace_t *th_trace;
3510 
3511 	ASSERT(MUTEX_HELD(&nce->nce_lock));
3512 
3513 	bucket_id = IP_TR_HASH(curthread);
3514 	ASSERT(bucket_id < IP_TR_HASH_MAX);
3515 
3516 	for (th_trace = nce->nce_trace[bucket_id]; th_trace != NULL;
3517 	    th_trace = th_trace->th_next) {
3518 		if (th_trace->th_id == curthread)
3519 			return (th_trace);
3520 	}
3521 	return (NULL);
3522 }
3523 
3524 void
3525 nce_trace_ref(nce_t *nce)
3526 {
3527 	int bucket_id;
3528 	th_trace_t *th_trace;
3529 
3530 	/*
3531 	 * Attempt to locate the trace buffer for the curthread.
3532 	 * If it does not exist, then allocate a new trace buffer
3533 	 * and link it in list of trace bufs for this ipif, at the head
3534 	 */
3535 	ASSERT(MUTEX_HELD(&nce->nce_lock));
3536 
3537 	if (nce->nce_trace_disable == B_TRUE)
3538 		return;
3539 
3540 	th_trace = th_trace_nce_lookup(nce);
3541 	if (th_trace == NULL) {
3542 		bucket_id = IP_TR_HASH(curthread);
3543 		th_trace = (th_trace_t *)kmem_zalloc(sizeof (th_trace_t),
3544 		    KM_NOSLEEP);
3545 		if (th_trace == NULL) {
3546 			nce->nce_trace_disable = B_TRUE;
3547 			nce_trace_inactive(nce);
3548 			return;
3549 		}
3550 		th_trace->th_id = curthread;
3551 		th_trace->th_next = nce->nce_trace[bucket_id];
3552 		th_trace->th_prev = &nce->nce_trace[bucket_id];
3553 		if (th_trace->th_next != NULL)
3554 			th_trace->th_next->th_prev = &th_trace->th_next;
3555 		nce->nce_trace[bucket_id] = th_trace;
3556 	}
3557 	ASSERT(th_trace->th_refcnt < TR_BUF_MAX - 1);
3558 	th_trace->th_refcnt++;
3559 	th_trace_rrecord(th_trace);
3560 }
3561 
3562 void
3563 nce_untrace_ref(nce_t *nce)
3564 {
3565 	th_trace_t *th_trace;
3566 
3567 	ASSERT(MUTEX_HELD(&nce->nce_lock));
3568 
3569 	if (nce->nce_trace_disable == B_TRUE)
3570 		return;
3571 
3572 	th_trace = th_trace_nce_lookup(nce);
3573 	ASSERT(th_trace != NULL && th_trace->th_refcnt > 0);
3574 
3575 	th_trace_rrecord(th_trace);
3576 	th_trace->th_refcnt--;
3577 }
3578 
3579 void
3580 nce_trace_inactive(nce_t *nce)
3581 {
3582 	th_trace_t *th_trace;
3583 	int i;
3584 
3585 	ASSERT(MUTEX_HELD(&nce->nce_lock));
3586 
3587 	for (i = 0; i < IP_TR_HASH_MAX; i++) {
3588 		while (nce->nce_trace[i] != NULL) {
3589 			th_trace = nce->nce_trace[i];
3590 
3591 			/* unlink th_trace and free it */
3592 			nce->nce_trace[i] = th_trace->th_next;
3593 			if (th_trace->th_next != NULL)
3594 				th_trace->th_next->th_prev =
3595 				    &nce->nce_trace[i];
3596 
3597 			th_trace->th_next = NULL;
3598 			th_trace->th_prev = NULL;
3599 			kmem_free(th_trace, sizeof (th_trace_t));
3600 		}
3601 	}
3602 
3603 }
3604 
3605 /* ARGSUSED */
3606 int
3607 nce_thread_exit(nce_t *nce, caddr_t arg)
3608 {
3609 	th_trace_t	*th_trace;
3610 
3611 	mutex_enter(&nce->nce_lock);
3612 	th_trace = th_trace_nce_lookup(nce);
3613 
3614 	if (th_trace == NULL) {
3615 		mutex_exit(&nce->nce_lock);
3616 		return (0);
3617 	}
3618 
3619 	ASSERT(th_trace->th_refcnt == 0);
3620 
3621 	/* unlink th_trace and free it */
3622 	*th_trace->th_prev = th_trace->th_next;
3623 	if (th_trace->th_next != NULL)
3624 		th_trace->th_next->th_prev = th_trace->th_prev;
3625 	th_trace->th_next = NULL;
3626 	th_trace->th_prev = NULL;
3627 	kmem_free(th_trace, sizeof (th_trace_t));
3628 	mutex_exit(&nce->nce_lock);
3629 	return (0);
3630 }
3631 #endif
3632 
3633 /*
3634  * Called when address resolution fails due to a timeout.
3635  * Send an ICMP unreachable in response to all queued packets.
3636  */
3637 void
3638 arp_resolv_failed(nce_t *nce)
3639 {
3640 	mblk_t	*mp, *nxt_mp, *first_mp;
3641 	char	buf[INET6_ADDRSTRLEN];
3642 	zoneid_t zoneid = GLOBAL_ZONEID;
3643 	struct in_addr ipv4addr;
3644 
3645 	IN6_V4MAPPED_TO_INADDR(&nce->nce_addr, &ipv4addr);
3646 	ip3dbg(("arp_resolv_failed: dst %s\n",
3647 	    inet_ntop(AF_INET, &ipv4addr, buf, sizeof (buf))));
3648 	mutex_enter(&nce->nce_lock);
3649 	mp = nce->nce_qd_mp;
3650 	nce->nce_qd_mp = NULL;
3651 	mutex_exit(&nce->nce_lock);
3652 
3653 	while (mp != NULL) {
3654 		nxt_mp = mp->b_next;
3655 		mp->b_next = NULL;
3656 		mp->b_prev = NULL;
3657 
3658 		first_mp = mp;
3659 		/*
3660 		 * Send icmp unreachable messages
3661 		 * to the hosts.
3662 		 */
3663 		(void) ip_hdr_complete((ipha_t *)mp->b_rptr, zoneid);
3664 		ip3dbg(("arp_resolv_failed: Calling icmp_unreachable\n"));
3665 		icmp_unreachable(nce->nce_ill->ill_wq, first_mp,
3666 		    ICMP_HOST_UNREACHABLE);
3667 		mp = nxt_mp;
3668 	}
3669 }
3670 
3671 static int
3672 ndp_lookup_then_add_v4(ill_t *ill, uchar_t *hw_addr, const in_addr_t *addr,
3673     const in_addr_t *mask, const in_addr_t *extract_mask,
3674     uint32_t hw_extract_start, uint16_t flags, uint16_t state,
3675     nce_t **newnce, mblk_t *fp_mp, mblk_t *res_mp)
3676 {
3677 	int	err = 0;
3678 	nce_t	*nce;
3679 	in6_addr_t addr6;
3680 
3681 	mutex_enter(&ndp4.ndp_g_lock);
3682 	nce = *((nce_t **)NCE_HASH_PTR_V4(*addr));
3683 	IN6_IPADDR_TO_V4MAPPED(*addr, &addr6);
3684 	nce = nce_lookup_addr(ill, &addr6, nce);
3685 	if (nce == NULL) {
3686 		err = ndp_add_v4(ill,
3687 		    hw_addr,
3688 		    addr,
3689 		    mask,
3690 		    extract_mask,
3691 		    hw_extract_start,
3692 		    flags,
3693 		    state,
3694 		    newnce,
3695 		    fp_mp,
3696 		    res_mp);
3697 	} else {
3698 		*newnce = nce;
3699 		err = EEXIST;
3700 	}
3701 	mutex_exit(&ndp4.ndp_g_lock);
3702 	return (err);
3703 }
3704 
3705 /*
3706  * NDP Cache Entry creation routine for IPv4.
3707  * Mapped entries are handled in arp.
3708  * This routine must always be called with ndp4.ndp_g_lock held.
3709  * Prior to return, nce_refcnt is incremented.
3710  */
3711 static int
3712 ndp_add_v4(ill_t *ill, uchar_t *hw_addr, const in_addr_t *addr,
3713     const in_addr_t *mask, const in_addr_t *extract_mask,
3714     uint32_t hw_extract_start, uint16_t flags, uint16_t state,
3715     nce_t **newnce, mblk_t *fp_mp, mblk_t *res_mp)
3716 {
3717 	static	nce_t		nce_nil;
3718 	nce_t		*nce;
3719 	mblk_t		*mp;
3720 	mblk_t		*template;
3721 	nce_t		**ncep;
3722 
3723 	ASSERT(MUTEX_HELD(&ndp4.ndp_g_lock));
3724 	ASSERT(ill != NULL);
3725 	if ((flags & ~NCE_EXTERNAL_FLAGS_MASK)) {
3726 		return (EINVAL);
3727 	}
3728 	ASSERT((flags & NCE_F_MAPPING) == 0);
3729 	ASSERT(extract_mask == NULL);
3730 	/*
3731 	 * Allocate the mblk to hold the nce.
3732 	 */
3733 	mp = allocb(sizeof (nce_t), BPRI_MED);
3734 	if (mp == NULL)
3735 		return (ENOMEM);
3736 
3737 	nce = (nce_t *)mp->b_rptr;
3738 	mp->b_wptr = (uchar_t *)&nce[1];
3739 	*nce = nce_nil;
3740 
3741 	/*
3742 	 * This one holds link layer address; if res_mp has been provided
3743 	 * by the caller, accept it without any further checks. Otherwise,
3744 	 * for V4, we fill it up with ill_resolver_mp here, then in
3745 	 * in ire_arpresolve(), we fill it up with the ARP query
3746 	 * once its formulated.
3747 	 */
3748 	if (res_mp != NULL) {
3749 		template = res_mp;
3750 	} else  {
3751 		template = copyb(ill->ill_resolver_mp);
3752 	}
3753 	if (template == NULL) {
3754 		freeb(mp);
3755 		return (ENOMEM);
3756 	}
3757 	nce->nce_ill = ill;
3758 	nce->nce_ipversion = IPV4_VERSION;
3759 	nce->nce_flags = flags;
3760 	nce->nce_state = state;
3761 	nce->nce_pcnt = ND_MAX_UNICAST_SOLICIT;
3762 	nce->nce_rcnt = ill->ill_xmit_count;
3763 	IN6_IPADDR_TO_V4MAPPED(*addr, &nce->nce_addr);
3764 	if (*mask == IP_HOST_MASK) {
3765 		nce->nce_mask = ipv6_all_ones;
3766 	} else  {
3767 		IN6_IPADDR_TO_V4MAPPED(*mask, &nce->nce_mask);
3768 	}
3769 	nce->nce_extract_mask = ipv6_all_zeros;
3770 	nce->nce_ll_extract_start = hw_extract_start;
3771 	nce->nce_fp_mp = (fp_mp? fp_mp : NULL);
3772 	nce->nce_res_mp = template;
3773 	if (state == ND_REACHABLE)
3774 		nce->nce_last = TICK_TO_MSEC(lbolt64);
3775 	else
3776 		nce->nce_last = 0;
3777 	nce->nce_qd_mp = NULL;
3778 	nce->nce_mp = mp;
3779 	if (hw_addr != NULL)
3780 		nce_set_ll(nce, hw_addr);
3781 	/* This one is for nce getting created */
3782 	nce->nce_refcnt = 1;
3783 	mutex_init(&nce->nce_lock, NULL, MUTEX_DEFAULT, NULL);
3784 	ncep = ((nce_t **)NCE_HASH_PTR_V4(*addr));
3785 
3786 #ifdef NCE_DEBUG
3787 	bzero(nce->nce_trace, sizeof (th_trace_t *) * IP_TR_HASH_MAX);
3788 #endif
3789 	/*
3790 	 * Atomically ensure that the ill is not CONDEMNED, before
3791 	 * adding the NCE.
3792 	 */
3793 	mutex_enter(&ill->ill_lock);
3794 	if (ill->ill_state_flags & ILL_CONDEMNED) {
3795 		mutex_exit(&ill->ill_lock);
3796 		freeb(mp);
3797 		if (res_mp == NULL) {
3798 			/*
3799 			 * template was locally allocated. need to free it.
3800 			 */
3801 			freeb(template);
3802 		}
3803 		return (EINVAL);
3804 	}
3805 	if ((nce->nce_next = *ncep) != NULL)
3806 		nce->nce_next->nce_ptpn = &nce->nce_next;
3807 	*ncep = nce;
3808 	nce->nce_ptpn = ncep;
3809 	*newnce = nce;
3810 	/* This one is for nce being used by an active thread */
3811 	NCE_REFHOLD(*newnce);
3812 
3813 	/* Bump up the number of nce's referencing this ill */
3814 	ill->ill_nce_cnt++;
3815 	mutex_exit(&ill->ill_lock);
3816 	return (0);
3817 }
3818 
3819 void
3820 ndp_flush_qd_mp(nce_t *nce)
3821 {
3822 	mblk_t *qd_mp, *qd_next;
3823 
3824 	ASSERT(MUTEX_HELD(&nce->nce_lock));
3825 	qd_mp = nce->nce_qd_mp;
3826 	nce->nce_qd_mp = NULL;
3827 	while (qd_mp != NULL) {
3828 		qd_next = qd_mp->b_next;
3829 		qd_mp->b_next = NULL;
3830 		qd_mp->b_prev = NULL;
3831 		freemsg(qd_mp);
3832 		qd_mp = qd_next;
3833 	}
3834 }
3835 
3836 nce_t *
3837 nce_reinit(nce_t *nce)
3838 {
3839 	nce_t *newnce = NULL;
3840 	in_addr_t nce_addr, nce_mask;
3841 
3842 	IN6_V4MAPPED_TO_IPADDR(&nce->nce_addr, nce_addr);
3843 	IN6_V4MAPPED_TO_IPADDR(&nce->nce_mask, nce_mask);
3844 	/*
3845 	 * delete the old one. this will get rid of any ire's pointing
3846 	 * at this nce.
3847 	 */
3848 	ndp_delete(nce);
3849 	/*
3850 	 * create a new nce with the same addr and mask.
3851 	 */
3852 	mutex_enter(&ndp4.ndp_g_lock);
3853 	(void) ndp_add_v4(nce->nce_ill, NULL, &nce_addr, &nce_mask, NULL, 0, 0,
3854 	    ND_INITIAL, &newnce, NULL, NULL);
3855 	mutex_exit(&ndp4.ndp_g_lock);
3856 	/*
3857 	 * refrele the old nce.
3858 	 */
3859 	NCE_REFRELE(nce);
3860 	return (newnce);
3861 }
3862 
3863 /*
3864  * ndp_walk routine to delete all entries that have a given destination or
3865  * gateway address and cached link layer (MAC) address.  This is used when ARP
3866  * informs us that a network-to-link-layer mapping may have changed.
3867  */
3868 void
3869 nce_delete_hw_changed(nce_t *nce, void *arg)
3870 {
3871 	nce_hw_map_t *hwm = arg;
3872 	mblk_t *mp;
3873 	dl_unitdata_req_t *dlu;
3874 	uchar_t *macaddr;
3875 	ill_t *ill;
3876 	int saplen;
3877 	ipaddr_t nce_addr;
3878 
3879 	if (nce->nce_state != ND_REACHABLE)
3880 		return;
3881 
3882 	IN6_V4MAPPED_TO_IPADDR(&nce->nce_addr, nce_addr);
3883 	if (nce_addr != hwm->hwm_addr)
3884 		return;
3885 
3886 	mutex_enter(&nce->nce_lock);
3887 	if ((mp = nce->nce_res_mp) == NULL) {
3888 		mutex_exit(&nce->nce_lock);
3889 		return;
3890 	}
3891 	dlu = (dl_unitdata_req_t *)mp->b_rptr;
3892 	macaddr = (uchar_t *)(dlu + 1);
3893 	ill = nce->nce_ill;
3894 	if ((saplen = ill->ill_sap_length) > 0)
3895 		macaddr += saplen;
3896 	else
3897 		saplen = -saplen;
3898 
3899 	/*
3900 	 * If the hardware address is unchanged, then leave this one alone.
3901 	 * Note that saplen == abs(saplen) now.
3902 	 */
3903 	if (hwm->hwm_hwlen == dlu->dl_dest_addr_length - saplen &&
3904 	    bcmp(hwm->hwm_hwaddr, macaddr, hwm->hwm_hwlen) == 0) {
3905 		mutex_exit(&nce->nce_lock);
3906 		return;
3907 	}
3908 	mutex_exit(&nce->nce_lock);
3909 
3910 	DTRACE_PROBE1(nce__hw__deleted, nce_t *, nce);
3911 	ndp_delete(nce);
3912 }
3913 
3914 /*
3915  * This function verifies whether a given IPv4 address is potentially known to
3916  * the NCE subsystem.  If so, then ARP must not delete the corresponding ace_t,
3917  * so that it can continue to look for hardware changes on that address.
3918  */
3919 boolean_t
3920 ndp_lookup_ipaddr(in_addr_t addr)
3921 {
3922 	nce_t		*nce;
3923 	struct in_addr	nceaddr;
3924 
3925 	if (addr == INADDR_ANY)
3926 		return (B_FALSE);
3927 
3928 	mutex_enter(&ndp4.ndp_g_lock);
3929 	nce = *(nce_t **)NCE_HASH_PTR_V4(addr);
3930 	for (; nce != NULL; nce = nce->nce_next) {
3931 		/* Note that only v4 mapped entries are in the table. */
3932 		IN6_V4MAPPED_TO_INADDR(&nce->nce_addr, &nceaddr);
3933 		if (addr == nceaddr.s_addr &&
3934 		    IN6_ARE_ADDR_EQUAL(&nce->nce_mask, &ipv6_all_ones)) {
3935 			/* Single flag check; no lock needed */
3936 			if (!(nce->nce_flags & NCE_F_CONDEMNED))
3937 				break;
3938 		}
3939 	}
3940 	mutex_exit(&ndp4.ndp_g_lock);
3941 	return (nce != NULL);
3942 }
3943