1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved.
23 */
24
25 #include <sys/types.h>
26 #include <sys/stream.h>
27 #include <sys/stropts.h>
28 #include <sys/strsun.h>
29 #include <sys/sysmacros.h>
30 #include <sys/errno.h>
31 #include <sys/dlpi.h>
32 #include <sys/socket.h>
33 #include <sys/ddi.h>
34 #include <sys/sunddi.h>
35 #include <sys/cmn_err.h>
36 #include <sys/debug.h>
37 #include <sys/vtrace.h>
38 #include <sys/kmem.h>
39 #include <sys/zone.h>
40 #include <sys/ethernet.h>
41 #include <sys/sdt.h>
42 #include <sys/mac.h>
43
44 #include <net/if.h>
45 #include <net/if_types.h>
46 #include <net/if_dl.h>
47 #include <net/route.h>
48 #include <netinet/in.h>
49 #include <netinet/ip6.h>
50 #include <netinet/icmp6.h>
51
52 #include <inet/common.h>
53 #include <inet/mi.h>
54 #include <inet/mib2.h>
55 #include <inet/nd.h>
56 #include <inet/ip.h>
57 #include <inet/ip_impl.h>
58 #include <inet/ipclassifier.h>
59 #include <inet/ip_if.h>
60 #include <inet/ip_ire.h>
61 #include <inet/ip_rts.h>
62 #include <inet/ip6.h>
63 #include <inet/ip_ndp.h>
64 #include <inet/sctp_ip.h>
65 #include <inet/ip_arp.h>
66 #include <inet/ip2mac_impl.h>
67
68 #define ANNOUNCE_INTERVAL(isv6) \
69 (isv6 ? ipst->ips_ip_ndp_unsolicit_interval : \
70 ipst->ips_ip_arp_publish_interval)
71
72 #define DEFENSE_INTERVAL(isv6) \
73 (isv6 ? ipst->ips_ndp_defend_interval : \
74 ipst->ips_arp_defend_interval)
75
76 /* Non-tunable probe interval, based on link capabilities */
77 #define ILL_PROBE_INTERVAL(ill) ((ill)->ill_note_link ? 150 : 1500)
78
79 /*
80 * The IPv4 Link Local address space is special; we do extra duplicate checking
81 * there, as the entire assignment mechanism rests on random numbers.
82 */
83 #define IS_IPV4_LL_SPACE(ptr) (((uchar_t *)ptr)[0] == 169 && \
84 ((uchar_t *)ptr)[1] == 254)
85
86 /*
87 * NCE_EXTERNAL_FLAGS_MASK defines the set of ncec_flags that may be passed
88 * in to the ncec*add* functions.
89 *
90 * NCE_F_AUTHORITY means that we ignore any incoming adverts for that
91 * mapping (though DAD is performed for the mapping). NCE_F_PUBLISH means
92 * that we will respond to requests for the protocol address.
93 */
94 #define NCE_EXTERNAL_FLAGS_MASK \
95 (NCE_F_MYADDR | NCE_F_ISROUTER | NCE_F_NONUD | \
96 NCE_F_ANYCAST | NCE_F_UNSOL_ADV | NCE_F_BCAST | NCE_F_MCAST | \
97 NCE_F_AUTHORITY | NCE_F_PUBLISH | NCE_F_STATIC)
98
99 /*
100 * Lock ordering:
101 *
102 * ndp_g_lock -> ill_lock -> ncec_lock
103 *
104 * The ndp_g_lock protects the NCE hash (nce_hash_tbl, NCE_HASH_PTR) and
105 * ncec_next. ncec_lock protects the contents of the NCE (particularly
106 * ncec_refcnt).
107 */
108
109 static void nce_cleanup_list(ncec_t *ncec);
110 static void nce_set_ll(ncec_t *ncec, uchar_t *ll_addr);
111 static ncec_t *ncec_lookup_illgrp(ill_t *, const in6_addr_t *,
112 ncec_t *);
113 static nce_t *nce_lookup_addr(ill_t *, const in6_addr_t *);
114 static int nce_set_multicast_v6(ill_t *ill, const in6_addr_t *addr,
115 uint16_t ncec_flags, nce_t **newnce);
116 static int nce_set_multicast_v4(ill_t *ill, const in_addr_t *dst,
117 uint16_t ncec_flags, nce_t **newnce);
118 static boolean_t ndp_xmit(ill_t *ill, uint32_t operation,
119 uint8_t *hwaddr, uint_t hwaddr_len, const in6_addr_t *sender,
120 const in6_addr_t *target, int flag);
121 static void ncec_refhold_locked(ncec_t *);
122 static boolean_t ill_defend_rate_limit(ill_t *, ncec_t *);
123 static void nce_queue_mp_common(ncec_t *, mblk_t *, boolean_t);
124 static int nce_add_common(ill_t *, uchar_t *, uint_t, const in6_addr_t *,
125 uint16_t, uint16_t, nce_t **);
126 static nce_t *nce_add_impl(ill_t *, ncec_t *, nce_t *, mblk_t *);
127 static nce_t *nce_add(ill_t *, ncec_t *);
128 static void nce_inactive(nce_t *);
129 extern nce_t *nce_lookup(ill_t *, const in6_addr_t *);
130 static nce_t *nce_ill_lookup_then_add(ill_t *, ncec_t *);
131 static int nce_add_v6(ill_t *, uchar_t *, uint_t, const in6_addr_t *,
132 uint16_t, uint16_t, nce_t **);
133 static int nce_add_v4(ill_t *, uchar_t *, uint_t, const in_addr_t *,
134 uint16_t, uint16_t, nce_t **);
135 static int nce_add_v6_postprocess(nce_t *);
136 static int nce_add_v4_postprocess(nce_t *);
137 static ill_t *nce_resolve_src(ncec_t *, in6_addr_t *);
138 static clock_t nce_fuzz_interval(clock_t, boolean_t);
139 static void nce_resolv_ipmp_ok(ncec_t *);
140 static void nce_walk_common(ill_t *, pfi_t, void *);
141 static void nce_start_timer(ncec_t *, uint_t);
142 static nce_t *nce_fastpath_create(ill_t *, ncec_t *);
143 static void nce_fastpath_trigger(nce_t *);
144 static nce_t *nce_fastpath(ncec_t *, boolean_t, nce_t *);
145
146 #ifdef DEBUG
147 static void ncec_trace_cleanup(const ncec_t *);
148 #endif
149
150 #define NCE_HASH_PTR_V4(ipst, addr) \
151 (&((ipst)->ips_ndp4->nce_hash_tbl[IRE_ADDR_HASH(addr, NCE_TABLE_SIZE)]))
152
153 #define NCE_HASH_PTR_V6(ipst, addr) \
154 (&((ipst)->ips_ndp6->nce_hash_tbl[NCE_ADDR_HASH_V6(addr, \
155 NCE_TABLE_SIZE)]))
156
157 extern kmem_cache_t *ncec_cache;
158 extern kmem_cache_t *nce_cache;
159
160 /*
161 * Send out a IPv6 (unicast) or IPv4 (broadcast) DAD probe
162 * If src_ill is not null, the ncec_addr is bound to src_ill. The
163 * src_ill is ignored by nce_dad for IPv4 Neighbor Cache entries where
164 * the probe is sent on the ncec_ill (in the non-IPMP case) or the
165 * IPMP cast_ill (in the IPMP case).
166 *
167 * Note that the probe interval is based on the src_ill for IPv6, and
168 * the ncec_xmit_interval for IPv4.
169 */
170 static void
nce_dad(ncec_t * ncec,ill_t * src_ill,boolean_t send_probe)171 nce_dad(ncec_t *ncec, ill_t *src_ill, boolean_t send_probe)
172 {
173 boolean_t dropped;
174 uint32_t probe_interval;
175
176 ASSERT(!(ncec->ncec_flags & NCE_F_MCAST));
177 ASSERT(!(ncec->ncec_flags & NCE_F_BCAST));
178 if (ncec->ncec_ipversion == IPV6_VERSION) {
179 dropped = ndp_xmit(src_ill, ND_NEIGHBOR_SOLICIT,
180 ncec->ncec_lladdr, ncec->ncec_lladdr_length,
181 &ipv6_all_zeros, &ncec->ncec_addr, NDP_PROBE);
182 probe_interval = ILL_PROBE_INTERVAL(src_ill);
183 } else {
184 /* IPv4 DAD delay the initial probe. */
185 if (send_probe)
186 dropped = arp_probe(ncec);
187 else
188 dropped = B_TRUE;
189 probe_interval = nce_fuzz_interval(ncec->ncec_xmit_interval,
190 !send_probe);
191 }
192 if (!dropped) {
193 mutex_enter(&ncec->ncec_lock);
194 ncec->ncec_pcnt--;
195 mutex_exit(&ncec->ncec_lock);
196 }
197 nce_restart_timer(ncec, probe_interval);
198 }
199
200 /*
201 * Compute default flags to use for an advertisement of this ncec's address.
202 */
203 static int
nce_advert_flags(const ncec_t * ncec)204 nce_advert_flags(const ncec_t *ncec)
205 {
206 int flag = 0;
207
208 if (ncec->ncec_flags & NCE_F_ISROUTER)
209 flag |= NDP_ISROUTER;
210 if (!(ncec->ncec_flags & NCE_F_ANYCAST))
211 flag |= NDP_ORIDE;
212
213 return (flag);
214 }
215
216 /*
217 * NDP Cache Entry creation routine.
218 * This routine must always be called with ndp6->ndp_g_lock held.
219 */
220 int
nce_add_v6(ill_t * ill,uchar_t * hw_addr,uint_t hw_addr_len,const in6_addr_t * addr,uint16_t flags,uint16_t state,nce_t ** newnce)221 nce_add_v6(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len,
222 const in6_addr_t *addr, uint16_t flags, uint16_t state, nce_t **newnce)
223 {
224 int err;
225 nce_t *nce;
226
227 ASSERT(MUTEX_HELD(&ill->ill_ipst->ips_ndp6->ndp_g_lock));
228 ASSERT(ill != NULL && ill->ill_isv6);
229
230 err = nce_add_common(ill, hw_addr, hw_addr_len, addr, flags, state,
231 &nce);
232 if (err != 0)
233 return (err);
234 ASSERT(newnce != NULL);
235 *newnce = nce;
236 return (err);
237 }
238
239 /*
240 * Post-processing routine to be executed after nce_add_v6(). This function
241 * triggers fastpath (if appropriate) and DAD on the newly added nce entry
242 * and must be called without any locks held.
243 */
244 int
nce_add_v6_postprocess(nce_t * nce)245 nce_add_v6_postprocess(nce_t *nce)
246 {
247 ncec_t *ncec = nce->nce_common;
248 boolean_t dropped = B_FALSE;
249 uchar_t *hw_addr = ncec->ncec_lladdr;
250 uint_t hw_addr_len = ncec->ncec_lladdr_length;
251 ill_t *ill = ncec->ncec_ill;
252 int err = 0;
253 uint16_t flags = ncec->ncec_flags;
254 ip_stack_t *ipst = ill->ill_ipst;
255 boolean_t trigger_fastpath = B_TRUE;
256
257 /*
258 * If the hw_addr is NULL, typically for ND_INCOMPLETE nces, then
259 * we call nce_fastpath as soon as the ncec is resolved in nce_process.
260 * We call nce_fastpath from nce_update if the link layer address of
261 * the peer changes from nce_update
262 */
263 if (NCE_PUBLISH(ncec) || !NCE_ISREACHABLE(ncec) ||
264 (hw_addr == NULL && ill->ill_net_type != IRE_IF_NORESOLVER))
265 trigger_fastpath = B_FALSE;
266
267 if (trigger_fastpath)
268 nce_fastpath_trigger(nce);
269 if (NCE_PUBLISH(ncec) && ncec->ncec_state == ND_PROBE) {
270 ill_t *hwaddr_ill;
271 /*
272 * Unicast entry that needs DAD.
273 */
274 if (IS_IPMP(ill)) {
275 hwaddr_ill = ipmp_illgrp_find_ill(ill->ill_grp,
276 hw_addr, hw_addr_len);
277 } else {
278 hwaddr_ill = ill;
279 }
280 nce_dad(ncec, hwaddr_ill, B_TRUE);
281 err = EINPROGRESS;
282 } else if (flags & NCE_F_UNSOL_ADV) {
283 /*
284 * We account for the transmit below by assigning one
285 * less than the ndd variable. Subsequent decrements
286 * are done in nce_timer.
287 */
288 mutex_enter(&ncec->ncec_lock);
289 ncec->ncec_unsolicit_count =
290 ipst->ips_ip_ndp_unsolicit_count - 1;
291 mutex_exit(&ncec->ncec_lock);
292 dropped = ndp_xmit(ill,
293 ND_NEIGHBOR_ADVERT,
294 hw_addr,
295 hw_addr_len,
296 &ncec->ncec_addr, /* Source and target of the adv */
297 &ipv6_all_hosts_mcast, /* Destination of the packet */
298 nce_advert_flags(ncec));
299 mutex_enter(&ncec->ncec_lock);
300 if (dropped)
301 ncec->ncec_unsolicit_count++;
302 else
303 ncec->ncec_last_time_defended = ddi_get_lbolt();
304 if (ncec->ncec_unsolicit_count != 0) {
305 nce_start_timer(ncec,
306 ipst->ips_ip_ndp_unsolicit_interval);
307 }
308 mutex_exit(&ncec->ncec_lock);
309 }
310 return (err);
311 }
312
313 /*
314 * Atomically lookup and add (if needed) Neighbor Cache information for
315 * an address.
316 *
317 * IPMP notes: the ncec for non-local (i.e., !NCE_MYADDR(ncec) addresses
318 * are always added pointing at the ipmp_ill. Thus, when the ill passed
319 * to nce_add_v6 is an under_ill (i.e., IS_UNDER_IPMP(ill)) two nce_t
320 * entries will be created, both pointing at the same ncec_t. The nce_t
321 * entries will have their nce_ill set to the ipmp_ill and the under_ill
322 * respectively, with the ncec_t having its ncec_ill pointing at the ipmp_ill.
323 * Local addresses are always created on the ill passed to nce_add_v6.
324 */
325 int
nce_lookup_then_add_v6(ill_t * ill,uchar_t * hw_addr,uint_t hw_addr_len,const in6_addr_t * addr,uint16_t flags,uint16_t state,nce_t ** newnce)326 nce_lookup_then_add_v6(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len,
327 const in6_addr_t *addr, uint16_t flags, uint16_t state, nce_t **newnce)
328 {
329 int err = 0;
330 ip_stack_t *ipst = ill->ill_ipst;
331 nce_t *nce, *upper_nce = NULL;
332 ill_t *in_ill = ill;
333 boolean_t need_ill_refrele = B_FALSE;
334
335 if (flags & NCE_F_MCAST) {
336 /*
337 * hw_addr will be figured out in nce_set_multicast_v6;
338 * caller has to select the cast_ill
339 */
340 ASSERT(hw_addr == NULL);
341 ASSERT(!IS_IPMP(ill));
342 err = nce_set_multicast_v6(ill, addr, flags, newnce);
343 return (err);
344 }
345 ASSERT(ill->ill_isv6);
346 if (IS_UNDER_IPMP(ill) && !(flags & NCE_F_MYADDR)) {
347 ill = ipmp_ill_hold_ipmp_ill(ill);
348 if (ill == NULL)
349 return (ENXIO);
350 need_ill_refrele = B_TRUE;
351 }
352
353 mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
354 nce = nce_lookup_addr(ill, addr);
355 if (nce == NULL) {
356 err = nce_add_v6(ill, hw_addr, hw_addr_len, addr, flags, state,
357 &nce);
358 } else {
359 err = EEXIST;
360 }
361 mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
362 if (err == 0)
363 err = nce_add_v6_postprocess(nce);
364 if (in_ill != ill && nce != NULL) {
365 nce_t *under_nce = NULL;
366
367 /*
368 * in_ill was the under_ill. Try to create the under_nce.
369 * Hold the ill_g_lock to prevent changes to group membership
370 * until we are done.
371 */
372 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
373 if (!IS_IN_SAME_ILLGRP(in_ill, ill)) {
374 DTRACE_PROBE2(ill__not__in__group, nce_t *, nce,
375 ill_t *, ill);
376 rw_exit(&ipst->ips_ill_g_lock);
377 err = ENXIO;
378 nce_refrele(nce);
379 nce = NULL;
380 goto bail;
381 }
382 under_nce = nce_fastpath_create(in_ill, nce->nce_common);
383 if (under_nce == NULL) {
384 rw_exit(&ipst->ips_ill_g_lock);
385 err = EINVAL;
386 nce_refrele(nce);
387 nce = NULL;
388 goto bail;
389 }
390 rw_exit(&ipst->ips_ill_g_lock);
391 upper_nce = nce;
392 nce = under_nce; /* will be returned to caller */
393 if (NCE_ISREACHABLE(nce->nce_common))
394 nce_fastpath_trigger(under_nce);
395 }
396 /* nce_refrele is deferred until the lock is dropped */
397 if (nce != NULL) {
398 if (newnce != NULL)
399 *newnce = nce;
400 else
401 nce_refrele(nce);
402 }
403 bail:
404 if (upper_nce != NULL)
405 nce_refrele(upper_nce);
406 if (need_ill_refrele)
407 ill_refrele(ill);
408 return (err);
409 }
410
411 /*
412 * Remove all the CONDEMNED nces from the appropriate hash table.
413 * We create a private list of NCEs, these may have ires pointing
414 * to them, so the list will be passed through to clean up dependent
415 * ires and only then we can do ncec_refrele() which can make NCE inactive.
416 */
417 static void
nce_remove(ndp_g_t * ndp,ncec_t * ncec,ncec_t ** free_nce_list)418 nce_remove(ndp_g_t *ndp, ncec_t *ncec, ncec_t **free_nce_list)
419 {
420 ncec_t *ncec1;
421 ncec_t **ptpn;
422
423 ASSERT(MUTEX_HELD(&ndp->ndp_g_lock));
424 ASSERT(ndp->ndp_g_walker == 0);
425 for (; ncec; ncec = ncec1) {
426 ncec1 = ncec->ncec_next;
427 mutex_enter(&ncec->ncec_lock);
428 if (NCE_ISCONDEMNED(ncec)) {
429 ptpn = ncec->ncec_ptpn;
430 ncec1 = ncec->ncec_next;
431 if (ncec1 != NULL)
432 ncec1->ncec_ptpn = ptpn;
433 *ptpn = ncec1;
434 ncec->ncec_ptpn = NULL;
435 ncec->ncec_next = NULL;
436 ncec->ncec_next = *free_nce_list;
437 *free_nce_list = ncec;
438 }
439 mutex_exit(&ncec->ncec_lock);
440 }
441 }
442
443 /*
444 * 1. Mark the entry CONDEMNED. This ensures that no new nce_lookup()
445 * will return this NCE. Also no new timeouts will
446 * be started (See nce_restart_timer).
447 * 2. Cancel any currently running timeouts.
448 * 3. If there is an ndp walker, return. The walker will do the cleanup.
449 * This ensures that walkers see a consistent list of NCEs while walking.
450 * 4. Otherwise remove the NCE from the list of NCEs
451 */
452 void
ncec_delete(ncec_t * ncec)453 ncec_delete(ncec_t *ncec)
454 {
455 ncec_t **ptpn;
456 ncec_t *ncec1;
457 int ipversion = ncec->ncec_ipversion;
458 ndp_g_t *ndp;
459 ip_stack_t *ipst = ncec->ncec_ipst;
460
461 if (ipversion == IPV4_VERSION)
462 ndp = ipst->ips_ndp4;
463 else
464 ndp = ipst->ips_ndp6;
465
466 /* Serialize deletes */
467 mutex_enter(&ncec->ncec_lock);
468 if (NCE_ISCONDEMNED(ncec)) {
469 /* Some other thread is doing the delete */
470 mutex_exit(&ncec->ncec_lock);
471 return;
472 }
473 /*
474 * Caller has a refhold. Also 1 ref for being in the list. Thus
475 * refcnt has to be >= 2
476 */
477 ASSERT(ncec->ncec_refcnt >= 2);
478 ncec->ncec_flags |= NCE_F_CONDEMNED;
479 mutex_exit(&ncec->ncec_lock);
480
481 /* Count how many condemned ires for kmem_cache callback */
482 atomic_inc_32(&ipst->ips_num_nce_condemned);
483 nce_fastpath_list_delete(ncec->ncec_ill, ncec, NULL);
484
485 /* Complete any waiting callbacks */
486 ncec_cb_dispatch(ncec);
487
488 /*
489 * Cancel any running timer. Timeout can't be restarted
490 * since CONDEMNED is set. Can't hold ncec_lock across untimeout.
491 * Passing invalid timeout id is fine.
492 */
493 if (ncec->ncec_timeout_id != 0) {
494 (void) untimeout(ncec->ncec_timeout_id);
495 ncec->ncec_timeout_id = 0;
496 }
497
498 mutex_enter(&ndp->ndp_g_lock);
499 if (ncec->ncec_ptpn == NULL) {
500 /*
501 * The last ndp walker has already removed this ncec from
502 * the list after we marked the ncec CONDEMNED and before
503 * we grabbed the global lock.
504 */
505 mutex_exit(&ndp->ndp_g_lock);
506 return;
507 }
508 if (ndp->ndp_g_walker > 0) {
509 /*
510 * Can't unlink. The walker will clean up
511 */
512 ndp->ndp_g_walker_cleanup = B_TRUE;
513 mutex_exit(&ndp->ndp_g_lock);
514 return;
515 }
516
517 /*
518 * Now remove the ncec from the list. nce_restart_timer won't restart
519 * the timer since it is marked CONDEMNED.
520 */
521 ptpn = ncec->ncec_ptpn;
522 ncec1 = ncec->ncec_next;
523 if (ncec1 != NULL)
524 ncec1->ncec_ptpn = ptpn;
525 *ptpn = ncec1;
526 ncec->ncec_ptpn = NULL;
527 ncec->ncec_next = NULL;
528 mutex_exit(&ndp->ndp_g_lock);
529
530 /* Removed from ncec_ptpn/ncec_next list */
531 ncec_refrele_notr(ncec);
532 }
533
534 void
ncec_inactive(ncec_t * ncec)535 ncec_inactive(ncec_t *ncec)
536 {
537 mblk_t **mpp;
538 ill_t *ill = ncec->ncec_ill;
539 ip_stack_t *ipst = ncec->ncec_ipst;
540
541 ASSERT(ncec->ncec_refcnt == 0);
542 ASSERT(MUTEX_HELD(&ncec->ncec_lock));
543
544 /* Count how many condemned nces for kmem_cache callback */
545 if (NCE_ISCONDEMNED(ncec))
546 atomic_add_32(&ipst->ips_num_nce_condemned, -1);
547
548 /* Free all allocated messages */
549 mpp = &ncec->ncec_qd_mp;
550 while (*mpp != NULL) {
551 mblk_t *mp;
552
553 mp = *mpp;
554 *mpp = mp->b_next;
555
556 inet_freemsg(mp);
557 }
558 /*
559 * must have been cleaned up in ncec_delete
560 */
561 ASSERT(list_is_empty(&ncec->ncec_cb));
562 list_destroy(&ncec->ncec_cb);
563 /*
564 * free the ncec_lladdr if one was allocated in nce_add_common()
565 */
566 if (ncec->ncec_lladdr_length > 0)
567 kmem_free(ncec->ncec_lladdr, ncec->ncec_lladdr_length);
568
569 #ifdef DEBUG
570 ncec_trace_cleanup(ncec);
571 #endif
572
573 mutex_enter(&ill->ill_lock);
574 DTRACE_PROBE3(ill__decr__cnt, (ill_t *), ill,
575 (char *), "ncec", (void *), ncec);
576 ill->ill_ncec_cnt--;
577 ncec->ncec_ill = NULL;
578 /*
579 * If the number of ncec's associated with this ill have dropped
580 * to zero, check whether we need to restart any operation that
581 * is waiting for this to happen.
582 */
583 if (ILL_DOWN_OK(ill)) {
584 /* ipif_ill_refrele_tail drops the ill_lock */
585 ipif_ill_refrele_tail(ill);
586 } else {
587 mutex_exit(&ill->ill_lock);
588 }
589
590 mutex_destroy(&ncec->ncec_lock);
591 kmem_cache_free(ncec_cache, ncec);
592 }
593
594 /*
595 * ncec_walk routine. Delete the ncec if it is associated with the ill
596 * that is going away. Always called as a writer.
597 */
598 void
ncec_delete_per_ill(ncec_t * ncec,uchar_t * arg)599 ncec_delete_per_ill(ncec_t *ncec, uchar_t *arg)
600 {
601 if ((ncec != NULL) && ncec->ncec_ill == (ill_t *)arg) {
602 ncec_delete(ncec);
603 }
604 }
605
606 /*
607 * Neighbor Cache cleanup logic for a list of ncec_t entries.
608 */
609 static void
nce_cleanup_list(ncec_t * ncec)610 nce_cleanup_list(ncec_t *ncec)
611 {
612 ncec_t *ncec_next;
613
614 ASSERT(ncec != NULL);
615 while (ncec != NULL) {
616 ncec_next = ncec->ncec_next;
617 ncec->ncec_next = NULL;
618
619 /*
620 * It is possible for the last ndp walker (this thread)
621 * to come here after ncec_delete has marked the ncec CONDEMNED
622 * and before it has removed the ncec from the fastpath list
623 * or called untimeout. So we need to do it here. It is safe
624 * for both ncec_delete and this thread to do it twice or
625 * even simultaneously since each of the threads has a
626 * reference on the ncec.
627 */
628 nce_fastpath_list_delete(ncec->ncec_ill, ncec, NULL);
629 /*
630 * Cancel any running timer. Timeout can't be restarted
631 * since CONDEMNED is set. The ncec_lock can't be
632 * held across untimeout though passing invalid timeout
633 * id is fine.
634 */
635 if (ncec->ncec_timeout_id != 0) {
636 (void) untimeout(ncec->ncec_timeout_id);
637 ncec->ncec_timeout_id = 0;
638 }
639 /* Removed from ncec_ptpn/ncec_next list */
640 ncec_refrele_notr(ncec);
641 ncec = ncec_next;
642 }
643 }
644
645 /*
646 * Restart DAD on given NCE. Returns B_TRUE if DAD has been restarted.
647 */
648 boolean_t
nce_restart_dad(ncec_t * ncec)649 nce_restart_dad(ncec_t *ncec)
650 {
651 boolean_t started;
652 ill_t *ill, *hwaddr_ill;
653
654 if (ncec == NULL)
655 return (B_FALSE);
656 ill = ncec->ncec_ill;
657 mutex_enter(&ncec->ncec_lock);
658 if (ncec->ncec_state == ND_PROBE) {
659 mutex_exit(&ncec->ncec_lock);
660 started = B_TRUE;
661 } else if (ncec->ncec_state == ND_REACHABLE) {
662 ASSERT(ncec->ncec_lladdr != NULL);
663 ncec->ncec_state = ND_PROBE;
664 ncec->ncec_pcnt = ND_MAX_UNICAST_SOLICIT;
665 /*
666 * Slight cheat here: we don't use the initial probe delay
667 * for IPv4 in this obscure case.
668 */
669 mutex_exit(&ncec->ncec_lock);
670 if (IS_IPMP(ill)) {
671 hwaddr_ill = ipmp_illgrp_find_ill(ill->ill_grp,
672 ncec->ncec_lladdr, ncec->ncec_lladdr_length);
673 } else {
674 hwaddr_ill = ill;
675 }
676 nce_dad(ncec, hwaddr_ill, B_TRUE);
677 started = B_TRUE;
678 } else {
679 mutex_exit(&ncec->ncec_lock);
680 started = B_FALSE;
681 }
682 return (started);
683 }
684
685 /*
686 * IPv6 Cache entry lookup. Try to find an ncec matching the parameters passed.
687 * If one is found, the refcnt on the ncec will be incremented.
688 */
689 ncec_t *
ncec_lookup_illgrp_v6(ill_t * ill,const in6_addr_t * addr)690 ncec_lookup_illgrp_v6(ill_t *ill, const in6_addr_t *addr)
691 {
692 ncec_t *ncec;
693 ip_stack_t *ipst = ill->ill_ipst;
694
695 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
696 mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
697
698 /* Get head of v6 hash table */
699 ncec = *((ncec_t **)NCE_HASH_PTR_V6(ipst, *addr));
700 ncec = ncec_lookup_illgrp(ill, addr, ncec);
701 mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
702 rw_exit(&ipst->ips_ill_g_lock);
703 return (ncec);
704 }
705 /*
706 * IPv4 Cache entry lookup. Try to find an ncec matching the parameters passed.
707 * If one is found, the refcnt on the ncec will be incremented.
708 */
709 ncec_t *
ncec_lookup_illgrp_v4(ill_t * ill,const in_addr_t * addr)710 ncec_lookup_illgrp_v4(ill_t *ill, const in_addr_t *addr)
711 {
712 ncec_t *ncec = NULL;
713 in6_addr_t addr6;
714 ip_stack_t *ipst = ill->ill_ipst;
715
716 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
717 mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
718
719 /* Get head of v4 hash table */
720 ncec = *((ncec_t **)NCE_HASH_PTR_V4(ipst, *addr));
721 IN6_IPADDR_TO_V4MAPPED(*addr, &addr6);
722 ncec = ncec_lookup_illgrp(ill, &addr6, ncec);
723 mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
724 rw_exit(&ipst->ips_ill_g_lock);
725 return (ncec);
726 }
727
728 /*
729 * Cache entry lookup. Try to find an ncec matching the parameters passed.
730 * If an ncec is found, increment the hold count on that ncec.
731 * The caller passes in the start of the appropriate hash table, and must
732 * be holding the appropriate global lock (ndp_g_lock). In addition, since
733 * this function matches ncec_t entries across the illgrp, the ips_ill_g_lock
734 * must be held as reader.
735 *
736 * This function always matches across the ipmp group.
737 */
738 ncec_t *
ncec_lookup_illgrp(ill_t * ill,const in6_addr_t * addr,ncec_t * ncec)739 ncec_lookup_illgrp(ill_t *ill, const in6_addr_t *addr, ncec_t *ncec)
740 {
741 ndp_g_t *ndp;
742 ip_stack_t *ipst = ill->ill_ipst;
743
744 if (ill->ill_isv6)
745 ndp = ipst->ips_ndp6;
746 else
747 ndp = ipst->ips_ndp4;
748
749 ASSERT(ill != NULL);
750 ASSERT(MUTEX_HELD(&ndp->ndp_g_lock));
751 if (IN6_IS_ADDR_UNSPECIFIED(addr))
752 return (NULL);
753 for (; ncec != NULL; ncec = ncec->ncec_next) {
754 if (ncec->ncec_ill == ill ||
755 IS_IN_SAME_ILLGRP(ill, ncec->ncec_ill)) {
756 if (IN6_ARE_ADDR_EQUAL(&ncec->ncec_addr, addr)) {
757 mutex_enter(&ncec->ncec_lock);
758 if (!NCE_ISCONDEMNED(ncec)) {
759 ncec_refhold_locked(ncec);
760 mutex_exit(&ncec->ncec_lock);
761 break;
762 }
763 mutex_exit(&ncec->ncec_lock);
764 }
765 }
766 }
767 return (ncec);
768 }
769
770 /*
771 * Find an nce_t on ill with nce_addr == addr. Lookup the nce_t
772 * entries for ill only, i.e., when ill is part of an ipmp group,
773 * nce_lookup_v4 will never try to match across the group.
774 */
775 nce_t *
nce_lookup_v4(ill_t * ill,const in_addr_t * addr)776 nce_lookup_v4(ill_t *ill, const in_addr_t *addr)
777 {
778 nce_t *nce;
779 in6_addr_t addr6;
780 ip_stack_t *ipst = ill->ill_ipst;
781
782 mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
783 IN6_IPADDR_TO_V4MAPPED(*addr, &addr6);
784 nce = nce_lookup_addr(ill, &addr6);
785 mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
786 return (nce);
787 }
788
789 /*
790 * Find an nce_t on ill with nce_addr == addr. Lookup the nce_t
791 * entries for ill only, i.e., when ill is part of an ipmp group,
792 * nce_lookup_v6 will never try to match across the group.
793 */
794 nce_t *
nce_lookup_v6(ill_t * ill,const in6_addr_t * addr6)795 nce_lookup_v6(ill_t *ill, const in6_addr_t *addr6)
796 {
797 nce_t *nce;
798 ip_stack_t *ipst = ill->ill_ipst;
799
800 mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
801 nce = nce_lookup_addr(ill, addr6);
802 mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
803 return (nce);
804 }
805
806 static nce_t *
nce_lookup_addr(ill_t * ill,const in6_addr_t * addr)807 nce_lookup_addr(ill_t *ill, const in6_addr_t *addr)
808 {
809 nce_t *nce;
810
811 ASSERT(ill != NULL);
812 #ifdef DEBUG
813 if (ill->ill_isv6)
814 ASSERT(MUTEX_HELD(&ill->ill_ipst->ips_ndp6->ndp_g_lock));
815 else
816 ASSERT(MUTEX_HELD(&ill->ill_ipst->ips_ndp4->ndp_g_lock));
817 #endif
818 mutex_enter(&ill->ill_lock);
819 nce = nce_lookup(ill, addr);
820 mutex_exit(&ill->ill_lock);
821 return (nce);
822 }
823
824
825 /*
826 * Router turned to host. We need to make sure that cached copies of the ncec
827 * are not used for forwarding packets if they were derived from the default
828 * route, and that the default route itself is removed, as required by
829 * section 7.2.5 of RFC 2461.
830 *
831 * Note that the ncec itself probably has valid link-layer information for the
832 * nexthop, so that there is no reason to delete the ncec, as long as the
833 * ISROUTER flag is turned off.
834 */
835 static void
ncec_router_to_host(ncec_t * ncec)836 ncec_router_to_host(ncec_t *ncec)
837 {
838 ire_t *ire;
839 ip_stack_t *ipst = ncec->ncec_ipst;
840
841 mutex_enter(&ncec->ncec_lock);
842 ncec->ncec_flags &= ~NCE_F_ISROUTER;
843 mutex_exit(&ncec->ncec_lock);
844
845 ire = ire_ftable_lookup_v6(&ipv6_all_zeros, &ipv6_all_zeros,
846 &ncec->ncec_addr, IRE_DEFAULT, ncec->ncec_ill, ALL_ZONES, NULL,
847 MATCH_IRE_ILL | MATCH_IRE_TYPE | MATCH_IRE_GW, 0, ipst, NULL);
848 if (ire != NULL) {
849 ip_rts_rtmsg(RTM_DELETE, ire, 0, ipst);
850 ire_delete(ire);
851 ire_refrele(ire);
852 }
853 }
854
855 /*
856 * Process passed in parameters either from an incoming packet or via
857 * user ioctl.
858 */
859 void
nce_process(ncec_t * ncec,uchar_t * hw_addr,uint32_t flag,boolean_t is_adv)860 nce_process(ncec_t *ncec, uchar_t *hw_addr, uint32_t flag, boolean_t is_adv)
861 {
862 ill_t *ill = ncec->ncec_ill;
863 uint32_t hw_addr_len = ill->ill_phys_addr_length;
864 boolean_t ll_updated = B_FALSE;
865 boolean_t ll_changed;
866 nce_t *nce;
867
868 ASSERT(ncec->ncec_ipversion == IPV6_VERSION);
869 /*
870 * No updates of link layer address or the neighbor state is
871 * allowed, when the cache is in NONUD state. This still
872 * allows for responding to reachability solicitation.
873 */
874 mutex_enter(&ncec->ncec_lock);
875 if (ncec->ncec_state == ND_INCOMPLETE) {
876 if (hw_addr == NULL) {
877 mutex_exit(&ncec->ncec_lock);
878 return;
879 }
880 nce_set_ll(ncec, hw_addr);
881 /*
882 * Update ncec state and send the queued packets
883 * back to ip this time ire will be added.
884 */
885 if (flag & ND_NA_FLAG_SOLICITED) {
886 nce_update(ncec, ND_REACHABLE, NULL);
887 } else {
888 nce_update(ncec, ND_STALE, NULL);
889 }
890 mutex_exit(&ncec->ncec_lock);
891 nce = nce_fastpath(ncec, B_TRUE, NULL);
892 nce_resolv_ok(ncec);
893 if (nce != NULL)
894 nce_refrele(nce);
895 return;
896 }
897 ll_changed = nce_cmp_ll_addr(ncec, hw_addr, hw_addr_len);
898 if (!is_adv) {
899 /* If this is a SOLICITATION request only */
900 if (ll_changed)
901 nce_update(ncec, ND_STALE, hw_addr);
902 mutex_exit(&ncec->ncec_lock);
903 ncec_cb_dispatch(ncec);
904 return;
905 }
906 if (!(flag & ND_NA_FLAG_OVERRIDE) && ll_changed) {
907 /* If in any other state than REACHABLE, ignore */
908 if (ncec->ncec_state == ND_REACHABLE) {
909 nce_update(ncec, ND_STALE, NULL);
910 }
911 mutex_exit(&ncec->ncec_lock);
912 ncec_cb_dispatch(ncec);
913 return;
914 } else {
915 if (ll_changed) {
916 nce_update(ncec, ND_UNCHANGED, hw_addr);
917 ll_updated = B_TRUE;
918 }
919 if (flag & ND_NA_FLAG_SOLICITED) {
920 nce_update(ncec, ND_REACHABLE, NULL);
921 } else {
922 if (ll_updated) {
923 nce_update(ncec, ND_STALE, NULL);
924 }
925 }
926 mutex_exit(&ncec->ncec_lock);
927 if (!(flag & ND_NA_FLAG_ROUTER) && (ncec->ncec_flags &
928 NCE_F_ISROUTER)) {
929 ncec_router_to_host(ncec);
930 } else {
931 ncec_cb_dispatch(ncec);
932 }
933 }
934 }
935
936 /*
937 * Pass arg1 to the pfi supplied, along with each ncec in existence.
938 * ncec_walk() places a REFHOLD on the ncec and drops the lock when
939 * walking the hash list.
940 */
941 void
ncec_walk_common(ndp_g_t * ndp,ill_t * ill,pfi_t pfi,void * arg1,boolean_t trace)942 ncec_walk_common(ndp_g_t *ndp, ill_t *ill, pfi_t pfi, void *arg1,
943 boolean_t trace)
944 {
945 ncec_t *ncec;
946 ncec_t *ncec1;
947 ncec_t **ncep;
948 ncec_t *free_nce_list = NULL;
949
950 mutex_enter(&ndp->ndp_g_lock);
951 /* Prevent ncec_delete from unlink and free of NCE */
952 ndp->ndp_g_walker++;
953 mutex_exit(&ndp->ndp_g_lock);
954 for (ncep = ndp->nce_hash_tbl;
955 ncep < A_END(ndp->nce_hash_tbl); ncep++) {
956 for (ncec = *ncep; ncec != NULL; ncec = ncec1) {
957 ncec1 = ncec->ncec_next;
958 if (ill == NULL || ncec->ncec_ill == ill) {
959 if (trace) {
960 ncec_refhold(ncec);
961 (*pfi)(ncec, arg1);
962 ncec_refrele(ncec);
963 } else {
964 ncec_refhold_notr(ncec);
965 (*pfi)(ncec, arg1);
966 ncec_refrele_notr(ncec);
967 }
968 }
969 }
970 }
971 mutex_enter(&ndp->ndp_g_lock);
972 ndp->ndp_g_walker--;
973 if (ndp->ndp_g_walker_cleanup && ndp->ndp_g_walker == 0) {
974 /* Time to delete condemned entries */
975 for (ncep = ndp->nce_hash_tbl;
976 ncep < A_END(ndp->nce_hash_tbl); ncep++) {
977 ncec = *ncep;
978 if (ncec != NULL) {
979 nce_remove(ndp, ncec, &free_nce_list);
980 }
981 }
982 ndp->ndp_g_walker_cleanup = B_FALSE;
983 }
984
985 mutex_exit(&ndp->ndp_g_lock);
986
987 if (free_nce_list != NULL) {
988 nce_cleanup_list(free_nce_list);
989 }
990 }
991
992 /*
993 * Walk everything.
994 * Note that ill can be NULL hence can't derive the ipst from it.
995 */
996 void
ncec_walk(ill_t * ill,pfi_t pfi,void * arg1,ip_stack_t * ipst)997 ncec_walk(ill_t *ill, pfi_t pfi, void *arg1, ip_stack_t *ipst)
998 {
999 ncec_walk_common(ipst->ips_ndp4, ill, pfi, arg1, B_TRUE);
1000 ncec_walk_common(ipst->ips_ndp6, ill, pfi, arg1, B_TRUE);
1001 }
1002
1003 /*
1004 * For each interface an entry is added for the unspecified multicast group.
1005 * Here that mapping is used to form the multicast cache entry for a particular
1006 * multicast destination.
1007 */
1008 static int
nce_set_multicast_v6(ill_t * ill,const in6_addr_t * dst,uint16_t flags,nce_t ** newnce)1009 nce_set_multicast_v6(ill_t *ill, const in6_addr_t *dst,
1010 uint16_t flags, nce_t **newnce)
1011 {
1012 uchar_t *hw_addr;
1013 int err = 0;
1014 ip_stack_t *ipst = ill->ill_ipst;
1015 nce_t *nce;
1016
1017 ASSERT(ill != NULL);
1018 ASSERT(ill->ill_isv6);
1019 ASSERT(!(IN6_IS_ADDR_UNSPECIFIED(dst)));
1020
1021 mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
1022 nce = nce_lookup_addr(ill, dst);
1023 if (nce != NULL) {
1024 mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
1025 goto done;
1026 }
1027 if (ill->ill_net_type == IRE_IF_RESOLVER) {
1028 /*
1029 * For IRE_IF_RESOLVER a hardware mapping can be
1030 * generated.
1031 */
1032 hw_addr = kmem_alloc(ill->ill_nd_lla_len, KM_NOSLEEP);
1033 if (hw_addr == NULL) {
1034 mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
1035 return (ENOMEM);
1036 }
1037 ip_mcast_mapping(ill, (uchar_t *)dst, hw_addr);
1038 } else {
1039 /* No hw_addr is needed for IRE_IF_NORESOLVER. */
1040 hw_addr = NULL;
1041 }
1042 ASSERT((flags & NCE_F_MCAST) != 0);
1043 ASSERT((flags & NCE_F_NONUD) != 0);
1044 /* nce_state will be computed by nce_add_common() */
1045 err = nce_add_v6(ill, hw_addr, ill->ill_phys_addr_length, dst, flags,
1046 ND_UNCHANGED, &nce);
1047 mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
1048 if (err == 0)
1049 err = nce_add_v6_postprocess(nce);
1050 if (hw_addr != NULL)
1051 kmem_free(hw_addr, ill->ill_nd_lla_len);
1052 if (err != 0) {
1053 ip1dbg(("nce_set_multicast_v6: create failed" "%d\n", err));
1054 return (err);
1055 }
1056 done:
1057 ASSERT(nce->nce_common->ncec_state == ND_REACHABLE);
1058 if (newnce != NULL)
1059 *newnce = nce;
1060 else
1061 nce_refrele(nce);
1062 return (0);
1063 }
1064
1065 /*
1066 * Return the link layer address, and any flags of a ncec.
1067 */
1068 int
ndp_query(ill_t * ill,struct lif_nd_req * lnr)1069 ndp_query(ill_t *ill, struct lif_nd_req *lnr)
1070 {
1071 ncec_t *ncec;
1072 in6_addr_t *addr;
1073 sin6_t *sin6;
1074
1075 ASSERT(ill != NULL && ill->ill_isv6);
1076 sin6 = (sin6_t *)&lnr->lnr_addr;
1077 addr = &sin6->sin6_addr;
1078
1079 /*
1080 * NOTE: if the ill is an IPMP interface, then match against the whole
1081 * illgrp. This e.g. allows in.ndpd to retrieve the link layer
1082 * addresses for the data addresses on an IPMP interface even though
1083 * ipif_ndp_up() created them with an ncec_ill of ipif_bound_ill.
1084 */
1085 ncec = ncec_lookup_illgrp_v6(ill, addr);
1086 if (ncec == NULL)
1087 return (ESRCH);
1088 /* If no link layer address is available yet, return ESRCH */
1089 if (!NCE_ISREACHABLE(ncec)) {
1090 ncec_refrele(ncec);
1091 return (ESRCH);
1092 }
1093 lnr->lnr_hdw_len = ill->ill_phys_addr_length;
1094 bcopy(ncec->ncec_lladdr, (uchar_t *)&lnr->lnr_hdw_addr,
1095 lnr->lnr_hdw_len);
1096 if (ncec->ncec_flags & NCE_F_ISROUTER)
1097 lnr->lnr_flags = NDF_ISROUTER_ON;
1098 if (ncec->ncec_flags & NCE_F_ANYCAST)
1099 lnr->lnr_flags |= NDF_ANYCAST_ON;
1100 if (ncec->ncec_flags & NCE_F_STATIC)
1101 lnr->lnr_flags |= NDF_STATIC;
1102 ncec_refrele(ncec);
1103 return (0);
1104 }
1105
1106 /*
1107 * Finish setting up the Enable/Disable multicast for the driver.
1108 */
1109 mblk_t *
ndp_mcastreq(ill_t * ill,const in6_addr_t * v6group,uint32_t hw_addr_len,uint32_t hw_addr_offset,mblk_t * mp)1110 ndp_mcastreq(ill_t *ill, const in6_addr_t *v6group, uint32_t hw_addr_len,
1111 uint32_t hw_addr_offset, mblk_t *mp)
1112 {
1113 uchar_t *hw_addr;
1114 ipaddr_t v4group;
1115 uchar_t *addr;
1116
1117 ASSERT(ill->ill_net_type == IRE_IF_RESOLVER);
1118 if (IN6_IS_ADDR_V4MAPPED(v6group)) {
1119 IN6_V4MAPPED_TO_IPADDR(v6group, v4group);
1120
1121 ASSERT(CLASSD(v4group));
1122 ASSERT(!(ill->ill_isv6));
1123
1124 addr = (uchar_t *)&v4group;
1125 } else {
1126 ASSERT(IN6_IS_ADDR_MULTICAST(v6group));
1127 ASSERT(ill->ill_isv6);
1128
1129 addr = (uchar_t *)v6group;
1130 }
1131 hw_addr = mi_offset_paramc(mp, hw_addr_offset, hw_addr_len);
1132 if (hw_addr == NULL) {
1133 ip0dbg(("ndp_mcastreq NULL hw_addr\n"));
1134 freemsg(mp);
1135 return (NULL);
1136 }
1137
1138 ip_mcast_mapping(ill, addr, hw_addr);
1139 return (mp);
1140 }
1141
1142 void
ip_ndp_resolve(ncec_t * ncec)1143 ip_ndp_resolve(ncec_t *ncec)
1144 {
1145 in_addr_t sender4 = INADDR_ANY;
1146 in6_addr_t sender6 = ipv6_all_zeros;
1147 ill_t *src_ill;
1148 uint32_t ms;
1149
1150 src_ill = nce_resolve_src(ncec, &sender6);
1151 if (src_ill == NULL) {
1152 /* Make sure we try again later */
1153 ms = ncec->ncec_ill->ill_reachable_retrans_time;
1154 nce_restart_timer(ncec, (clock_t)ms);
1155 return;
1156 }
1157 if (ncec->ncec_ipversion == IPV4_VERSION)
1158 IN6_V4MAPPED_TO_IPADDR(&sender6, sender4);
1159 mutex_enter(&ncec->ncec_lock);
1160 if (ncec->ncec_ipversion == IPV6_VERSION)
1161 ms = ndp_solicit(ncec, sender6, src_ill);
1162 else
1163 ms = arp_request(ncec, sender4, src_ill);
1164 mutex_exit(&ncec->ncec_lock);
1165 if (ms == 0) {
1166 if (ncec->ncec_state != ND_REACHABLE) {
1167 if (ncec->ncec_ipversion == IPV6_VERSION)
1168 ndp_resolv_failed(ncec);
1169 else
1170 arp_resolv_failed(ncec);
1171 ASSERT((ncec->ncec_flags & NCE_F_STATIC) == 0);
1172 nce_make_unreachable(ncec);
1173 ncec_delete(ncec);
1174 }
1175 } else {
1176 nce_restart_timer(ncec, (clock_t)ms);
1177 }
1178 done:
1179 ill_refrele(src_ill);
1180 }
1181
1182 /*
1183 * Send an IPv6 neighbor solicitation.
1184 * Returns number of milliseconds after which we should either rexmit or abort.
1185 * Return of zero means we should abort.
1186 * The caller holds the ncec_lock to protect ncec_qd_mp and ncec_rcnt.
1187 * The optional source address is used as a hint to ndp_solicit for
1188 * which source to use in the packet.
1189 *
1190 * NOTE: This routine drops ncec_lock (and later reacquires it) when sending
1191 * the packet.
1192 */
1193 uint32_t
ndp_solicit(ncec_t * ncec,in6_addr_t src,ill_t * ill)1194 ndp_solicit(ncec_t *ncec, in6_addr_t src, ill_t *ill)
1195 {
1196 in6_addr_t dst;
1197 boolean_t dropped = B_FALSE;
1198
1199 ASSERT(ncec->ncec_ipversion == IPV6_VERSION);
1200 ASSERT(MUTEX_HELD(&ncec->ncec_lock));
1201
1202 if (ncec->ncec_rcnt == 0)
1203 return (0);
1204
1205 dst = ncec->ncec_addr;
1206 ncec->ncec_rcnt--;
1207 mutex_exit(&ncec->ncec_lock);
1208 dropped = ndp_xmit(ill, ND_NEIGHBOR_SOLICIT, ill->ill_phys_addr,
1209 ill->ill_phys_addr_length, &src, &dst, 0);
1210 mutex_enter(&ncec->ncec_lock);
1211 if (dropped)
1212 ncec->ncec_rcnt++;
1213 return (ncec->ncec_ill->ill_reachable_retrans_time);
1214 }
1215
1216 /*
1217 * Attempt to recover an address on an interface that's been marked as a
1218 * duplicate. Because NCEs are destroyed when the interface goes down, there's
1219 * no easy way to just probe the address and have the right thing happen if
1220 * it's no longer in use. Instead, we just bring it up normally and allow the
1221 * regular interface start-up logic to probe for a remaining duplicate and take
1222 * us back down if necessary.
1223 * Neither DHCP nor temporary addresses arrive here; they're excluded by
1224 * ip_ndp_excl.
1225 */
1226 /* ARGSUSED */
1227 void
ip_addr_recover(ipsq_t * ipsq,queue_t * rq,mblk_t * mp,void * dummy_arg)1228 ip_addr_recover(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg)
1229 {
1230 ill_t *ill = rq->q_ptr;
1231 ipif_t *ipif;
1232 in6_addr_t *addr6 = (in6_addr_t *)mp->b_rptr;
1233 in_addr_t *addr4 = (in_addr_t *)mp->b_rptr;
1234 boolean_t addr_equal;
1235
1236 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
1237 /*
1238 * We do not support recovery of proxy ARP'd interfaces,
1239 * because the system lacks a complete proxy ARP mechanism.
1240 */
1241 if (ill->ill_isv6) {
1242 addr_equal = IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6lcl_addr,
1243 addr6);
1244 } else {
1245 addr_equal = (ipif->ipif_lcl_addr == *addr4);
1246 }
1247
1248 if ((ipif->ipif_flags & IPIF_POINTOPOINT) || !addr_equal)
1249 continue;
1250
1251 /*
1252 * If we have already recovered or if the interface is going
1253 * away, then ignore.
1254 */
1255 mutex_enter(&ill->ill_lock);
1256 if (!(ipif->ipif_flags & IPIF_DUPLICATE) ||
1257 (ipif->ipif_state_flags & IPIF_CONDEMNED)) {
1258 mutex_exit(&ill->ill_lock);
1259 continue;
1260 }
1261
1262 ipif->ipif_flags &= ~IPIF_DUPLICATE;
1263 ill->ill_ipif_dup_count--;
1264 mutex_exit(&ill->ill_lock);
1265 ipif->ipif_was_dup = B_TRUE;
1266
1267 if (ill->ill_isv6) {
1268 VERIFY(ipif_ndp_up(ipif, B_TRUE) != EINPROGRESS);
1269 (void) ipif_up_done_v6(ipif);
1270 } else {
1271 VERIFY(ipif_arp_up(ipif, Res_act_initial, B_TRUE) !=
1272 EINPROGRESS);
1273 (void) ipif_up_done(ipif);
1274 }
1275 }
1276 freeb(mp);
1277 }
1278
1279 /*
1280 * Attempt to recover an IPv6 interface that's been shut down as a duplicate.
1281 * As long as someone else holds the address, the interface will stay down.
1282 * When that conflict goes away, the interface is brought back up. This is
1283 * done so that accidental shutdowns of addresses aren't made permanent. Your
1284 * server will recover from a failure.
1285 *
1286 * For DHCP and temporary addresses, recovery is not done in the kernel.
1287 * Instead, it's handled by user space processes (dhcpagent and in.ndpd).
1288 *
1289 * This function is entered on a timer expiry; the ID is in ipif_recovery_id.
1290 */
1291 void
ipif_dup_recovery(void * arg)1292 ipif_dup_recovery(void *arg)
1293 {
1294 ipif_t *ipif = arg;
1295
1296 ipif->ipif_recovery_id = 0;
1297 if (!(ipif->ipif_flags & IPIF_DUPLICATE))
1298 return;
1299
1300 /*
1301 * No lock, because this is just an optimization.
1302 */
1303 if (ipif->ipif_state_flags & IPIF_CONDEMNED)
1304 return;
1305
1306 /* If the link is down, we'll retry this later */
1307 if (!(ipif->ipif_ill->ill_phyint->phyint_flags & PHYI_RUNNING))
1308 return;
1309
1310 ipif_do_recovery(ipif);
1311 }
1312
1313 /*
1314 * Perform interface recovery by forcing the duplicate interfaces up and
1315 * allowing the system to determine which ones should stay up.
1316 *
1317 * Called both by recovery timer expiry and link-up notification.
1318 */
1319 void
ipif_do_recovery(ipif_t * ipif)1320 ipif_do_recovery(ipif_t *ipif)
1321 {
1322 ill_t *ill = ipif->ipif_ill;
1323 mblk_t *mp;
1324 ip_stack_t *ipst = ill->ill_ipst;
1325 size_t mp_size;
1326
1327 if (ipif->ipif_isv6)
1328 mp_size = sizeof (ipif->ipif_v6lcl_addr);
1329 else
1330 mp_size = sizeof (ipif->ipif_lcl_addr);
1331 mp = allocb(mp_size, BPRI_MED);
1332 if (mp == NULL) {
1333 mutex_enter(&ill->ill_lock);
1334 if (ipst->ips_ip_dup_recovery > 0 &&
1335 ipif->ipif_recovery_id == 0 &&
1336 !(ipif->ipif_state_flags & IPIF_CONDEMNED)) {
1337 ipif->ipif_recovery_id = timeout(ipif_dup_recovery,
1338 ipif, MSEC_TO_TICK(ipst->ips_ip_dup_recovery));
1339 }
1340 mutex_exit(&ill->ill_lock);
1341 } else {
1342 /*
1343 * A recovery timer may still be running if we got here from
1344 * ill_restart_dad(); cancel that timer.
1345 */
1346 if (ipif->ipif_recovery_id != 0)
1347 (void) untimeout(ipif->ipif_recovery_id);
1348 ipif->ipif_recovery_id = 0;
1349
1350 if (ipif->ipif_isv6) {
1351 bcopy(&ipif->ipif_v6lcl_addr, mp->b_rptr,
1352 sizeof (ipif->ipif_v6lcl_addr));
1353 } else {
1354 bcopy(&ipif->ipif_lcl_addr, mp->b_rptr,
1355 sizeof (ipif->ipif_lcl_addr));
1356 }
1357 ill_refhold(ill);
1358 qwriter_ip(ill, ill->ill_rq, mp, ip_addr_recover, NEW_OP,
1359 B_FALSE);
1360 }
1361 }
1362
1363 /*
1364 * Find the MAC and IP addresses in an NA/NS message.
1365 */
1366 static void
ip_ndp_find_addresses(mblk_t * mp,ip_recv_attr_t * ira,ill_t * ill,in6_addr_t * targp,uchar_t ** haddr,uint_t * haddrlenp)1367 ip_ndp_find_addresses(mblk_t *mp, ip_recv_attr_t *ira, ill_t *ill,
1368 in6_addr_t *targp, uchar_t **haddr, uint_t *haddrlenp)
1369 {
1370 icmp6_t *icmp6 = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
1371 nd_neighbor_solicit_t *ns = (nd_neighbor_solicit_t *)icmp6;
1372 uchar_t *addr;
1373 int alen;
1374
1375 /* icmp_inbound_v6 ensures this */
1376 ASSERT(ira->ira_flags & IRAF_L2SRC_SET);
1377
1378 addr = ira->ira_l2src;
1379 alen = ill->ill_phys_addr_length;
1380 if (alen > 0) {
1381 *haddr = addr;
1382 *haddrlenp = alen;
1383 } else {
1384 *haddr = NULL;
1385 *haddrlenp = 0;
1386 }
1387
1388 /* nd_ns_target and nd_na_target are at the same offset, so we cheat */
1389 *targp = ns->nd_ns_target;
1390 }
1391
1392 /*
1393 * This is for exclusive changes due to NDP duplicate address detection
1394 * failure.
1395 */
1396 /* ARGSUSED */
1397 static void
ip_ndp_excl(ipsq_t * ipsq,queue_t * rq,mblk_t * mp,void * dummy_arg)1398 ip_ndp_excl(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg)
1399 {
1400 ill_t *ill = rq->q_ptr;
1401 ipif_t *ipif;
1402 uchar_t *haddr;
1403 uint_t haddrlen;
1404 ip_stack_t *ipst = ill->ill_ipst;
1405 in6_addr_t targ;
1406 ip_recv_attr_t iras;
1407 mblk_t *attrmp;
1408
1409 attrmp = mp;
1410 mp = mp->b_cont;
1411 attrmp->b_cont = NULL;
1412 if (!ip_recv_attr_from_mblk(attrmp, &iras)) {
1413 /* The ill or ip_stack_t disappeared on us */
1414 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
1415 ip_drop_input("ip_recv_attr_from_mblk", mp, ill);
1416 freemsg(mp);
1417 ira_cleanup(&iras, B_TRUE);
1418 return;
1419 }
1420
1421 ASSERT(ill == iras.ira_rill);
1422
1423 ip_ndp_find_addresses(mp, &iras, ill, &targ, &haddr, &haddrlen);
1424 if (haddr != NULL && haddrlen == ill->ill_phys_addr_length) {
1425 /*
1426 * Ignore conflicts generated by misbehaving switches that
1427 * just reflect our own messages back to us. For IPMP, we may
1428 * see reflections across any ill in the illgrp.
1429 *
1430 * RFC2462 and revisions tried to detect both the case
1431 * when a statically configured IPv6 address is a duplicate,
1432 * and the case when the L2 address itself is a duplicate. The
1433 * later is important because, with stateles address autoconf,
1434 * if the L2 address is a duplicate, the resulting IPv6
1435 * address(es) would also be duplicates. We rely on DAD of the
1436 * IPv6 address itself to detect the latter case.
1437 */
1438 /* For an under ill_grp can change under lock */
1439 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
1440 if (bcmp(haddr, ill->ill_phys_addr, haddrlen) == 0 ||
1441 IS_UNDER_IPMP(ill) &&
1442 ipmp_illgrp_find_ill(ill->ill_grp, haddr,
1443 haddrlen) != NULL) {
1444 rw_exit(&ipst->ips_ill_g_lock);
1445 goto ignore_conflict;
1446 }
1447 rw_exit(&ipst->ips_ill_g_lock);
1448 }
1449
1450 /*
1451 * Look up the appropriate ipif.
1452 */
1453 ipif = ipif_lookup_addr_v6(&targ, ill, ALL_ZONES, ipst);
1454 if (ipif == NULL)
1455 goto ignore_conflict;
1456
1457 /* Reload the ill to match the ipif */
1458 ill = ipif->ipif_ill;
1459
1460 /* If it's already duplicate or ineligible, then don't do anything. */
1461 if (ipif->ipif_flags & (IPIF_POINTOPOINT|IPIF_DUPLICATE)) {
1462 ipif_refrele(ipif);
1463 goto ignore_conflict;
1464 }
1465
1466 /*
1467 * If this is a failure during duplicate recovery, then don't
1468 * complain. It may take a long time to recover.
1469 */
1470 if (!ipif->ipif_was_dup) {
1471 char ibuf[LIFNAMSIZ];
1472 char hbuf[MAC_STR_LEN];
1473 char sbuf[INET6_ADDRSTRLEN];
1474
1475 ipif_get_name(ipif, ibuf, sizeof (ibuf));
1476 cmn_err(CE_WARN, "%s has duplicate address %s (in use by %s);"
1477 " disabled", ibuf,
1478 inet_ntop(AF_INET6, &targ, sbuf, sizeof (sbuf)),
1479 mac_colon_addr(haddr, haddrlen, hbuf, sizeof (hbuf)));
1480 }
1481 mutex_enter(&ill->ill_lock);
1482 ASSERT(!(ipif->ipif_flags & IPIF_DUPLICATE));
1483 ipif->ipif_flags |= IPIF_DUPLICATE;
1484 ill->ill_ipif_dup_count++;
1485 mutex_exit(&ill->ill_lock);
1486 (void) ipif_down(ipif, NULL, NULL);
1487 (void) ipif_down_tail(ipif);
1488 mutex_enter(&ill->ill_lock);
1489 if (!(ipif->ipif_flags & (IPIF_DHCPRUNNING|IPIF_TEMPORARY)) &&
1490 ill->ill_net_type == IRE_IF_RESOLVER &&
1491 !(ipif->ipif_state_flags & IPIF_CONDEMNED) &&
1492 ipst->ips_ip_dup_recovery > 0) {
1493 ASSERT(ipif->ipif_recovery_id == 0);
1494 ipif->ipif_recovery_id = timeout(ipif_dup_recovery,
1495 ipif, MSEC_TO_TICK(ipst->ips_ip_dup_recovery));
1496 }
1497 mutex_exit(&ill->ill_lock);
1498 ipif_refrele(ipif);
1499
1500 ignore_conflict:
1501 freemsg(mp);
1502 ira_cleanup(&iras, B_TRUE);
1503 }
1504
1505 /*
1506 * Handle failure by tearing down the ipifs with the specified address. Note
1507 * that tearing down the ipif also means deleting the ncec through ipif_down, so
1508 * it's not possible to do recovery by just restarting the ncec timer. Instead,
1509 * we start a timer on the ipif.
1510 * Caller has to free mp;
1511 */
1512 static void
ndp_failure(mblk_t * mp,ip_recv_attr_t * ira)1513 ndp_failure(mblk_t *mp, ip_recv_attr_t *ira)
1514 {
1515 const uchar_t *haddr;
1516 ill_t *ill = ira->ira_rill;
1517
1518 /*
1519 * Ignore conflicts generated by misbehaving switches that just
1520 * reflect our own messages back to us.
1521 */
1522
1523 /* icmp_inbound_v6 ensures this */
1524 ASSERT(ira->ira_flags & IRAF_L2SRC_SET);
1525 haddr = ira->ira_l2src;
1526 if (haddr != NULL &&
1527 bcmp(haddr, ill->ill_phys_addr, ill->ill_phys_addr_length) == 0) {
1528 return;
1529 }
1530
1531 if ((mp = copymsg(mp)) != NULL) {
1532 mblk_t *attrmp;
1533
1534 attrmp = ip_recv_attr_to_mblk(ira);
1535 if (attrmp == NULL) {
1536 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
1537 ip_drop_input("ipIfStatsInDiscards", mp, ill);
1538 freemsg(mp);
1539 } else {
1540 ASSERT(attrmp->b_cont == NULL);
1541 attrmp->b_cont = mp;
1542 mp = attrmp;
1543 ill_refhold(ill);
1544 qwriter_ip(ill, ill->ill_rq, mp, ip_ndp_excl, NEW_OP,
1545 B_FALSE);
1546 }
1547 }
1548 }
1549
1550 /*
1551 * Handle a discovered conflict: some other system is advertising that it owns
1552 * one of our IP addresses. We need to defend ourselves, or just shut down the
1553 * interface.
1554 *
1555 * Handles both IPv4 and IPv6
1556 */
1557 boolean_t
ip_nce_conflict(mblk_t * mp,ip_recv_attr_t * ira,ncec_t * ncec)1558 ip_nce_conflict(mblk_t *mp, ip_recv_attr_t *ira, ncec_t *ncec)
1559 {
1560 ipif_t *ipif;
1561 clock_t now;
1562 uint_t maxdefense;
1563 uint_t defs;
1564 ill_t *ill = ira->ira_ill;
1565 ip_stack_t *ipst = ill->ill_ipst;
1566 uint32_t elapsed;
1567 boolean_t isv6 = ill->ill_isv6;
1568 ipaddr_t ncec_addr;
1569
1570 if (isv6) {
1571 ipif = ipif_lookup_addr_v6(&ncec->ncec_addr, ill, ALL_ZONES,
1572 ipst);
1573 } else {
1574 if (arp_no_defense) {
1575 /*
1576 * Yes, there is a conflict, but no, we do not
1577 * defend ourself.
1578 */
1579 return (B_TRUE);
1580 }
1581 IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, ncec_addr);
1582 ipif = ipif_lookup_addr(ncec_addr, ill, ALL_ZONES,
1583 ipst);
1584 }
1585 if (ipif == NULL)
1586 return (B_FALSE);
1587
1588 /*
1589 * First, figure out if this address is disposable.
1590 */
1591 if (ipif->ipif_flags & (IPIF_DHCPRUNNING | IPIF_TEMPORARY))
1592 maxdefense = ipst->ips_ip_max_temp_defend;
1593 else
1594 maxdefense = ipst->ips_ip_max_defend;
1595
1596 /*
1597 * Now figure out how many times we've defended ourselves. Ignore
1598 * defenses that happened long in the past.
1599 */
1600 now = ddi_get_lbolt();
1601 elapsed = (drv_hztousec(now - ncec->ncec_last_time_defended))/1000000;
1602 mutex_enter(&ncec->ncec_lock);
1603 if ((defs = ncec->ncec_defense_count) > 0 &&
1604 elapsed > ipst->ips_ip_defend_interval) {
1605 /*
1606 * ip_defend_interval has elapsed.
1607 * reset the defense count.
1608 */
1609 ncec->ncec_defense_count = defs = 0;
1610 }
1611 ncec->ncec_defense_count++;
1612 ncec->ncec_last_time_defended = now;
1613 mutex_exit(&ncec->ncec_lock);
1614 ipif_refrele(ipif);
1615
1616 /*
1617 * If we've defended ourselves too many times already, then give up and
1618 * tear down the interface(s) using this address.
1619 * Otherwise, caller has to defend by sending out an announce.
1620 */
1621 if (defs >= maxdefense) {
1622 if (isv6)
1623 ndp_failure(mp, ira);
1624 else
1625 arp_failure(mp, ira);
1626 } else {
1627 return (B_TRUE); /* caller must defend this address */
1628 }
1629 return (B_FALSE);
1630 }
1631
1632 /*
1633 * Handle reception of Neighbor Solicitation messages.
1634 */
1635 static void
ndp_input_solicit(mblk_t * mp,ip_recv_attr_t * ira)1636 ndp_input_solicit(mblk_t *mp, ip_recv_attr_t *ira)
1637 {
1638 ill_t *ill = ira->ira_ill, *under_ill;
1639 nd_neighbor_solicit_t *ns;
1640 uint32_t hlen = ill->ill_phys_addr_length;
1641 uchar_t *haddr = NULL;
1642 icmp6_t *icmp_nd;
1643 ip6_t *ip6h;
1644 ncec_t *our_ncec = NULL;
1645 in6_addr_t target;
1646 in6_addr_t src;
1647 int len;
1648 int flag = 0;
1649 nd_opt_hdr_t *opt = NULL;
1650 boolean_t bad_solicit = B_FALSE;
1651 mib2_ipv6IfIcmpEntry_t *mib = ill->ill_icmp6_mib;
1652 boolean_t need_ill_refrele = B_FALSE;
1653
1654 ip6h = (ip6_t *)mp->b_rptr;
1655 icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
1656 len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN;
1657 src = ip6h->ip6_src;
1658 ns = (nd_neighbor_solicit_t *)icmp_nd;
1659 target = ns->nd_ns_target;
1660 if (IN6_IS_ADDR_MULTICAST(&target) || IN6_IS_ADDR_V4MAPPED(&target) ||
1661 IN6_IS_ADDR_LOOPBACK(&target)) {
1662 if (ip_debug > 2) {
1663 /* ip1dbg */
1664 pr_addr_dbg("ndp_input_solicit: Martian Target %s\n",
1665 AF_INET6, &target);
1666 }
1667 bad_solicit = B_TRUE;
1668 goto done;
1669 }
1670 if (len > sizeof (nd_neighbor_solicit_t)) {
1671 /* Options present */
1672 opt = (nd_opt_hdr_t *)&ns[1];
1673 len -= sizeof (nd_neighbor_solicit_t);
1674 if (!ndp_verify_optlen(opt, len)) {
1675 ip1dbg(("ndp_input_solicit: Bad opt len\n"));
1676 bad_solicit = B_TRUE;
1677 goto done;
1678 }
1679 }
1680 if (IN6_IS_ADDR_UNSPECIFIED(&src)) {
1681 /* Check to see if this is a valid DAD solicitation */
1682 if (!IN6_IS_ADDR_MC_SOLICITEDNODE(&ip6h->ip6_dst)) {
1683 if (ip_debug > 2) {
1684 /* ip1dbg */
1685 pr_addr_dbg("ndp_input_solicit: IPv6 "
1686 "Destination is not solicited node "
1687 "multicast %s\n", AF_INET6,
1688 &ip6h->ip6_dst);
1689 }
1690 bad_solicit = B_TRUE;
1691 goto done;
1692 }
1693 }
1694
1695 /*
1696 * NOTE: with IPMP, it's possible the nominated multicast ill (which
1697 * received this packet if it's multicast) is not the ill tied to
1698 * e.g. the IPMP ill's data link-local. So we match across the illgrp
1699 * to ensure we find the associated NCE.
1700 */
1701 our_ncec = ncec_lookup_illgrp_v6(ill, &target);
1702 /*
1703 * If this is a valid Solicitation for an address we are publishing,
1704 * then a PUBLISH entry should exist in the cache
1705 */
1706 if (our_ncec == NULL || !NCE_PUBLISH(our_ncec)) {
1707 ip1dbg(("ndp_input_solicit: Wrong target in NS?!"
1708 "ifname=%s ", ill->ill_name));
1709 if (ip_debug > 2) {
1710 /* ip1dbg */
1711 pr_addr_dbg(" dst %s\n", AF_INET6, &target);
1712 }
1713 if (our_ncec == NULL)
1714 bad_solicit = B_TRUE;
1715 goto done;
1716 }
1717
1718 /* At this point we should have a verified NS per spec */
1719 if (opt != NULL) {
1720 opt = ndp_get_option(opt, len, ND_OPT_SOURCE_LINKADDR);
1721 if (opt != NULL) {
1722 haddr = (uchar_t *)&opt[1];
1723 if (hlen > opt->nd_opt_len * 8 - sizeof (*opt) ||
1724 hlen == 0) {
1725 ip1dbg(("ndp_input_advert: bad SLLA\n"));
1726 bad_solicit = B_TRUE;
1727 goto done;
1728 }
1729 }
1730 }
1731
1732 /* If sending directly to peer, set the unicast flag */
1733 if (!IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst))
1734 flag |= NDP_UNICAST;
1735
1736 /*
1737 * Create/update the entry for the soliciting node on the ipmp_ill.
1738 * or respond to outstanding queries, don't if
1739 * the source is unspecified address.
1740 */
1741 if (!IN6_IS_ADDR_UNSPECIFIED(&src)) {
1742 int err;
1743 nce_t *nnce;
1744
1745 ASSERT(ill->ill_isv6);
1746 /*
1747 * Regular solicitations *must* include the Source Link-Layer
1748 * Address option. Ignore messages that do not.
1749 */
1750 if (haddr == NULL && IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst)) {
1751 ip1dbg(("ndp_input_solicit: source link-layer address "
1752 "option missing with a specified source.\n"));
1753 bad_solicit = B_TRUE;
1754 goto done;
1755 }
1756
1757 /*
1758 * This is a regular solicitation. If we're still in the
1759 * process of verifying the address, then don't respond at all
1760 * and don't keep track of the sender.
1761 */
1762 if (our_ncec->ncec_state == ND_PROBE)
1763 goto done;
1764
1765 /*
1766 * If the solicitation doesn't have sender hardware address
1767 * (legal for unicast solicitation), then process without
1768 * installing the return NCE. Either we already know it, or
1769 * we'll be forced to look it up when (and if) we reply to the
1770 * packet.
1771 */
1772 if (haddr == NULL)
1773 goto no_source;
1774
1775 under_ill = ill;
1776 if (IS_UNDER_IPMP(under_ill)) {
1777 ill = ipmp_ill_hold_ipmp_ill(under_ill);
1778 if (ill == NULL)
1779 ill = under_ill;
1780 else
1781 need_ill_refrele = B_TRUE;
1782 }
1783 err = nce_lookup_then_add_v6(ill,
1784 haddr, hlen,
1785 &src, /* Soliciting nodes address */
1786 0,
1787 ND_STALE,
1788 &nnce);
1789
1790 if (need_ill_refrele) {
1791 ill_refrele(ill);
1792 ill = under_ill;
1793 need_ill_refrele = B_FALSE;
1794 }
1795 switch (err) {
1796 case 0:
1797 /* done with this entry */
1798 nce_refrele(nnce);
1799 break;
1800 case EEXIST:
1801 /*
1802 * B_FALSE indicates this is not an an advertisement.
1803 */
1804 nce_process(nnce->nce_common, haddr, 0, B_FALSE);
1805 nce_refrele(nnce);
1806 break;
1807 default:
1808 ip1dbg(("ndp_input_solicit: Can't create NCE %d\n",
1809 err));
1810 goto done;
1811 }
1812 no_source:
1813 flag |= NDP_SOLICITED;
1814 } else {
1815 /*
1816 * No source link layer address option should be present in a
1817 * valid DAD request.
1818 */
1819 if (haddr != NULL) {
1820 ip1dbg(("ndp_input_solicit: source link-layer address "
1821 "option present with an unspecified source.\n"));
1822 bad_solicit = B_TRUE;
1823 goto done;
1824 }
1825 if (our_ncec->ncec_state == ND_PROBE) {
1826 /*
1827 * Internally looped-back probes will have
1828 * IRAF_L2SRC_LOOPBACK set so we can ignore our own
1829 * transmissions.
1830 */
1831 if (!(ira->ira_flags & IRAF_L2SRC_LOOPBACK)) {
1832 /*
1833 * If someone else is probing our address, then
1834 * we've crossed wires. Declare failure.
1835 */
1836 ndp_failure(mp, ira);
1837 }
1838 goto done;
1839 }
1840 /*
1841 * This is a DAD probe. Multicast the advertisement to the
1842 * all-nodes address.
1843 */
1844 src = ipv6_all_hosts_mcast;
1845 }
1846 flag |= nce_advert_flags(our_ncec);
1847 (void) ndp_xmit(ill,
1848 ND_NEIGHBOR_ADVERT,
1849 our_ncec->ncec_lladdr,
1850 our_ncec->ncec_lladdr_length,
1851 &target, /* Source and target of the advertisement pkt */
1852 &src, /* IP Destination (source of original pkt) */
1853 flag);
1854 done:
1855 if (bad_solicit)
1856 BUMP_MIB(mib, ipv6IfIcmpInBadNeighborSolicitations);
1857 if (our_ncec != NULL)
1858 ncec_refrele(our_ncec);
1859 }
1860
1861 /*
1862 * Handle reception of Neighbor Solicitation messages
1863 */
1864 void
ndp_input_advert(mblk_t * mp,ip_recv_attr_t * ira)1865 ndp_input_advert(mblk_t *mp, ip_recv_attr_t *ira)
1866 {
1867 ill_t *ill = ira->ira_ill;
1868 nd_neighbor_advert_t *na;
1869 uint32_t hlen = ill->ill_phys_addr_length;
1870 uchar_t *haddr = NULL;
1871 icmp6_t *icmp_nd;
1872 ip6_t *ip6h;
1873 ncec_t *dst_ncec = NULL;
1874 in6_addr_t target;
1875 nd_opt_hdr_t *opt = NULL;
1876 int len;
1877 ip_stack_t *ipst = ill->ill_ipst;
1878 mib2_ipv6IfIcmpEntry_t *mib = ill->ill_icmp6_mib;
1879
1880 ip6h = (ip6_t *)mp->b_rptr;
1881 icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
1882 len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN;
1883 na = (nd_neighbor_advert_t *)icmp_nd;
1884
1885 if (IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst) &&
1886 (na->nd_na_flags_reserved & ND_NA_FLAG_SOLICITED)) {
1887 ip1dbg(("ndp_input_advert: Target is multicast but the "
1888 "solicited flag is not zero\n"));
1889 BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements);
1890 return;
1891 }
1892 target = na->nd_na_target;
1893 if (IN6_IS_ADDR_MULTICAST(&target) || IN6_IS_ADDR_V4MAPPED(&target) ||
1894 IN6_IS_ADDR_LOOPBACK(&target)) {
1895 if (ip_debug > 2) {
1896 /* ip1dbg */
1897 pr_addr_dbg("ndp_input_solicit: Martian Target %s\n",
1898 AF_INET6, &target);
1899 }
1900 BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements);
1901 return;
1902 }
1903 if (len > sizeof (nd_neighbor_advert_t)) {
1904 opt = (nd_opt_hdr_t *)&na[1];
1905 if (!ndp_verify_optlen(opt,
1906 len - sizeof (nd_neighbor_advert_t))) {
1907 ip1dbg(("ndp_input_advert: cannot verify SLLA\n"));
1908 BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements);
1909 return;
1910 }
1911 /* At this point we have a verified NA per spec */
1912 len -= sizeof (nd_neighbor_advert_t);
1913 opt = ndp_get_option(opt, len, ND_OPT_TARGET_LINKADDR);
1914 if (opt != NULL) {
1915 haddr = (uchar_t *)&opt[1];
1916 if (hlen > opt->nd_opt_len * 8 - sizeof (*opt) ||
1917 hlen == 0) {
1918 ip1dbg(("ndp_input_advert: bad SLLA\n"));
1919 BUMP_MIB(mib,
1920 ipv6IfIcmpInBadNeighborAdvertisements);
1921 return;
1922 }
1923 }
1924 }
1925
1926 /*
1927 * NOTE: we match across the illgrp since we need to do DAD for all of
1928 * our local addresses, and those are spread across all the active
1929 * ills in the group.
1930 */
1931 if ((dst_ncec = ncec_lookup_illgrp_v6(ill, &target)) == NULL)
1932 return;
1933
1934 if (NCE_PUBLISH(dst_ncec)) {
1935 /*
1936 * Someone just advertised an addresses that we publish. First,
1937 * check it it was us -- if so, we can safely ignore it.
1938 * We don't get the haddr from the ira_l2src because, in the
1939 * case that the packet originated from us, on an IPMP group,
1940 * the ira_l2src may would be the link-layer address of the
1941 * cast_ill used to send the packet, which may not be the same
1942 * as the dst_ncec->ncec_lladdr of the address.
1943 */
1944 if (haddr != NULL) {
1945 if (ira->ira_flags & IRAF_L2SRC_LOOPBACK)
1946 goto out;
1947
1948 if (!nce_cmp_ll_addr(dst_ncec, haddr, hlen))
1949 goto out; /* from us -- no conflict */
1950
1951 /*
1952 * If we're in an IPMP group, check if this is an echo
1953 * from another ill in the group. Use the double-
1954 * checked locking pattern to avoid grabbing
1955 * ill_g_lock in the non-IPMP case.
1956 */
1957 if (IS_UNDER_IPMP(ill)) {
1958 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
1959 if (IS_UNDER_IPMP(ill) && ipmp_illgrp_find_ill(
1960 ill->ill_grp, haddr, hlen) != NULL) {
1961 rw_exit(&ipst->ips_ill_g_lock);
1962 goto out;
1963 }
1964 rw_exit(&ipst->ips_ill_g_lock);
1965 }
1966 }
1967
1968 /*
1969 * This appears to be a real conflict. If we're trying to
1970 * configure this NCE (ND_PROBE), then shut it down.
1971 * Otherwise, handle the discovered conflict.
1972 */
1973 if (dst_ncec->ncec_state == ND_PROBE) {
1974 ndp_failure(mp, ira);
1975 } else {
1976 if (ip_nce_conflict(mp, ira, dst_ncec)) {
1977 char hbuf[MAC_STR_LEN];
1978 char sbuf[INET6_ADDRSTRLEN];
1979
1980 cmn_err(CE_WARN,
1981 "node '%s' is using %s on %s",
1982 inet_ntop(AF_INET6, &target, sbuf,
1983 sizeof (sbuf)),
1984 haddr == NULL ? "<none>" :
1985 mac_colon_addr(haddr, hlen, hbuf,
1986 sizeof (hbuf)), ill->ill_name);
1987 /*
1988 * RFC 4862, Section 5.4.4 does not mandate
1989 * any specific behavior when an NA matches
1990 * a non-tentative address assigned to the
1991 * receiver. We make the choice of defending
1992 * our address, based on the assumption that
1993 * the sender has not detected the Duplicate.
1994 *
1995 * ncec_last_time_defended has been adjusted
1996 * in ip_nce_conflict()
1997 */
1998 (void) ndp_announce(dst_ncec);
1999 }
2000 }
2001 } else {
2002 if (na->nd_na_flags_reserved & ND_NA_FLAG_ROUTER)
2003 dst_ncec->ncec_flags |= NCE_F_ISROUTER;
2004
2005 /* B_TRUE indicates this an advertisement */
2006 nce_process(dst_ncec, haddr, na->nd_na_flags_reserved, B_TRUE);
2007 }
2008 out:
2009 ncec_refrele(dst_ncec);
2010 }
2011
2012 /*
2013 * Process NDP neighbor solicitation/advertisement messages.
2014 * The checksum has already checked o.k before reaching here.
2015 * Information about the datalink header is contained in ira_l2src, but
2016 * that should be ignored for loopback packets.
2017 */
2018 void
ndp_input(mblk_t * mp,ip_recv_attr_t * ira)2019 ndp_input(mblk_t *mp, ip_recv_attr_t *ira)
2020 {
2021 ill_t *ill = ira->ira_rill;
2022 icmp6_t *icmp_nd;
2023 ip6_t *ip6h;
2024 int len;
2025 mib2_ipv6IfIcmpEntry_t *mib = ill->ill_icmp6_mib;
2026 ill_t *orig_ill = NULL;
2027
2028 /*
2029 * Since ira_ill is where the IRE_LOCAL was hosted we use ira_rill
2030 * and make it be the IPMP upper so avoid being confused by a packet
2031 * addressed to a unicast address on a different ill.
2032 */
2033 if (IS_UNDER_IPMP(ill)) {
2034 orig_ill = ill;
2035 ill = ipmp_ill_hold_ipmp_ill(orig_ill);
2036 if (ill == NULL) {
2037 ill = orig_ill;
2038 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2039 ip_drop_input("ipIfStatsInDiscards - IPMP ill",
2040 mp, ill);
2041 freemsg(mp);
2042 return;
2043 }
2044 ASSERT(ill != orig_ill);
2045 orig_ill = ira->ira_ill;
2046 ira->ira_ill = ill;
2047 mib = ill->ill_icmp6_mib;
2048 }
2049 if (!pullupmsg(mp, -1)) {
2050 ip1dbg(("ndp_input: pullupmsg failed\n"));
2051 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2052 ip_drop_input("ipIfStatsInDiscards - pullupmsg", mp, ill);
2053 goto done;
2054 }
2055 ip6h = (ip6_t *)mp->b_rptr;
2056 if (ip6h->ip6_hops != IPV6_MAX_HOPS) {
2057 ip1dbg(("ndp_input: hoplimit != IPV6_MAX_HOPS\n"));
2058 ip_drop_input("ipv6IfIcmpBadHoplimit", mp, ill);
2059 BUMP_MIB(mib, ipv6IfIcmpBadHoplimit);
2060 goto done;
2061 }
2062 /*
2063 * NDP does not accept any extension headers between the
2064 * IP header and the ICMP header since e.g. a routing
2065 * header could be dangerous.
2066 * This assumes that any AH or ESP headers are removed
2067 * by ip prior to passing the packet to ndp_input.
2068 */
2069 if (ip6h->ip6_nxt != IPPROTO_ICMPV6) {
2070 ip1dbg(("ndp_input: Wrong next header 0x%x\n",
2071 ip6h->ip6_nxt));
2072 ip_drop_input("Wrong next header", mp, ill);
2073 BUMP_MIB(mib, ipv6IfIcmpInErrors);
2074 goto done;
2075 }
2076 icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
2077 ASSERT(icmp_nd->icmp6_type == ND_NEIGHBOR_SOLICIT ||
2078 icmp_nd->icmp6_type == ND_NEIGHBOR_ADVERT);
2079 if (icmp_nd->icmp6_code != 0) {
2080 ip1dbg(("ndp_input: icmp6 code != 0 \n"));
2081 ip_drop_input("code non-zero", mp, ill);
2082 BUMP_MIB(mib, ipv6IfIcmpInErrors);
2083 goto done;
2084 }
2085 len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN;
2086 /*
2087 * Make sure packet length is large enough for either
2088 * a NS or a NA icmp packet.
2089 */
2090 if (len < sizeof (struct icmp6_hdr) + sizeof (struct in6_addr)) {
2091 ip1dbg(("ndp_input: packet too short\n"));
2092 ip_drop_input("packet too short", mp, ill);
2093 BUMP_MIB(mib, ipv6IfIcmpInErrors);
2094 goto done;
2095 }
2096 if (icmp_nd->icmp6_type == ND_NEIGHBOR_SOLICIT) {
2097 ndp_input_solicit(mp, ira);
2098 } else {
2099 ndp_input_advert(mp, ira);
2100 }
2101 done:
2102 freemsg(mp);
2103 if (orig_ill != NULL) {
2104 ill_refrele(ill);
2105 ira->ira_ill = orig_ill;
2106 }
2107 }
2108
2109 /*
2110 * ndp_xmit is called to form and transmit a ND solicitation or
2111 * advertisement ICMP packet.
2112 *
2113 * If the source address is unspecified and this isn't a probe (used for
2114 * duplicate address detection), an appropriate source address and link layer
2115 * address will be chosen here. The link layer address option is included if
2116 * the source is specified (i.e., all non-probe packets), and omitted (per the
2117 * specification) otherwise.
2118 *
2119 * It returns B_FALSE only if it does a successful put() to the
2120 * corresponding ill's ill_wq otherwise returns B_TRUE.
2121 */
2122 static boolean_t
ndp_xmit(ill_t * ill,uint32_t operation,uint8_t * hw_addr,uint_t hw_addr_len,const in6_addr_t * sender,const in6_addr_t * target,int flag)2123 ndp_xmit(ill_t *ill, uint32_t operation, uint8_t *hw_addr, uint_t hw_addr_len,
2124 const in6_addr_t *sender, const in6_addr_t *target, int flag)
2125 {
2126 uint32_t len;
2127 icmp6_t *icmp6;
2128 mblk_t *mp;
2129 ip6_t *ip6h;
2130 nd_opt_hdr_t *opt;
2131 uint_t plen;
2132 zoneid_t zoneid = GLOBAL_ZONEID;
2133 ill_t *hwaddr_ill = ill;
2134 ip_xmit_attr_t ixas;
2135 ip_stack_t *ipst = ill->ill_ipst;
2136 boolean_t need_refrele = B_FALSE;
2137 boolean_t probe = B_FALSE;
2138
2139 if (IS_UNDER_IPMP(ill)) {
2140 probe = ipif_lookup_testaddr_v6(ill, sender, NULL);
2141 /*
2142 * We send non-probe packets on the upper IPMP interface.
2143 * ip_output_simple() will use cast_ill for sending any
2144 * multicast packets. Note that we can't follow the same
2145 * logic for probe packets because all interfaces in the ipmp
2146 * group may have failed, so that we really want to only try
2147 * to send the ND packet on the ill corresponding to the src
2148 * address.
2149 */
2150 if (!probe) {
2151 ill = ipmp_ill_hold_ipmp_ill(ill);
2152 if (ill != NULL)
2153 need_refrele = B_TRUE;
2154 else
2155 ill = hwaddr_ill;
2156 }
2157 }
2158
2159 /*
2160 * If we have a unspecified source(sender) address, select a
2161 * proper source address for the solicitation here itself so
2162 * that we can initialize the h/w address correctly.
2163 *
2164 * If the sender is specified then we use this address in order
2165 * to lookup the zoneid before calling ip_output_v6(). This is to
2166 * enable unicast ND_NEIGHBOR_ADVERT packets to be routed correctly
2167 * by IP (we cannot guarantee that the global zone has an interface
2168 * route to the destination).
2169 *
2170 * Note that the NA never comes here with the unspecified source
2171 * address.
2172 */
2173
2174 /*
2175 * Probes will have unspec src at this point.
2176 */
2177 if (!(IN6_IS_ADDR_UNSPECIFIED(sender))) {
2178 zoneid = ipif_lookup_addr_zoneid_v6(sender, ill, ipst);
2179 /*
2180 * It's possible for ipif_lookup_addr_zoneid_v6() to return
2181 * ALL_ZONES if it cannot find a matching ipif for the address
2182 * we are trying to use. In this case we err on the side of
2183 * trying to send the packet by defaulting to the GLOBAL_ZONEID.
2184 */
2185 if (zoneid == ALL_ZONES)
2186 zoneid = GLOBAL_ZONEID;
2187 }
2188
2189 plen = (sizeof (nd_opt_hdr_t) + hw_addr_len + 7) / 8;
2190 len = IPV6_HDR_LEN + sizeof (nd_neighbor_advert_t) + plen * 8;
2191 mp = allocb(len, BPRI_LO);
2192 if (mp == NULL) {
2193 if (need_refrele)
2194 ill_refrele(ill);
2195 return (B_TRUE);
2196 }
2197
2198 bzero((char *)mp->b_rptr, len);
2199 mp->b_wptr = mp->b_rptr + len;
2200
2201 bzero(&ixas, sizeof (ixas));
2202 ixas.ixa_flags = IXAF_SET_ULP_CKSUM | IXAF_NO_HW_CKSUM;
2203
2204 ixas.ixa_ifindex = ill->ill_phyint->phyint_ifindex;
2205 ixas.ixa_ipst = ipst;
2206 ixas.ixa_cred = kcred;
2207 ixas.ixa_cpid = NOPID;
2208 ixas.ixa_tsl = NULL;
2209 ixas.ixa_zoneid = zoneid;
2210
2211 ip6h = (ip6_t *)mp->b_rptr;
2212 ip6h->ip6_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
2213 ip6h->ip6_plen = htons(len - IPV6_HDR_LEN);
2214 ip6h->ip6_nxt = IPPROTO_ICMPV6;
2215 ip6h->ip6_hops = IPV6_MAX_HOPS;
2216 ixas.ixa_multicast_ttl = ip6h->ip6_hops;
2217 ip6h->ip6_dst = *target;
2218 icmp6 = (icmp6_t *)&ip6h[1];
2219
2220 if (hw_addr_len != 0) {
2221 opt = (nd_opt_hdr_t *)((uint8_t *)ip6h + IPV6_HDR_LEN +
2222 sizeof (nd_neighbor_advert_t));
2223 } else {
2224 opt = NULL;
2225 }
2226 if (operation == ND_NEIGHBOR_SOLICIT) {
2227 nd_neighbor_solicit_t *ns = (nd_neighbor_solicit_t *)icmp6;
2228
2229 if (opt != NULL && !(flag & NDP_PROBE)) {
2230 /*
2231 * Note that we don't send out SLLA for ND probes
2232 * per RFC 4862, even though we do send out the src
2233 * haddr for IPv4 DAD probes, even though both IPv4
2234 * and IPv6 go out with the unspecified/INADDR_ANY
2235 * src IP addr.
2236 */
2237 opt->nd_opt_type = ND_OPT_SOURCE_LINKADDR;
2238 }
2239 ip6h->ip6_src = *sender;
2240 ns->nd_ns_target = *target;
2241 if (!(flag & NDP_UNICAST)) {
2242 /* Form multicast address of the target */
2243 ip6h->ip6_dst = ipv6_solicited_node_mcast;
2244 ip6h->ip6_dst.s6_addr32[3] |=
2245 ns->nd_ns_target.s6_addr32[3];
2246 }
2247 } else {
2248 nd_neighbor_advert_t *na = (nd_neighbor_advert_t *)icmp6;
2249
2250 ASSERT(!(flag & NDP_PROBE));
2251 if (opt != NULL)
2252 opt->nd_opt_type = ND_OPT_TARGET_LINKADDR;
2253 ip6h->ip6_src = *sender;
2254 na->nd_na_target = *sender;
2255 if (flag & NDP_ISROUTER)
2256 na->nd_na_flags_reserved |= ND_NA_FLAG_ROUTER;
2257 if (flag & NDP_SOLICITED)
2258 na->nd_na_flags_reserved |= ND_NA_FLAG_SOLICITED;
2259 if (flag & NDP_ORIDE)
2260 na->nd_na_flags_reserved |= ND_NA_FLAG_OVERRIDE;
2261 }
2262
2263 if (!(flag & NDP_PROBE)) {
2264 if (hw_addr != NULL && opt != NULL) {
2265 /* Fill in link layer address and option len */
2266 opt->nd_opt_len = (uint8_t)plen;
2267 bcopy(hw_addr, &opt[1], hw_addr_len);
2268 }
2269 }
2270 if (opt != NULL && opt->nd_opt_type == 0) {
2271 /* If there's no link layer address option, then strip it. */
2272 len -= plen * 8;
2273 mp->b_wptr = mp->b_rptr + len;
2274 ip6h->ip6_plen = htons(len - IPV6_HDR_LEN);
2275 }
2276
2277 icmp6->icmp6_type = (uint8_t)operation;
2278 icmp6->icmp6_code = 0;
2279 /*
2280 * Prepare for checksum by putting icmp length in the icmp
2281 * checksum field. The checksum is calculated in ip_output.c.
2282 */
2283 icmp6->icmp6_cksum = ip6h->ip6_plen;
2284
2285 (void) ip_output_simple(mp, &ixas);
2286 ixa_cleanup(&ixas);
2287 if (need_refrele)
2288 ill_refrele(ill);
2289 return (B_FALSE);
2290 }
2291
2292 /*
2293 * Used to set ND_UNREACHBLE before ncec_delete sets it NCE_F_CONDEMNED.
2294 * The datapath uses this as an indication that there
2295 * is a problem (as opposed to a NCE that was just
2296 * reclaimed due to lack of memory.
2297 * Note that static ARP entries never become unreachable.
2298 */
2299 void
nce_make_unreachable(ncec_t * ncec)2300 nce_make_unreachable(ncec_t *ncec)
2301 {
2302 mutex_enter(&ncec->ncec_lock);
2303 ncec->ncec_state = ND_UNREACHABLE;
2304 mutex_exit(&ncec->ncec_lock);
2305 }
2306
2307 /*
2308 * NCE retransmit timer. Common to IPv4 and IPv6.
2309 * This timer goes off when:
2310 * a. It is time to retransmit a resolution for resolver.
2311 * b. It is time to send reachability probes.
2312 */
2313 void
nce_timer(void * arg)2314 nce_timer(void *arg)
2315 {
2316 ncec_t *ncec = arg;
2317 ill_t *ill = ncec->ncec_ill, *src_ill;
2318 char addrbuf[INET6_ADDRSTRLEN];
2319 boolean_t dropped = B_FALSE;
2320 ip_stack_t *ipst = ncec->ncec_ipst;
2321 boolean_t isv6 = (ncec->ncec_ipversion == IPV6_VERSION);
2322 in_addr_t sender4 = INADDR_ANY;
2323 in6_addr_t sender6 = ipv6_all_zeros;
2324
2325 /*
2326 * The timer has to be cancelled by ncec_delete before doing the final
2327 * refrele. So the NCE is guaranteed to exist when the timer runs
2328 * until it clears the timeout_id. Before clearing the timeout_id
2329 * bump up the refcnt so that we can continue to use the ncec
2330 */
2331 ASSERT(ncec != NULL);
2332 mutex_enter(&ncec->ncec_lock);
2333 ncec_refhold_locked(ncec);
2334 ncec->ncec_timeout_id = 0;
2335 mutex_exit(&ncec->ncec_lock);
2336
2337 src_ill = nce_resolve_src(ncec, &sender6);
2338 /* if we could not find a sender address, return */
2339 if (src_ill == NULL) {
2340 if (!isv6) {
2341 IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, sender4);
2342 ip1dbg(("no src ill for %s\n", inet_ntop(AF_INET,
2343 &sender4, addrbuf, sizeof (addrbuf))));
2344 } else {
2345 ip1dbg(("no src ill for %s\n", inet_ntop(AF_INET6,
2346 &ncec->ncec_addr, addrbuf, sizeof (addrbuf))));
2347 }
2348 nce_restart_timer(ncec, ill->ill_reachable_retrans_time);
2349 ncec_refrele(ncec);
2350 return;
2351 }
2352 if (!isv6)
2353 IN6_V4MAPPED_TO_IPADDR(&sender6, sender4);
2354
2355 mutex_enter(&ncec->ncec_lock);
2356 /*
2357 * Check the reachability state.
2358 */
2359 switch (ncec->ncec_state) {
2360 case ND_DELAY:
2361 ASSERT(ncec->ncec_lladdr != NULL);
2362 ncec->ncec_state = ND_PROBE;
2363 ncec->ncec_pcnt = ND_MAX_UNICAST_SOLICIT;
2364 if (isv6) {
2365 mutex_exit(&ncec->ncec_lock);
2366 dropped = ndp_xmit(src_ill, ND_NEIGHBOR_SOLICIT,
2367 src_ill->ill_phys_addr,
2368 src_ill->ill_phys_addr_length,
2369 &sender6, &ncec->ncec_addr,
2370 NDP_UNICAST);
2371 } else {
2372 dropped = (arp_request(ncec, sender4, src_ill) == 0);
2373 mutex_exit(&ncec->ncec_lock);
2374 }
2375 if (!dropped) {
2376 mutex_enter(&ncec->ncec_lock);
2377 ncec->ncec_pcnt--;
2378 mutex_exit(&ncec->ncec_lock);
2379 }
2380 if (ip_debug > 3) {
2381 /* ip2dbg */
2382 pr_addr_dbg("nce_timer: state for %s changed "
2383 "to PROBE\n", AF_INET6, &ncec->ncec_addr);
2384 }
2385 nce_restart_timer(ncec, ill->ill_reachable_retrans_time);
2386 break;
2387 case ND_PROBE:
2388 /* must be retransmit timer */
2389 ASSERT(ncec->ncec_pcnt >= -1);
2390 if (ncec->ncec_pcnt > 0) {
2391 /*
2392 * As per RFC2461, the ncec gets deleted after
2393 * MAX_UNICAST_SOLICIT unsuccessful re-transmissions.
2394 * Note that the first unicast solicitation is sent
2395 * during the DELAY state.
2396 */
2397 ip2dbg(("nce_timer: pcount=%x dst %s\n",
2398 ncec->ncec_pcnt,
2399 inet_ntop((isv6? AF_INET6 : AF_INET),
2400 &ncec->ncec_addr, addrbuf, sizeof (addrbuf))));
2401 if (NCE_PUBLISH(ncec)) {
2402 mutex_exit(&ncec->ncec_lock);
2403 /*
2404 * send out a probe; note that src_ill
2405 * is ignored by nce_dad() for all
2406 * DAD message types other than IPv6
2407 * unicast probes
2408 */
2409 nce_dad(ncec, src_ill, B_TRUE);
2410 } else {
2411 ASSERT(src_ill != NULL);
2412 if (isv6) {
2413 mutex_exit(&ncec->ncec_lock);
2414 dropped = ndp_xmit(src_ill,
2415 ND_NEIGHBOR_SOLICIT,
2416 src_ill->ill_phys_addr,
2417 src_ill->ill_phys_addr_length,
2418 &sender6, &ncec->ncec_addr,
2419 NDP_UNICAST);
2420 } else {
2421 /*
2422 * since the nce is REACHABLE,
2423 * the ARP request will be sent out
2424 * as a link-layer unicast.
2425 */
2426 dropped = (arp_request(ncec, sender4,
2427 src_ill) == 0);
2428 mutex_exit(&ncec->ncec_lock);
2429 }
2430 if (!dropped) {
2431 mutex_enter(&ncec->ncec_lock);
2432 ncec->ncec_pcnt--;
2433 mutex_exit(&ncec->ncec_lock);
2434 }
2435 nce_restart_timer(ncec,
2436 ill->ill_reachable_retrans_time);
2437 }
2438 } else if (ncec->ncec_pcnt < 0) {
2439 /* No hope, delete the ncec */
2440 /* Tell datapath it went bad */
2441 ncec->ncec_state = ND_UNREACHABLE;
2442 mutex_exit(&ncec->ncec_lock);
2443 if (ip_debug > 2) {
2444 /* ip1dbg */
2445 pr_addr_dbg("nce_timer: Delete NCE for"
2446 " dst %s\n", (isv6? AF_INET6: AF_INET),
2447 &ncec->ncec_addr);
2448 }
2449 /* if static ARP can't delete. */
2450 if ((ncec->ncec_flags & NCE_F_STATIC) == 0)
2451 ncec_delete(ncec);
2452
2453 } else if (!NCE_PUBLISH(ncec)) {
2454 /*
2455 * Probe count is 0 for a dynamic entry (one that we
2456 * ourselves are not publishing). We should never get
2457 * here if NONUD was requested, hence the ASSERT below.
2458 */
2459 ASSERT((ncec->ncec_flags & NCE_F_NONUD) == 0);
2460 ip2dbg(("nce_timer: pcount=%x dst %s\n",
2461 ncec->ncec_pcnt, inet_ntop(AF_INET6,
2462 &ncec->ncec_addr, addrbuf, sizeof (addrbuf))));
2463 ncec->ncec_pcnt--;
2464 mutex_exit(&ncec->ncec_lock);
2465 /* Wait one interval before killing */
2466 nce_restart_timer(ncec,
2467 ill->ill_reachable_retrans_time);
2468 } else if (ill->ill_phyint->phyint_flags & PHYI_RUNNING) {
2469 ipif_t *ipif;
2470 ipaddr_t ncec_addr;
2471
2472 /*
2473 * We're done probing, and we can now declare this
2474 * address to be usable. Let IP know that it's ok to
2475 * use.
2476 */
2477 ncec->ncec_state = ND_REACHABLE;
2478 ncec->ncec_flags &= ~NCE_F_UNVERIFIED;
2479 mutex_exit(&ncec->ncec_lock);
2480 if (isv6) {
2481 ipif = ipif_lookup_addr_exact_v6(
2482 &ncec->ncec_addr, ill, ipst);
2483 } else {
2484 IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr,
2485 ncec_addr);
2486 ipif = ipif_lookup_addr_exact(ncec_addr, ill,
2487 ipst);
2488 }
2489 if (ipif != NULL) {
2490 if (ipif->ipif_was_dup) {
2491 char ibuf[LIFNAMSIZ];
2492 char sbuf[INET6_ADDRSTRLEN];
2493
2494 ipif->ipif_was_dup = B_FALSE;
2495 (void) inet_ntop(AF_INET6,
2496 &ipif->ipif_v6lcl_addr,
2497 sbuf, sizeof (sbuf));
2498 ipif_get_name(ipif, ibuf,
2499 sizeof (ibuf));
2500 cmn_err(CE_NOTE, "recovered address "
2501 "%s on %s", sbuf, ibuf);
2502 }
2503 if ((ipif->ipif_flags & IPIF_UP) &&
2504 !ipif->ipif_addr_ready)
2505 ipif_up_notify(ipif);
2506 ipif->ipif_addr_ready = 1;
2507 ipif_refrele(ipif);
2508 }
2509 if (!isv6 && arp_no_defense)
2510 break;
2511 /* Begin defending our new address */
2512 if (ncec->ncec_unsolicit_count > 0) {
2513 ncec->ncec_unsolicit_count--;
2514 if (isv6) {
2515 dropped = ndp_announce(ncec);
2516 } else {
2517 dropped = arp_announce(ncec);
2518 }
2519
2520 if (dropped)
2521 ncec->ncec_unsolicit_count++;
2522 else
2523 ncec->ncec_last_time_defended =
2524 ddi_get_lbolt();
2525 }
2526 if (ncec->ncec_unsolicit_count > 0) {
2527 nce_restart_timer(ncec,
2528 ANNOUNCE_INTERVAL(isv6));
2529 } else if (DEFENSE_INTERVAL(isv6) != 0) {
2530 nce_restart_timer(ncec, DEFENSE_INTERVAL(isv6));
2531 }
2532 } else {
2533 /*
2534 * This is an address we're probing to be our own, but
2535 * the ill is down. Wait until it comes back before
2536 * doing anything, but switch to reachable state so
2537 * that the restart will work.
2538 */
2539 ncec->ncec_state = ND_REACHABLE;
2540 mutex_exit(&ncec->ncec_lock);
2541 }
2542 break;
2543 case ND_INCOMPLETE: {
2544 mblk_t *mp, *nextmp;
2545 mblk_t **prevmpp;
2546
2547 /*
2548 * Per case (2) in the nce_queue_mp() comments, scan ncec_qd_mp
2549 * for any IPMP probe packets, and toss them. IPMP probe
2550 * packets will always be at the head of ncec_qd_mp, so that
2551 * we can stop at the first queued ND packet that is
2552 * not a probe packet.
2553 */
2554 prevmpp = &ncec->ncec_qd_mp;
2555 for (mp = ncec->ncec_qd_mp; mp != NULL; mp = nextmp) {
2556 nextmp = mp->b_next;
2557
2558 if (IS_UNDER_IPMP(ill) && ncec->ncec_nprobes > 0) {
2559 inet_freemsg(mp);
2560 ncec->ncec_nprobes--;
2561 *prevmpp = nextmp;
2562 } else {
2563 prevmpp = &mp->b_next;
2564 }
2565 }
2566
2567 /*
2568 * Must be resolver's retransmit timer.
2569 */
2570 mutex_exit(&ncec->ncec_lock);
2571 ip_ndp_resolve(ncec);
2572 break;
2573 }
2574 case ND_REACHABLE:
2575 if (((ncec->ncec_flags & NCE_F_UNSOL_ADV) &&
2576 ncec->ncec_unsolicit_count != 0) ||
2577 (NCE_PUBLISH(ncec) && DEFENSE_INTERVAL(isv6) != 0)) {
2578 if (ncec->ncec_unsolicit_count > 0) {
2579 ncec->ncec_unsolicit_count--;
2580 mutex_exit(&ncec->ncec_lock);
2581 /*
2582 * When we get to zero announcements left,
2583 * switch to address defense
2584 */
2585 } else {
2586 boolean_t rate_limit;
2587
2588 mutex_exit(&ncec->ncec_lock);
2589 rate_limit = ill_defend_rate_limit(ill, ncec);
2590 if (rate_limit) {
2591 nce_restart_timer(ncec,
2592 DEFENSE_INTERVAL(isv6));
2593 break;
2594 }
2595 }
2596 if (isv6) {
2597 dropped = ndp_announce(ncec);
2598 } else {
2599 dropped = arp_announce(ncec);
2600 }
2601 mutex_enter(&ncec->ncec_lock);
2602 if (dropped) {
2603 ncec->ncec_unsolicit_count++;
2604 } else {
2605 ncec->ncec_last_time_defended =
2606 ddi_get_lbolt();
2607 }
2608 mutex_exit(&ncec->ncec_lock);
2609 if (ncec->ncec_unsolicit_count != 0) {
2610 nce_restart_timer(ncec,
2611 ANNOUNCE_INTERVAL(isv6));
2612 } else {
2613 nce_restart_timer(ncec, DEFENSE_INTERVAL(isv6));
2614 }
2615 } else {
2616 mutex_exit(&ncec->ncec_lock);
2617 }
2618 break;
2619 default:
2620 mutex_exit(&ncec->ncec_lock);
2621 break;
2622 }
2623 done:
2624 ncec_refrele(ncec);
2625 ill_refrele(src_ill);
2626 }
2627
2628 /*
2629 * Set a link layer address from the ll_addr passed in.
2630 * Copy SAP from ill.
2631 */
2632 static void
nce_set_ll(ncec_t * ncec,uchar_t * ll_addr)2633 nce_set_ll(ncec_t *ncec, uchar_t *ll_addr)
2634 {
2635 ill_t *ill = ncec->ncec_ill;
2636
2637 ASSERT(ll_addr != NULL);
2638 if (ill->ill_phys_addr_length > 0) {
2639 /*
2640 * The bcopy() below used to be called for the physical address
2641 * length rather than the link layer address length. For
2642 * ethernet and many other media, the phys_addr and lla are
2643 * identical.
2644 *
2645 * The phys_addr and lla may not be the same for devices that
2646 * support DL_IPV6_LINK_LAYER_ADDR, though there are currently
2647 * no known instances of these.
2648 *
2649 * For PPP or other interfaces with a zero length
2650 * physical address, don't do anything here.
2651 * The bcopy() with a zero phys_addr length was previously
2652 * a no-op for interfaces with a zero-length physical address.
2653 * Using the lla for them would change the way they operate.
2654 * Doing nothing in such cases preserves expected behavior.
2655 */
2656 bcopy(ll_addr, ncec->ncec_lladdr, ill->ill_nd_lla_len);
2657 }
2658 }
2659
2660 boolean_t
nce_cmp_ll_addr(const ncec_t * ncec,const uchar_t * ll_addr,uint32_t ll_addr_len)2661 nce_cmp_ll_addr(const ncec_t *ncec, const uchar_t *ll_addr,
2662 uint32_t ll_addr_len)
2663 {
2664 ASSERT(ncec->ncec_lladdr != NULL);
2665 if (ll_addr == NULL)
2666 return (B_FALSE);
2667 if (bcmp(ll_addr, ncec->ncec_lladdr, ll_addr_len) != 0)
2668 return (B_TRUE);
2669 return (B_FALSE);
2670 }
2671
2672 /*
2673 * Updates the link layer address or the reachability state of
2674 * a cache entry. Reset probe counter if needed.
2675 */
2676 void
nce_update(ncec_t * ncec,uint16_t new_state,uchar_t * new_ll_addr)2677 nce_update(ncec_t *ncec, uint16_t new_state, uchar_t *new_ll_addr)
2678 {
2679 ill_t *ill = ncec->ncec_ill;
2680 boolean_t need_stop_timer = B_FALSE;
2681 boolean_t need_fastpath_update = B_FALSE;
2682 nce_t *nce = NULL;
2683 timeout_id_t tid;
2684
2685 ASSERT(MUTEX_HELD(&ncec->ncec_lock));
2686 /*
2687 * If this interface does not do NUD, there is no point
2688 * in allowing an update to the cache entry. Although
2689 * we will respond to NS.
2690 * The only time we accept an update for a resolver when
2691 * NUD is turned off is when it has just been created.
2692 * Non-Resolvers will always be created as REACHABLE.
2693 */
2694 if (new_state != ND_UNCHANGED) {
2695 if ((ncec->ncec_flags & NCE_F_NONUD) &&
2696 (ncec->ncec_state != ND_INCOMPLETE))
2697 return;
2698 ASSERT((int16_t)new_state >= ND_STATE_VALID_MIN);
2699 ASSERT((int16_t)new_state <= ND_STATE_VALID_MAX);
2700 need_stop_timer = B_TRUE;
2701 if (new_state == ND_REACHABLE)
2702 ncec->ncec_last = TICK_TO_MSEC(ddi_get_lbolt64());
2703 else {
2704 /* We force NUD in this case */
2705 ncec->ncec_last = 0;
2706 }
2707 ncec->ncec_state = new_state;
2708 ncec->ncec_pcnt = ND_MAX_UNICAST_SOLICIT;
2709 ASSERT(ncec->ncec_lladdr != NULL || new_state == ND_INITIAL ||
2710 new_state == ND_INCOMPLETE);
2711 }
2712 if (need_stop_timer || (ncec->ncec_flags & NCE_F_STATIC)) {
2713 tid = ncec->ncec_timeout_id;
2714 ncec->ncec_timeout_id = 0;
2715 }
2716 /*
2717 * Re-trigger fastpath probe and
2718 * overwrite the DL_UNITDATA_REQ data, noting we'll lose
2719 * whatever packets that happens to be transmitting at the time.
2720 */
2721 if (new_ll_addr != NULL) {
2722 bcopy(new_ll_addr, ncec->ncec_lladdr,
2723 ill->ill_phys_addr_length);
2724 need_fastpath_update = B_TRUE;
2725 }
2726 mutex_exit(&ncec->ncec_lock);
2727 if (need_stop_timer || (ncec->ncec_flags & NCE_F_STATIC)) {
2728 if (tid != 0)
2729 (void) untimeout(tid);
2730 }
2731 if (need_fastpath_update) {
2732 /*
2733 * Delete any existing existing dlur_mp and fp_mp information.
2734 * For IPMP interfaces, all underlying ill's must be checked
2735 * and purged.
2736 */
2737 nce_fastpath_list_delete(ncec->ncec_ill, ncec, NULL);
2738 /*
2739 * add the new dlur_mp and fp_mp
2740 */
2741 nce = nce_fastpath(ncec, B_TRUE, NULL);
2742 if (nce != NULL)
2743 nce_refrele(nce);
2744 }
2745 mutex_enter(&ncec->ncec_lock);
2746 }
2747
2748 static void
nce_queue_mp_common(ncec_t * ncec,mblk_t * mp,boolean_t head_insert)2749 nce_queue_mp_common(ncec_t *ncec, mblk_t *mp, boolean_t head_insert)
2750 {
2751 uint_t count = 0;
2752 mblk_t **mpp, *tmp;
2753
2754 ASSERT(MUTEX_HELD(&ncec->ncec_lock));
2755
2756 for (mpp = &ncec->ncec_qd_mp; *mpp != NULL; mpp = &(*mpp)->b_next) {
2757 if (++count > ncec->ncec_ill->ill_max_buf) {
2758 tmp = ncec->ncec_qd_mp->b_next;
2759 ncec->ncec_qd_mp->b_next = NULL;
2760 /*
2761 * if we never create data addrs on the under_ill
2762 * does this matter?
2763 */
2764 BUMP_MIB(ncec->ncec_ill->ill_ip_mib,
2765 ipIfStatsOutDiscards);
2766 ip_drop_output("ipIfStatsOutDiscards", ncec->ncec_qd_mp,
2767 ncec->ncec_ill);
2768 freemsg(ncec->ncec_qd_mp);
2769 ncec->ncec_qd_mp = tmp;
2770 }
2771 }
2772
2773 if (head_insert) {
2774 ncec->ncec_nprobes++;
2775 mp->b_next = ncec->ncec_qd_mp;
2776 ncec->ncec_qd_mp = mp;
2777 } else {
2778 *mpp = mp;
2779 }
2780 }
2781
2782 /*
2783 * nce_queue_mp will queue the packet into the ncec_qd_mp. The packet will be
2784 * queued at the head or tail of the queue based on the input argument
2785 * 'head_insert'. The caller should specify this argument as B_TRUE if this
2786 * packet is an IPMP probe packet, in which case the following happens:
2787 *
2788 * 1. Insert it at the head of the ncec_qd_mp list. Consider the normal
2789 * (non-ipmp_probe) load-speading case where the source address of the ND
2790 * packet is not tied to ncec_ill. If the ill bound to the source address
2791 * cannot receive, the response to the ND packet will not be received.
2792 * However, if ND packets for ncec_ill's probes are queued behind that ND
2793 * packet, those probes will also fail to be sent, and thus in.mpathd will
2794 * erroneously conclude that ncec_ill has also failed.
2795 *
2796 * 2. Drop the ipmp_probe packet in ndp_timer() if the ND did not succeed on
2797 * the first attempt. This ensures that ND problems do not manifest as
2798 * probe RTT spikes.
2799 *
2800 * We achieve this by inserting ipmp_probe() packets at the head of the
2801 * nce_queue.
2802 *
2803 * The ncec for the probe target is created with ncec_ill set to the ipmp_ill,
2804 * but the caller needs to set head_insert to B_TRUE if this is a probe packet.
2805 */
2806 void
nce_queue_mp(ncec_t * ncec,mblk_t * mp,boolean_t head_insert)2807 nce_queue_mp(ncec_t *ncec, mblk_t *mp, boolean_t head_insert)
2808 {
2809 ASSERT(MUTEX_HELD(&ncec->ncec_lock));
2810 nce_queue_mp_common(ncec, mp, head_insert);
2811 }
2812
2813 /*
2814 * Called when address resolution failed due to a timeout.
2815 * Send an ICMP unreachable in response to all queued packets.
2816 */
2817 void
ndp_resolv_failed(ncec_t * ncec)2818 ndp_resolv_failed(ncec_t *ncec)
2819 {
2820 mblk_t *mp, *nxt_mp;
2821 char buf[INET6_ADDRSTRLEN];
2822 ill_t *ill = ncec->ncec_ill;
2823 ip_recv_attr_t iras;
2824
2825 bzero(&iras, sizeof (iras));
2826 iras.ira_flags = 0;
2827 /*
2828 * we are setting the ira_rill to the ipmp_ill (instead of
2829 * the actual ill on which the packet was received), but this
2830 * is ok because we don't actually need the real ira_rill.
2831 * to send the icmp unreachable to the sender.
2832 */
2833 iras.ira_ill = iras.ira_rill = ill;
2834 iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex;
2835 iras.ira_rifindex = iras.ira_ruifindex;
2836
2837 ip1dbg(("ndp_resolv_failed: dst %s\n",
2838 inet_ntop(AF_INET6, (char *)&ncec->ncec_addr, buf, sizeof (buf))));
2839 mutex_enter(&ncec->ncec_lock);
2840 mp = ncec->ncec_qd_mp;
2841 ncec->ncec_qd_mp = NULL;
2842 ncec->ncec_nprobes = 0;
2843 mutex_exit(&ncec->ncec_lock);
2844 while (mp != NULL) {
2845 nxt_mp = mp->b_next;
2846 mp->b_next = NULL;
2847
2848 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
2849 ip_drop_output("ipIfStatsOutDiscards - address unreachable",
2850 mp, ill);
2851 icmp_unreachable_v6(mp,
2852 ICMP6_DST_UNREACH_ADDR, B_FALSE, &iras);
2853 ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE));
2854 mp = nxt_mp;
2855 }
2856 ncec_cb_dispatch(ncec); /* finish off waiting callbacks */
2857 }
2858
2859 /*
2860 * Handle the completion of NDP and ARP resolution.
2861 */
2862 void
nce_resolv_ok(ncec_t * ncec)2863 nce_resolv_ok(ncec_t *ncec)
2864 {
2865 mblk_t *mp;
2866 uint_t pkt_len;
2867 iaflags_t ixaflags = IXAF_NO_TRACE;
2868 nce_t *nce;
2869 ill_t *ill = ncec->ncec_ill;
2870 boolean_t isv6 = (ncec->ncec_ipversion == IPV6_VERSION);
2871 ip_stack_t *ipst = ill->ill_ipst;
2872
2873 if (IS_IPMP(ncec->ncec_ill)) {
2874 nce_resolv_ipmp_ok(ncec);
2875 return;
2876 }
2877 /* non IPMP case */
2878
2879 mutex_enter(&ncec->ncec_lock);
2880 ASSERT(ncec->ncec_nprobes == 0);
2881 mp = ncec->ncec_qd_mp;
2882 ncec->ncec_qd_mp = NULL;
2883 mutex_exit(&ncec->ncec_lock);
2884
2885 while (mp != NULL) {
2886 mblk_t *nxt_mp;
2887
2888 if (ill->ill_isv6) {
2889 ip6_t *ip6h = (ip6_t *)mp->b_rptr;
2890
2891 pkt_len = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN;
2892 } else {
2893 ipha_t *ipha = (ipha_t *)mp->b_rptr;
2894
2895 ixaflags |= IXAF_IS_IPV4;
2896 pkt_len = ntohs(ipha->ipha_length);
2897 }
2898 nxt_mp = mp->b_next;
2899 mp->b_next = NULL;
2900 /*
2901 * IXAF_NO_DEV_FLOW_CTL information for TCP packets is no
2902 * longer available, but it's ok to drop this flag because TCP
2903 * has its own flow-control in effect, so TCP packets
2904 * are not likely to get here when flow-control is in effect.
2905 */
2906 mutex_enter(&ill->ill_lock);
2907 nce = nce_lookup(ill, &ncec->ncec_addr);
2908 mutex_exit(&ill->ill_lock);
2909
2910 if (nce == NULL) {
2911 if (isv6) {
2912 BUMP_MIB(&ipst->ips_ip6_mib,
2913 ipIfStatsOutDiscards);
2914 } else {
2915 BUMP_MIB(&ipst->ips_ip_mib,
2916 ipIfStatsOutDiscards);
2917 }
2918 ip_drop_output("ipIfStatsOutDiscards - no nce",
2919 mp, NULL);
2920 freemsg(mp);
2921 } else {
2922 /*
2923 * We don't know the zoneid, but
2924 * ip_xmit does not care since IXAF_NO_TRACE
2925 * is set. (We traced the packet the first
2926 * time through ip_xmit.)
2927 */
2928 (void) ip_xmit(mp, nce, ixaflags, pkt_len, 0,
2929 ALL_ZONES, 0, NULL);
2930 nce_refrele(nce);
2931 }
2932 mp = nxt_mp;
2933 }
2934
2935 ncec_cb_dispatch(ncec); /* complete callbacks */
2936 }
2937
2938 /*
2939 * Called by SIOCSNDP* ioctl to add/change an ncec entry
2940 * and the corresponding attributes.
2941 * Disallow states other than ND_REACHABLE or ND_STALE.
2942 */
2943 int
ndp_sioc_update(ill_t * ill,lif_nd_req_t * lnr)2944 ndp_sioc_update(ill_t *ill, lif_nd_req_t *lnr)
2945 {
2946 sin6_t *sin6;
2947 in6_addr_t *addr;
2948 ncec_t *ncec;
2949 nce_t *nce;
2950 int err = 0;
2951 uint16_t new_flags = 0;
2952 uint16_t old_flags = 0;
2953 int inflags = lnr->lnr_flags;
2954 ip_stack_t *ipst = ill->ill_ipst;
2955 boolean_t do_postprocess = B_FALSE;
2956
2957 ASSERT(ill->ill_isv6);
2958 if ((lnr->lnr_state_create != ND_REACHABLE) &&
2959 (lnr->lnr_state_create != ND_STALE))
2960 return (EINVAL);
2961
2962 sin6 = (sin6_t *)&lnr->lnr_addr;
2963 addr = &sin6->sin6_addr;
2964
2965 mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
2966 ASSERT(!IS_UNDER_IPMP(ill));
2967 nce = nce_lookup_addr(ill, addr);
2968 if (nce != NULL)
2969 new_flags = nce->nce_common->ncec_flags;
2970
2971 switch (inflags & (NDF_ISROUTER_ON|NDF_ISROUTER_OFF)) {
2972 case NDF_ISROUTER_ON:
2973 new_flags |= NCE_F_ISROUTER;
2974 break;
2975 case NDF_ISROUTER_OFF:
2976 new_flags &= ~NCE_F_ISROUTER;
2977 break;
2978 case (NDF_ISROUTER_OFF|NDF_ISROUTER_ON):
2979 mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
2980 if (nce != NULL)
2981 nce_refrele(nce);
2982 return (EINVAL);
2983 }
2984 if (inflags & NDF_STATIC)
2985 new_flags |= NCE_F_STATIC;
2986
2987 switch (inflags & (NDF_ANYCAST_ON|NDF_ANYCAST_OFF)) {
2988 case NDF_ANYCAST_ON:
2989 new_flags |= NCE_F_ANYCAST;
2990 break;
2991 case NDF_ANYCAST_OFF:
2992 new_flags &= ~NCE_F_ANYCAST;
2993 break;
2994 case (NDF_ANYCAST_OFF|NDF_ANYCAST_ON):
2995 mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
2996 if (nce != NULL)
2997 nce_refrele(nce);
2998 return (EINVAL);
2999 }
3000
3001 if (nce == NULL) {
3002 err = nce_add_v6(ill,
3003 (uchar_t *)lnr->lnr_hdw_addr,
3004 ill->ill_phys_addr_length,
3005 addr,
3006 new_flags,
3007 lnr->lnr_state_create,
3008 &nce);
3009 if (err != 0) {
3010 mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
3011 ip1dbg(("ndp_sioc_update: Can't create NCE %d\n", err));
3012 return (err);
3013 } else {
3014 do_postprocess = B_TRUE;
3015 }
3016 }
3017 ncec = nce->nce_common;
3018 old_flags = ncec->ncec_flags;
3019 if (old_flags & NCE_F_ISROUTER && !(new_flags & NCE_F_ISROUTER)) {
3020 ncec_router_to_host(ncec);
3021 mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
3022 if (do_postprocess)
3023 err = nce_add_v6_postprocess(nce);
3024 nce_refrele(nce);
3025 return (0);
3026 }
3027 mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
3028
3029 if (do_postprocess)
3030 err = nce_add_v6_postprocess(nce);
3031 /*
3032 * err cannot be anything other than 0 because we don't support
3033 * proxy arp of static addresses.
3034 */
3035 ASSERT(err == 0);
3036
3037 mutex_enter(&ncec->ncec_lock);
3038 ncec->ncec_flags = new_flags;
3039 mutex_exit(&ncec->ncec_lock);
3040 /*
3041 * Note that we ignore the state at this point, which
3042 * should be either STALE or REACHABLE. Instead we let
3043 * the link layer address passed in to determine the state
3044 * much like incoming packets.
3045 */
3046 nce_process(ncec, (uchar_t *)lnr->lnr_hdw_addr, 0, B_FALSE);
3047 nce_refrele(nce);
3048 return (0);
3049 }
3050
3051 /*
3052 * Create an nce_t structure for ill using the ncec->ncec_lladdr to set up
3053 * the nce_dlur_mp. If ill != ncec->ncec_ill, then the ips_ill_g_lock must
3054 * be held to ensure that they are in the same group.
3055 */
3056 static nce_t *
nce_fastpath_create(ill_t * ill,ncec_t * ncec)3057 nce_fastpath_create(ill_t *ill, ncec_t *ncec)
3058 {
3059
3060 nce_t *nce;
3061
3062 nce = nce_ill_lookup_then_add(ill, ncec);
3063
3064 if (nce == NULL || IS_LOOPBACK(nce->nce_ill) || IS_VNI(nce->nce_ill))
3065 return (nce);
3066
3067 /*
3068 * hold the ncec_lock to synchronize with nce_update() so that,
3069 * at the end of this function, the contents of nce_dlur_mp are
3070 * consistent with ncec->ncec_lladdr, even though some intermediate
3071 * packet may have been sent out with a mangled address, which would
3072 * only be a transient condition.
3073 */
3074 mutex_enter(&ncec->ncec_lock);
3075 if (ncec->ncec_lladdr != NULL) {
3076 bcopy(ncec->ncec_lladdr, nce->nce_dlur_mp->b_rptr +
3077 NCE_LL_ADDR_OFFSET(ill), ill->ill_phys_addr_length);
3078 } else {
3079 nce->nce_dlur_mp = ill_dlur_gen(NULL, 0, ill->ill_sap,
3080 ill->ill_sap_length);
3081 }
3082 mutex_exit(&ncec->ncec_lock);
3083 return (nce);
3084 }
3085
3086 /*
3087 * we make nce_fp_mp to have an M_DATA prepend.
3088 * The caller ensures there is hold on ncec for this function.
3089 * Note that since ill_fastpath_probe() copies the mblk there is
3090 * no need to hold the nce or ncec beyond this function.
3091 *
3092 * If the caller has passed in a non-null ncec_nce to nce_fastpath() that
3093 * ncec_nce must correspond to the nce for ncec with nce_ill == ncec->ncec_ill
3094 * and will be returned back by this function, so that no extra nce_refrele
3095 * is required for the caller. The calls from nce_add_common() use this
3096 * method. All other callers (that pass in NULL ncec_nce) will have to do a
3097 * nce_refrele of the returned nce (when it is non-null).
3098 */
3099 nce_t *
nce_fastpath(ncec_t * ncec,boolean_t trigger_fp_req,nce_t * ncec_nce)3100 nce_fastpath(ncec_t *ncec, boolean_t trigger_fp_req, nce_t *ncec_nce)
3101 {
3102 nce_t *nce;
3103 ill_t *ill = ncec->ncec_ill;
3104
3105 ASSERT(ill != NULL);
3106
3107 if (IS_IPMP(ill) && trigger_fp_req) {
3108 trigger_fp_req = B_FALSE;
3109 ipmp_ncec_refresh_nce(ncec);
3110 }
3111
3112 /*
3113 * If the caller already has the nce corresponding to the ill, use
3114 * that one. Otherwise we have to lookup/add the nce. Calls from
3115 * nce_add_common() fall in the former category, and have just done
3116 * the nce lookup/add that can be reused.
3117 */
3118 if (ncec_nce == NULL)
3119 nce = nce_fastpath_create(ill, ncec);
3120 else
3121 nce = ncec_nce;
3122
3123 if (nce == NULL || IS_LOOPBACK(nce->nce_ill) || IS_VNI(nce->nce_ill))
3124 return (nce);
3125
3126 if (trigger_fp_req)
3127 nce_fastpath_trigger(nce);
3128 return (nce);
3129 }
3130
3131 /*
3132 * Trigger fastpath on nce. No locks may be held.
3133 */
3134 static void
nce_fastpath_trigger(nce_t * nce)3135 nce_fastpath_trigger(nce_t *nce)
3136 {
3137 int res;
3138 ill_t *ill = nce->nce_ill;
3139 ncec_t *ncec = nce->nce_common;
3140
3141 res = ill_fastpath_probe(ill, nce->nce_dlur_mp);
3142 /*
3143 * EAGAIN is an indication of a transient error
3144 * i.e. allocation failure etc. leave the ncec in the list it
3145 * will be updated when another probe happens for another ire
3146 * if not it will be taken out of the list when the ire is
3147 * deleted.
3148 */
3149 if (res != 0 && res != EAGAIN && res != ENOTSUP)
3150 nce_fastpath_list_delete(ill, ncec, NULL);
3151 }
3152
3153 /*
3154 * Add ncec to the nce fastpath list on ill.
3155 */
3156 static nce_t *
nce_ill_lookup_then_add_locked(ill_t * ill,ncec_t * ncec)3157 nce_ill_lookup_then_add_locked(ill_t *ill, ncec_t *ncec)
3158 {
3159 nce_t *nce = NULL;
3160
3161 ASSERT(MUTEX_HELD(&ill->ill_lock));
3162 /*
3163 * Atomically ensure that the ill is not CONDEMNED and is not going
3164 * down, before adding the NCE.
3165 */
3166 if (ill->ill_state_flags & ILL_CONDEMNED)
3167 return (NULL);
3168 mutex_enter(&ncec->ncec_lock);
3169 /*
3170 * if ncec has not been deleted and
3171 * is not already in the list add it.
3172 */
3173 if (!NCE_ISCONDEMNED(ncec)) {
3174 nce = nce_lookup(ill, &ncec->ncec_addr);
3175 if (nce != NULL)
3176 goto done;
3177 nce = nce_add(ill, ncec);
3178 }
3179 done:
3180 mutex_exit(&ncec->ncec_lock);
3181 return (nce);
3182 }
3183
3184 nce_t *
nce_ill_lookup_then_add(ill_t * ill,ncec_t * ncec)3185 nce_ill_lookup_then_add(ill_t *ill, ncec_t *ncec)
3186 {
3187 nce_t *nce;
3188
3189 mutex_enter(&ill->ill_lock);
3190 nce = nce_ill_lookup_then_add_locked(ill, ncec);
3191 mutex_exit(&ill->ill_lock);
3192 return (nce);
3193 }
3194
3195
3196 /*
3197 * remove ncec from the ill_nce list. If 'dead' is non-null, the deleted
3198 * nce is added to the 'dead' list, and the caller must nce_refrele() the
3199 * entry after all locks have been dropped.
3200 */
3201 void
nce_fastpath_list_delete(ill_t * ill,ncec_t * ncec,list_t * dead)3202 nce_fastpath_list_delete(ill_t *ill, ncec_t *ncec, list_t *dead)
3203 {
3204 nce_t *nce;
3205
3206 ASSERT(ill != NULL);
3207
3208 /* delete any nces referencing the ncec from underlying ills */
3209 if (IS_IPMP(ill))
3210 ipmp_ncec_delete_nce(ncec);
3211
3212 /* now the ill itself */
3213 mutex_enter(&ill->ill_lock);
3214 for (nce = list_head(&ill->ill_nce); nce != NULL;
3215 nce = list_next(&ill->ill_nce, nce)) {
3216 if (nce->nce_common == ncec) {
3217 nce_refhold(nce);
3218 nce_delete(nce);
3219 break;
3220 }
3221 }
3222 mutex_exit(&ill->ill_lock);
3223 if (nce != NULL) {
3224 if (dead == NULL)
3225 nce_refrele(nce);
3226 else
3227 list_insert_tail(dead, nce);
3228 }
3229 }
3230
3231 /*
3232 * when the fastpath response does not fit in the datab
3233 * associated with the existing nce_fp_mp, we delete and
3234 * add the nce to retrigger fastpath based on the information
3235 * in the ncec_t.
3236 */
3237 static nce_t *
nce_delete_then_add(nce_t * nce)3238 nce_delete_then_add(nce_t *nce)
3239 {
3240 ill_t *ill = nce->nce_ill;
3241 nce_t *newnce = NULL;
3242
3243 ip0dbg(("nce_delete_then_add nce %p ill %s\n",
3244 (void *)nce, ill->ill_name));
3245 mutex_enter(&ill->ill_lock);
3246 mutex_enter(&nce->nce_common->ncec_lock);
3247 nce_delete(nce);
3248 /*
3249 * Make sure that ncec is not condemned before adding. We hold the
3250 * ill_lock and ncec_lock to synchronize with ncec_delete() and
3251 * ipmp_ncec_delete_nce()
3252 */
3253 if (!NCE_ISCONDEMNED(nce->nce_common))
3254 newnce = nce_add(ill, nce->nce_common);
3255 mutex_exit(&nce->nce_common->ncec_lock);
3256 mutex_exit(&ill->ill_lock);
3257 nce_refrele(nce);
3258 return (newnce); /* could be null if nomem */
3259 }
3260
3261 typedef struct nce_fp_match_s {
3262 nce_t *nce_fp_match_res;
3263 mblk_t *nce_fp_match_ack_mp;
3264 } nce_fp_match_t;
3265
3266 /* ARGSUSED */
3267 static int
nce_fastpath_match_dlur(ill_t * ill,nce_t * nce,void * arg)3268 nce_fastpath_match_dlur(ill_t *ill, nce_t *nce, void *arg)
3269 {
3270 nce_fp_match_t *nce_fp_marg = arg;
3271 ncec_t *ncec = nce->nce_common;
3272 mblk_t *mp = nce_fp_marg->nce_fp_match_ack_mp;
3273 uchar_t *mp_rptr, *ud_mp_rptr;
3274 mblk_t *ud_mp = nce->nce_dlur_mp;
3275 ptrdiff_t cmplen;
3276
3277 /*
3278 * mp is the mp associated with the fastpath ack.
3279 * ud_mp is the outstanding DL_UNITDATA_REQ on the nce_t
3280 * under consideration. If the contents match, then the
3281 * fastpath ack is used to update the nce.
3282 */
3283 if (ud_mp == NULL)
3284 return (0);
3285 mp_rptr = mp->b_rptr;
3286 cmplen = mp->b_wptr - mp_rptr;
3287 ASSERT(cmplen >= 0);
3288
3289 ud_mp_rptr = ud_mp->b_rptr;
3290 /*
3291 * The ncec is locked here to prevent any other threads from accessing
3292 * and changing nce_dlur_mp when the address becomes resolved to an
3293 * lla while we're in the middle of looking at and comparing the
3294 * hardware address (lla). It is also locked to prevent multiple
3295 * threads in nce_fastpath() from examining nce_dlur_mp at the same
3296 * time.
3297 */
3298 mutex_enter(&ncec->ncec_lock);
3299 if (ud_mp->b_wptr - ud_mp_rptr != cmplen ||
3300 bcmp((char *)mp_rptr, (char *)ud_mp_rptr, cmplen) == 0) {
3301 nce_fp_marg->nce_fp_match_res = nce;
3302 mutex_exit(&ncec->ncec_lock);
3303 nce_refhold(nce);
3304 return (1);
3305 }
3306 mutex_exit(&ncec->ncec_lock);
3307 return (0);
3308 }
3309
3310 /*
3311 * Update all NCE's that are not in fastpath mode and
3312 * have an nce_fp_mp that matches mp. mp->b_cont contains
3313 * the fastpath header.
3314 *
3315 * Returns TRUE if entry should be dequeued, or FALSE otherwise.
3316 */
3317 void
nce_fastpath_update(ill_t * ill,mblk_t * mp)3318 nce_fastpath_update(ill_t *ill, mblk_t *mp)
3319 {
3320 nce_fp_match_t nce_fp_marg;
3321 nce_t *nce;
3322 mblk_t *nce_fp_mp, *fp_mp;
3323
3324 nce_fp_marg.nce_fp_match_res = NULL;
3325 nce_fp_marg.nce_fp_match_ack_mp = mp;
3326
3327 nce_walk(ill, nce_fastpath_match_dlur, &nce_fp_marg);
3328
3329 if ((nce = nce_fp_marg.nce_fp_match_res) == NULL)
3330 return;
3331
3332 mutex_enter(&nce->nce_lock);
3333 nce_fp_mp = nce->nce_fp_mp;
3334
3335 if (nce_fp_mp != NULL) {
3336 fp_mp = mp->b_cont;
3337 if (nce_fp_mp->b_rptr + MBLKL(fp_mp) >
3338 nce_fp_mp->b_datap->db_lim) {
3339 mutex_exit(&nce->nce_lock);
3340 nce = nce_delete_then_add(nce);
3341 if (nce == NULL) {
3342 return;
3343 }
3344 mutex_enter(&nce->nce_lock);
3345 nce_fp_mp = nce->nce_fp_mp;
3346 }
3347 }
3348
3349 /* Matched - install mp as the fastpath mp */
3350 if (nce_fp_mp == NULL) {
3351 fp_mp = dupb(mp->b_cont);
3352 nce->nce_fp_mp = fp_mp;
3353 } else {
3354 fp_mp = mp->b_cont;
3355 bcopy(fp_mp->b_rptr, nce_fp_mp->b_rptr, MBLKL(fp_mp));
3356 nce->nce_fp_mp->b_wptr = nce->nce_fp_mp->b_rptr
3357 + MBLKL(fp_mp);
3358 }
3359 mutex_exit(&nce->nce_lock);
3360 nce_refrele(nce);
3361 }
3362
3363 /*
3364 * Return a pointer to a given option in the packet.
3365 * Assumes that option part of the packet have already been validated.
3366 */
3367 nd_opt_hdr_t *
ndp_get_option(nd_opt_hdr_t * opt,int optlen,int opt_type)3368 ndp_get_option(nd_opt_hdr_t *opt, int optlen, int opt_type)
3369 {
3370 while (optlen > 0) {
3371 if (opt->nd_opt_type == opt_type)
3372 return (opt);
3373 optlen -= 8 * opt->nd_opt_len;
3374 opt = (struct nd_opt_hdr *)((char *)opt + 8 * opt->nd_opt_len);
3375 }
3376 return (NULL);
3377 }
3378
3379 /*
3380 * Verify all option lengths present are > 0, also check to see
3381 * if the option lengths and packet length are consistent.
3382 */
3383 boolean_t
ndp_verify_optlen(nd_opt_hdr_t * opt,int optlen)3384 ndp_verify_optlen(nd_opt_hdr_t *opt, int optlen)
3385 {
3386 ASSERT(opt != NULL);
3387 while (optlen > 0) {
3388 if (opt->nd_opt_len == 0)
3389 return (B_FALSE);
3390 optlen -= 8 * opt->nd_opt_len;
3391 if (optlen < 0)
3392 return (B_FALSE);
3393 opt = (struct nd_opt_hdr *)((char *)opt + 8 * opt->nd_opt_len);
3394 }
3395 return (B_TRUE);
3396 }
3397
3398 /*
3399 * ncec_walk function.
3400 * Free a fraction of the NCE cache entries.
3401 *
3402 * A possible optimization here would be to use ncec_last where possible, and
3403 * delete the least-frequently used entry, which would require more complex
3404 * computation as we walk through the ncec's (e.g., track ncec entries by
3405 * order of ncec_last and/or maintain state)
3406 */
3407 static void
ncec_cache_reclaim(ncec_t * ncec,char * arg)3408 ncec_cache_reclaim(ncec_t *ncec, char *arg)
3409 {
3410 ip_stack_t *ipst = ncec->ncec_ipst;
3411 uint_t fraction = *(uint_t *)arg;
3412 uint_t rand;
3413
3414 if ((ncec->ncec_flags &
3415 (NCE_F_MYADDR | NCE_F_STATIC | NCE_F_BCAST)) != 0) {
3416 return;
3417 }
3418
3419 rand = (uint_t)ddi_get_lbolt() +
3420 NCE_ADDR_HASH_V6(ncec->ncec_addr, NCE_TABLE_SIZE);
3421 if ((rand/fraction)*fraction == rand) {
3422 IP_STAT(ipst, ip_nce_reclaim_deleted);
3423 ncec_delete(ncec);
3424 }
3425 }
3426
3427 /*
3428 * kmem_cache callback to free up memory.
3429 *
3430 * For now we just delete a fixed fraction.
3431 */
3432 static void
ip_nce_reclaim_stack(ip_stack_t * ipst)3433 ip_nce_reclaim_stack(ip_stack_t *ipst)
3434 {
3435 uint_t fraction = ipst->ips_ip_nce_reclaim_fraction;
3436
3437 IP_STAT(ipst, ip_nce_reclaim_calls);
3438
3439 ncec_walk(NULL, (pfi_t)ncec_cache_reclaim, (uchar_t *)&fraction, ipst);
3440
3441 /*
3442 * Walk all CONNs that can have a reference on an ire, ncec or dce.
3443 * Get them to update any stale references to drop any refholds they
3444 * have.
3445 */
3446 ipcl_walk(conn_ixa_cleanup, (void *)B_FALSE, ipst);
3447 }
3448
3449 /*
3450 * Called by the memory allocator subsystem directly, when the system
3451 * is running low on memory.
3452 */
3453 /* ARGSUSED */
3454 void
ip_nce_reclaim(void * args)3455 ip_nce_reclaim(void *args)
3456 {
3457 netstack_handle_t nh;
3458 netstack_t *ns;
3459 ip_stack_t *ipst;
3460
3461 netstack_next_init(&nh);
3462 while ((ns = netstack_next(&nh)) != NULL) {
3463 /*
3464 * netstack_next() can return a netstack_t with a NULL
3465 * netstack_ip at boot time.
3466 */
3467 if ((ipst = ns->netstack_ip) == NULL) {
3468 netstack_rele(ns);
3469 continue;
3470 }
3471 ip_nce_reclaim_stack(ipst);
3472 netstack_rele(ns);
3473 }
3474 netstack_next_fini(&nh);
3475 }
3476
3477 #ifdef DEBUG
3478 void
ncec_trace_ref(ncec_t * ncec)3479 ncec_trace_ref(ncec_t *ncec)
3480 {
3481 ASSERT(MUTEX_HELD(&ncec->ncec_lock));
3482
3483 if (ncec->ncec_trace_disable)
3484 return;
3485
3486 if (!th_trace_ref(ncec, ncec->ncec_ipst)) {
3487 ncec->ncec_trace_disable = B_TRUE;
3488 ncec_trace_cleanup(ncec);
3489 }
3490 }
3491
3492 void
ncec_untrace_ref(ncec_t * ncec)3493 ncec_untrace_ref(ncec_t *ncec)
3494 {
3495 ASSERT(MUTEX_HELD(&ncec->ncec_lock));
3496
3497 if (!ncec->ncec_trace_disable)
3498 th_trace_unref(ncec);
3499 }
3500
3501 static void
ncec_trace_cleanup(const ncec_t * ncec)3502 ncec_trace_cleanup(const ncec_t *ncec)
3503 {
3504 th_trace_cleanup(ncec, ncec->ncec_trace_disable);
3505 }
3506 #endif
3507
3508 /*
3509 * Called when address resolution fails due to a timeout.
3510 * Send an ICMP unreachable in response to all queued packets.
3511 */
3512 void
arp_resolv_failed(ncec_t * ncec)3513 arp_resolv_failed(ncec_t *ncec)
3514 {
3515 mblk_t *mp, *nxt_mp;
3516 char buf[INET6_ADDRSTRLEN];
3517 struct in_addr ipv4addr;
3518 ill_t *ill = ncec->ncec_ill;
3519 ip_stack_t *ipst = ncec->ncec_ipst;
3520 ip_recv_attr_t iras;
3521
3522 bzero(&iras, sizeof (iras));
3523 iras.ira_flags = IRAF_IS_IPV4;
3524 /*
3525 * we are setting the ira_rill to the ipmp_ill (instead of
3526 * the actual ill on which the packet was received), but this
3527 * is ok because we don't actually need the real ira_rill.
3528 * to send the icmp unreachable to the sender.
3529 */
3530 iras.ira_ill = iras.ira_rill = ill;
3531 iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex;
3532 iras.ira_rifindex = iras.ira_ruifindex;
3533
3534 IN6_V4MAPPED_TO_INADDR(&ncec->ncec_addr, &ipv4addr);
3535 ip3dbg(("arp_resolv_failed: dst %s\n",
3536 inet_ntop(AF_INET, &ipv4addr, buf, sizeof (buf))));
3537 mutex_enter(&ncec->ncec_lock);
3538 mp = ncec->ncec_qd_mp;
3539 ncec->ncec_qd_mp = NULL;
3540 ncec->ncec_nprobes = 0;
3541 mutex_exit(&ncec->ncec_lock);
3542 while (mp != NULL) {
3543 nxt_mp = mp->b_next;
3544 mp->b_next = NULL;
3545
3546 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
3547 ip_drop_output("ipIfStatsOutDiscards - address unreachable",
3548 mp, ill);
3549 if (ipst->ips_ip_arp_icmp_error) {
3550 ip3dbg(("arp_resolv_failed: "
3551 "Calling icmp_unreachable\n"));
3552 icmp_unreachable(mp, ICMP_HOST_UNREACHABLE, &iras);
3553 } else {
3554 freemsg(mp);
3555 }
3556 ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE));
3557 mp = nxt_mp;
3558 }
3559 ncec_cb_dispatch(ncec); /* finish off waiting callbacks */
3560 }
3561
3562 /*
3563 * if ill is an under_ill, translate it to the ipmp_ill and add the
3564 * nce on the ipmp_ill. Two nce_t entries (one on the ipmp_ill, and
3565 * one on the underlying in_ill) will be created for the
3566 * ncec_t in this case. The ncec_t itself will be created on the ipmp_ill.
3567 */
3568 int
nce_lookup_then_add_v4(ill_t * ill,uchar_t * hw_addr,uint_t hw_addr_len,const in_addr_t * addr,uint16_t flags,uint16_t state,nce_t ** newnce)3569 nce_lookup_then_add_v4(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len,
3570 const in_addr_t *addr, uint16_t flags, uint16_t state, nce_t **newnce)
3571 {
3572 int err;
3573 in6_addr_t addr6;
3574 ip_stack_t *ipst = ill->ill_ipst;
3575 nce_t *nce, *upper_nce = NULL;
3576 ill_t *in_ill = ill, *under = NULL;
3577 boolean_t need_ill_refrele = B_FALSE;
3578
3579 if (flags & NCE_F_MCAST) {
3580 /*
3581 * hw_addr will be figured out in nce_set_multicast_v4;
3582 * caller needs to pass in the cast_ill for ipmp
3583 */
3584 ASSERT(hw_addr == NULL);
3585 ASSERT(!IS_IPMP(ill));
3586 err = nce_set_multicast_v4(ill, addr, flags, newnce);
3587 return (err);
3588 }
3589
3590 if (IS_UNDER_IPMP(ill) && !(flags & NCE_F_MYADDR)) {
3591 ill = ipmp_ill_hold_ipmp_ill(ill);
3592 if (ill == NULL)
3593 return (ENXIO);
3594 need_ill_refrele = B_TRUE;
3595 }
3596 if ((flags & NCE_F_BCAST) != 0) {
3597 /*
3598 * IPv4 broadcast ncec: compute the hwaddr.
3599 */
3600 if (IS_IPMP(ill)) {
3601 under = ipmp_ill_hold_xmit_ill(ill, B_FALSE);
3602 if (under == NULL) {
3603 if (need_ill_refrele)
3604 ill_refrele(ill);
3605 return (ENETDOWN);
3606 }
3607 hw_addr = under->ill_bcast_mp->b_rptr +
3608 NCE_LL_ADDR_OFFSET(under);
3609 hw_addr_len = under->ill_phys_addr_length;
3610 } else {
3611 hw_addr = ill->ill_bcast_mp->b_rptr +
3612 NCE_LL_ADDR_OFFSET(ill),
3613 hw_addr_len = ill->ill_phys_addr_length;
3614 }
3615 }
3616
3617 mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
3618 IN6_IPADDR_TO_V4MAPPED(*addr, &addr6);
3619 nce = nce_lookup_addr(ill, &addr6);
3620 if (nce == NULL) {
3621 err = nce_add_v4(ill, hw_addr, hw_addr_len, addr, flags,
3622 state, &nce);
3623 } else {
3624 err = EEXIST;
3625 }
3626 mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
3627 if (err == 0)
3628 err = nce_add_v4_postprocess(nce);
3629
3630 if (in_ill != ill && nce != NULL) {
3631 nce_t *under_nce = NULL;
3632
3633 /*
3634 * in_ill was the under_ill. Try to create the under_nce.
3635 * Hold the ill_g_lock to prevent changes to group membership
3636 * until we are done.
3637 */
3638 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
3639 if (!IS_IN_SAME_ILLGRP(in_ill, ill)) {
3640 DTRACE_PROBE2(ill__not__in__group, nce_t *, nce,
3641 ill_t *, ill);
3642 rw_exit(&ipst->ips_ill_g_lock);
3643 err = ENXIO;
3644 nce_refrele(nce);
3645 nce = NULL;
3646 goto bail;
3647 }
3648 under_nce = nce_fastpath_create(in_ill, nce->nce_common);
3649 if (under_nce == NULL) {
3650 rw_exit(&ipst->ips_ill_g_lock);
3651 err = EINVAL;
3652 nce_refrele(nce);
3653 nce = NULL;
3654 goto bail;
3655 }
3656 rw_exit(&ipst->ips_ill_g_lock);
3657 upper_nce = nce;
3658 nce = under_nce; /* will be returned to caller */
3659 if (NCE_ISREACHABLE(nce->nce_common))
3660 nce_fastpath_trigger(under_nce);
3661 }
3662 if (nce != NULL) {
3663 if (newnce != NULL)
3664 *newnce = nce;
3665 else
3666 nce_refrele(nce);
3667 }
3668 bail:
3669 if (under != NULL)
3670 ill_refrele(under);
3671 if (upper_nce != NULL)
3672 nce_refrele(upper_nce);
3673 if (need_ill_refrele)
3674 ill_refrele(ill);
3675
3676 return (err);
3677 }
3678
3679 /*
3680 * NDP Cache Entry creation routine for IPv4.
3681 * This routine must always be called with ndp4->ndp_g_lock held.
3682 * Prior to return, ncec_refcnt is incremented.
3683 *
3684 * IPMP notes: the ncec for non-local (i.e., !NCE_MYADDR(ncec) addresses
3685 * are always added pointing at the ipmp_ill. Thus, when the ill passed
3686 * to nce_add_v4 is an under_ill (i.e., IS_UNDER_IPMP(ill)) two nce_t
3687 * entries will be created, both pointing at the same ncec_t. The nce_t
3688 * entries will have their nce_ill set to the ipmp_ill and the under_ill
3689 * respectively, with the ncec_t having its ncec_ill pointing at the ipmp_ill.
3690 * Local addresses are always created on the ill passed to nce_add_v4.
3691 */
3692 int
nce_add_v4(ill_t * ill,uchar_t * hw_addr,uint_t hw_addr_len,const in_addr_t * addr,uint16_t flags,uint16_t state,nce_t ** newnce)3693 nce_add_v4(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len,
3694 const in_addr_t *addr, uint16_t flags, uint16_t state, nce_t **newnce)
3695 {
3696 int err;
3697 boolean_t is_multicast = (flags & NCE_F_MCAST);
3698 struct in6_addr addr6;
3699 nce_t *nce;
3700
3701 ASSERT(MUTEX_HELD(&ill->ill_ipst->ips_ndp4->ndp_g_lock));
3702 ASSERT(!ill->ill_isv6);
3703 ASSERT(!IN_MULTICAST(htonl(*addr)) || is_multicast);
3704
3705 IN6_IPADDR_TO_V4MAPPED(*addr, &addr6);
3706 err = nce_add_common(ill, hw_addr, hw_addr_len, &addr6, flags, state,
3707 &nce);
3708 ASSERT(newnce != NULL);
3709 *newnce = nce;
3710 return (err);
3711 }
3712
3713 /*
3714 * Post-processing routine to be executed after nce_add_v4(). This function
3715 * triggers fastpath (if appropriate) and DAD on the newly added nce entry
3716 * and must be called without any locks held.
3717 *
3718 * Always returns 0, but we return an int to keep this symmetric with the
3719 * IPv6 counter-part.
3720 */
3721 int
nce_add_v4_postprocess(nce_t * nce)3722 nce_add_v4_postprocess(nce_t *nce)
3723 {
3724 ncec_t *ncec = nce->nce_common;
3725 uint16_t flags = ncec->ncec_flags;
3726 boolean_t ndp_need_dad = B_FALSE;
3727 boolean_t dropped;
3728 clock_t delay;
3729 ip_stack_t *ipst = ncec->ncec_ill->ill_ipst;
3730 uchar_t *hw_addr = ncec->ncec_lladdr;
3731 boolean_t trigger_fastpath = B_TRUE;
3732
3733 /*
3734 * If the hw_addr is NULL, typically for ND_INCOMPLETE nces, then
3735 * we call nce_fastpath as soon as the ncec is resolved in nce_process.
3736 * We call nce_fastpath from nce_update if the link layer address of
3737 * the peer changes from nce_update
3738 */
3739 if (NCE_PUBLISH(ncec) || !NCE_ISREACHABLE(ncec) || (hw_addr == NULL &&
3740 ncec->ncec_ill->ill_net_type != IRE_IF_NORESOLVER))
3741 trigger_fastpath = B_FALSE;
3742
3743 if (trigger_fastpath)
3744 nce_fastpath_trigger(nce);
3745
3746 if (NCE_PUBLISH(ncec) && ncec->ncec_state == ND_PROBE) {
3747 /*
3748 * Either the caller (by passing in ND_PROBE)
3749 * or nce_add_common() (by the internally computed state
3750 * based on ncec_addr and ill_net_type) has determined
3751 * that this unicast entry needs DAD. Trigger DAD.
3752 */
3753 ndp_need_dad = B_TRUE;
3754 } else if (flags & NCE_F_UNSOL_ADV) {
3755 /*
3756 * We account for the transmit below by assigning one
3757 * less than the ndd variable. Subsequent decrements
3758 * are done in nce_timer.
3759 */
3760 mutex_enter(&ncec->ncec_lock);
3761 ncec->ncec_unsolicit_count =
3762 ipst->ips_ip_arp_publish_count - 1;
3763 mutex_exit(&ncec->ncec_lock);
3764 dropped = arp_announce(ncec);
3765 mutex_enter(&ncec->ncec_lock);
3766 if (dropped)
3767 ncec->ncec_unsolicit_count++;
3768 else
3769 ncec->ncec_last_time_defended = ddi_get_lbolt();
3770 if (ncec->ncec_unsolicit_count != 0) {
3771 nce_start_timer(ncec,
3772 ipst->ips_ip_arp_publish_interval);
3773 }
3774 mutex_exit(&ncec->ncec_lock);
3775 }
3776
3777 /*
3778 * If ncec_xmit_interval is 0, user has configured us to send the first
3779 * probe right away. Do so, and set up for the subsequent probes.
3780 */
3781 if (ndp_need_dad) {
3782 mutex_enter(&ncec->ncec_lock);
3783 if (ncec->ncec_pcnt == 0) {
3784 /*
3785 * DAD probes and announce can be
3786 * administratively disabled by setting the
3787 * probe_count to zero. Restart the timer in
3788 * this case to mark the ipif as ready.
3789 */
3790 ncec->ncec_unsolicit_count = 0;
3791 mutex_exit(&ncec->ncec_lock);
3792 nce_restart_timer(ncec, 0);
3793 } else {
3794 mutex_exit(&ncec->ncec_lock);
3795 delay = ((ncec->ncec_flags & NCE_F_FAST) ?
3796 ipst->ips_arp_probe_delay :
3797 ipst->ips_arp_fastprobe_delay);
3798 nce_dad(ncec, NULL, (delay == 0 ? B_TRUE : B_FALSE));
3799 }
3800 }
3801 return (0);
3802 }
3803
3804 /*
3805 * ncec_walk routine to update all entries that have a given destination or
3806 * gateway address and cached link layer (MAC) address. This is used when ARP
3807 * informs us that a network-to-link-layer mapping may have changed.
3808 */
3809 void
nce_update_hw_changed(ncec_t * ncec,void * arg)3810 nce_update_hw_changed(ncec_t *ncec, void *arg)
3811 {
3812 nce_hw_map_t *hwm = arg;
3813 ipaddr_t ncec_addr;
3814
3815 if (ncec->ncec_state != ND_REACHABLE)
3816 return;
3817
3818 IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, ncec_addr);
3819 if (ncec_addr != hwm->hwm_addr)
3820 return;
3821
3822 mutex_enter(&ncec->ncec_lock);
3823 if (hwm->hwm_flags != 0)
3824 ncec->ncec_flags = hwm->hwm_flags;
3825 nce_update(ncec, ND_STALE, hwm->hwm_hwaddr);
3826 mutex_exit(&ncec->ncec_lock);
3827 }
3828
3829 void
ncec_refhold(ncec_t * ncec)3830 ncec_refhold(ncec_t *ncec)
3831 {
3832 mutex_enter(&(ncec)->ncec_lock);
3833 (ncec)->ncec_refcnt++;
3834 ASSERT((ncec)->ncec_refcnt != 0);
3835 #ifdef DEBUG
3836 ncec_trace_ref(ncec);
3837 #endif
3838 mutex_exit(&(ncec)->ncec_lock);
3839 }
3840
3841 void
ncec_refhold_notr(ncec_t * ncec)3842 ncec_refhold_notr(ncec_t *ncec)
3843 {
3844 mutex_enter(&(ncec)->ncec_lock);
3845 (ncec)->ncec_refcnt++;
3846 ASSERT((ncec)->ncec_refcnt != 0);
3847 mutex_exit(&(ncec)->ncec_lock);
3848 }
3849
3850 static void
ncec_refhold_locked(ncec_t * ncec)3851 ncec_refhold_locked(ncec_t *ncec)
3852 {
3853 ASSERT(MUTEX_HELD(&(ncec)->ncec_lock));
3854 (ncec)->ncec_refcnt++;
3855 #ifdef DEBUG
3856 ncec_trace_ref(ncec);
3857 #endif
3858 }
3859
3860 /* ncec_inactive destroys the mutex thus no mutex_exit is needed */
3861 void
ncec_refrele(ncec_t * ncec)3862 ncec_refrele(ncec_t *ncec)
3863 {
3864 mutex_enter(&(ncec)->ncec_lock);
3865 #ifdef DEBUG
3866 ncec_untrace_ref(ncec);
3867 #endif
3868 ASSERT((ncec)->ncec_refcnt != 0);
3869 if (--(ncec)->ncec_refcnt == 0) {
3870 ncec_inactive(ncec);
3871 } else {
3872 mutex_exit(&(ncec)->ncec_lock);
3873 }
3874 }
3875
3876 void
ncec_refrele_notr(ncec_t * ncec)3877 ncec_refrele_notr(ncec_t *ncec)
3878 {
3879 mutex_enter(&(ncec)->ncec_lock);
3880 ASSERT((ncec)->ncec_refcnt != 0);
3881 if (--(ncec)->ncec_refcnt == 0) {
3882 ncec_inactive(ncec);
3883 } else {
3884 mutex_exit(&(ncec)->ncec_lock);
3885 }
3886 }
3887
3888 /*
3889 * Common to IPv4 and IPv6.
3890 */
3891 void
nce_restart_timer(ncec_t * ncec,uint_t ms)3892 nce_restart_timer(ncec_t *ncec, uint_t ms)
3893 {
3894 timeout_id_t tid;
3895
3896 ASSERT(!MUTEX_HELD(&(ncec)->ncec_lock));
3897
3898 /* First cancel any running timer */
3899 mutex_enter(&ncec->ncec_lock);
3900 tid = ncec->ncec_timeout_id;
3901 ncec->ncec_timeout_id = 0;
3902 if (tid != 0) {
3903 mutex_exit(&ncec->ncec_lock);
3904 (void) untimeout(tid);
3905 mutex_enter(&ncec->ncec_lock);
3906 }
3907
3908 /* Restart timer */
3909 nce_start_timer(ncec, ms);
3910 mutex_exit(&ncec->ncec_lock);
3911 }
3912
3913 static void
nce_start_timer(ncec_t * ncec,uint_t ms)3914 nce_start_timer(ncec_t *ncec, uint_t ms)
3915 {
3916 ASSERT(MUTEX_HELD(&ncec->ncec_lock));
3917 /*
3918 * Don't start the timer if the ncec has been deleted, or if the timer
3919 * is already running
3920 */
3921 if (!NCE_ISCONDEMNED(ncec) && ncec->ncec_timeout_id == 0) {
3922 ncec->ncec_timeout_id = timeout(nce_timer, ncec,
3923 MSEC_TO_TICK(ms) == 0 ? 1 : MSEC_TO_TICK(ms));
3924 }
3925 }
3926
3927 int
nce_set_multicast_v4(ill_t * ill,const in_addr_t * dst,uint16_t flags,nce_t ** newnce)3928 nce_set_multicast_v4(ill_t *ill, const in_addr_t *dst,
3929 uint16_t flags, nce_t **newnce)
3930 {
3931 uchar_t *hw_addr;
3932 int err = 0;
3933 ip_stack_t *ipst = ill->ill_ipst;
3934 in6_addr_t dst6;
3935 nce_t *nce;
3936
3937 ASSERT(!ill->ill_isv6);
3938
3939 IN6_IPADDR_TO_V4MAPPED(*dst, &dst6);
3940 mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
3941 if ((nce = nce_lookup_addr(ill, &dst6)) != NULL) {
3942 mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
3943 goto done;
3944 }
3945 if (ill->ill_net_type == IRE_IF_RESOLVER) {
3946 /*
3947 * For IRE_IF_RESOLVER a hardware mapping can be
3948 * generated, for IRE_IF_NORESOLVER, resolution cookie
3949 * in the ill is copied in nce_add_v4().
3950 */
3951 hw_addr = kmem_alloc(ill->ill_phys_addr_length, KM_NOSLEEP);
3952 if (hw_addr == NULL) {
3953 mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
3954 return (ENOMEM);
3955 }
3956 ip_mcast_mapping(ill, (uchar_t *)dst, hw_addr);
3957 } else {
3958 /*
3959 * IRE_IF_NORESOLVER type simply copies the resolution
3960 * cookie passed in. So no hw_addr is needed.
3961 */
3962 hw_addr = NULL;
3963 }
3964 ASSERT(flags & NCE_F_MCAST);
3965 ASSERT(flags & NCE_F_NONUD);
3966 /* nce_state will be computed by nce_add_common() */
3967 err = nce_add_v4(ill, hw_addr, ill->ill_phys_addr_length, dst, flags,
3968 ND_UNCHANGED, &nce);
3969 mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
3970 if (err == 0)
3971 err = nce_add_v4_postprocess(nce);
3972 if (hw_addr != NULL)
3973 kmem_free(hw_addr, ill->ill_phys_addr_length);
3974 if (err != 0) {
3975 ip1dbg(("nce_set_multicast_v4: create failed" "%d\n", err));
3976 return (err);
3977 }
3978 done:
3979 if (newnce != NULL)
3980 *newnce = nce;
3981 else
3982 nce_refrele(nce);
3983 return (0);
3984 }
3985
3986 /*
3987 * This is used when scanning for "old" (least recently broadcast) NCEs. We
3988 * don't want to have to walk the list for every single one, so we gather up
3989 * batches at a time.
3990 */
3991 #define NCE_RESCHED_LIST_LEN 8
3992
3993 typedef struct {
3994 ill_t *ncert_ill;
3995 uint_t ncert_num;
3996 ncec_t *ncert_nces[NCE_RESCHED_LIST_LEN];
3997 } nce_resched_t;
3998
3999 /*
4000 * Pick the longest waiting NCEs for defense.
4001 */
4002 /* ARGSUSED */
4003 static int
ncec_reschedule(ill_t * ill,nce_t * nce,void * arg)4004 ncec_reschedule(ill_t *ill, nce_t *nce, void *arg)
4005 {
4006 nce_resched_t *ncert = arg;
4007 ncec_t **ncecs;
4008 ncec_t **ncec_max;
4009 ncec_t *ncec_temp;
4010 ncec_t *ncec = nce->nce_common;
4011
4012 ASSERT(ncec->ncec_ill == ncert->ncert_ill);
4013 /*
4014 * Only reachable entries that are ready for announcement are eligible.
4015 */
4016 if (!NCE_MYADDR(ncec) || ncec->ncec_state != ND_REACHABLE)
4017 return (0);
4018 if (ncert->ncert_num < NCE_RESCHED_LIST_LEN) {
4019 ncec_refhold(ncec);
4020 ncert->ncert_nces[ncert->ncert_num++] = ncec;
4021 } else {
4022 ncecs = ncert->ncert_nces;
4023 ncec_max = ncecs + NCE_RESCHED_LIST_LEN;
4024 ncec_refhold(ncec);
4025 for (; ncecs < ncec_max; ncecs++) {
4026 ASSERT(ncec != NULL);
4027 if ((*ncecs)->ncec_last_time_defended >
4028 ncec->ncec_last_time_defended) {
4029 ncec_temp = *ncecs;
4030 *ncecs = ncec;
4031 ncec = ncec_temp;
4032 }
4033 }
4034 ncec_refrele(ncec);
4035 }
4036 return (0);
4037 }
4038
4039 /*
4040 * Reschedule the ARP defense of any long-waiting NCEs. It's assumed that this
4041 * doesn't happen very often (if at all), and thus it needn't be highly
4042 * optimized. (Note, though, that it's actually O(N) complexity, because the
4043 * outer loop is bounded by a constant rather than by the length of the list.)
4044 */
4045 static void
nce_ill_reschedule(ill_t * ill,nce_resched_t * ncert)4046 nce_ill_reschedule(ill_t *ill, nce_resched_t *ncert)
4047 {
4048 ncec_t *ncec;
4049 ip_stack_t *ipst = ill->ill_ipst;
4050 uint_t i, defend_rate;
4051
4052 i = ill->ill_defend_count;
4053 ill->ill_defend_count = 0;
4054 if (ill->ill_isv6)
4055 defend_rate = ipst->ips_ndp_defend_rate;
4056 else
4057 defend_rate = ipst->ips_arp_defend_rate;
4058 /* If none could be sitting around, then don't reschedule */
4059 if (i < defend_rate) {
4060 DTRACE_PROBE1(reschedule_none, ill_t *, ill);
4061 return;
4062 }
4063 ncert->ncert_ill = ill;
4064 while (ill->ill_defend_count < defend_rate) {
4065 nce_walk_common(ill, ncec_reschedule, ncert);
4066 for (i = 0; i < ncert->ncert_num; i++) {
4067
4068 ncec = ncert->ncert_nces[i];
4069 mutex_enter(&ncec->ncec_lock);
4070 ncec->ncec_flags |= NCE_F_DELAYED;
4071 mutex_exit(&ncec->ncec_lock);
4072 /*
4073 * we plan to schedule this ncec, so incr the
4074 * defend_count in anticipation.
4075 */
4076 if (++ill->ill_defend_count >= defend_rate)
4077 break;
4078 }
4079 if (ncert->ncert_num < NCE_RESCHED_LIST_LEN)
4080 break;
4081 }
4082 }
4083
4084 /*
4085 * Check if the current rate-limiting parameters permit the sending
4086 * of another address defense announcement for both IPv4 and IPv6.
4087 * Returns B_TRUE if rate-limiting is in effect (i.e., send is not
4088 * permitted), and B_FALSE otherwise. The `defend_rate' parameter
4089 * determines how many address defense announcements are permitted
4090 * in any `defense_perio' interval.
4091 */
4092 static boolean_t
ill_defend_rate_limit(ill_t * ill,ncec_t * ncec)4093 ill_defend_rate_limit(ill_t *ill, ncec_t *ncec)
4094 {
4095 clock_t now = ddi_get_lbolt();
4096 ip_stack_t *ipst = ill->ill_ipst;
4097 clock_t start = ill->ill_defend_start;
4098 uint32_t elapsed, defend_period, defend_rate;
4099 nce_resched_t ncert;
4100 boolean_t ret;
4101 int i;
4102
4103 if (ill->ill_isv6) {
4104 defend_period = ipst->ips_ndp_defend_period;
4105 defend_rate = ipst->ips_ndp_defend_rate;
4106 } else {
4107 defend_period = ipst->ips_arp_defend_period;
4108 defend_rate = ipst->ips_arp_defend_rate;
4109 }
4110 if (defend_rate == 0)
4111 return (B_TRUE);
4112 bzero(&ncert, sizeof (ncert));
4113 mutex_enter(&ill->ill_lock);
4114 if (start > 0) {
4115 elapsed = now - start;
4116 if (elapsed > SEC_TO_TICK(defend_period)) {
4117 ill->ill_defend_start = now;
4118 /*
4119 * nce_ill_reschedule will attempt to
4120 * prevent starvation by reschduling the
4121 * oldest entries, which are marked with
4122 * the NCE_F_DELAYED flag.
4123 */
4124 nce_ill_reschedule(ill, &ncert);
4125 }
4126 } else {
4127 ill->ill_defend_start = now;
4128 }
4129 ASSERT(ill->ill_defend_count <= defend_rate);
4130 mutex_enter(&ncec->ncec_lock);
4131 if (ncec->ncec_flags & NCE_F_DELAYED) {
4132 /*
4133 * This ncec was rescheduled as one of the really old
4134 * entries needing on-going defense. The
4135 * ill_defend_count was already incremented in
4136 * nce_ill_reschedule. Go ahead and send the announce.
4137 */
4138 ncec->ncec_flags &= ~NCE_F_DELAYED;
4139 mutex_exit(&ncec->ncec_lock);
4140 ret = B_FALSE;
4141 goto done;
4142 }
4143 mutex_exit(&ncec->ncec_lock);
4144 if (ill->ill_defend_count < defend_rate)
4145 ill->ill_defend_count++;
4146 if (ill->ill_defend_count == defend_rate) {
4147 /*
4148 * we are no longer allowed to send unbidden defense
4149 * messages. Wait for rescheduling.
4150 */
4151 ret = B_TRUE;
4152 } else {
4153 ret = B_FALSE;
4154 }
4155 done:
4156 mutex_exit(&ill->ill_lock);
4157 /*
4158 * After all the locks have been dropped we can restart nce timer,
4159 * and refrele the delayed ncecs
4160 */
4161 for (i = 0; i < ncert.ncert_num; i++) {
4162 clock_t xmit_interval;
4163 ncec_t *tmp;
4164
4165 tmp = ncert.ncert_nces[i];
4166 xmit_interval = nce_fuzz_interval(tmp->ncec_xmit_interval,
4167 B_FALSE);
4168 nce_restart_timer(tmp, xmit_interval);
4169 ncec_refrele(tmp);
4170 }
4171 return (ret);
4172 }
4173
4174 boolean_t
ndp_announce(ncec_t * ncec)4175 ndp_announce(ncec_t *ncec)
4176 {
4177 return (ndp_xmit(ncec->ncec_ill, ND_NEIGHBOR_ADVERT, ncec->ncec_lladdr,
4178 ncec->ncec_lladdr_length, &ncec->ncec_addr, &ipv6_all_hosts_mcast,
4179 nce_advert_flags(ncec)));
4180 }
4181
4182 ill_t *
nce_resolve_src(ncec_t * ncec,in6_addr_t * src)4183 nce_resolve_src(ncec_t *ncec, in6_addr_t *src)
4184 {
4185 mblk_t *mp;
4186 in6_addr_t src6;
4187 ipaddr_t src4;
4188 ill_t *ill = ncec->ncec_ill;
4189 ill_t *src_ill = NULL;
4190 ipif_t *ipif = NULL;
4191 boolean_t is_myaddr = NCE_MYADDR(ncec);
4192 boolean_t isv6 = (ncec->ncec_ipversion == IPV6_VERSION);
4193
4194 ASSERT(src != NULL);
4195 ASSERT(IN6_IS_ADDR_UNSPECIFIED(src));
4196 src6 = *src;
4197 if (is_myaddr) {
4198 src6 = ncec->ncec_addr;
4199 if (!isv6)
4200 IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, src4);
4201 } else {
4202 /*
4203 * try to find one from the outgoing packet.
4204 */
4205 mutex_enter(&ncec->ncec_lock);
4206 mp = ncec->ncec_qd_mp;
4207 if (mp != NULL) {
4208 if (isv6) {
4209 ip6_t *ip6h = (ip6_t *)mp->b_rptr;
4210
4211 src6 = ip6h->ip6_src;
4212 } else {
4213 ipha_t *ipha = (ipha_t *)mp->b_rptr;
4214
4215 src4 = ipha->ipha_src;
4216 IN6_IPADDR_TO_V4MAPPED(src4, &src6);
4217 }
4218 }
4219 mutex_exit(&ncec->ncec_lock);
4220 }
4221
4222 /*
4223 * For outgoing packets, if the src of outgoing packet is one
4224 * of the assigned interface addresses use it, otherwise we
4225 * will pick the source address below.
4226 * For local addresses (is_myaddr) doing DAD, NDP announce
4227 * messages are mcast. So we use the (IPMP) cast_ill or the
4228 * (non-IPMP) ncec_ill for these message types. The only case
4229 * of unicast DAD messages are for IPv6 ND probes, for which
4230 * we find the ipif_bound_ill corresponding to the ncec_addr.
4231 */
4232 if (!IN6_IS_ADDR_UNSPECIFIED(&src6) || is_myaddr) {
4233 if (isv6) {
4234 ipif = ipif_lookup_addr_nondup_v6(&src6, ill, ALL_ZONES,
4235 ill->ill_ipst);
4236 } else {
4237 ipif = ipif_lookup_addr_nondup(src4, ill, ALL_ZONES,
4238 ill->ill_ipst);
4239 }
4240
4241 /*
4242 * If no relevant ipif can be found, then it's not one of our
4243 * addresses. Reset to :: and try to find a src for the NS or
4244 * ARP request using ipif_select_source_v[4,6] below.
4245 * If an ipif can be found, but it's not yet done with
4246 * DAD verification, and we are not being invoked for
4247 * DAD (i.e., !is_myaddr), then just postpone this
4248 * transmission until later.
4249 */
4250 if (ipif == NULL) {
4251 src6 = ipv6_all_zeros;
4252 src4 = INADDR_ANY;
4253 } else if (!ipif->ipif_addr_ready && !is_myaddr) {
4254 DTRACE_PROBE2(nce__resolve__ipif__not__ready,
4255 ncec_t *, ncec, ipif_t *, ipif);
4256 ipif_refrele(ipif);
4257 return (NULL);
4258 }
4259 }
4260
4261 if (IN6_IS_ADDR_UNSPECIFIED(&src6) && !is_myaddr) {
4262 /*
4263 * Pick a source address for this solicitation, but
4264 * restrict the selection to addresses assigned to the
4265 * output interface. We do this because the destination will
4266 * create a neighbor cache entry for the source address of
4267 * this packet, so the source address had better be a valid
4268 * neighbor.
4269 */
4270 if (isv6) {
4271 ipif = ipif_select_source_v6(ill, &ncec->ncec_addr,
4272 B_TRUE, IPV6_PREFER_SRC_DEFAULT, ALL_ZONES,
4273 B_FALSE, NULL);
4274 } else {
4275 ipaddr_t nce_addr;
4276
4277 IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, nce_addr);
4278 ipif = ipif_select_source_v4(ill, nce_addr, ALL_ZONES,
4279 B_FALSE, NULL);
4280 }
4281 if (ipif == NULL && IS_IPMP(ill)) {
4282 ill_t *send_ill = ipmp_ill_hold_xmit_ill(ill, B_TRUE);
4283
4284 if (send_ill != NULL) {
4285 if (isv6) {
4286 ipif = ipif_select_source_v6(send_ill,
4287 &ncec->ncec_addr, B_TRUE,
4288 IPV6_PREFER_SRC_DEFAULT, ALL_ZONES,
4289 B_FALSE, NULL);
4290 } else {
4291 IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr,
4292 src4);
4293 ipif = ipif_select_source_v4(send_ill,
4294 src4, ALL_ZONES, B_TRUE, NULL);
4295 }
4296 ill_refrele(send_ill);
4297 }
4298 }
4299
4300 if (ipif == NULL) {
4301 char buf[INET6_ADDRSTRLEN];
4302
4303 ip1dbg(("nce_resolve_src: No source ipif for dst %s\n",
4304 inet_ntop((isv6 ? AF_INET6 : AF_INET),
4305 (char *)&ncec->ncec_addr, buf, sizeof (buf))));
4306 DTRACE_PROBE1(nce__resolve__no__ipif, ncec_t *, ncec);
4307 return (NULL);
4308 }
4309 src6 = ipif->ipif_v6lcl_addr;
4310 }
4311 *src = src6;
4312 if (ipif != NULL) {
4313 src_ill = ipif->ipif_ill;
4314 if (IS_IPMP(src_ill))
4315 src_ill = ipmp_ipif_hold_bound_ill(ipif);
4316 else
4317 ill_refhold(src_ill);
4318 ipif_refrele(ipif);
4319 DTRACE_PROBE2(nce__resolve__src__ill, ncec_t *, ncec,
4320 ill_t *, src_ill);
4321 }
4322 return (src_ill);
4323 }
4324
4325 void
ip_nce_lookup_and_update(ipaddr_t * addr,ipif_t * ipif,ip_stack_t * ipst,uchar_t * hwaddr,int hwaddr_len,int flags)4326 ip_nce_lookup_and_update(ipaddr_t *addr, ipif_t *ipif, ip_stack_t *ipst,
4327 uchar_t *hwaddr, int hwaddr_len, int flags)
4328 {
4329 ill_t *ill;
4330 ncec_t *ncec;
4331 nce_t *nce;
4332 uint16_t new_state;
4333
4334 ill = (ipif ? ipif->ipif_ill : NULL);
4335 if (ill != NULL) {
4336 /*
4337 * only one ncec is possible
4338 */
4339 nce = nce_lookup_v4(ill, addr);
4340 if (nce != NULL) {
4341 ncec = nce->nce_common;
4342 mutex_enter(&ncec->ncec_lock);
4343 if (NCE_ISREACHABLE(ncec))
4344 new_state = ND_UNCHANGED;
4345 else
4346 new_state = ND_STALE;
4347 ncec->ncec_flags = flags;
4348 nce_update(ncec, new_state, hwaddr);
4349 mutex_exit(&ncec->ncec_lock);
4350 nce_refrele(nce);
4351 return;
4352 }
4353 } else {
4354 /*
4355 * ill is wildcard; clean up all ncec's and ire's
4356 * that match on addr.
4357 */
4358 nce_hw_map_t hwm;
4359
4360 hwm.hwm_addr = *addr;
4361 hwm.hwm_hwlen = hwaddr_len;
4362 hwm.hwm_hwaddr = hwaddr;
4363 hwm.hwm_flags = flags;
4364
4365 ncec_walk_common(ipst->ips_ndp4, NULL,
4366 (pfi_t)nce_update_hw_changed, (uchar_t *)&hwm, B_TRUE);
4367 }
4368 }
4369
4370 /*
4371 * Common function to add ncec entries.
4372 * we always add the ncec with ncec_ill == ill, and always create
4373 * nce_t on ncec_ill. A dlpi fastpath message may be triggered if the
4374 * ncec is !reachable.
4375 *
4376 * When the caller passes in an nce_state of ND_UNCHANGED,
4377 * nce_add_common() will determine the state of the created nce based
4378 * on the ill_net_type and nce_flags used. Otherwise, the nce will
4379 * be created with state set to the passed in nce_state.
4380 */
4381 static int
nce_add_common(ill_t * ill,uchar_t * hw_addr,uint_t hw_addr_len,const in6_addr_t * addr,uint16_t flags,uint16_t nce_state,nce_t ** retnce)4382 nce_add_common(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len,
4383 const in6_addr_t *addr, uint16_t flags, uint16_t nce_state, nce_t **retnce)
4384 {
4385 static ncec_t nce_nil;
4386 uchar_t *template = NULL;
4387 int err;
4388 ncec_t *ncec;
4389 ncec_t **ncep;
4390 ip_stack_t *ipst = ill->ill_ipst;
4391 uint16_t state;
4392 boolean_t fastprobe = B_FALSE;
4393 struct ndp_g_s *ndp;
4394 nce_t *nce = NULL;
4395 mblk_t *dlur_mp = NULL;
4396
4397 if (ill->ill_isv6)
4398 ndp = ill->ill_ipst->ips_ndp6;
4399 else
4400 ndp = ill->ill_ipst->ips_ndp4;
4401
4402 *retnce = NULL;
4403
4404 ASSERT(MUTEX_HELD(&ndp->ndp_g_lock));
4405
4406 if (IN6_IS_ADDR_UNSPECIFIED(addr)) {
4407 ip0dbg(("nce_add_common: no addr\n"));
4408 return (EINVAL);
4409 }
4410 if ((flags & ~NCE_EXTERNAL_FLAGS_MASK)) {
4411 ip0dbg(("nce_add_common: flags = %x\n", (int)flags));
4412 return (EINVAL);
4413 }
4414
4415 if (ill->ill_isv6) {
4416 ncep = ((ncec_t **)NCE_HASH_PTR_V6(ipst, *addr));
4417 } else {
4418 ipaddr_t v4addr;
4419
4420 IN6_V4MAPPED_TO_IPADDR(addr, v4addr);
4421 ncep = ((ncec_t **)NCE_HASH_PTR_V4(ipst, v4addr));
4422 }
4423
4424 /*
4425 * The caller has ensured that there is no nce on ill, but there could
4426 * still be an nce_common_t for the address, so that we find exisiting
4427 * ncec_t strucutures first, and atomically add a new nce_t if
4428 * one is found. The ndp_g_lock ensures that we don't cross threads
4429 * with an ncec_delete(). Unlike ncec_lookup_illgrp() we do not
4430 * compare for matches across the illgrp because this function is
4431 * called via nce_lookup_then_add_v* -> nce_add_v* -> nce_add_common,
4432 * with the nce_lookup_then_add_v* passing in the ipmp_ill where
4433 * appropriate.
4434 */
4435 ncec = *ncep;
4436 for (; ncec != NULL; ncec = ncec->ncec_next) {
4437 if (ncec->ncec_ill == ill) {
4438 if (IN6_ARE_ADDR_EQUAL(&ncec->ncec_addr, addr)) {
4439 /*
4440 * We should never find *retnce to be
4441 * MYADDR, since the caller may then
4442 * incorrectly restart a DAD timer that's
4443 * already running. However, if we are in
4444 * forwarding mode, and the interface is
4445 * moving in/out of groups, the data
4446 * path ire lookup (e.g., ire_revalidate_nce)
4447 * may have determined that some destination
4448 * is offlink while the control path is adding
4449 * that address as a local address.
4450 * Recover from this case by failing the
4451 * lookup
4452 */
4453 if (NCE_MYADDR(ncec))
4454 return (ENXIO);
4455 *retnce = nce_ill_lookup_then_add(ill, ncec);
4456 if (*retnce != NULL)
4457 break;
4458 }
4459 }
4460 }
4461 if (*retnce != NULL) /* caller must trigger fastpath on nce */
4462 return (0);
4463
4464 ncec = kmem_cache_alloc(ncec_cache, KM_NOSLEEP);
4465 if (ncec == NULL)
4466 return (ENOMEM);
4467 *ncec = nce_nil;
4468 ncec->ncec_ill = ill;
4469 ncec->ncec_ipversion = (ill->ill_isv6 ? IPV6_VERSION : IPV4_VERSION);
4470 ncec->ncec_flags = flags;
4471 ncec->ncec_ipst = ipst; /* No netstack_hold */
4472
4473 if (!ill->ill_isv6) {
4474 ipaddr_t addr4;
4475
4476 /*
4477 * DAD probe interval and probe count are set based on
4478 * fast/slow probe settings. If the underlying link doesn't
4479 * have reliably up/down notifications or if we're working
4480 * with IPv4 169.254.0.0/16 Link Local Address space, then
4481 * don't use the fast timers. Otherwise, use them.
4482 */
4483 ASSERT(IN6_IS_ADDR_V4MAPPED(addr));
4484 IN6_V4MAPPED_TO_IPADDR(addr, addr4);
4485 if (ill->ill_note_link && !IS_IPV4_LL_SPACE(&addr4)) {
4486 fastprobe = B_TRUE;
4487 } else if (IS_IPMP(ill) && NCE_PUBLISH(ncec) &&
4488 !IS_IPV4_LL_SPACE(&addr4)) {
4489 ill_t *hwaddr_ill;
4490
4491 hwaddr_ill = ipmp_illgrp_find_ill(ill->ill_grp, hw_addr,
4492 hw_addr_len);
4493 if (hwaddr_ill != NULL && hwaddr_ill->ill_note_link)
4494 fastprobe = B_TRUE;
4495 }
4496 if (fastprobe) {
4497 ncec->ncec_xmit_interval =
4498 ipst->ips_arp_fastprobe_interval;
4499 ncec->ncec_pcnt =
4500 ipst->ips_arp_fastprobe_count;
4501 ncec->ncec_flags |= NCE_F_FAST;
4502 } else {
4503 ncec->ncec_xmit_interval =
4504 ipst->ips_arp_probe_interval;
4505 ncec->ncec_pcnt =
4506 ipst->ips_arp_probe_count;
4507 }
4508 if (NCE_PUBLISH(ncec)) {
4509 ncec->ncec_unsolicit_count =
4510 ipst->ips_ip_arp_publish_count;
4511 }
4512 } else {
4513 /*
4514 * probe interval is constant: ILL_PROBE_INTERVAL
4515 * probe count is constant: ND_MAX_UNICAST_SOLICIT
4516 */
4517 ncec->ncec_pcnt = ND_MAX_UNICAST_SOLICIT;
4518 if (NCE_PUBLISH(ncec)) {
4519 ncec->ncec_unsolicit_count =
4520 ipst->ips_ip_ndp_unsolicit_count;
4521 }
4522 }
4523 ncec->ncec_rcnt = ill->ill_xmit_count;
4524 ncec->ncec_addr = *addr;
4525 ncec->ncec_qd_mp = NULL;
4526 ncec->ncec_refcnt = 1; /* for ncec getting created */
4527 mutex_init(&ncec->ncec_lock, NULL, MUTEX_DEFAULT, NULL);
4528 ncec->ncec_trace_disable = B_FALSE;
4529
4530 /*
4531 * ncec_lladdr holds link layer address
4532 */
4533 if (hw_addr_len > 0) {
4534 template = kmem_alloc(hw_addr_len, KM_NOSLEEP);
4535 if (template == NULL) {
4536 err = ENOMEM;
4537 goto err_ret;
4538 }
4539 ncec->ncec_lladdr = template;
4540 ncec->ncec_lladdr_length = hw_addr_len;
4541 bzero(ncec->ncec_lladdr, hw_addr_len);
4542 }
4543 if ((flags & NCE_F_BCAST) != 0) {
4544 state = ND_REACHABLE;
4545 ASSERT(hw_addr_len > 0);
4546 } else if (ill->ill_net_type == IRE_IF_RESOLVER) {
4547 state = ND_INITIAL;
4548 } else if (ill->ill_net_type == IRE_IF_NORESOLVER) {
4549 /*
4550 * NORESOLVER entries are always created in the REACHABLE
4551 * state.
4552 */
4553 state = ND_REACHABLE;
4554 if (ill->ill_phys_addr_length == IP_ADDR_LEN &&
4555 ill->ill_mactype != DL_IPV4 &&
4556 ill->ill_mactype != DL_6TO4) {
4557 /*
4558 * We create a nce_res_mp with the IP nexthop address
4559 * as the destination address if the physical length
4560 * is exactly 4 bytes for point-to-multipoint links
4561 * that do their own resolution from IP to link-layer
4562 * address (e.g. IP over X.25).
4563 */
4564 bcopy((uchar_t *)addr,
4565 ncec->ncec_lladdr, ill->ill_phys_addr_length);
4566 }
4567 if (ill->ill_phys_addr_length == IPV6_ADDR_LEN &&
4568 ill->ill_mactype != DL_IPV6) {
4569 /*
4570 * We create a nce_res_mp with the IP nexthop address
4571 * as the destination address if the physical legnth
4572 * is exactly 16 bytes for point-to-multipoint links
4573 * that do their own resolution from IP to link-layer
4574 * address.
4575 */
4576 bcopy((uchar_t *)addr,
4577 ncec->ncec_lladdr, ill->ill_phys_addr_length);
4578 }
4579 /*
4580 * Since NUD is not part of the base IPv4 protocol definition,
4581 * IPv4 neighbor entries on NORESOLVER interfaces will never
4582 * age, and are marked NCE_F_NONUD.
4583 */
4584 if (!ill->ill_isv6)
4585 ncec->ncec_flags |= NCE_F_NONUD;
4586 } else if (ill->ill_net_type == IRE_LOOPBACK) {
4587 state = ND_REACHABLE;
4588 }
4589
4590 if (hw_addr != NULL || ill->ill_net_type == IRE_IF_NORESOLVER) {
4591 /*
4592 * We are adding an ncec with a deterministic hw_addr,
4593 * so the state can only be one of {REACHABLE, STALE, PROBE}.
4594 *
4595 * if we are adding a unicast ncec for the local address
4596 * it would be REACHABLE; we would be adding a ND_STALE entry
4597 * for the requestor of an ARP_REQUEST/ND_SOLICIT. Our own
4598 * addresses are added in PROBE to trigger DAD.
4599 */
4600 if ((flags & (NCE_F_MCAST|NCE_F_BCAST)) ||
4601 ill->ill_net_type == IRE_IF_NORESOLVER)
4602 state = ND_REACHABLE;
4603 else if (!NCE_PUBLISH(ncec))
4604 state = ND_STALE;
4605 else
4606 state = ND_PROBE;
4607 if (hw_addr != NULL)
4608 nce_set_ll(ncec, hw_addr);
4609 }
4610 /* caller overrides internally computed state */
4611 if (nce_state != ND_UNCHANGED)
4612 state = nce_state;
4613
4614 if (state == ND_PROBE)
4615 ncec->ncec_flags |= NCE_F_UNVERIFIED;
4616
4617 ncec->ncec_state = state;
4618
4619 if (state == ND_REACHABLE) {
4620 ncec->ncec_last = ncec->ncec_init_time =
4621 TICK_TO_MSEC(ddi_get_lbolt64());
4622 } else {
4623 ncec->ncec_last = 0;
4624 if (state == ND_INITIAL)
4625 ncec->ncec_init_time = TICK_TO_MSEC(ddi_get_lbolt64());
4626 }
4627 list_create(&ncec->ncec_cb, sizeof (ncec_cb_t),
4628 offsetof(ncec_cb_t, ncec_cb_node));
4629 /*
4630 * have all the memory allocations out of the way before taking locks
4631 * and adding the nce.
4632 */
4633 nce = kmem_cache_alloc(nce_cache, KM_NOSLEEP);
4634 if (nce == NULL) {
4635 err = ENOMEM;
4636 goto err_ret;
4637 }
4638 if (ncec->ncec_lladdr != NULL ||
4639 ill->ill_net_type == IRE_IF_NORESOLVER) {
4640 dlur_mp = ill_dlur_gen(ncec->ncec_lladdr,
4641 ill->ill_phys_addr_length, ill->ill_sap,
4642 ill->ill_sap_length);
4643 if (dlur_mp == NULL) {
4644 err = ENOMEM;
4645 goto err_ret;
4646 }
4647 }
4648
4649 /*
4650 * Atomically ensure that the ill is not CONDEMNED, before
4651 * adding the NCE.
4652 */
4653 mutex_enter(&ill->ill_lock);
4654 if (ill->ill_state_flags & ILL_CONDEMNED) {
4655 mutex_exit(&ill->ill_lock);
4656 err = EINVAL;
4657 goto err_ret;
4658 }
4659 if (!NCE_MYADDR(ncec) &&
4660 (ill->ill_state_flags & ILL_DOWN_IN_PROGRESS)) {
4661 mutex_exit(&ill->ill_lock);
4662 DTRACE_PROBE1(nce__add__on__down__ill, ncec_t *, ncec);
4663 err = EINVAL;
4664 goto err_ret;
4665 }
4666 /*
4667 * Acquire the ncec_lock even before adding the ncec to the list
4668 * so that it cannot get deleted after the ncec is added, but
4669 * before we add the nce.
4670 */
4671 mutex_enter(&ncec->ncec_lock);
4672 if ((ncec->ncec_next = *ncep) != NULL)
4673 ncec->ncec_next->ncec_ptpn = &ncec->ncec_next;
4674 *ncep = ncec;
4675 ncec->ncec_ptpn = ncep;
4676
4677 /* Bump up the number of ncec's referencing this ill */
4678 DTRACE_PROBE3(ill__incr__cnt, (ill_t *), ill,
4679 (char *), "ncec", (void *), ncec);
4680 ill->ill_ncec_cnt++;
4681 /*
4682 * Since we hold the ncec_lock at this time, the ncec cannot be
4683 * condemned, and we can safely add the nce.
4684 */
4685 *retnce = nce_add_impl(ill, ncec, nce, dlur_mp);
4686 mutex_exit(&ncec->ncec_lock);
4687 mutex_exit(&ill->ill_lock);
4688
4689 /* caller must trigger fastpath on *retnce */
4690 return (0);
4691
4692 err_ret:
4693 if (ncec != NULL)
4694 kmem_cache_free(ncec_cache, ncec);
4695 if (nce != NULL)
4696 kmem_cache_free(nce_cache, nce);
4697 freemsg(dlur_mp);
4698 if (template != NULL)
4699 kmem_free(template, ill->ill_phys_addr_length);
4700 return (err);
4701 }
4702
4703 /*
4704 * take a ref on the nce
4705 */
4706 void
nce_refhold(nce_t * nce)4707 nce_refhold(nce_t *nce)
4708 {
4709 mutex_enter(&nce->nce_lock);
4710 nce->nce_refcnt++;
4711 ASSERT((nce)->nce_refcnt != 0);
4712 mutex_exit(&nce->nce_lock);
4713 }
4714
4715 /*
4716 * release a ref on the nce; In general, this
4717 * cannot be called with locks held because nce_inactive
4718 * may result in nce_inactive which will take the ill_lock,
4719 * do ipif_ill_refrele_tail etc. Thus the one exception
4720 * where this can be called with locks held is when the caller
4721 * is certain that the nce_refcnt is sufficient to prevent
4722 * the invocation of nce_inactive.
4723 */
4724 void
nce_refrele(nce_t * nce)4725 nce_refrele(nce_t *nce)
4726 {
4727 ASSERT((nce)->nce_refcnt != 0);
4728 mutex_enter(&nce->nce_lock);
4729 if (--nce->nce_refcnt == 0)
4730 nce_inactive(nce); /* destroys the mutex */
4731 else
4732 mutex_exit(&nce->nce_lock);
4733 }
4734
4735 /*
4736 * free the nce after all refs have gone away.
4737 */
4738 static void
nce_inactive(nce_t * nce)4739 nce_inactive(nce_t *nce)
4740 {
4741 ill_t *ill = nce->nce_ill;
4742
4743 ASSERT(nce->nce_refcnt == 0);
4744
4745 ncec_refrele_notr(nce->nce_common);
4746 nce->nce_common = NULL;
4747 freemsg(nce->nce_fp_mp);
4748 freemsg(nce->nce_dlur_mp);
4749
4750 mutex_enter(&ill->ill_lock);
4751 DTRACE_PROBE3(ill__decr__cnt, (ill_t *), ill,
4752 (char *), "nce", (void *), nce);
4753 ill->ill_nce_cnt--;
4754 nce->nce_ill = NULL;
4755 /*
4756 * If the number of ncec's associated with this ill have dropped
4757 * to zero, check whether we need to restart any operation that
4758 * is waiting for this to happen.
4759 */
4760 if (ILL_DOWN_OK(ill)) {
4761 /* ipif_ill_refrele_tail drops the ill_lock */
4762 ipif_ill_refrele_tail(ill);
4763 } else {
4764 mutex_exit(&ill->ill_lock);
4765 }
4766
4767 mutex_destroy(&nce->nce_lock);
4768 kmem_cache_free(nce_cache, nce);
4769 }
4770
4771 /*
4772 * Add an nce to the ill_nce list.
4773 */
4774 static nce_t *
nce_add_impl(ill_t * ill,ncec_t * ncec,nce_t * nce,mblk_t * dlur_mp)4775 nce_add_impl(ill_t *ill, ncec_t *ncec, nce_t *nce, mblk_t *dlur_mp)
4776 {
4777 bzero(nce, sizeof (*nce));
4778 mutex_init(&nce->nce_lock, NULL, MUTEX_DEFAULT, NULL);
4779 nce->nce_common = ncec;
4780 nce->nce_addr = ncec->ncec_addr;
4781 nce->nce_ill = ill;
4782 DTRACE_PROBE3(ill__incr__cnt, (ill_t *), ill,
4783 (char *), "nce", (void *), nce);
4784 ill->ill_nce_cnt++;
4785
4786 nce->nce_refcnt = 1; /* for the thread */
4787 ncec->ncec_refcnt++; /* want ncec_refhold_locked_notr(ncec) */
4788 nce->nce_dlur_mp = dlur_mp;
4789
4790 /* add nce to the ill's fastpath list. */
4791 nce->nce_refcnt++; /* for the list */
4792 list_insert_head(&ill->ill_nce, nce);
4793 return (nce);
4794 }
4795
4796 static nce_t *
nce_add(ill_t * ill,ncec_t * ncec)4797 nce_add(ill_t *ill, ncec_t *ncec)
4798 {
4799 nce_t *nce;
4800 mblk_t *dlur_mp = NULL;
4801
4802 ASSERT(MUTEX_HELD(&ill->ill_lock));
4803 ASSERT(MUTEX_HELD(&ncec->ncec_lock));
4804
4805 nce = kmem_cache_alloc(nce_cache, KM_NOSLEEP);
4806 if (nce == NULL)
4807 return (NULL);
4808 if (ncec->ncec_lladdr != NULL ||
4809 ill->ill_net_type == IRE_IF_NORESOLVER) {
4810 dlur_mp = ill_dlur_gen(ncec->ncec_lladdr,
4811 ill->ill_phys_addr_length, ill->ill_sap,
4812 ill->ill_sap_length);
4813 if (dlur_mp == NULL) {
4814 kmem_cache_free(nce_cache, nce);
4815 return (NULL);
4816 }
4817 }
4818 return (nce_add_impl(ill, ncec, nce, dlur_mp));
4819 }
4820
4821 /*
4822 * remove the nce from the ill_faspath list
4823 */
4824 void
nce_delete(nce_t * nce)4825 nce_delete(nce_t *nce)
4826 {
4827 ill_t *ill = nce->nce_ill;
4828
4829 ASSERT(MUTEX_HELD(&ill->ill_lock));
4830
4831 mutex_enter(&nce->nce_lock);
4832 if (nce->nce_is_condemned) {
4833 /*
4834 * some other thread has removed this nce from the ill_nce list
4835 */
4836 mutex_exit(&nce->nce_lock);
4837 return;
4838 }
4839 nce->nce_is_condemned = B_TRUE;
4840 mutex_exit(&nce->nce_lock);
4841
4842 list_remove(&ill->ill_nce, nce);
4843 /*
4844 * even though we are holding the ill_lock, it is ok to
4845 * call nce_refrele here because we know that we should have
4846 * at least 2 refs on the nce: one for the thread, and one
4847 * for the list. The refrele below will release the one for
4848 * the list.
4849 */
4850 nce_refrele(nce);
4851 }
4852
4853 nce_t *
nce_lookup(ill_t * ill,const in6_addr_t * addr)4854 nce_lookup(ill_t *ill, const in6_addr_t *addr)
4855 {
4856 nce_t *nce = NULL;
4857
4858 ASSERT(ill != NULL);
4859 ASSERT(MUTEX_HELD(&ill->ill_lock));
4860
4861 for (nce = list_head(&ill->ill_nce); nce != NULL;
4862 nce = list_next(&ill->ill_nce, nce)) {
4863 if (IN6_ARE_ADDR_EQUAL(&nce->nce_addr, addr))
4864 break;
4865 }
4866
4867 /*
4868 * if we found the nce on the ill_nce list while holding
4869 * the ill_lock, then it cannot be condemned yet.
4870 */
4871 if (nce != NULL) {
4872 ASSERT(!nce->nce_is_condemned);
4873 nce_refhold(nce);
4874 }
4875 return (nce);
4876 }
4877
4878 /*
4879 * Walk the ill_nce list on ill. The callback function func() cannot perform
4880 * any destructive actions.
4881 */
4882 static void
nce_walk_common(ill_t * ill,pfi_t func,void * arg)4883 nce_walk_common(ill_t *ill, pfi_t func, void *arg)
4884 {
4885 nce_t *nce = NULL, *nce_next;
4886
4887 ASSERT(MUTEX_HELD(&ill->ill_lock));
4888 for (nce = list_head(&ill->ill_nce); nce != NULL; ) {
4889 nce_next = list_next(&ill->ill_nce, nce);
4890 if (func(ill, nce, arg) != 0)
4891 break;
4892 nce = nce_next;
4893 }
4894 }
4895
4896 void
nce_walk(ill_t * ill,pfi_t func,void * arg)4897 nce_walk(ill_t *ill, pfi_t func, void *arg)
4898 {
4899 mutex_enter(&ill->ill_lock);
4900 nce_walk_common(ill, func, arg);
4901 mutex_exit(&ill->ill_lock);
4902 }
4903
4904 void
nce_flush(ill_t * ill,boolean_t flushall)4905 nce_flush(ill_t *ill, boolean_t flushall)
4906 {
4907 nce_t *nce, *nce_next;
4908 list_t dead;
4909
4910 list_create(&dead, sizeof (nce_t), offsetof(nce_t, nce_node));
4911 mutex_enter(&ill->ill_lock);
4912 for (nce = list_head(&ill->ill_nce); nce != NULL; ) {
4913 nce_next = list_next(&ill->ill_nce, nce);
4914 if (!flushall && NCE_PUBLISH(nce->nce_common)) {
4915 nce = nce_next;
4916 continue;
4917 }
4918 /*
4919 * nce_delete requires that the caller should either not
4920 * be holding locks, or should hold a ref to ensure that
4921 * we wont hit ncec_inactive. So take a ref and clean up
4922 * after the list is flushed.
4923 */
4924 nce_refhold(nce);
4925 nce_delete(nce);
4926 list_insert_tail(&dead, nce);
4927 nce = nce_next;
4928 }
4929 mutex_exit(&ill->ill_lock);
4930 while ((nce = list_head(&dead)) != NULL) {
4931 list_remove(&dead, nce);
4932 nce_refrele(nce);
4933 }
4934 ASSERT(list_is_empty(&dead));
4935 list_destroy(&dead);
4936 }
4937
4938 /* Return an interval that is anywhere in the [1 .. intv] range */
4939 static clock_t
nce_fuzz_interval(clock_t intv,boolean_t initial_time)4940 nce_fuzz_interval(clock_t intv, boolean_t initial_time)
4941 {
4942 clock_t rnd, frac;
4943
4944 (void) random_get_pseudo_bytes((uint8_t *)&rnd, sizeof (rnd));
4945 /* Note that clock_t is signed; must chop off bits */
4946 rnd &= (1ul << (NBBY * sizeof (rnd) - 1)) - 1;
4947 if (initial_time) {
4948 if (intv <= 0)
4949 intv = 1;
4950 else
4951 intv = (rnd % intv) + 1;
4952 } else {
4953 /* Compute 'frac' as 20% of the configured interval */
4954 if ((frac = intv / 5) <= 1)
4955 frac = 2;
4956 /* Set intv randomly in the range [intv-frac .. intv+frac] */
4957 if ((intv = intv - frac + rnd % (2 * frac + 1)) <= 0)
4958 intv = 1;
4959 }
4960 return (intv);
4961 }
4962
4963 void
nce_resolv_ipmp_ok(ncec_t * ncec)4964 nce_resolv_ipmp_ok(ncec_t *ncec)
4965 {
4966 mblk_t *mp;
4967 uint_t pkt_len;
4968 iaflags_t ixaflags = IXAF_NO_TRACE;
4969 nce_t *under_nce;
4970 ill_t *ill = ncec->ncec_ill;
4971 boolean_t isv6 = (ncec->ncec_ipversion == IPV6_VERSION);
4972 ipif_t *src_ipif = NULL;
4973 ip_stack_t *ipst = ill->ill_ipst;
4974 ill_t *send_ill;
4975 uint_t nprobes;
4976
4977 ASSERT(IS_IPMP(ill));
4978
4979 mutex_enter(&ncec->ncec_lock);
4980 nprobes = ncec->ncec_nprobes;
4981 mp = ncec->ncec_qd_mp;
4982 ncec->ncec_qd_mp = NULL;
4983 ncec->ncec_nprobes = 0;
4984 mutex_exit(&ncec->ncec_lock);
4985
4986 while (mp != NULL) {
4987 mblk_t *nxt_mp;
4988
4989 nxt_mp = mp->b_next;
4990 mp->b_next = NULL;
4991 if (isv6) {
4992 ip6_t *ip6h = (ip6_t *)mp->b_rptr;
4993
4994 pkt_len = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN;
4995 src_ipif = ipif_lookup_addr_nondup_v6(&ip6h->ip6_src,
4996 ill, ALL_ZONES, ipst);
4997 } else {
4998 ipha_t *ipha = (ipha_t *)mp->b_rptr;
4999
5000 ixaflags |= IXAF_IS_IPV4;
5001 pkt_len = ntohs(ipha->ipha_length);
5002 src_ipif = ipif_lookup_addr_nondup(ipha->ipha_src,
5003 ill, ALL_ZONES, ipst);
5004 }
5005
5006 /*
5007 * find a new nce based on an under_ill. The first IPMP probe
5008 * packet gets queued, so we could still find a src_ipif that
5009 * matches an IPMP test address.
5010 */
5011 if (src_ipif == NULL || IS_IPMP(src_ipif->ipif_ill)) {
5012 /*
5013 * if src_ipif is null, this could be either a
5014 * forwarded packet or a probe whose src got deleted.
5015 * We identify the former case by looking for the
5016 * ncec_nprobes: the first ncec_nprobes packets are
5017 * probes;
5018 */
5019 if (src_ipif == NULL && nprobes > 0)
5020 goto drop_pkt;
5021
5022 /*
5023 * For forwarded packets, we use the ipmp rotor
5024 * to find send_ill.
5025 */
5026 send_ill = ipmp_ill_hold_xmit_ill(ncec->ncec_ill,
5027 B_TRUE);
5028 } else {
5029 send_ill = src_ipif->ipif_ill;
5030 ill_refhold(send_ill);
5031 }
5032
5033 DTRACE_PROBE4(nce__resolve__ipmp, (mblk_t *), mp,
5034 (ncec_t *), ncec, (ipif_t *),
5035 src_ipif, (ill_t *), send_ill);
5036
5037 if (send_ill == NULL) {
5038 if (src_ipif != NULL)
5039 ipif_refrele(src_ipif);
5040 goto drop_pkt;
5041 }
5042 /* create an under_nce on send_ill */
5043 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
5044 if (IS_IN_SAME_ILLGRP(send_ill, ncec->ncec_ill))
5045 under_nce = nce_fastpath_create(send_ill, ncec);
5046 else
5047 under_nce = NULL;
5048 rw_exit(&ipst->ips_ill_g_lock);
5049 if (under_nce != NULL && NCE_ISREACHABLE(ncec))
5050 nce_fastpath_trigger(under_nce);
5051
5052 ill_refrele(send_ill);
5053 if (src_ipif != NULL)
5054 ipif_refrele(src_ipif);
5055
5056 if (under_nce != NULL) {
5057 (void) ip_xmit(mp, under_nce, ixaflags, pkt_len, 0,
5058 ALL_ZONES, 0, NULL);
5059 nce_refrele(under_nce);
5060 if (nprobes > 0)
5061 nprobes--;
5062 mp = nxt_mp;
5063 continue;
5064 }
5065 drop_pkt:
5066 if (isv6) {
5067 BUMP_MIB(&ipst->ips_ip6_mib, ipIfStatsOutDiscards);
5068 } else {
5069 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
5070 }
5071 ip_drop_output("ipIfStatsOutDiscards - no under_ill", mp, NULL);
5072 freemsg(mp);
5073 if (nprobes > 0)
5074 nprobes--;
5075 mp = nxt_mp;
5076 }
5077 ncec_cb_dispatch(ncec); /* complete callbacks */
5078 }
5079