1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved.
23 */
24
25 #include <sys/types.h>
26 #include <sys/stream.h>
27 #include <sys/stropts.h>
28 #include <sys/strsun.h>
29 #include <sys/sysmacros.h>
30 #include <sys/errno.h>
31 #include <sys/dlpi.h>
32 #include <sys/socket.h>
33 #include <sys/ddi.h>
34 #include <sys/sunddi.h>
35 #include <sys/cmn_err.h>
36 #include <sys/debug.h>
37 #include <sys/vtrace.h>
38 #include <sys/kmem.h>
39 #include <sys/zone.h>
40 #include <sys/ethernet.h>
41 #include <sys/sdt.h>
42 #include <sys/mac.h>
43
44 #include <net/if.h>
45 #include <net/if_types.h>
46 #include <net/if_dl.h>
47 #include <net/route.h>
48 #include <netinet/in.h>
49 #include <netinet/ip6.h>
50 #include <netinet/icmp6.h>
51
52 #include <inet/common.h>
53 #include <inet/mi.h>
54 #include <inet/mib2.h>
55 #include <inet/nd.h>
56 #include <inet/ip.h>
57 #include <inet/ip_impl.h>
58 #include <inet/ipclassifier.h>
59 #include <inet/ip_if.h>
60 #include <inet/ip_ire.h>
61 #include <inet/ip_rts.h>
62 #include <inet/ip6.h>
63 #include <inet/ip_ndp.h>
64 #include <inet/sctp_ip.h>
65 #include <inet/ip_arp.h>
66 #include <inet/ip2mac_impl.h>
67
68 #define ANNOUNCE_INTERVAL(isv6) \
69 (isv6 ? ipst->ips_ip_ndp_unsolicit_interval : \
70 ipst->ips_ip_arp_publish_interval)
71
72 #define DEFENSE_INTERVAL(isv6) \
73 (isv6 ? ipst->ips_ndp_defend_interval : \
74 ipst->ips_arp_defend_interval)
75
76 /* Non-tunable probe interval, based on link capabilities */
77 #define ILL_PROBE_INTERVAL(ill) ((ill)->ill_note_link ? 150 : 1500)
78
79 /*
80 * The IPv4 Link Local address space is special; we do extra duplicate checking
81 * there, as the entire assignment mechanism rests on random numbers.
82 */
83 #define IS_IPV4_LL_SPACE(ptr) (((uchar_t *)ptr)[0] == 169 && \
84 ((uchar_t *)ptr)[1] == 254)
85
86 /*
87 * NCE_EXTERNAL_FLAGS_MASK defines the set of ncec_flags that may be passed
88 * in to the ncec*add* functions.
89 *
90 * NCE_F_AUTHORITY means that we ignore any incoming adverts for that
91 * mapping (though DAD is performed for the mapping). NCE_F_PUBLISH means
92 * that we will respond to requests for the protocol address.
93 */
94 #define NCE_EXTERNAL_FLAGS_MASK \
95 (NCE_F_MYADDR | NCE_F_ISROUTER | NCE_F_NONUD | \
96 NCE_F_ANYCAST | NCE_F_UNSOL_ADV | NCE_F_BCAST | NCE_F_MCAST | \
97 NCE_F_AUTHORITY | NCE_F_PUBLISH | NCE_F_STATIC)
98
99 /*
100 * Lock ordering:
101 *
102 * ndp_g_lock -> ill_lock -> ncec_lock
103 *
104 * The ndp_g_lock protects the NCE hash (nce_hash_tbl, NCE_HASH_PTR) and
105 * ncec_next. ncec_lock protects the contents of the NCE (particularly
106 * ncec_refcnt).
107 */
108
109 static void nce_cleanup_list(ncec_t *ncec);
110 static void nce_set_ll(ncec_t *ncec, uchar_t *ll_addr);
111 static ncec_t *ncec_lookup_illgrp(ill_t *, const in6_addr_t *,
112 ncec_t *);
113 static nce_t *nce_lookup_addr(ill_t *, const in6_addr_t *);
114 static int nce_set_multicast_v6(ill_t *ill, const in6_addr_t *addr,
115 uint16_t ncec_flags, nce_t **newnce);
116 static int nce_set_multicast_v4(ill_t *ill, const in_addr_t *dst,
117 uint16_t ncec_flags, nce_t **newnce);
118 static boolean_t ndp_xmit(ill_t *ill, uint32_t operation,
119 uint8_t *hwaddr, uint_t hwaddr_len, const in6_addr_t *sender,
120 const in6_addr_t *target, int flag);
121 static void ncec_refhold_locked(ncec_t *);
122 static boolean_t ill_defend_rate_limit(ill_t *, ncec_t *);
123 static void nce_queue_mp_common(ncec_t *, mblk_t *, boolean_t);
124 static int nce_add_common(ill_t *, uchar_t *, uint_t, const in6_addr_t *,
125 uint16_t, uint16_t, nce_t **);
126 static nce_t *nce_add_impl(ill_t *, ncec_t *, nce_t *, mblk_t *);
127 static nce_t *nce_add(ill_t *, ncec_t *);
128 static void nce_inactive(nce_t *);
129 extern nce_t *nce_lookup(ill_t *, const in6_addr_t *);
130 static nce_t *nce_ill_lookup_then_add(ill_t *, ncec_t *);
131 static int nce_add_v6(ill_t *, uchar_t *, uint_t, const in6_addr_t *,
132 uint16_t, uint16_t, nce_t **);
133 static int nce_add_v4(ill_t *, uchar_t *, uint_t, const in_addr_t *,
134 uint16_t, uint16_t, nce_t **);
135 static int nce_add_v6_postprocess(nce_t *);
136 static int nce_add_v4_postprocess(nce_t *);
137 static ill_t *nce_resolve_src(ncec_t *, in6_addr_t *);
138 static clock_t nce_fuzz_interval(clock_t, boolean_t);
139 static void nce_resolv_ipmp_ok(ncec_t *);
140 static void nce_walk_common(ill_t *, pfi_t, void *);
141 static void nce_start_timer(ncec_t *, uint_t);
142 static nce_t *nce_fastpath_create(ill_t *, ncec_t *);
143 static void nce_fastpath_trigger(nce_t *);
144 static nce_t *nce_fastpath(ncec_t *, boolean_t, nce_t *);
145
146 #ifdef DEBUG
147 static void ncec_trace_cleanup(const ncec_t *);
148 #endif
149
150 #define NCE_HASH_PTR_V4(ipst, addr) \
151 (&((ipst)->ips_ndp4->nce_hash_tbl[IRE_ADDR_HASH(addr, NCE_TABLE_SIZE)]))
152
153 #define NCE_HASH_PTR_V6(ipst, addr) \
154 (&((ipst)->ips_ndp6->nce_hash_tbl[NCE_ADDR_HASH_V6(addr, \
155 NCE_TABLE_SIZE)]))
156
157 extern kmem_cache_t *ncec_cache;
158 extern kmem_cache_t *nce_cache;
159
160 /*
161 * Send out a IPv6 (unicast) or IPv4 (broadcast) DAD probe
162 * If src_ill is not null, the ncec_addr is bound to src_ill. The
163 * src_ill is ignored by nce_dad for IPv4 Neighbor Cache entries where
164 * the probe is sent on the ncec_ill (in the non-IPMP case) or the
165 * IPMP cast_ill (in the IPMP case).
166 *
167 * Note that the probe interval is based on the src_ill for IPv6, and
168 * the ncec_xmit_interval for IPv4.
169 */
170 static void
nce_dad(ncec_t * ncec,ill_t * src_ill,boolean_t send_probe)171 nce_dad(ncec_t *ncec, ill_t *src_ill, boolean_t send_probe)
172 {
173 boolean_t dropped;
174 uint32_t probe_interval;
175
176 ASSERT(!(ncec->ncec_flags & NCE_F_MCAST));
177 ASSERT(!(ncec->ncec_flags & NCE_F_BCAST));
178 if (ncec->ncec_ipversion == IPV6_VERSION) {
179 dropped = ndp_xmit(src_ill, ND_NEIGHBOR_SOLICIT,
180 ncec->ncec_lladdr, ncec->ncec_lladdr_length,
181 &ipv6_all_zeros, &ncec->ncec_addr, NDP_PROBE);
182 probe_interval = ILL_PROBE_INTERVAL(src_ill);
183 } else {
184 /* IPv4 DAD delay the initial probe. */
185 if (send_probe)
186 dropped = arp_probe(ncec);
187 else
188 dropped = B_TRUE;
189 probe_interval = nce_fuzz_interval(ncec->ncec_xmit_interval,
190 !send_probe);
191 }
192 if (!dropped) {
193 mutex_enter(&ncec->ncec_lock);
194 ncec->ncec_pcnt--;
195 mutex_exit(&ncec->ncec_lock);
196 }
197 nce_restart_timer(ncec, probe_interval);
198 }
199
200 /*
201 * Compute default flags to use for an advertisement of this ncec's address.
202 */
203 static int
nce_advert_flags(const ncec_t * ncec)204 nce_advert_flags(const ncec_t *ncec)
205 {
206 int flag = 0;
207
208 if (ncec->ncec_flags & NCE_F_ISROUTER)
209 flag |= NDP_ISROUTER;
210 if (!(ncec->ncec_flags & NCE_F_ANYCAST))
211 flag |= NDP_ORIDE;
212
213 return (flag);
214 }
215
216 /*
217 * NDP Cache Entry creation routine.
218 * This routine must always be called with ndp6->ndp_g_lock held.
219 */
220 int
nce_add_v6(ill_t * ill,uchar_t * hw_addr,uint_t hw_addr_len,const in6_addr_t * addr,uint16_t flags,uint16_t state,nce_t ** newnce)221 nce_add_v6(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len,
222 const in6_addr_t *addr, uint16_t flags, uint16_t state, nce_t **newnce)
223 {
224 int err;
225 nce_t *nce;
226
227 ASSERT(MUTEX_HELD(&ill->ill_ipst->ips_ndp6->ndp_g_lock));
228 ASSERT(ill != NULL && ill->ill_isv6);
229
230 err = nce_add_common(ill, hw_addr, hw_addr_len, addr, flags, state,
231 &nce);
232 if (err != 0)
233 return (err);
234 ASSERT(newnce != NULL);
235 *newnce = nce;
236 return (err);
237 }
238
239 /*
240 * Post-processing routine to be executed after nce_add_v6(). This function
241 * triggers fastpath (if appropriate) and DAD on the newly added nce entry
242 * and must be called without any locks held.
243 */
244 int
nce_add_v6_postprocess(nce_t * nce)245 nce_add_v6_postprocess(nce_t *nce)
246 {
247 ncec_t *ncec = nce->nce_common;
248 boolean_t dropped = B_FALSE;
249 uchar_t *hw_addr = ncec->ncec_lladdr;
250 uint_t hw_addr_len = ncec->ncec_lladdr_length;
251 ill_t *ill = ncec->ncec_ill;
252 int err = 0;
253 uint16_t flags = ncec->ncec_flags;
254 ip_stack_t *ipst = ill->ill_ipst;
255 boolean_t trigger_fastpath = B_TRUE;
256
257 /*
258 * If the hw_addr is NULL, typically for ND_INCOMPLETE nces, then
259 * we call nce_fastpath as soon as the ncec is resolved in nce_process.
260 * We call nce_fastpath from nce_update if the link layer address of
261 * the peer changes from nce_update
262 */
263 if (NCE_PUBLISH(ncec) || !NCE_ISREACHABLE(ncec) ||
264 (hw_addr == NULL && ill->ill_net_type != IRE_IF_NORESOLVER))
265 trigger_fastpath = B_FALSE;
266
267 if (trigger_fastpath)
268 nce_fastpath_trigger(nce);
269 if (NCE_PUBLISH(ncec) && ncec->ncec_state == ND_PROBE) {
270 ill_t *hwaddr_ill;
271 /*
272 * Unicast entry that needs DAD.
273 */
274 if (IS_IPMP(ill)) {
275 hwaddr_ill = ipmp_illgrp_find_ill(ill->ill_grp,
276 hw_addr, hw_addr_len);
277 } else {
278 hwaddr_ill = ill;
279 }
280 nce_dad(ncec, hwaddr_ill, B_TRUE);
281 err = EINPROGRESS;
282 } else if (flags & NCE_F_UNSOL_ADV) {
283 /*
284 * We account for the transmit below by assigning one
285 * less than the ndd variable. Subsequent decrements
286 * are done in nce_timer.
287 */
288 mutex_enter(&ncec->ncec_lock);
289 ncec->ncec_unsolicit_count =
290 ipst->ips_ip_ndp_unsolicit_count - 1;
291 mutex_exit(&ncec->ncec_lock);
292 dropped = ndp_xmit(ill,
293 ND_NEIGHBOR_ADVERT,
294 hw_addr,
295 hw_addr_len,
296 &ncec->ncec_addr, /* Source and target of the adv */
297 &ipv6_all_hosts_mcast, /* Destination of the packet */
298 nce_advert_flags(ncec));
299 mutex_enter(&ncec->ncec_lock);
300 if (dropped)
301 ncec->ncec_unsolicit_count++;
302 else
303 ncec->ncec_last_time_defended = ddi_get_lbolt();
304 if (ncec->ncec_unsolicit_count != 0) {
305 nce_start_timer(ncec,
306 ipst->ips_ip_ndp_unsolicit_interval);
307 }
308 mutex_exit(&ncec->ncec_lock);
309 }
310 return (err);
311 }
312
313 /*
314 * Atomically lookup and add (if needed) Neighbor Cache information for
315 * an address.
316 *
317 * IPMP notes: the ncec for non-local (i.e., !NCE_MYADDR(ncec) addresses
318 * are always added pointing at the ipmp_ill. Thus, when the ill passed
319 * to nce_add_v6 is an under_ill (i.e., IS_UNDER_IPMP(ill)) two nce_t
320 * entries will be created, both pointing at the same ncec_t. The nce_t
321 * entries will have their nce_ill set to the ipmp_ill and the under_ill
322 * respectively, with the ncec_t having its ncec_ill pointing at the ipmp_ill.
323 * Local addresses are always created on the ill passed to nce_add_v6.
324 */
325 int
nce_lookup_then_add_v6(ill_t * ill,uchar_t * hw_addr,uint_t hw_addr_len,const in6_addr_t * addr,uint16_t flags,uint16_t state,nce_t ** newnce)326 nce_lookup_then_add_v6(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len,
327 const in6_addr_t *addr, uint16_t flags, uint16_t state, nce_t **newnce)
328 {
329 int err = 0;
330 ip_stack_t *ipst = ill->ill_ipst;
331 nce_t *nce, *upper_nce = NULL;
332 ill_t *in_ill = ill;
333 boolean_t need_ill_refrele = B_FALSE;
334
335 if (flags & NCE_F_MCAST) {
336 /*
337 * hw_addr will be figured out in nce_set_multicast_v6;
338 * caller has to select the cast_ill
339 */
340 ASSERT(hw_addr == NULL);
341 ASSERT(!IS_IPMP(ill));
342 err = nce_set_multicast_v6(ill, addr, flags, newnce);
343 return (err);
344 }
345 ASSERT(ill->ill_isv6);
346 if (IS_UNDER_IPMP(ill) && !(flags & NCE_F_MYADDR)) {
347 ill = ipmp_ill_hold_ipmp_ill(ill);
348 if (ill == NULL)
349 return (ENXIO);
350 need_ill_refrele = B_TRUE;
351 }
352
353 mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
354 nce = nce_lookup_addr(ill, addr);
355 if (nce == NULL) {
356 err = nce_add_v6(ill, hw_addr, hw_addr_len, addr, flags, state,
357 &nce);
358 } else {
359 err = EEXIST;
360 }
361 mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
362 if (err == 0)
363 err = nce_add_v6_postprocess(nce);
364 if (in_ill != ill && nce != NULL) {
365 nce_t *under_nce = NULL;
366
367 /*
368 * in_ill was the under_ill. Try to create the under_nce.
369 * Hold the ill_g_lock to prevent changes to group membership
370 * until we are done.
371 */
372 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
373 if (!IS_IN_SAME_ILLGRP(in_ill, ill)) {
374 DTRACE_PROBE2(ill__not__in__group, nce_t *, nce,
375 ill_t *, ill);
376 rw_exit(&ipst->ips_ill_g_lock);
377 err = ENXIO;
378 nce_refrele(nce);
379 nce = NULL;
380 goto bail;
381 }
382 under_nce = nce_fastpath_create(in_ill, nce->nce_common);
383 if (under_nce == NULL) {
384 rw_exit(&ipst->ips_ill_g_lock);
385 err = EINVAL;
386 nce_refrele(nce);
387 nce = NULL;
388 goto bail;
389 }
390 rw_exit(&ipst->ips_ill_g_lock);
391 upper_nce = nce;
392 nce = under_nce; /* will be returned to caller */
393 if (NCE_ISREACHABLE(nce->nce_common))
394 nce_fastpath_trigger(under_nce);
395 }
396 /* nce_refrele is deferred until the lock is dropped */
397 if (nce != NULL) {
398 if (newnce != NULL)
399 *newnce = nce;
400 else
401 nce_refrele(nce);
402 }
403 bail:
404 if (upper_nce != NULL)
405 nce_refrele(upper_nce);
406 if (need_ill_refrele)
407 ill_refrele(ill);
408 return (err);
409 }
410
411 /*
412 * Remove all the CONDEMNED nces from the appropriate hash table.
413 * We create a private list of NCEs, these may have ires pointing
414 * to them, so the list will be passed through to clean up dependent
415 * ires and only then we can do ncec_refrele() which can make NCE inactive.
416 */
417 static void
nce_remove(ndp_g_t * ndp,ncec_t * ncec,ncec_t ** free_nce_list)418 nce_remove(ndp_g_t *ndp, ncec_t *ncec, ncec_t **free_nce_list)
419 {
420 ncec_t *ncec1;
421 ncec_t **ptpn;
422
423 ASSERT(MUTEX_HELD(&ndp->ndp_g_lock));
424 ASSERT(ndp->ndp_g_walker == 0);
425 for (; ncec; ncec = ncec1) {
426 ncec1 = ncec->ncec_next;
427 mutex_enter(&ncec->ncec_lock);
428 if (NCE_ISCONDEMNED(ncec)) {
429 ptpn = ncec->ncec_ptpn;
430 ncec1 = ncec->ncec_next;
431 if (ncec1 != NULL)
432 ncec1->ncec_ptpn = ptpn;
433 *ptpn = ncec1;
434 ncec->ncec_ptpn = NULL;
435 ncec->ncec_next = NULL;
436 ncec->ncec_next = *free_nce_list;
437 *free_nce_list = ncec;
438 }
439 mutex_exit(&ncec->ncec_lock);
440 }
441 }
442
443 /*
444 * 1. Mark the entry CONDEMNED. This ensures that no new nce_lookup()
445 * will return this NCE. Also no new timeouts will
446 * be started (See nce_restart_timer).
447 * 2. Cancel any currently running timeouts.
448 * 3. If there is an ndp walker, return. The walker will do the cleanup.
449 * This ensures that walkers see a consistent list of NCEs while walking.
450 * 4. Otherwise remove the NCE from the list of NCEs
451 */
452 void
ncec_delete(ncec_t * ncec)453 ncec_delete(ncec_t *ncec)
454 {
455 ncec_t **ptpn;
456 ncec_t *ncec1;
457 int ipversion = ncec->ncec_ipversion;
458 ndp_g_t *ndp;
459 ip_stack_t *ipst = ncec->ncec_ipst;
460
461 if (ipversion == IPV4_VERSION)
462 ndp = ipst->ips_ndp4;
463 else
464 ndp = ipst->ips_ndp6;
465
466 /* Serialize deletes */
467 mutex_enter(&ncec->ncec_lock);
468 if (NCE_ISCONDEMNED(ncec)) {
469 /* Some other thread is doing the delete */
470 mutex_exit(&ncec->ncec_lock);
471 return;
472 }
473 /*
474 * Caller has a refhold. Also 1 ref for being in the list. Thus
475 * refcnt has to be >= 2
476 */
477 ASSERT(ncec->ncec_refcnt >= 2);
478 ncec->ncec_flags |= NCE_F_CONDEMNED;
479 mutex_exit(&ncec->ncec_lock);
480
481 /* Count how many condemned ires for kmem_cache callback */
482 atomic_inc_32(&ipst->ips_num_nce_condemned);
483 nce_fastpath_list_delete(ncec->ncec_ill, ncec, NULL);
484
485 /* Complete any waiting callbacks */
486 ncec_cb_dispatch(ncec);
487
488 /*
489 * Cancel any running timer. Timeout can't be restarted
490 * since CONDEMNED is set. Can't hold ncec_lock across untimeout.
491 * Passing invalid timeout id is fine.
492 */
493 if (ncec->ncec_timeout_id != 0) {
494 (void) untimeout(ncec->ncec_timeout_id);
495 ncec->ncec_timeout_id = 0;
496 }
497
498 mutex_enter(&ndp->ndp_g_lock);
499 if (ncec->ncec_ptpn == NULL) {
500 /*
501 * The last ndp walker has already removed this ncec from
502 * the list after we marked the ncec CONDEMNED and before
503 * we grabbed the global lock.
504 */
505 mutex_exit(&ndp->ndp_g_lock);
506 return;
507 }
508 if (ndp->ndp_g_walker > 0) {
509 /*
510 * Can't unlink. The walker will clean up
511 */
512 ndp->ndp_g_walker_cleanup = B_TRUE;
513 mutex_exit(&ndp->ndp_g_lock);
514 return;
515 }
516
517 /*
518 * Now remove the ncec from the list. nce_restart_timer won't restart
519 * the timer since it is marked CONDEMNED.
520 */
521 ptpn = ncec->ncec_ptpn;
522 ncec1 = ncec->ncec_next;
523 if (ncec1 != NULL)
524 ncec1->ncec_ptpn = ptpn;
525 *ptpn = ncec1;
526 ncec->ncec_ptpn = NULL;
527 ncec->ncec_next = NULL;
528 mutex_exit(&ndp->ndp_g_lock);
529
530 /* Removed from ncec_ptpn/ncec_next list */
531 ncec_refrele_notr(ncec);
532 }
533
534 void
ncec_inactive(ncec_t * ncec)535 ncec_inactive(ncec_t *ncec)
536 {
537 mblk_t **mpp;
538 ill_t *ill = ncec->ncec_ill;
539 ip_stack_t *ipst = ncec->ncec_ipst;
540
541 ASSERT(ncec->ncec_refcnt == 0);
542 ASSERT(MUTEX_HELD(&ncec->ncec_lock));
543
544 /* Count how many condemned nces for kmem_cache callback */
545 if (NCE_ISCONDEMNED(ncec))
546 atomic_add_32(&ipst->ips_num_nce_condemned, -1);
547
548 /* Free all allocated messages */
549 mpp = &ncec->ncec_qd_mp;
550 while (*mpp != NULL) {
551 mblk_t *mp;
552
553 mp = *mpp;
554 *mpp = mp->b_next;
555
556 inet_freemsg(mp);
557 }
558 /*
559 * must have been cleaned up in ncec_delete
560 */
561 ASSERT(list_is_empty(&ncec->ncec_cb));
562 list_destroy(&ncec->ncec_cb);
563 /*
564 * free the ncec_lladdr if one was allocated in nce_add_common()
565 */
566 if (ncec->ncec_lladdr_length > 0)
567 kmem_free(ncec->ncec_lladdr, ncec->ncec_lladdr_length);
568
569 #ifdef DEBUG
570 ncec_trace_cleanup(ncec);
571 #endif
572
573 mutex_enter(&ill->ill_lock);
574 DTRACE_PROBE3(ill__decr__cnt, (ill_t *), ill,
575 (char *), "ncec", (void *), ncec);
576 ill->ill_ncec_cnt--;
577 ncec->ncec_ill = NULL;
578 /*
579 * If the number of ncec's associated with this ill have dropped
580 * to zero, check whether we need to restart any operation that
581 * is waiting for this to happen.
582 */
583 if (ILL_DOWN_OK(ill)) {
584 /* ipif_ill_refrele_tail drops the ill_lock */
585 ipif_ill_refrele_tail(ill);
586 } else {
587 mutex_exit(&ill->ill_lock);
588 }
589
590 mutex_destroy(&ncec->ncec_lock);
591 kmem_cache_free(ncec_cache, ncec);
592 }
593
594 /*
595 * ncec_walk routine. Delete the ncec if it is associated with the ill
596 * that is going away. Always called as a writer.
597 */
598 void
ncec_delete_per_ill(ncec_t * ncec,uchar_t * arg)599 ncec_delete_per_ill(ncec_t *ncec, uchar_t *arg)
600 {
601 if ((ncec != NULL) && ncec->ncec_ill == (ill_t *)arg) {
602 ncec_delete(ncec);
603 }
604 }
605
606 /*
607 * Neighbor Cache cleanup logic for a list of ncec_t entries.
608 */
609 static void
nce_cleanup_list(ncec_t * ncec)610 nce_cleanup_list(ncec_t *ncec)
611 {
612 ncec_t *ncec_next;
613
614 ASSERT(ncec != NULL);
615 while (ncec != NULL) {
616 ncec_next = ncec->ncec_next;
617 ncec->ncec_next = NULL;
618
619 /*
620 * It is possible for the last ndp walker (this thread)
621 * to come here after ncec_delete has marked the ncec CONDEMNED
622 * and before it has removed the ncec from the fastpath list
623 * or called untimeout. So we need to do it here. It is safe
624 * for both ncec_delete and this thread to do it twice or
625 * even simultaneously since each of the threads has a
626 * reference on the ncec.
627 */
628 nce_fastpath_list_delete(ncec->ncec_ill, ncec, NULL);
629 /*
630 * Cancel any running timer. Timeout can't be restarted
631 * since CONDEMNED is set. The ncec_lock can't be
632 * held across untimeout though passing invalid timeout
633 * id is fine.
634 */
635 if (ncec->ncec_timeout_id != 0) {
636 (void) untimeout(ncec->ncec_timeout_id);
637 ncec->ncec_timeout_id = 0;
638 }
639 /* Removed from ncec_ptpn/ncec_next list */
640 ncec_refrele_notr(ncec);
641 ncec = ncec_next;
642 }
643 }
644
645 /*
646 * Restart DAD on given NCE. Returns B_TRUE if DAD has been restarted.
647 */
648 boolean_t
nce_restart_dad(ncec_t * ncec)649 nce_restart_dad(ncec_t *ncec)
650 {
651 boolean_t started;
652 ill_t *ill, *hwaddr_ill;
653
654 if (ncec == NULL)
655 return (B_FALSE);
656 ill = ncec->ncec_ill;
657 mutex_enter(&ncec->ncec_lock);
658 if (ncec->ncec_state == ND_PROBE) {
659 mutex_exit(&ncec->ncec_lock);
660 started = B_TRUE;
661 } else if (ncec->ncec_state == ND_REACHABLE) {
662 ASSERT(ncec->ncec_lladdr != NULL);
663 ncec->ncec_state = ND_PROBE;
664 ncec->ncec_pcnt = ND_MAX_UNICAST_SOLICIT;
665 /*
666 * Slight cheat here: we don't use the initial probe delay
667 * for IPv4 in this obscure case.
668 */
669 mutex_exit(&ncec->ncec_lock);
670 if (IS_IPMP(ill)) {
671 hwaddr_ill = ipmp_illgrp_find_ill(ill->ill_grp,
672 ncec->ncec_lladdr, ncec->ncec_lladdr_length);
673 } else {
674 hwaddr_ill = ill;
675 }
676 nce_dad(ncec, hwaddr_ill, B_TRUE);
677 started = B_TRUE;
678 } else {
679 mutex_exit(&ncec->ncec_lock);
680 started = B_FALSE;
681 }
682 return (started);
683 }
684
685 /*
686 * IPv6 Cache entry lookup. Try to find an ncec matching the parameters passed.
687 * If one is found, the refcnt on the ncec will be incremented.
688 */
689 ncec_t *
ncec_lookup_illgrp_v6(ill_t * ill,const in6_addr_t * addr)690 ncec_lookup_illgrp_v6(ill_t *ill, const in6_addr_t *addr)
691 {
692 ncec_t *ncec;
693 ip_stack_t *ipst = ill->ill_ipst;
694
695 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
696 mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
697
698 /* Get head of v6 hash table */
699 ncec = *((ncec_t **)NCE_HASH_PTR_V6(ipst, *addr));
700 ncec = ncec_lookup_illgrp(ill, addr, ncec);
701 mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
702 rw_exit(&ipst->ips_ill_g_lock);
703 return (ncec);
704 }
705 /*
706 * IPv4 Cache entry lookup. Try to find an ncec matching the parameters passed.
707 * If one is found, the refcnt on the ncec will be incremented.
708 */
709 ncec_t *
ncec_lookup_illgrp_v4(ill_t * ill,const in_addr_t * addr)710 ncec_lookup_illgrp_v4(ill_t *ill, const in_addr_t *addr)
711 {
712 ncec_t *ncec = NULL;
713 in6_addr_t addr6;
714 ip_stack_t *ipst = ill->ill_ipst;
715
716 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
717 mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
718
719 /* Get head of v4 hash table */
720 ncec = *((ncec_t **)NCE_HASH_PTR_V4(ipst, *addr));
721 IN6_IPADDR_TO_V4MAPPED(*addr, &addr6);
722 ncec = ncec_lookup_illgrp(ill, &addr6, ncec);
723 mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
724 rw_exit(&ipst->ips_ill_g_lock);
725 return (ncec);
726 }
727
728 /*
729 * Cache entry lookup. Try to find an ncec matching the parameters passed.
730 * If an ncec is found, increment the hold count on that ncec.
731 * The caller passes in the start of the appropriate hash table, and must
732 * be holding the appropriate global lock (ndp_g_lock). In addition, since
733 * this function matches ncec_t entries across the illgrp, the ips_ill_g_lock
734 * must be held as reader.
735 *
736 * This function always matches across the ipmp group.
737 */
738 ncec_t *
ncec_lookup_illgrp(ill_t * ill,const in6_addr_t * addr,ncec_t * ncec)739 ncec_lookup_illgrp(ill_t *ill, const in6_addr_t *addr, ncec_t *ncec)
740 {
741 ndp_g_t *ndp;
742 ip_stack_t *ipst = ill->ill_ipst;
743
744 if (ill->ill_isv6)
745 ndp = ipst->ips_ndp6;
746 else
747 ndp = ipst->ips_ndp4;
748
749 ASSERT(ill != NULL);
750 ASSERT(MUTEX_HELD(&ndp->ndp_g_lock));
751 if (IN6_IS_ADDR_UNSPECIFIED(addr))
752 return (NULL);
753 for (; ncec != NULL; ncec = ncec->ncec_next) {
754 if (ncec->ncec_ill == ill ||
755 IS_IN_SAME_ILLGRP(ill, ncec->ncec_ill)) {
756 if (IN6_ARE_ADDR_EQUAL(&ncec->ncec_addr, addr)) {
757 mutex_enter(&ncec->ncec_lock);
758 if (!NCE_ISCONDEMNED(ncec)) {
759 ncec_refhold_locked(ncec);
760 mutex_exit(&ncec->ncec_lock);
761 break;
762 }
763 mutex_exit(&ncec->ncec_lock);
764 }
765 }
766 }
767 return (ncec);
768 }
769
770 /*
771 * Find an nce_t on ill with nce_addr == addr. Lookup the nce_t
772 * entries for ill only, i.e., when ill is part of an ipmp group,
773 * nce_lookup_v4 will never try to match across the group.
774 */
775 nce_t *
nce_lookup_v4(ill_t * ill,const in_addr_t * addr)776 nce_lookup_v4(ill_t *ill, const in_addr_t *addr)
777 {
778 nce_t *nce;
779 in6_addr_t addr6;
780 ip_stack_t *ipst = ill->ill_ipst;
781
782 mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
783 IN6_IPADDR_TO_V4MAPPED(*addr, &addr6);
784 nce = nce_lookup_addr(ill, &addr6);
785 mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
786 return (nce);
787 }
788
789 /*
790 * Find an nce_t on ill with nce_addr == addr. Lookup the nce_t
791 * entries for ill only, i.e., when ill is part of an ipmp group,
792 * nce_lookup_v6 will never try to match across the group.
793 */
794 nce_t *
nce_lookup_v6(ill_t * ill,const in6_addr_t * addr6)795 nce_lookup_v6(ill_t *ill, const in6_addr_t *addr6)
796 {
797 nce_t *nce;
798 ip_stack_t *ipst = ill->ill_ipst;
799
800 mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
801 nce = nce_lookup_addr(ill, addr6);
802 mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
803 return (nce);
804 }
805
806 static nce_t *
nce_lookup_addr(ill_t * ill,const in6_addr_t * addr)807 nce_lookup_addr(ill_t *ill, const in6_addr_t *addr)
808 {
809 nce_t *nce;
810
811 ASSERT(ill != NULL);
812 #ifdef DEBUG
813 if (ill->ill_isv6)
814 ASSERT(MUTEX_HELD(&ill->ill_ipst->ips_ndp6->ndp_g_lock));
815 else
816 ASSERT(MUTEX_HELD(&ill->ill_ipst->ips_ndp4->ndp_g_lock));
817 #endif
818 mutex_enter(&ill->ill_lock);
819 nce = nce_lookup(ill, addr);
820 mutex_exit(&ill->ill_lock);
821 return (nce);
822 }
823
824
825 /*
826 * Router turned to host. We need to make sure that cached copies of the ncec
827 * are not used for forwarding packets if they were derived from the default
828 * route, and that the default route itself is removed, as required by
829 * section 7.2.5 of RFC 2461.
830 *
831 * Note that the ncec itself probably has valid link-layer information for the
832 * nexthop, so that there is no reason to delete the ncec, as long as the
833 * ISROUTER flag is turned off.
834 */
835 static void
ncec_router_to_host(ncec_t * ncec)836 ncec_router_to_host(ncec_t *ncec)
837 {
838 ire_t *ire;
839 ip_stack_t *ipst = ncec->ncec_ipst;
840
841 mutex_enter(&ncec->ncec_lock);
842 ncec->ncec_flags &= ~NCE_F_ISROUTER;
843 mutex_exit(&ncec->ncec_lock);
844
845 ire = ire_ftable_lookup_v6(&ipv6_all_zeros, &ipv6_all_zeros,
846 &ncec->ncec_addr, IRE_DEFAULT, ncec->ncec_ill, ALL_ZONES, NULL,
847 MATCH_IRE_ILL | MATCH_IRE_TYPE | MATCH_IRE_GW, 0, ipst, NULL);
848 if (ire != NULL) {
849 ip_rts_rtmsg(RTM_DELETE, ire, 0, ipst);
850 ire_delete(ire);
851 ire_refrele(ire);
852 }
853 }
854
855 /*
856 * Process passed in parameters either from an incoming packet or via
857 * user ioctl.
858 */
859 void
nce_process(ncec_t * ncec,uchar_t * hw_addr,uint32_t flag,boolean_t is_adv)860 nce_process(ncec_t *ncec, uchar_t *hw_addr, uint32_t flag, boolean_t is_adv)
861 {
862 ill_t *ill = ncec->ncec_ill;
863 uint32_t hw_addr_len = ill->ill_phys_addr_length;
864 boolean_t ll_updated = B_FALSE;
865 boolean_t ll_changed;
866 nce_t *nce;
867
868 ASSERT(ncec->ncec_ipversion == IPV6_VERSION);
869 /*
870 * No updates of link layer address or the neighbor state is
871 * allowed, when the cache is in NONUD state. This still
872 * allows for responding to reachability solicitation.
873 */
874 mutex_enter(&ncec->ncec_lock);
875 if (ncec->ncec_state == ND_INCOMPLETE) {
876 if (hw_addr == NULL) {
877 mutex_exit(&ncec->ncec_lock);
878 return;
879 }
880 nce_set_ll(ncec, hw_addr);
881 /*
882 * Update ncec state and send the queued packets
883 * back to ip this time ire will be added.
884 */
885 if (flag & ND_NA_FLAG_SOLICITED) {
886 nce_update(ncec, ND_REACHABLE, NULL);
887 } else {
888 nce_update(ncec, ND_STALE, NULL);
889 }
890 mutex_exit(&ncec->ncec_lock);
891 nce = nce_fastpath(ncec, B_TRUE, NULL);
892 nce_resolv_ok(ncec);
893 if (nce != NULL)
894 nce_refrele(nce);
895 return;
896 }
897 ll_changed = nce_cmp_ll_addr(ncec, hw_addr, hw_addr_len);
898 if (!is_adv) {
899 /* If this is a SOLICITATION request only */
900 if (ll_changed)
901 nce_update(ncec, ND_STALE, hw_addr);
902 mutex_exit(&ncec->ncec_lock);
903 ncec_cb_dispatch(ncec);
904 return;
905 }
906 if (!(flag & ND_NA_FLAG_OVERRIDE) && ll_changed) {
907 /* If in any other state than REACHABLE, ignore */
908 if (ncec->ncec_state == ND_REACHABLE) {
909 nce_update(ncec, ND_STALE, NULL);
910 }
911 mutex_exit(&ncec->ncec_lock);
912 ncec_cb_dispatch(ncec);
913 return;
914 } else {
915 if (ll_changed) {
916 nce_update(ncec, ND_UNCHANGED, hw_addr);
917 ll_updated = B_TRUE;
918 }
919 if (flag & ND_NA_FLAG_SOLICITED) {
920 nce_update(ncec, ND_REACHABLE, NULL);
921 } else {
922 if (ll_updated) {
923 nce_update(ncec, ND_STALE, NULL);
924 }
925 }
926 mutex_exit(&ncec->ncec_lock);
927 if (!(flag & ND_NA_FLAG_ROUTER) && (ncec->ncec_flags &
928 NCE_F_ISROUTER)) {
929 ncec_router_to_host(ncec);
930 } else {
931 ncec_cb_dispatch(ncec);
932 }
933 }
934 }
935
936 /*
937 * Pass arg1 to the pfi supplied, along with each ncec in existence.
938 * ncec_walk() places a REFHOLD on the ncec and drops the lock when
939 * walking the hash list.
940 */
941 void
ncec_walk_common(ndp_g_t * ndp,ill_t * ill,pfi_t pfi,void * arg1,boolean_t trace)942 ncec_walk_common(ndp_g_t *ndp, ill_t *ill, pfi_t pfi, void *arg1,
943 boolean_t trace)
944 {
945 ncec_t *ncec;
946 ncec_t *ncec1;
947 ncec_t **ncep;
948 ncec_t *free_nce_list = NULL;
949
950 mutex_enter(&ndp->ndp_g_lock);
951 /* Prevent ncec_delete from unlink and free of NCE */
952 ndp->ndp_g_walker++;
953 mutex_exit(&ndp->ndp_g_lock);
954 for (ncep = ndp->nce_hash_tbl;
955 ncep < A_END(ndp->nce_hash_tbl); ncep++) {
956 for (ncec = *ncep; ncec != NULL; ncec = ncec1) {
957 ncec1 = ncec->ncec_next;
958 if (ill == NULL || ncec->ncec_ill == ill) {
959 if (trace) {
960 ncec_refhold(ncec);
961 (*pfi)(ncec, arg1);
962 ncec_refrele(ncec);
963 } else {
964 ncec_refhold_notr(ncec);
965 (*pfi)(ncec, arg1);
966 ncec_refrele_notr(ncec);
967 }
968 }
969 }
970 }
971 mutex_enter(&ndp->ndp_g_lock);
972 ndp->ndp_g_walker--;
973 if (ndp->ndp_g_walker_cleanup && ndp->ndp_g_walker == 0) {
974 /* Time to delete condemned entries */
975 for (ncep = ndp->nce_hash_tbl;
976 ncep < A_END(ndp->nce_hash_tbl); ncep++) {
977 ncec = *ncep;
978 if (ncec != NULL) {
979 nce_remove(ndp, ncec, &free_nce_list);
980 }
981 }
982 ndp->ndp_g_walker_cleanup = B_FALSE;
983 }
984
985 mutex_exit(&ndp->ndp_g_lock);
986
987 if (free_nce_list != NULL) {
988 nce_cleanup_list(free_nce_list);
989 }
990 }
991
992 /*
993 * Walk everything.
994 * Note that ill can be NULL hence can't derive the ipst from it.
995 */
996 void
ncec_walk(ill_t * ill,pfi_t pfi,void * arg1,ip_stack_t * ipst)997 ncec_walk(ill_t *ill, pfi_t pfi, void *arg1, ip_stack_t *ipst)
998 {
999 ncec_walk_common(ipst->ips_ndp4, ill, pfi, arg1, B_TRUE);
1000 ncec_walk_common(ipst->ips_ndp6, ill, pfi, arg1, B_TRUE);
1001 }
1002
1003 /*
1004 * For each interface an entry is added for the unspecified multicast group.
1005 * Here that mapping is used to form the multicast cache entry for a particular
1006 * multicast destination.
1007 */
1008 static int
nce_set_multicast_v6(ill_t * ill,const in6_addr_t * dst,uint16_t flags,nce_t ** newnce)1009 nce_set_multicast_v6(ill_t *ill, const in6_addr_t *dst,
1010 uint16_t flags, nce_t **newnce)
1011 {
1012 uchar_t *hw_addr;
1013 int err = 0;
1014 ip_stack_t *ipst = ill->ill_ipst;
1015 nce_t *nce;
1016
1017 ASSERT(ill != NULL);
1018 ASSERT(ill->ill_isv6);
1019 ASSERT(!(IN6_IS_ADDR_UNSPECIFIED(dst)));
1020
1021 mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
1022 nce = nce_lookup_addr(ill, dst);
1023 if (nce != NULL) {
1024 mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
1025 goto done;
1026 }
1027 if (ill->ill_net_type == IRE_IF_RESOLVER) {
1028 /*
1029 * For IRE_IF_RESOLVER a hardware mapping can be
1030 * generated.
1031 */
1032 hw_addr = kmem_alloc(ill->ill_nd_lla_len, KM_NOSLEEP);
1033 if (hw_addr == NULL) {
1034 mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
1035 return (ENOMEM);
1036 }
1037 ip_mcast_mapping(ill, (uchar_t *)dst, hw_addr);
1038 } else {
1039 /* No hw_addr is needed for IRE_IF_NORESOLVER. */
1040 hw_addr = NULL;
1041 }
1042 ASSERT((flags & NCE_F_MCAST) != 0);
1043 ASSERT((flags & NCE_F_NONUD) != 0);
1044 /* nce_state will be computed by nce_add_common() */
1045 err = nce_add_v6(ill, hw_addr, ill->ill_phys_addr_length, dst, flags,
1046 ND_UNCHANGED, &nce);
1047 mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
1048 if (err == 0)
1049 err = nce_add_v6_postprocess(nce);
1050 if (hw_addr != NULL)
1051 kmem_free(hw_addr, ill->ill_nd_lla_len);
1052 if (err != 0) {
1053 ip1dbg(("nce_set_multicast_v6: create failed" "%d\n", err));
1054 return (err);
1055 }
1056 done:
1057 ASSERT(nce->nce_common->ncec_state == ND_REACHABLE);
1058 if (newnce != NULL)
1059 *newnce = nce;
1060 else
1061 nce_refrele(nce);
1062 return (0);
1063 }
1064
1065 /*
1066 * Return the link layer address, and any flags of a ncec.
1067 */
1068 int
ndp_query(ill_t * ill,struct lif_nd_req * lnr)1069 ndp_query(ill_t *ill, struct lif_nd_req *lnr)
1070 {
1071 ncec_t *ncec;
1072 in6_addr_t *addr;
1073 sin6_t *sin6;
1074
1075 ASSERT(ill != NULL && ill->ill_isv6);
1076 sin6 = (sin6_t *)&lnr->lnr_addr;
1077 addr = &sin6->sin6_addr;
1078
1079 /*
1080 * NOTE: if the ill is an IPMP interface, then match against the whole
1081 * illgrp. This e.g. allows in.ndpd to retrieve the link layer
1082 * addresses for the data addresses on an IPMP interface even though
1083 * ipif_ndp_up() created them with an ncec_ill of ipif_bound_ill.
1084 */
1085 ncec = ncec_lookup_illgrp_v6(ill, addr);
1086 if (ncec == NULL)
1087 return (ESRCH);
1088 /* If no link layer address is available yet, return ESRCH */
1089 if (!NCE_ISREACHABLE(ncec)) {
1090 ncec_refrele(ncec);
1091 return (ESRCH);
1092 }
1093 lnr->lnr_hdw_len = ill->ill_phys_addr_length;
1094 bcopy(ncec->ncec_lladdr, (uchar_t *)&lnr->lnr_hdw_addr,
1095 lnr->lnr_hdw_len);
1096 if (ncec->ncec_flags & NCE_F_ISROUTER)
1097 lnr->lnr_flags = NDF_ISROUTER_ON;
1098 if (ncec->ncec_flags & NCE_F_ANYCAST)
1099 lnr->lnr_flags |= NDF_ANYCAST_ON;
1100 ncec_refrele(ncec);
1101 return (0);
1102 }
1103
1104 /*
1105 * Finish setting up the Enable/Disable multicast for the driver.
1106 */
1107 mblk_t *
ndp_mcastreq(ill_t * ill,const in6_addr_t * v6group,uint32_t hw_addr_len,uint32_t hw_addr_offset,mblk_t * mp)1108 ndp_mcastreq(ill_t *ill, const in6_addr_t *v6group, uint32_t hw_addr_len,
1109 uint32_t hw_addr_offset, mblk_t *mp)
1110 {
1111 uchar_t *hw_addr;
1112 ipaddr_t v4group;
1113 uchar_t *addr;
1114
1115 ASSERT(ill->ill_net_type == IRE_IF_RESOLVER);
1116 if (IN6_IS_ADDR_V4MAPPED(v6group)) {
1117 IN6_V4MAPPED_TO_IPADDR(v6group, v4group);
1118
1119 ASSERT(CLASSD(v4group));
1120 ASSERT(!(ill->ill_isv6));
1121
1122 addr = (uchar_t *)&v4group;
1123 } else {
1124 ASSERT(IN6_IS_ADDR_MULTICAST(v6group));
1125 ASSERT(ill->ill_isv6);
1126
1127 addr = (uchar_t *)v6group;
1128 }
1129 hw_addr = mi_offset_paramc(mp, hw_addr_offset, hw_addr_len);
1130 if (hw_addr == NULL) {
1131 ip0dbg(("ndp_mcastreq NULL hw_addr\n"));
1132 freemsg(mp);
1133 return (NULL);
1134 }
1135
1136 ip_mcast_mapping(ill, addr, hw_addr);
1137 return (mp);
1138 }
1139
1140 void
ip_ndp_resolve(ncec_t * ncec)1141 ip_ndp_resolve(ncec_t *ncec)
1142 {
1143 in_addr_t sender4 = INADDR_ANY;
1144 in6_addr_t sender6 = ipv6_all_zeros;
1145 ill_t *src_ill;
1146 uint32_t ms;
1147
1148 src_ill = nce_resolve_src(ncec, &sender6);
1149 if (src_ill == NULL) {
1150 /* Make sure we try again later */
1151 ms = ncec->ncec_ill->ill_reachable_retrans_time;
1152 nce_restart_timer(ncec, (clock_t)ms);
1153 return;
1154 }
1155 if (ncec->ncec_ipversion == IPV4_VERSION)
1156 IN6_V4MAPPED_TO_IPADDR(&sender6, sender4);
1157 mutex_enter(&ncec->ncec_lock);
1158 if (ncec->ncec_ipversion == IPV6_VERSION)
1159 ms = ndp_solicit(ncec, sender6, src_ill);
1160 else
1161 ms = arp_request(ncec, sender4, src_ill);
1162 mutex_exit(&ncec->ncec_lock);
1163 if (ms == 0) {
1164 if (ncec->ncec_state != ND_REACHABLE) {
1165 if (ncec->ncec_ipversion == IPV6_VERSION)
1166 ndp_resolv_failed(ncec);
1167 else
1168 arp_resolv_failed(ncec);
1169 ASSERT((ncec->ncec_flags & NCE_F_STATIC) == 0);
1170 nce_make_unreachable(ncec);
1171 ncec_delete(ncec);
1172 }
1173 } else {
1174 nce_restart_timer(ncec, (clock_t)ms);
1175 }
1176 done:
1177 ill_refrele(src_ill);
1178 }
1179
1180 /*
1181 * Send an IPv6 neighbor solicitation.
1182 * Returns number of milliseconds after which we should either rexmit or abort.
1183 * Return of zero means we should abort.
1184 * The caller holds the ncec_lock to protect ncec_qd_mp and ncec_rcnt.
1185 * The optional source address is used as a hint to ndp_solicit for
1186 * which source to use in the packet.
1187 *
1188 * NOTE: This routine drops ncec_lock (and later reacquires it) when sending
1189 * the packet.
1190 */
1191 uint32_t
ndp_solicit(ncec_t * ncec,in6_addr_t src,ill_t * ill)1192 ndp_solicit(ncec_t *ncec, in6_addr_t src, ill_t *ill)
1193 {
1194 in6_addr_t dst;
1195 boolean_t dropped = B_FALSE;
1196
1197 ASSERT(ncec->ncec_ipversion == IPV6_VERSION);
1198 ASSERT(MUTEX_HELD(&ncec->ncec_lock));
1199
1200 if (ncec->ncec_rcnt == 0)
1201 return (0);
1202
1203 dst = ncec->ncec_addr;
1204 ncec->ncec_rcnt--;
1205 mutex_exit(&ncec->ncec_lock);
1206 dropped = ndp_xmit(ill, ND_NEIGHBOR_SOLICIT, ill->ill_phys_addr,
1207 ill->ill_phys_addr_length, &src, &dst, 0);
1208 mutex_enter(&ncec->ncec_lock);
1209 if (dropped)
1210 ncec->ncec_rcnt++;
1211 return (ncec->ncec_ill->ill_reachable_retrans_time);
1212 }
1213
1214 /*
1215 * Attempt to recover an address on an interface that's been marked as a
1216 * duplicate. Because NCEs are destroyed when the interface goes down, there's
1217 * no easy way to just probe the address and have the right thing happen if
1218 * it's no longer in use. Instead, we just bring it up normally and allow the
1219 * regular interface start-up logic to probe for a remaining duplicate and take
1220 * us back down if necessary.
1221 * Neither DHCP nor temporary addresses arrive here; they're excluded by
1222 * ip_ndp_excl.
1223 */
1224 /* ARGSUSED */
1225 void
ip_addr_recover(ipsq_t * ipsq,queue_t * rq,mblk_t * mp,void * dummy_arg)1226 ip_addr_recover(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg)
1227 {
1228 ill_t *ill = rq->q_ptr;
1229 ipif_t *ipif;
1230 in6_addr_t *addr6 = (in6_addr_t *)mp->b_rptr;
1231 in_addr_t *addr4 = (in_addr_t *)mp->b_rptr;
1232 boolean_t addr_equal;
1233
1234 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
1235 /*
1236 * We do not support recovery of proxy ARP'd interfaces,
1237 * because the system lacks a complete proxy ARP mechanism.
1238 */
1239 if (ill->ill_isv6) {
1240 addr_equal = IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6lcl_addr,
1241 addr6);
1242 } else {
1243 addr_equal = (ipif->ipif_lcl_addr == *addr4);
1244 }
1245
1246 if ((ipif->ipif_flags & IPIF_POINTOPOINT) || !addr_equal)
1247 continue;
1248
1249 /*
1250 * If we have already recovered or if the interface is going
1251 * away, then ignore.
1252 */
1253 mutex_enter(&ill->ill_lock);
1254 if (!(ipif->ipif_flags & IPIF_DUPLICATE) ||
1255 (ipif->ipif_state_flags & IPIF_CONDEMNED)) {
1256 mutex_exit(&ill->ill_lock);
1257 continue;
1258 }
1259
1260 ipif->ipif_flags &= ~IPIF_DUPLICATE;
1261 ill->ill_ipif_dup_count--;
1262 mutex_exit(&ill->ill_lock);
1263 ipif->ipif_was_dup = B_TRUE;
1264
1265 if (ill->ill_isv6) {
1266 VERIFY(ipif_ndp_up(ipif, B_TRUE) != EINPROGRESS);
1267 (void) ipif_up_done_v6(ipif);
1268 } else {
1269 VERIFY(ipif_arp_up(ipif, Res_act_initial, B_TRUE) !=
1270 EINPROGRESS);
1271 (void) ipif_up_done(ipif);
1272 }
1273 }
1274 freeb(mp);
1275 }
1276
1277 /*
1278 * Attempt to recover an IPv6 interface that's been shut down as a duplicate.
1279 * As long as someone else holds the address, the interface will stay down.
1280 * When that conflict goes away, the interface is brought back up. This is
1281 * done so that accidental shutdowns of addresses aren't made permanent. Your
1282 * server will recover from a failure.
1283 *
1284 * For DHCP and temporary addresses, recovery is not done in the kernel.
1285 * Instead, it's handled by user space processes (dhcpagent and in.ndpd).
1286 *
1287 * This function is entered on a timer expiry; the ID is in ipif_recovery_id.
1288 */
1289 void
ipif_dup_recovery(void * arg)1290 ipif_dup_recovery(void *arg)
1291 {
1292 ipif_t *ipif = arg;
1293
1294 ipif->ipif_recovery_id = 0;
1295 if (!(ipif->ipif_flags & IPIF_DUPLICATE))
1296 return;
1297
1298 /*
1299 * No lock, because this is just an optimization.
1300 */
1301 if (ipif->ipif_state_flags & IPIF_CONDEMNED)
1302 return;
1303
1304 /* If the link is down, we'll retry this later */
1305 if (!(ipif->ipif_ill->ill_phyint->phyint_flags & PHYI_RUNNING))
1306 return;
1307
1308 ipif_do_recovery(ipif);
1309 }
1310
1311 /*
1312 * Perform interface recovery by forcing the duplicate interfaces up and
1313 * allowing the system to determine which ones should stay up.
1314 *
1315 * Called both by recovery timer expiry and link-up notification.
1316 */
1317 void
ipif_do_recovery(ipif_t * ipif)1318 ipif_do_recovery(ipif_t *ipif)
1319 {
1320 ill_t *ill = ipif->ipif_ill;
1321 mblk_t *mp;
1322 ip_stack_t *ipst = ill->ill_ipst;
1323 size_t mp_size;
1324
1325 if (ipif->ipif_isv6)
1326 mp_size = sizeof (ipif->ipif_v6lcl_addr);
1327 else
1328 mp_size = sizeof (ipif->ipif_lcl_addr);
1329 mp = allocb(mp_size, BPRI_MED);
1330 if (mp == NULL) {
1331 mutex_enter(&ill->ill_lock);
1332 if (ipst->ips_ip_dup_recovery > 0 &&
1333 ipif->ipif_recovery_id == 0 &&
1334 !(ipif->ipif_state_flags & IPIF_CONDEMNED)) {
1335 ipif->ipif_recovery_id = timeout(ipif_dup_recovery,
1336 ipif, MSEC_TO_TICK(ipst->ips_ip_dup_recovery));
1337 }
1338 mutex_exit(&ill->ill_lock);
1339 } else {
1340 /*
1341 * A recovery timer may still be running if we got here from
1342 * ill_restart_dad(); cancel that timer.
1343 */
1344 if (ipif->ipif_recovery_id != 0)
1345 (void) untimeout(ipif->ipif_recovery_id);
1346 ipif->ipif_recovery_id = 0;
1347
1348 if (ipif->ipif_isv6) {
1349 bcopy(&ipif->ipif_v6lcl_addr, mp->b_rptr,
1350 sizeof (ipif->ipif_v6lcl_addr));
1351 } else {
1352 bcopy(&ipif->ipif_lcl_addr, mp->b_rptr,
1353 sizeof (ipif->ipif_lcl_addr));
1354 }
1355 ill_refhold(ill);
1356 qwriter_ip(ill, ill->ill_rq, mp, ip_addr_recover, NEW_OP,
1357 B_FALSE);
1358 }
1359 }
1360
1361 /*
1362 * Find the MAC and IP addresses in an NA/NS message.
1363 */
1364 static void
ip_ndp_find_addresses(mblk_t * mp,ip_recv_attr_t * ira,ill_t * ill,in6_addr_t * targp,uchar_t ** haddr,uint_t * haddrlenp)1365 ip_ndp_find_addresses(mblk_t *mp, ip_recv_attr_t *ira, ill_t *ill,
1366 in6_addr_t *targp, uchar_t **haddr, uint_t *haddrlenp)
1367 {
1368 icmp6_t *icmp6 = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
1369 nd_neighbor_solicit_t *ns = (nd_neighbor_solicit_t *)icmp6;
1370 uchar_t *addr;
1371 int alen;
1372
1373 /* icmp_inbound_v6 ensures this */
1374 ASSERT(ira->ira_flags & IRAF_L2SRC_SET);
1375
1376 addr = ira->ira_l2src;
1377 alen = ill->ill_phys_addr_length;
1378 if (alen > 0) {
1379 *haddr = addr;
1380 *haddrlenp = alen;
1381 } else {
1382 *haddr = NULL;
1383 *haddrlenp = 0;
1384 }
1385
1386 /* nd_ns_target and nd_na_target are at the same offset, so we cheat */
1387 *targp = ns->nd_ns_target;
1388 }
1389
1390 /*
1391 * This is for exclusive changes due to NDP duplicate address detection
1392 * failure.
1393 */
1394 /* ARGSUSED */
1395 static void
ip_ndp_excl(ipsq_t * ipsq,queue_t * rq,mblk_t * mp,void * dummy_arg)1396 ip_ndp_excl(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg)
1397 {
1398 ill_t *ill = rq->q_ptr;
1399 ipif_t *ipif;
1400 uchar_t *haddr;
1401 uint_t haddrlen;
1402 ip_stack_t *ipst = ill->ill_ipst;
1403 in6_addr_t targ;
1404 ip_recv_attr_t iras;
1405 mblk_t *attrmp;
1406
1407 attrmp = mp;
1408 mp = mp->b_cont;
1409 attrmp->b_cont = NULL;
1410 if (!ip_recv_attr_from_mblk(attrmp, &iras)) {
1411 /* The ill or ip_stack_t disappeared on us */
1412 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
1413 ip_drop_input("ip_recv_attr_from_mblk", mp, ill);
1414 freemsg(mp);
1415 ira_cleanup(&iras, B_TRUE);
1416 return;
1417 }
1418
1419 ASSERT(ill == iras.ira_rill);
1420
1421 ip_ndp_find_addresses(mp, &iras, ill, &targ, &haddr, &haddrlen);
1422 if (haddr != NULL && haddrlen == ill->ill_phys_addr_length) {
1423 /*
1424 * Ignore conflicts generated by misbehaving switches that
1425 * just reflect our own messages back to us. For IPMP, we may
1426 * see reflections across any ill in the illgrp.
1427 *
1428 * RFC2462 and revisions tried to detect both the case
1429 * when a statically configured IPv6 address is a duplicate,
1430 * and the case when the L2 address itself is a duplicate. The
1431 * later is important because, with stateles address autoconf,
1432 * if the L2 address is a duplicate, the resulting IPv6
1433 * address(es) would also be duplicates. We rely on DAD of the
1434 * IPv6 address itself to detect the latter case.
1435 */
1436 /* For an under ill_grp can change under lock */
1437 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
1438 if (bcmp(haddr, ill->ill_phys_addr, haddrlen) == 0 ||
1439 IS_UNDER_IPMP(ill) &&
1440 ipmp_illgrp_find_ill(ill->ill_grp, haddr,
1441 haddrlen) != NULL) {
1442 rw_exit(&ipst->ips_ill_g_lock);
1443 goto ignore_conflict;
1444 }
1445 rw_exit(&ipst->ips_ill_g_lock);
1446 }
1447
1448 /*
1449 * Look up the appropriate ipif.
1450 */
1451 ipif = ipif_lookup_addr_v6(&targ, ill, ALL_ZONES, ipst);
1452 if (ipif == NULL)
1453 goto ignore_conflict;
1454
1455 /* Reload the ill to match the ipif */
1456 ill = ipif->ipif_ill;
1457
1458 /* If it's already duplicate or ineligible, then don't do anything. */
1459 if (ipif->ipif_flags & (IPIF_POINTOPOINT|IPIF_DUPLICATE)) {
1460 ipif_refrele(ipif);
1461 goto ignore_conflict;
1462 }
1463
1464 /*
1465 * If this is a failure during duplicate recovery, then don't
1466 * complain. It may take a long time to recover.
1467 */
1468 if (!ipif->ipif_was_dup) {
1469 char ibuf[LIFNAMSIZ];
1470 char hbuf[MAC_STR_LEN];
1471 char sbuf[INET6_ADDRSTRLEN];
1472
1473 ipif_get_name(ipif, ibuf, sizeof (ibuf));
1474 cmn_err(CE_WARN, "%s has duplicate address %s (in use by %s);"
1475 " disabled", ibuf,
1476 inet_ntop(AF_INET6, &targ, sbuf, sizeof (sbuf)),
1477 mac_colon_addr(haddr, haddrlen, hbuf, sizeof (hbuf)));
1478 }
1479 mutex_enter(&ill->ill_lock);
1480 ASSERT(!(ipif->ipif_flags & IPIF_DUPLICATE));
1481 ipif->ipif_flags |= IPIF_DUPLICATE;
1482 ill->ill_ipif_dup_count++;
1483 mutex_exit(&ill->ill_lock);
1484 (void) ipif_down(ipif, NULL, NULL);
1485 (void) ipif_down_tail(ipif);
1486 mutex_enter(&ill->ill_lock);
1487 if (!(ipif->ipif_flags & (IPIF_DHCPRUNNING|IPIF_TEMPORARY)) &&
1488 ill->ill_net_type == IRE_IF_RESOLVER &&
1489 !(ipif->ipif_state_flags & IPIF_CONDEMNED) &&
1490 ipst->ips_ip_dup_recovery > 0) {
1491 ASSERT(ipif->ipif_recovery_id == 0);
1492 ipif->ipif_recovery_id = timeout(ipif_dup_recovery,
1493 ipif, MSEC_TO_TICK(ipst->ips_ip_dup_recovery));
1494 }
1495 mutex_exit(&ill->ill_lock);
1496 ipif_refrele(ipif);
1497
1498 ignore_conflict:
1499 freemsg(mp);
1500 ira_cleanup(&iras, B_TRUE);
1501 }
1502
1503 /*
1504 * Handle failure by tearing down the ipifs with the specified address. Note
1505 * that tearing down the ipif also means deleting the ncec through ipif_down, so
1506 * it's not possible to do recovery by just restarting the ncec timer. Instead,
1507 * we start a timer on the ipif.
1508 * Caller has to free mp;
1509 */
1510 static void
ndp_failure(mblk_t * mp,ip_recv_attr_t * ira)1511 ndp_failure(mblk_t *mp, ip_recv_attr_t *ira)
1512 {
1513 const uchar_t *haddr;
1514 ill_t *ill = ira->ira_rill;
1515
1516 /*
1517 * Ignore conflicts generated by misbehaving switches that just
1518 * reflect our own messages back to us.
1519 */
1520
1521 /* icmp_inbound_v6 ensures this */
1522 ASSERT(ira->ira_flags & IRAF_L2SRC_SET);
1523 haddr = ira->ira_l2src;
1524 if (haddr != NULL &&
1525 bcmp(haddr, ill->ill_phys_addr, ill->ill_phys_addr_length) == 0) {
1526 return;
1527 }
1528
1529 if ((mp = copymsg(mp)) != NULL) {
1530 mblk_t *attrmp;
1531
1532 attrmp = ip_recv_attr_to_mblk(ira);
1533 if (attrmp == NULL) {
1534 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
1535 ip_drop_input("ipIfStatsInDiscards", mp, ill);
1536 freemsg(mp);
1537 } else {
1538 ASSERT(attrmp->b_cont == NULL);
1539 attrmp->b_cont = mp;
1540 mp = attrmp;
1541 ill_refhold(ill);
1542 qwriter_ip(ill, ill->ill_rq, mp, ip_ndp_excl, NEW_OP,
1543 B_FALSE);
1544 }
1545 }
1546 }
1547
1548 /*
1549 * Handle a discovered conflict: some other system is advertising that it owns
1550 * one of our IP addresses. We need to defend ourselves, or just shut down the
1551 * interface.
1552 *
1553 * Handles both IPv4 and IPv6
1554 */
1555 boolean_t
ip_nce_conflict(mblk_t * mp,ip_recv_attr_t * ira,ncec_t * ncec)1556 ip_nce_conflict(mblk_t *mp, ip_recv_attr_t *ira, ncec_t *ncec)
1557 {
1558 ipif_t *ipif;
1559 clock_t now;
1560 uint_t maxdefense;
1561 uint_t defs;
1562 ill_t *ill = ira->ira_ill;
1563 ip_stack_t *ipst = ill->ill_ipst;
1564 uint32_t elapsed;
1565 boolean_t isv6 = ill->ill_isv6;
1566 ipaddr_t ncec_addr;
1567
1568 if (isv6) {
1569 ipif = ipif_lookup_addr_v6(&ncec->ncec_addr, ill, ALL_ZONES,
1570 ipst);
1571 } else {
1572 if (arp_no_defense) {
1573 /*
1574 * Yes, there is a conflict, but no, we do not
1575 * defend ourself.
1576 */
1577 return (B_TRUE);
1578 }
1579 IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, ncec_addr);
1580 ipif = ipif_lookup_addr(ncec_addr, ill, ALL_ZONES,
1581 ipst);
1582 }
1583 if (ipif == NULL)
1584 return (B_FALSE);
1585
1586 /*
1587 * First, figure out if this address is disposable.
1588 */
1589 if (ipif->ipif_flags & (IPIF_DHCPRUNNING | IPIF_TEMPORARY))
1590 maxdefense = ipst->ips_ip_max_temp_defend;
1591 else
1592 maxdefense = ipst->ips_ip_max_defend;
1593
1594 /*
1595 * Now figure out how many times we've defended ourselves. Ignore
1596 * defenses that happened long in the past.
1597 */
1598 now = ddi_get_lbolt();
1599 elapsed = (drv_hztousec(now - ncec->ncec_last_time_defended))/1000000;
1600 mutex_enter(&ncec->ncec_lock);
1601 if ((defs = ncec->ncec_defense_count) > 0 &&
1602 elapsed > ipst->ips_ip_defend_interval) {
1603 /*
1604 * ip_defend_interval has elapsed.
1605 * reset the defense count.
1606 */
1607 ncec->ncec_defense_count = defs = 0;
1608 }
1609 ncec->ncec_defense_count++;
1610 ncec->ncec_last_time_defended = now;
1611 mutex_exit(&ncec->ncec_lock);
1612 ipif_refrele(ipif);
1613
1614 /*
1615 * If we've defended ourselves too many times already, then give up and
1616 * tear down the interface(s) using this address.
1617 * Otherwise, caller has to defend by sending out an announce.
1618 */
1619 if (defs >= maxdefense) {
1620 if (isv6)
1621 ndp_failure(mp, ira);
1622 else
1623 arp_failure(mp, ira);
1624 } else {
1625 return (B_TRUE); /* caller must defend this address */
1626 }
1627 return (B_FALSE);
1628 }
1629
1630 /*
1631 * Handle reception of Neighbor Solicitation messages.
1632 */
1633 static void
ndp_input_solicit(mblk_t * mp,ip_recv_attr_t * ira)1634 ndp_input_solicit(mblk_t *mp, ip_recv_attr_t *ira)
1635 {
1636 ill_t *ill = ira->ira_ill, *under_ill;
1637 nd_neighbor_solicit_t *ns;
1638 uint32_t hlen = ill->ill_phys_addr_length;
1639 uchar_t *haddr = NULL;
1640 icmp6_t *icmp_nd;
1641 ip6_t *ip6h;
1642 ncec_t *our_ncec = NULL;
1643 in6_addr_t target;
1644 in6_addr_t src;
1645 int len;
1646 int flag = 0;
1647 nd_opt_hdr_t *opt = NULL;
1648 boolean_t bad_solicit = B_FALSE;
1649 mib2_ipv6IfIcmpEntry_t *mib = ill->ill_icmp6_mib;
1650 boolean_t need_ill_refrele = B_FALSE;
1651
1652 ip6h = (ip6_t *)mp->b_rptr;
1653 icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
1654 len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN;
1655 src = ip6h->ip6_src;
1656 ns = (nd_neighbor_solicit_t *)icmp_nd;
1657 target = ns->nd_ns_target;
1658 if (IN6_IS_ADDR_MULTICAST(&target) || IN6_IS_ADDR_V4MAPPED(&target) ||
1659 IN6_IS_ADDR_LOOPBACK(&target)) {
1660 if (ip_debug > 2) {
1661 /* ip1dbg */
1662 pr_addr_dbg("ndp_input_solicit: Martian Target %s\n",
1663 AF_INET6, &target);
1664 }
1665 bad_solicit = B_TRUE;
1666 goto done;
1667 }
1668 if (len > sizeof (nd_neighbor_solicit_t)) {
1669 /* Options present */
1670 opt = (nd_opt_hdr_t *)&ns[1];
1671 len -= sizeof (nd_neighbor_solicit_t);
1672 if (!ndp_verify_optlen(opt, len)) {
1673 ip1dbg(("ndp_input_solicit: Bad opt len\n"));
1674 bad_solicit = B_TRUE;
1675 goto done;
1676 }
1677 }
1678 if (IN6_IS_ADDR_UNSPECIFIED(&src)) {
1679 /* Check to see if this is a valid DAD solicitation */
1680 if (!IN6_IS_ADDR_MC_SOLICITEDNODE(&ip6h->ip6_dst)) {
1681 if (ip_debug > 2) {
1682 /* ip1dbg */
1683 pr_addr_dbg("ndp_input_solicit: IPv6 "
1684 "Destination is not solicited node "
1685 "multicast %s\n", AF_INET6,
1686 &ip6h->ip6_dst);
1687 }
1688 bad_solicit = B_TRUE;
1689 goto done;
1690 }
1691 }
1692
1693 /*
1694 * NOTE: with IPMP, it's possible the nominated multicast ill (which
1695 * received this packet if it's multicast) is not the ill tied to
1696 * e.g. the IPMP ill's data link-local. So we match across the illgrp
1697 * to ensure we find the associated NCE.
1698 */
1699 our_ncec = ncec_lookup_illgrp_v6(ill, &target);
1700 /*
1701 * If this is a valid Solicitation for an address we are publishing,
1702 * then a PUBLISH entry should exist in the cache
1703 */
1704 if (our_ncec == NULL || !NCE_PUBLISH(our_ncec)) {
1705 ip1dbg(("ndp_input_solicit: Wrong target in NS?!"
1706 "ifname=%s ", ill->ill_name));
1707 if (ip_debug > 2) {
1708 /* ip1dbg */
1709 pr_addr_dbg(" dst %s\n", AF_INET6, &target);
1710 }
1711 if (our_ncec == NULL)
1712 bad_solicit = B_TRUE;
1713 goto done;
1714 }
1715
1716 /* At this point we should have a verified NS per spec */
1717 if (opt != NULL) {
1718 opt = ndp_get_option(opt, len, ND_OPT_SOURCE_LINKADDR);
1719 if (opt != NULL) {
1720 haddr = (uchar_t *)&opt[1];
1721 if (hlen > opt->nd_opt_len * 8 - sizeof (*opt) ||
1722 hlen == 0) {
1723 ip1dbg(("ndp_input_advert: bad SLLA\n"));
1724 bad_solicit = B_TRUE;
1725 goto done;
1726 }
1727 }
1728 }
1729
1730 /* If sending directly to peer, set the unicast flag */
1731 if (!IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst))
1732 flag |= NDP_UNICAST;
1733
1734 /*
1735 * Create/update the entry for the soliciting node on the ipmp_ill.
1736 * or respond to outstanding queries, don't if
1737 * the source is unspecified address.
1738 */
1739 if (!IN6_IS_ADDR_UNSPECIFIED(&src)) {
1740 int err;
1741 nce_t *nnce;
1742
1743 ASSERT(ill->ill_isv6);
1744 /*
1745 * Regular solicitations *must* include the Source Link-Layer
1746 * Address option. Ignore messages that do not.
1747 */
1748 if (haddr == NULL && IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst)) {
1749 ip1dbg(("ndp_input_solicit: source link-layer address "
1750 "option missing with a specified source.\n"));
1751 bad_solicit = B_TRUE;
1752 goto done;
1753 }
1754
1755 /*
1756 * This is a regular solicitation. If we're still in the
1757 * process of verifying the address, then don't respond at all
1758 * and don't keep track of the sender.
1759 */
1760 if (our_ncec->ncec_state == ND_PROBE)
1761 goto done;
1762
1763 /*
1764 * If the solicitation doesn't have sender hardware address
1765 * (legal for unicast solicitation), then process without
1766 * installing the return NCE. Either we already know it, or
1767 * we'll be forced to look it up when (and if) we reply to the
1768 * packet.
1769 */
1770 if (haddr == NULL)
1771 goto no_source;
1772
1773 under_ill = ill;
1774 if (IS_UNDER_IPMP(under_ill)) {
1775 ill = ipmp_ill_hold_ipmp_ill(under_ill);
1776 if (ill == NULL)
1777 ill = under_ill;
1778 else
1779 need_ill_refrele = B_TRUE;
1780 }
1781 err = nce_lookup_then_add_v6(ill,
1782 haddr, hlen,
1783 &src, /* Soliciting nodes address */
1784 0,
1785 ND_STALE,
1786 &nnce);
1787
1788 if (need_ill_refrele) {
1789 ill_refrele(ill);
1790 ill = under_ill;
1791 need_ill_refrele = B_FALSE;
1792 }
1793 switch (err) {
1794 case 0:
1795 /* done with this entry */
1796 nce_refrele(nnce);
1797 break;
1798 case EEXIST:
1799 /*
1800 * B_FALSE indicates this is not an an advertisement.
1801 */
1802 nce_process(nnce->nce_common, haddr, 0, B_FALSE);
1803 nce_refrele(nnce);
1804 break;
1805 default:
1806 ip1dbg(("ndp_input_solicit: Can't create NCE %d\n",
1807 err));
1808 goto done;
1809 }
1810 no_source:
1811 flag |= NDP_SOLICITED;
1812 } else {
1813 /*
1814 * No source link layer address option should be present in a
1815 * valid DAD request.
1816 */
1817 if (haddr != NULL) {
1818 ip1dbg(("ndp_input_solicit: source link-layer address "
1819 "option present with an unspecified source.\n"));
1820 bad_solicit = B_TRUE;
1821 goto done;
1822 }
1823 if (our_ncec->ncec_state == ND_PROBE) {
1824 /*
1825 * Internally looped-back probes will have
1826 * IRAF_L2SRC_LOOPBACK set so we can ignore our own
1827 * transmissions.
1828 */
1829 if (!(ira->ira_flags & IRAF_L2SRC_LOOPBACK)) {
1830 /*
1831 * If someone else is probing our address, then
1832 * we've crossed wires. Declare failure.
1833 */
1834 ndp_failure(mp, ira);
1835 }
1836 goto done;
1837 }
1838 /*
1839 * This is a DAD probe. Multicast the advertisement to the
1840 * all-nodes address.
1841 */
1842 src = ipv6_all_hosts_mcast;
1843 }
1844 flag |= nce_advert_flags(our_ncec);
1845 (void) ndp_xmit(ill,
1846 ND_NEIGHBOR_ADVERT,
1847 our_ncec->ncec_lladdr,
1848 our_ncec->ncec_lladdr_length,
1849 &target, /* Source and target of the advertisement pkt */
1850 &src, /* IP Destination (source of original pkt) */
1851 flag);
1852 done:
1853 if (bad_solicit)
1854 BUMP_MIB(mib, ipv6IfIcmpInBadNeighborSolicitations);
1855 if (our_ncec != NULL)
1856 ncec_refrele(our_ncec);
1857 }
1858
1859 /*
1860 * Handle reception of Neighbor Solicitation messages
1861 */
1862 void
ndp_input_advert(mblk_t * mp,ip_recv_attr_t * ira)1863 ndp_input_advert(mblk_t *mp, ip_recv_attr_t *ira)
1864 {
1865 ill_t *ill = ira->ira_ill;
1866 nd_neighbor_advert_t *na;
1867 uint32_t hlen = ill->ill_phys_addr_length;
1868 uchar_t *haddr = NULL;
1869 icmp6_t *icmp_nd;
1870 ip6_t *ip6h;
1871 ncec_t *dst_ncec = NULL;
1872 in6_addr_t target;
1873 nd_opt_hdr_t *opt = NULL;
1874 int len;
1875 ip_stack_t *ipst = ill->ill_ipst;
1876 mib2_ipv6IfIcmpEntry_t *mib = ill->ill_icmp6_mib;
1877
1878 ip6h = (ip6_t *)mp->b_rptr;
1879 icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
1880 len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN;
1881 na = (nd_neighbor_advert_t *)icmp_nd;
1882
1883 if (IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst) &&
1884 (na->nd_na_flags_reserved & ND_NA_FLAG_SOLICITED)) {
1885 ip1dbg(("ndp_input_advert: Target is multicast but the "
1886 "solicited flag is not zero\n"));
1887 BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements);
1888 return;
1889 }
1890 target = na->nd_na_target;
1891 if (IN6_IS_ADDR_MULTICAST(&target) || IN6_IS_ADDR_V4MAPPED(&target) ||
1892 IN6_IS_ADDR_LOOPBACK(&target)) {
1893 if (ip_debug > 2) {
1894 /* ip1dbg */
1895 pr_addr_dbg("ndp_input_solicit: Martian Target %s\n",
1896 AF_INET6, &target);
1897 }
1898 BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements);
1899 return;
1900 }
1901 if (len > sizeof (nd_neighbor_advert_t)) {
1902 opt = (nd_opt_hdr_t *)&na[1];
1903 if (!ndp_verify_optlen(opt,
1904 len - sizeof (nd_neighbor_advert_t))) {
1905 ip1dbg(("ndp_input_advert: cannot verify SLLA\n"));
1906 BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements);
1907 return;
1908 }
1909 /* At this point we have a verified NA per spec */
1910 len -= sizeof (nd_neighbor_advert_t);
1911 opt = ndp_get_option(opt, len, ND_OPT_TARGET_LINKADDR);
1912 if (opt != NULL) {
1913 haddr = (uchar_t *)&opt[1];
1914 if (hlen > opt->nd_opt_len * 8 - sizeof (*opt) ||
1915 hlen == 0) {
1916 ip1dbg(("ndp_input_advert: bad SLLA\n"));
1917 BUMP_MIB(mib,
1918 ipv6IfIcmpInBadNeighborAdvertisements);
1919 return;
1920 }
1921 }
1922 }
1923
1924 /*
1925 * NOTE: we match across the illgrp since we need to do DAD for all of
1926 * our local addresses, and those are spread across all the active
1927 * ills in the group.
1928 */
1929 if ((dst_ncec = ncec_lookup_illgrp_v6(ill, &target)) == NULL)
1930 return;
1931
1932 if (NCE_PUBLISH(dst_ncec)) {
1933 /*
1934 * Someone just advertised an addresses that we publish. First,
1935 * check it it was us -- if so, we can safely ignore it.
1936 * We don't get the haddr from the ira_l2src because, in the
1937 * case that the packet originated from us, on an IPMP group,
1938 * the ira_l2src may would be the link-layer address of the
1939 * cast_ill used to send the packet, which may not be the same
1940 * as the dst_ncec->ncec_lladdr of the address.
1941 */
1942 if (haddr != NULL) {
1943 if (ira->ira_flags & IRAF_L2SRC_LOOPBACK)
1944 goto out;
1945
1946 if (!nce_cmp_ll_addr(dst_ncec, haddr, hlen))
1947 goto out; /* from us -- no conflict */
1948
1949 /*
1950 * If we're in an IPMP group, check if this is an echo
1951 * from another ill in the group. Use the double-
1952 * checked locking pattern to avoid grabbing
1953 * ill_g_lock in the non-IPMP case.
1954 */
1955 if (IS_UNDER_IPMP(ill)) {
1956 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
1957 if (IS_UNDER_IPMP(ill) && ipmp_illgrp_find_ill(
1958 ill->ill_grp, haddr, hlen) != NULL) {
1959 rw_exit(&ipst->ips_ill_g_lock);
1960 goto out;
1961 }
1962 rw_exit(&ipst->ips_ill_g_lock);
1963 }
1964 }
1965
1966 /*
1967 * This appears to be a real conflict. If we're trying to
1968 * configure this NCE (ND_PROBE), then shut it down.
1969 * Otherwise, handle the discovered conflict.
1970 */
1971 if (dst_ncec->ncec_state == ND_PROBE) {
1972 ndp_failure(mp, ira);
1973 } else {
1974 if (ip_nce_conflict(mp, ira, dst_ncec)) {
1975 char hbuf[MAC_STR_LEN];
1976 char sbuf[INET6_ADDRSTRLEN];
1977
1978 cmn_err(CE_WARN,
1979 "node '%s' is using %s on %s",
1980 inet_ntop(AF_INET6, &target, sbuf,
1981 sizeof (sbuf)),
1982 haddr == NULL ? "<none>" :
1983 mac_colon_addr(haddr, hlen, hbuf,
1984 sizeof (hbuf)), ill->ill_name);
1985 /*
1986 * RFC 4862, Section 5.4.4 does not mandate
1987 * any specific behavior when an NA matches
1988 * a non-tentative address assigned to the
1989 * receiver. We make the choice of defending
1990 * our address, based on the assumption that
1991 * the sender has not detected the Duplicate.
1992 *
1993 * ncec_last_time_defended has been adjusted
1994 * in ip_nce_conflict()
1995 */
1996 (void) ndp_announce(dst_ncec);
1997 }
1998 }
1999 } else {
2000 if (na->nd_na_flags_reserved & ND_NA_FLAG_ROUTER)
2001 dst_ncec->ncec_flags |= NCE_F_ISROUTER;
2002
2003 /* B_TRUE indicates this an advertisement */
2004 nce_process(dst_ncec, haddr, na->nd_na_flags_reserved, B_TRUE);
2005 }
2006 out:
2007 ncec_refrele(dst_ncec);
2008 }
2009
2010 /*
2011 * Process NDP neighbor solicitation/advertisement messages.
2012 * The checksum has already checked o.k before reaching here.
2013 * Information about the datalink header is contained in ira_l2src, but
2014 * that should be ignored for loopback packets.
2015 */
2016 void
ndp_input(mblk_t * mp,ip_recv_attr_t * ira)2017 ndp_input(mblk_t *mp, ip_recv_attr_t *ira)
2018 {
2019 ill_t *ill = ira->ira_rill;
2020 icmp6_t *icmp_nd;
2021 ip6_t *ip6h;
2022 int len;
2023 mib2_ipv6IfIcmpEntry_t *mib = ill->ill_icmp6_mib;
2024 ill_t *orig_ill = NULL;
2025
2026 /*
2027 * Since ira_ill is where the IRE_LOCAL was hosted we use ira_rill
2028 * and make it be the IPMP upper so avoid being confused by a packet
2029 * addressed to a unicast address on a different ill.
2030 */
2031 if (IS_UNDER_IPMP(ill)) {
2032 orig_ill = ill;
2033 ill = ipmp_ill_hold_ipmp_ill(orig_ill);
2034 if (ill == NULL) {
2035 ill = orig_ill;
2036 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2037 ip_drop_input("ipIfStatsInDiscards - IPMP ill",
2038 mp, ill);
2039 freemsg(mp);
2040 return;
2041 }
2042 ASSERT(ill != orig_ill);
2043 orig_ill = ira->ira_ill;
2044 ira->ira_ill = ill;
2045 mib = ill->ill_icmp6_mib;
2046 }
2047 if (!pullupmsg(mp, -1)) {
2048 ip1dbg(("ndp_input: pullupmsg failed\n"));
2049 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2050 ip_drop_input("ipIfStatsInDiscards - pullupmsg", mp, ill);
2051 goto done;
2052 }
2053 ip6h = (ip6_t *)mp->b_rptr;
2054 if (ip6h->ip6_hops != IPV6_MAX_HOPS) {
2055 ip1dbg(("ndp_input: hoplimit != IPV6_MAX_HOPS\n"));
2056 ip_drop_input("ipv6IfIcmpBadHoplimit", mp, ill);
2057 BUMP_MIB(mib, ipv6IfIcmpBadHoplimit);
2058 goto done;
2059 }
2060 /*
2061 * NDP does not accept any extension headers between the
2062 * IP header and the ICMP header since e.g. a routing
2063 * header could be dangerous.
2064 * This assumes that any AH or ESP headers are removed
2065 * by ip prior to passing the packet to ndp_input.
2066 */
2067 if (ip6h->ip6_nxt != IPPROTO_ICMPV6) {
2068 ip1dbg(("ndp_input: Wrong next header 0x%x\n",
2069 ip6h->ip6_nxt));
2070 ip_drop_input("Wrong next header", mp, ill);
2071 BUMP_MIB(mib, ipv6IfIcmpInErrors);
2072 goto done;
2073 }
2074 icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
2075 ASSERT(icmp_nd->icmp6_type == ND_NEIGHBOR_SOLICIT ||
2076 icmp_nd->icmp6_type == ND_NEIGHBOR_ADVERT);
2077 if (icmp_nd->icmp6_code != 0) {
2078 ip1dbg(("ndp_input: icmp6 code != 0 \n"));
2079 ip_drop_input("code non-zero", mp, ill);
2080 BUMP_MIB(mib, ipv6IfIcmpInErrors);
2081 goto done;
2082 }
2083 len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN;
2084 /*
2085 * Make sure packet length is large enough for either
2086 * a NS or a NA icmp packet.
2087 */
2088 if (len < sizeof (struct icmp6_hdr) + sizeof (struct in6_addr)) {
2089 ip1dbg(("ndp_input: packet too short\n"));
2090 ip_drop_input("packet too short", mp, ill);
2091 BUMP_MIB(mib, ipv6IfIcmpInErrors);
2092 goto done;
2093 }
2094 if (icmp_nd->icmp6_type == ND_NEIGHBOR_SOLICIT) {
2095 ndp_input_solicit(mp, ira);
2096 } else {
2097 ndp_input_advert(mp, ira);
2098 }
2099 done:
2100 freemsg(mp);
2101 if (orig_ill != NULL) {
2102 ill_refrele(ill);
2103 ira->ira_ill = orig_ill;
2104 }
2105 }
2106
2107 /*
2108 * ndp_xmit is called to form and transmit a ND solicitation or
2109 * advertisement ICMP packet.
2110 *
2111 * If the source address is unspecified and this isn't a probe (used for
2112 * duplicate address detection), an appropriate source address and link layer
2113 * address will be chosen here. The link layer address option is included if
2114 * the source is specified (i.e., all non-probe packets), and omitted (per the
2115 * specification) otherwise.
2116 *
2117 * It returns B_FALSE only if it does a successful put() to the
2118 * corresponding ill's ill_wq otherwise returns B_TRUE.
2119 */
2120 static boolean_t
ndp_xmit(ill_t * ill,uint32_t operation,uint8_t * hw_addr,uint_t hw_addr_len,const in6_addr_t * sender,const in6_addr_t * target,int flag)2121 ndp_xmit(ill_t *ill, uint32_t operation, uint8_t *hw_addr, uint_t hw_addr_len,
2122 const in6_addr_t *sender, const in6_addr_t *target, int flag)
2123 {
2124 uint32_t len;
2125 icmp6_t *icmp6;
2126 mblk_t *mp;
2127 ip6_t *ip6h;
2128 nd_opt_hdr_t *opt;
2129 uint_t plen;
2130 zoneid_t zoneid = GLOBAL_ZONEID;
2131 ill_t *hwaddr_ill = ill;
2132 ip_xmit_attr_t ixas;
2133 ip_stack_t *ipst = ill->ill_ipst;
2134 boolean_t need_refrele = B_FALSE;
2135 boolean_t probe = B_FALSE;
2136
2137 if (IS_UNDER_IPMP(ill)) {
2138 probe = ipif_lookup_testaddr_v6(ill, sender, NULL);
2139 /*
2140 * We send non-probe packets on the upper IPMP interface.
2141 * ip_output_simple() will use cast_ill for sending any
2142 * multicast packets. Note that we can't follow the same
2143 * logic for probe packets because all interfaces in the ipmp
2144 * group may have failed, so that we really want to only try
2145 * to send the ND packet on the ill corresponding to the src
2146 * address.
2147 */
2148 if (!probe) {
2149 ill = ipmp_ill_hold_ipmp_ill(ill);
2150 if (ill != NULL)
2151 need_refrele = B_TRUE;
2152 else
2153 ill = hwaddr_ill;
2154 }
2155 }
2156
2157 /*
2158 * If we have a unspecified source(sender) address, select a
2159 * proper source address for the solicitation here itself so
2160 * that we can initialize the h/w address correctly.
2161 *
2162 * If the sender is specified then we use this address in order
2163 * to lookup the zoneid before calling ip_output_v6(). This is to
2164 * enable unicast ND_NEIGHBOR_ADVERT packets to be routed correctly
2165 * by IP (we cannot guarantee that the global zone has an interface
2166 * route to the destination).
2167 *
2168 * Note that the NA never comes here with the unspecified source
2169 * address.
2170 */
2171
2172 /*
2173 * Probes will have unspec src at this point.
2174 */
2175 if (!(IN6_IS_ADDR_UNSPECIFIED(sender))) {
2176 zoneid = ipif_lookup_addr_zoneid_v6(sender, ill, ipst);
2177 /*
2178 * It's possible for ipif_lookup_addr_zoneid_v6() to return
2179 * ALL_ZONES if it cannot find a matching ipif for the address
2180 * we are trying to use. In this case we err on the side of
2181 * trying to send the packet by defaulting to the GLOBAL_ZONEID.
2182 */
2183 if (zoneid == ALL_ZONES)
2184 zoneid = GLOBAL_ZONEID;
2185 }
2186
2187 plen = (sizeof (nd_opt_hdr_t) + hw_addr_len + 7) / 8;
2188 len = IPV6_HDR_LEN + sizeof (nd_neighbor_advert_t) + plen * 8;
2189 mp = allocb(len, BPRI_LO);
2190 if (mp == NULL) {
2191 if (need_refrele)
2192 ill_refrele(ill);
2193 return (B_TRUE);
2194 }
2195
2196 bzero((char *)mp->b_rptr, len);
2197 mp->b_wptr = mp->b_rptr + len;
2198
2199 bzero(&ixas, sizeof (ixas));
2200 ixas.ixa_flags = IXAF_SET_ULP_CKSUM | IXAF_NO_HW_CKSUM;
2201
2202 ixas.ixa_ifindex = ill->ill_phyint->phyint_ifindex;
2203 ixas.ixa_ipst = ipst;
2204 ixas.ixa_cred = kcred;
2205 ixas.ixa_cpid = NOPID;
2206 ixas.ixa_tsl = NULL;
2207 ixas.ixa_zoneid = zoneid;
2208
2209 ip6h = (ip6_t *)mp->b_rptr;
2210 ip6h->ip6_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
2211 ip6h->ip6_plen = htons(len - IPV6_HDR_LEN);
2212 ip6h->ip6_nxt = IPPROTO_ICMPV6;
2213 ip6h->ip6_hops = IPV6_MAX_HOPS;
2214 ixas.ixa_multicast_ttl = ip6h->ip6_hops;
2215 ip6h->ip6_dst = *target;
2216 icmp6 = (icmp6_t *)&ip6h[1];
2217
2218 if (hw_addr_len != 0) {
2219 opt = (nd_opt_hdr_t *)((uint8_t *)ip6h + IPV6_HDR_LEN +
2220 sizeof (nd_neighbor_advert_t));
2221 } else {
2222 opt = NULL;
2223 }
2224 if (operation == ND_NEIGHBOR_SOLICIT) {
2225 nd_neighbor_solicit_t *ns = (nd_neighbor_solicit_t *)icmp6;
2226
2227 if (opt != NULL && !(flag & NDP_PROBE)) {
2228 /*
2229 * Note that we don't send out SLLA for ND probes
2230 * per RFC 4862, even though we do send out the src
2231 * haddr for IPv4 DAD probes, even though both IPv4
2232 * and IPv6 go out with the unspecified/INADDR_ANY
2233 * src IP addr.
2234 */
2235 opt->nd_opt_type = ND_OPT_SOURCE_LINKADDR;
2236 }
2237 ip6h->ip6_src = *sender;
2238 ns->nd_ns_target = *target;
2239 if (!(flag & NDP_UNICAST)) {
2240 /* Form multicast address of the target */
2241 ip6h->ip6_dst = ipv6_solicited_node_mcast;
2242 ip6h->ip6_dst.s6_addr32[3] |=
2243 ns->nd_ns_target.s6_addr32[3];
2244 }
2245 } else {
2246 nd_neighbor_advert_t *na = (nd_neighbor_advert_t *)icmp6;
2247
2248 ASSERT(!(flag & NDP_PROBE));
2249 if (opt != NULL)
2250 opt->nd_opt_type = ND_OPT_TARGET_LINKADDR;
2251 ip6h->ip6_src = *sender;
2252 na->nd_na_target = *sender;
2253 if (flag & NDP_ISROUTER)
2254 na->nd_na_flags_reserved |= ND_NA_FLAG_ROUTER;
2255 if (flag & NDP_SOLICITED)
2256 na->nd_na_flags_reserved |= ND_NA_FLAG_SOLICITED;
2257 if (flag & NDP_ORIDE)
2258 na->nd_na_flags_reserved |= ND_NA_FLAG_OVERRIDE;
2259 }
2260
2261 if (!(flag & NDP_PROBE)) {
2262 if (hw_addr != NULL && opt != NULL) {
2263 /* Fill in link layer address and option len */
2264 opt->nd_opt_len = (uint8_t)plen;
2265 bcopy(hw_addr, &opt[1], hw_addr_len);
2266 }
2267 }
2268 if (opt != NULL && opt->nd_opt_type == 0) {
2269 /* If there's no link layer address option, then strip it. */
2270 len -= plen * 8;
2271 mp->b_wptr = mp->b_rptr + len;
2272 ip6h->ip6_plen = htons(len - IPV6_HDR_LEN);
2273 }
2274
2275 icmp6->icmp6_type = (uint8_t)operation;
2276 icmp6->icmp6_code = 0;
2277 /*
2278 * Prepare for checksum by putting icmp length in the icmp
2279 * checksum field. The checksum is calculated in ip_output.c.
2280 */
2281 icmp6->icmp6_cksum = ip6h->ip6_plen;
2282
2283 (void) ip_output_simple(mp, &ixas);
2284 ixa_cleanup(&ixas);
2285 if (need_refrele)
2286 ill_refrele(ill);
2287 return (B_FALSE);
2288 }
2289
2290 /*
2291 * Used to set ND_UNREACHBLE before ncec_delete sets it NCE_F_CONDEMNED.
2292 * The datapath uses this as an indication that there
2293 * is a problem (as opposed to a NCE that was just
2294 * reclaimed due to lack of memory.
2295 * Note that static ARP entries never become unreachable.
2296 */
2297 void
nce_make_unreachable(ncec_t * ncec)2298 nce_make_unreachable(ncec_t *ncec)
2299 {
2300 mutex_enter(&ncec->ncec_lock);
2301 ncec->ncec_state = ND_UNREACHABLE;
2302 mutex_exit(&ncec->ncec_lock);
2303 }
2304
2305 /*
2306 * NCE retransmit timer. Common to IPv4 and IPv6.
2307 * This timer goes off when:
2308 * a. It is time to retransmit a resolution for resolver.
2309 * b. It is time to send reachability probes.
2310 */
2311 void
nce_timer(void * arg)2312 nce_timer(void *arg)
2313 {
2314 ncec_t *ncec = arg;
2315 ill_t *ill = ncec->ncec_ill, *src_ill;
2316 char addrbuf[INET6_ADDRSTRLEN];
2317 boolean_t dropped = B_FALSE;
2318 ip_stack_t *ipst = ncec->ncec_ipst;
2319 boolean_t isv6 = (ncec->ncec_ipversion == IPV6_VERSION);
2320 in_addr_t sender4 = INADDR_ANY;
2321 in6_addr_t sender6 = ipv6_all_zeros;
2322
2323 /*
2324 * The timer has to be cancelled by ncec_delete before doing the final
2325 * refrele. So the NCE is guaranteed to exist when the timer runs
2326 * until it clears the timeout_id. Before clearing the timeout_id
2327 * bump up the refcnt so that we can continue to use the ncec
2328 */
2329 ASSERT(ncec != NULL);
2330 mutex_enter(&ncec->ncec_lock);
2331 ncec_refhold_locked(ncec);
2332 ncec->ncec_timeout_id = 0;
2333 mutex_exit(&ncec->ncec_lock);
2334
2335 src_ill = nce_resolve_src(ncec, &sender6);
2336 /* if we could not find a sender address, return */
2337 if (src_ill == NULL) {
2338 if (!isv6) {
2339 IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, sender4);
2340 ip1dbg(("no src ill for %s\n", inet_ntop(AF_INET,
2341 &sender4, addrbuf, sizeof (addrbuf))));
2342 } else {
2343 ip1dbg(("no src ill for %s\n", inet_ntop(AF_INET6,
2344 &ncec->ncec_addr, addrbuf, sizeof (addrbuf))));
2345 }
2346 nce_restart_timer(ncec, ill->ill_reachable_retrans_time);
2347 ncec_refrele(ncec);
2348 return;
2349 }
2350 if (!isv6)
2351 IN6_V4MAPPED_TO_IPADDR(&sender6, sender4);
2352
2353 mutex_enter(&ncec->ncec_lock);
2354 /*
2355 * Check the reachability state.
2356 */
2357 switch (ncec->ncec_state) {
2358 case ND_DELAY:
2359 ASSERT(ncec->ncec_lladdr != NULL);
2360 ncec->ncec_state = ND_PROBE;
2361 ncec->ncec_pcnt = ND_MAX_UNICAST_SOLICIT;
2362 if (isv6) {
2363 mutex_exit(&ncec->ncec_lock);
2364 dropped = ndp_xmit(src_ill, ND_NEIGHBOR_SOLICIT,
2365 src_ill->ill_phys_addr,
2366 src_ill->ill_phys_addr_length,
2367 &sender6, &ncec->ncec_addr,
2368 NDP_UNICAST);
2369 } else {
2370 dropped = (arp_request(ncec, sender4, src_ill) == 0);
2371 mutex_exit(&ncec->ncec_lock);
2372 }
2373 if (!dropped) {
2374 mutex_enter(&ncec->ncec_lock);
2375 ncec->ncec_pcnt--;
2376 mutex_exit(&ncec->ncec_lock);
2377 }
2378 if (ip_debug > 3) {
2379 /* ip2dbg */
2380 pr_addr_dbg("nce_timer: state for %s changed "
2381 "to PROBE\n", AF_INET6, &ncec->ncec_addr);
2382 }
2383 nce_restart_timer(ncec, ill->ill_reachable_retrans_time);
2384 break;
2385 case ND_PROBE:
2386 /* must be retransmit timer */
2387 ASSERT(ncec->ncec_pcnt >= -1);
2388 if (ncec->ncec_pcnt > 0) {
2389 /*
2390 * As per RFC2461, the ncec gets deleted after
2391 * MAX_UNICAST_SOLICIT unsuccessful re-transmissions.
2392 * Note that the first unicast solicitation is sent
2393 * during the DELAY state.
2394 */
2395 ip2dbg(("nce_timer: pcount=%x dst %s\n",
2396 ncec->ncec_pcnt,
2397 inet_ntop((isv6? AF_INET6 : AF_INET),
2398 &ncec->ncec_addr, addrbuf, sizeof (addrbuf))));
2399 if (NCE_PUBLISH(ncec)) {
2400 mutex_exit(&ncec->ncec_lock);
2401 /*
2402 * send out a probe; note that src_ill
2403 * is ignored by nce_dad() for all
2404 * DAD message types other than IPv6
2405 * unicast probes
2406 */
2407 nce_dad(ncec, src_ill, B_TRUE);
2408 } else {
2409 ASSERT(src_ill != NULL);
2410 if (isv6) {
2411 mutex_exit(&ncec->ncec_lock);
2412 dropped = ndp_xmit(src_ill,
2413 ND_NEIGHBOR_SOLICIT,
2414 src_ill->ill_phys_addr,
2415 src_ill->ill_phys_addr_length,
2416 &sender6, &ncec->ncec_addr,
2417 NDP_UNICAST);
2418 } else {
2419 /*
2420 * since the nce is REACHABLE,
2421 * the ARP request will be sent out
2422 * as a link-layer unicast.
2423 */
2424 dropped = (arp_request(ncec, sender4,
2425 src_ill) == 0);
2426 mutex_exit(&ncec->ncec_lock);
2427 }
2428 if (!dropped) {
2429 mutex_enter(&ncec->ncec_lock);
2430 ncec->ncec_pcnt--;
2431 mutex_exit(&ncec->ncec_lock);
2432 }
2433 nce_restart_timer(ncec,
2434 ill->ill_reachable_retrans_time);
2435 }
2436 } else if (ncec->ncec_pcnt < 0) {
2437 /* No hope, delete the ncec */
2438 /* Tell datapath it went bad */
2439 ncec->ncec_state = ND_UNREACHABLE;
2440 mutex_exit(&ncec->ncec_lock);
2441 if (ip_debug > 2) {
2442 /* ip1dbg */
2443 pr_addr_dbg("nce_timer: Delete NCE for"
2444 " dst %s\n", (isv6? AF_INET6: AF_INET),
2445 &ncec->ncec_addr);
2446 }
2447 /* if static ARP can't delete. */
2448 if ((ncec->ncec_flags & NCE_F_STATIC) == 0)
2449 ncec_delete(ncec);
2450
2451 } else if (!NCE_PUBLISH(ncec)) {
2452 /*
2453 * Probe count is 0 for a dynamic entry (one that we
2454 * ourselves are not publishing). We should never get
2455 * here if NONUD was requested, hence the ASSERT below.
2456 */
2457 ASSERT((ncec->ncec_flags & NCE_F_NONUD) == 0);
2458 ip2dbg(("nce_timer: pcount=%x dst %s\n",
2459 ncec->ncec_pcnt, inet_ntop(AF_INET6,
2460 &ncec->ncec_addr, addrbuf, sizeof (addrbuf))));
2461 ncec->ncec_pcnt--;
2462 mutex_exit(&ncec->ncec_lock);
2463 /* Wait one interval before killing */
2464 nce_restart_timer(ncec,
2465 ill->ill_reachable_retrans_time);
2466 } else if (ill->ill_phyint->phyint_flags & PHYI_RUNNING) {
2467 ipif_t *ipif;
2468 ipaddr_t ncec_addr;
2469
2470 /*
2471 * We're done probing, and we can now declare this
2472 * address to be usable. Let IP know that it's ok to
2473 * use.
2474 */
2475 ncec->ncec_state = ND_REACHABLE;
2476 ncec->ncec_flags &= ~NCE_F_UNVERIFIED;
2477 mutex_exit(&ncec->ncec_lock);
2478 if (isv6) {
2479 ipif = ipif_lookup_addr_exact_v6(
2480 &ncec->ncec_addr, ill, ipst);
2481 } else {
2482 IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr,
2483 ncec_addr);
2484 ipif = ipif_lookup_addr_exact(ncec_addr, ill,
2485 ipst);
2486 }
2487 if (ipif != NULL) {
2488 if (ipif->ipif_was_dup) {
2489 char ibuf[LIFNAMSIZ];
2490 char sbuf[INET6_ADDRSTRLEN];
2491
2492 ipif->ipif_was_dup = B_FALSE;
2493 (void) inet_ntop(AF_INET6,
2494 &ipif->ipif_v6lcl_addr,
2495 sbuf, sizeof (sbuf));
2496 ipif_get_name(ipif, ibuf,
2497 sizeof (ibuf));
2498 cmn_err(CE_NOTE, "recovered address "
2499 "%s on %s", sbuf, ibuf);
2500 }
2501 if ((ipif->ipif_flags & IPIF_UP) &&
2502 !ipif->ipif_addr_ready)
2503 ipif_up_notify(ipif);
2504 ipif->ipif_addr_ready = 1;
2505 ipif_refrele(ipif);
2506 }
2507 if (!isv6 && arp_no_defense)
2508 break;
2509 /* Begin defending our new address */
2510 if (ncec->ncec_unsolicit_count > 0) {
2511 ncec->ncec_unsolicit_count--;
2512 if (isv6) {
2513 dropped = ndp_announce(ncec);
2514 } else {
2515 dropped = arp_announce(ncec);
2516 }
2517
2518 if (dropped)
2519 ncec->ncec_unsolicit_count++;
2520 else
2521 ncec->ncec_last_time_defended =
2522 ddi_get_lbolt();
2523 }
2524 if (ncec->ncec_unsolicit_count > 0) {
2525 nce_restart_timer(ncec,
2526 ANNOUNCE_INTERVAL(isv6));
2527 } else if (DEFENSE_INTERVAL(isv6) != 0) {
2528 nce_restart_timer(ncec, DEFENSE_INTERVAL(isv6));
2529 }
2530 } else {
2531 /*
2532 * This is an address we're probing to be our own, but
2533 * the ill is down. Wait until it comes back before
2534 * doing anything, but switch to reachable state so
2535 * that the restart will work.
2536 */
2537 ncec->ncec_state = ND_REACHABLE;
2538 mutex_exit(&ncec->ncec_lock);
2539 }
2540 break;
2541 case ND_INCOMPLETE: {
2542 mblk_t *mp, *nextmp;
2543 mblk_t **prevmpp;
2544
2545 /*
2546 * Per case (2) in the nce_queue_mp() comments, scan ncec_qd_mp
2547 * for any IPMP probe packets, and toss them. IPMP probe
2548 * packets will always be at the head of ncec_qd_mp, so that
2549 * we can stop at the first queued ND packet that is
2550 * not a probe packet.
2551 */
2552 prevmpp = &ncec->ncec_qd_mp;
2553 for (mp = ncec->ncec_qd_mp; mp != NULL; mp = nextmp) {
2554 nextmp = mp->b_next;
2555
2556 if (IS_UNDER_IPMP(ill) && ncec->ncec_nprobes > 0) {
2557 inet_freemsg(mp);
2558 ncec->ncec_nprobes--;
2559 *prevmpp = nextmp;
2560 } else {
2561 prevmpp = &mp->b_next;
2562 }
2563 }
2564
2565 /*
2566 * Must be resolver's retransmit timer.
2567 */
2568 mutex_exit(&ncec->ncec_lock);
2569 ip_ndp_resolve(ncec);
2570 break;
2571 }
2572 case ND_REACHABLE:
2573 if (((ncec->ncec_flags & NCE_F_UNSOL_ADV) &&
2574 ncec->ncec_unsolicit_count != 0) ||
2575 (NCE_PUBLISH(ncec) && DEFENSE_INTERVAL(isv6) != 0)) {
2576 if (ncec->ncec_unsolicit_count > 0) {
2577 ncec->ncec_unsolicit_count--;
2578 mutex_exit(&ncec->ncec_lock);
2579 /*
2580 * When we get to zero announcements left,
2581 * switch to address defense
2582 */
2583 } else {
2584 boolean_t rate_limit;
2585
2586 mutex_exit(&ncec->ncec_lock);
2587 rate_limit = ill_defend_rate_limit(ill, ncec);
2588 if (rate_limit) {
2589 nce_restart_timer(ncec,
2590 DEFENSE_INTERVAL(isv6));
2591 break;
2592 }
2593 }
2594 if (isv6) {
2595 dropped = ndp_announce(ncec);
2596 } else {
2597 dropped = arp_announce(ncec);
2598 }
2599 mutex_enter(&ncec->ncec_lock);
2600 if (dropped) {
2601 ncec->ncec_unsolicit_count++;
2602 } else {
2603 ncec->ncec_last_time_defended =
2604 ddi_get_lbolt();
2605 }
2606 mutex_exit(&ncec->ncec_lock);
2607 if (ncec->ncec_unsolicit_count != 0) {
2608 nce_restart_timer(ncec,
2609 ANNOUNCE_INTERVAL(isv6));
2610 } else {
2611 nce_restart_timer(ncec, DEFENSE_INTERVAL(isv6));
2612 }
2613 } else {
2614 mutex_exit(&ncec->ncec_lock);
2615 }
2616 break;
2617 default:
2618 mutex_exit(&ncec->ncec_lock);
2619 break;
2620 }
2621 done:
2622 ncec_refrele(ncec);
2623 ill_refrele(src_ill);
2624 }
2625
2626 /*
2627 * Set a link layer address from the ll_addr passed in.
2628 * Copy SAP from ill.
2629 */
2630 static void
nce_set_ll(ncec_t * ncec,uchar_t * ll_addr)2631 nce_set_ll(ncec_t *ncec, uchar_t *ll_addr)
2632 {
2633 ill_t *ill = ncec->ncec_ill;
2634
2635 ASSERT(ll_addr != NULL);
2636 if (ill->ill_phys_addr_length > 0) {
2637 /*
2638 * The bcopy() below used to be called for the physical address
2639 * length rather than the link layer address length. For
2640 * ethernet and many other media, the phys_addr and lla are
2641 * identical.
2642 *
2643 * The phys_addr and lla may not be the same for devices that
2644 * support DL_IPV6_LINK_LAYER_ADDR, though there are currently
2645 * no known instances of these.
2646 *
2647 * For PPP or other interfaces with a zero length
2648 * physical address, don't do anything here.
2649 * The bcopy() with a zero phys_addr length was previously
2650 * a no-op for interfaces with a zero-length physical address.
2651 * Using the lla for them would change the way they operate.
2652 * Doing nothing in such cases preserves expected behavior.
2653 */
2654 bcopy(ll_addr, ncec->ncec_lladdr, ill->ill_nd_lla_len);
2655 }
2656 }
2657
2658 boolean_t
nce_cmp_ll_addr(const ncec_t * ncec,const uchar_t * ll_addr,uint32_t ll_addr_len)2659 nce_cmp_ll_addr(const ncec_t *ncec, const uchar_t *ll_addr,
2660 uint32_t ll_addr_len)
2661 {
2662 ASSERT(ncec->ncec_lladdr != NULL);
2663 if (ll_addr == NULL)
2664 return (B_FALSE);
2665 if (bcmp(ll_addr, ncec->ncec_lladdr, ll_addr_len) != 0)
2666 return (B_TRUE);
2667 return (B_FALSE);
2668 }
2669
2670 /*
2671 * Updates the link layer address or the reachability state of
2672 * a cache entry. Reset probe counter if needed.
2673 */
2674 void
nce_update(ncec_t * ncec,uint16_t new_state,uchar_t * new_ll_addr)2675 nce_update(ncec_t *ncec, uint16_t new_state, uchar_t *new_ll_addr)
2676 {
2677 ill_t *ill = ncec->ncec_ill;
2678 boolean_t need_stop_timer = B_FALSE;
2679 boolean_t need_fastpath_update = B_FALSE;
2680 nce_t *nce = NULL;
2681 timeout_id_t tid;
2682
2683 ASSERT(MUTEX_HELD(&ncec->ncec_lock));
2684 /*
2685 * If this interface does not do NUD, there is no point
2686 * in allowing an update to the cache entry. Although
2687 * we will respond to NS.
2688 * The only time we accept an update for a resolver when
2689 * NUD is turned off is when it has just been created.
2690 * Non-Resolvers will always be created as REACHABLE.
2691 */
2692 if (new_state != ND_UNCHANGED) {
2693 if ((ncec->ncec_flags & NCE_F_NONUD) &&
2694 (ncec->ncec_state != ND_INCOMPLETE))
2695 return;
2696 ASSERT((int16_t)new_state >= ND_STATE_VALID_MIN);
2697 ASSERT((int16_t)new_state <= ND_STATE_VALID_MAX);
2698 need_stop_timer = B_TRUE;
2699 if (new_state == ND_REACHABLE)
2700 ncec->ncec_last = TICK_TO_MSEC(ddi_get_lbolt64());
2701 else {
2702 /* We force NUD in this case */
2703 ncec->ncec_last = 0;
2704 }
2705 ncec->ncec_state = new_state;
2706 ncec->ncec_pcnt = ND_MAX_UNICAST_SOLICIT;
2707 ASSERT(ncec->ncec_lladdr != NULL || new_state == ND_INITIAL ||
2708 new_state == ND_INCOMPLETE);
2709 }
2710 if (need_stop_timer || (ncec->ncec_flags & NCE_F_STATIC)) {
2711 tid = ncec->ncec_timeout_id;
2712 ncec->ncec_timeout_id = 0;
2713 }
2714 /*
2715 * Re-trigger fastpath probe and
2716 * overwrite the DL_UNITDATA_REQ data, noting we'll lose
2717 * whatever packets that happens to be transmitting at the time.
2718 */
2719 if (new_ll_addr != NULL) {
2720 bcopy(new_ll_addr, ncec->ncec_lladdr,
2721 ill->ill_phys_addr_length);
2722 need_fastpath_update = B_TRUE;
2723 }
2724 mutex_exit(&ncec->ncec_lock);
2725 if (need_stop_timer || (ncec->ncec_flags & NCE_F_STATIC)) {
2726 if (tid != 0)
2727 (void) untimeout(tid);
2728 }
2729 if (need_fastpath_update) {
2730 /*
2731 * Delete any existing existing dlur_mp and fp_mp information.
2732 * For IPMP interfaces, all underlying ill's must be checked
2733 * and purged.
2734 */
2735 nce_fastpath_list_delete(ncec->ncec_ill, ncec, NULL);
2736 /*
2737 * add the new dlur_mp and fp_mp
2738 */
2739 nce = nce_fastpath(ncec, B_TRUE, NULL);
2740 if (nce != NULL)
2741 nce_refrele(nce);
2742 }
2743 mutex_enter(&ncec->ncec_lock);
2744 }
2745
2746 static void
nce_queue_mp_common(ncec_t * ncec,mblk_t * mp,boolean_t head_insert)2747 nce_queue_mp_common(ncec_t *ncec, mblk_t *mp, boolean_t head_insert)
2748 {
2749 uint_t count = 0;
2750 mblk_t **mpp, *tmp;
2751
2752 ASSERT(MUTEX_HELD(&ncec->ncec_lock));
2753
2754 for (mpp = &ncec->ncec_qd_mp; *mpp != NULL; mpp = &(*mpp)->b_next) {
2755 if (++count > ncec->ncec_ill->ill_max_buf) {
2756 tmp = ncec->ncec_qd_mp->b_next;
2757 ncec->ncec_qd_mp->b_next = NULL;
2758 /*
2759 * if we never create data addrs on the under_ill
2760 * does this matter?
2761 */
2762 BUMP_MIB(ncec->ncec_ill->ill_ip_mib,
2763 ipIfStatsOutDiscards);
2764 ip_drop_output("ipIfStatsOutDiscards", ncec->ncec_qd_mp,
2765 ncec->ncec_ill);
2766 freemsg(ncec->ncec_qd_mp);
2767 ncec->ncec_qd_mp = tmp;
2768 }
2769 }
2770
2771 if (head_insert) {
2772 ncec->ncec_nprobes++;
2773 mp->b_next = ncec->ncec_qd_mp;
2774 ncec->ncec_qd_mp = mp;
2775 } else {
2776 *mpp = mp;
2777 }
2778 }
2779
2780 /*
2781 * nce_queue_mp will queue the packet into the ncec_qd_mp. The packet will be
2782 * queued at the head or tail of the queue based on the input argument
2783 * 'head_insert'. The caller should specify this argument as B_TRUE if this
2784 * packet is an IPMP probe packet, in which case the following happens:
2785 *
2786 * 1. Insert it at the head of the ncec_qd_mp list. Consider the normal
2787 * (non-ipmp_probe) load-speading case where the source address of the ND
2788 * packet is not tied to ncec_ill. If the ill bound to the source address
2789 * cannot receive, the response to the ND packet will not be received.
2790 * However, if ND packets for ncec_ill's probes are queued behind that ND
2791 * packet, those probes will also fail to be sent, and thus in.mpathd will
2792 * erroneously conclude that ncec_ill has also failed.
2793 *
2794 * 2. Drop the ipmp_probe packet in ndp_timer() if the ND did not succeed on
2795 * the first attempt. This ensures that ND problems do not manifest as
2796 * probe RTT spikes.
2797 *
2798 * We achieve this by inserting ipmp_probe() packets at the head of the
2799 * nce_queue.
2800 *
2801 * The ncec for the probe target is created with ncec_ill set to the ipmp_ill,
2802 * but the caller needs to set head_insert to B_TRUE if this is a probe packet.
2803 */
2804 void
nce_queue_mp(ncec_t * ncec,mblk_t * mp,boolean_t head_insert)2805 nce_queue_mp(ncec_t *ncec, mblk_t *mp, boolean_t head_insert)
2806 {
2807 ASSERT(MUTEX_HELD(&ncec->ncec_lock));
2808 nce_queue_mp_common(ncec, mp, head_insert);
2809 }
2810
2811 /*
2812 * Called when address resolution failed due to a timeout.
2813 * Send an ICMP unreachable in response to all queued packets.
2814 */
2815 void
ndp_resolv_failed(ncec_t * ncec)2816 ndp_resolv_failed(ncec_t *ncec)
2817 {
2818 mblk_t *mp, *nxt_mp;
2819 char buf[INET6_ADDRSTRLEN];
2820 ill_t *ill = ncec->ncec_ill;
2821 ip_recv_attr_t iras;
2822
2823 bzero(&iras, sizeof (iras));
2824 iras.ira_flags = 0;
2825 /*
2826 * we are setting the ira_rill to the ipmp_ill (instead of
2827 * the actual ill on which the packet was received), but this
2828 * is ok because we don't actually need the real ira_rill.
2829 * to send the icmp unreachable to the sender.
2830 */
2831 iras.ira_ill = iras.ira_rill = ill;
2832 iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex;
2833 iras.ira_rifindex = iras.ira_ruifindex;
2834
2835 ip1dbg(("ndp_resolv_failed: dst %s\n",
2836 inet_ntop(AF_INET6, (char *)&ncec->ncec_addr, buf, sizeof (buf))));
2837 mutex_enter(&ncec->ncec_lock);
2838 mp = ncec->ncec_qd_mp;
2839 ncec->ncec_qd_mp = NULL;
2840 ncec->ncec_nprobes = 0;
2841 mutex_exit(&ncec->ncec_lock);
2842 while (mp != NULL) {
2843 nxt_mp = mp->b_next;
2844 mp->b_next = NULL;
2845
2846 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
2847 ip_drop_output("ipIfStatsOutDiscards - address unreachable",
2848 mp, ill);
2849 icmp_unreachable_v6(mp,
2850 ICMP6_DST_UNREACH_ADDR, B_FALSE, &iras);
2851 ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE));
2852 mp = nxt_mp;
2853 }
2854 ncec_cb_dispatch(ncec); /* finish off waiting callbacks */
2855 }
2856
2857 /*
2858 * Handle the completion of NDP and ARP resolution.
2859 */
2860 void
nce_resolv_ok(ncec_t * ncec)2861 nce_resolv_ok(ncec_t *ncec)
2862 {
2863 mblk_t *mp;
2864 uint_t pkt_len;
2865 iaflags_t ixaflags = IXAF_NO_TRACE;
2866 nce_t *nce;
2867 ill_t *ill = ncec->ncec_ill;
2868 boolean_t isv6 = (ncec->ncec_ipversion == IPV6_VERSION);
2869 ip_stack_t *ipst = ill->ill_ipst;
2870
2871 if (IS_IPMP(ncec->ncec_ill)) {
2872 nce_resolv_ipmp_ok(ncec);
2873 return;
2874 }
2875 /* non IPMP case */
2876
2877 mutex_enter(&ncec->ncec_lock);
2878 ASSERT(ncec->ncec_nprobes == 0);
2879 mp = ncec->ncec_qd_mp;
2880 ncec->ncec_qd_mp = NULL;
2881 mutex_exit(&ncec->ncec_lock);
2882
2883 while (mp != NULL) {
2884 mblk_t *nxt_mp;
2885
2886 if (ill->ill_isv6) {
2887 ip6_t *ip6h = (ip6_t *)mp->b_rptr;
2888
2889 pkt_len = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN;
2890 } else {
2891 ipha_t *ipha = (ipha_t *)mp->b_rptr;
2892
2893 ixaflags |= IXAF_IS_IPV4;
2894 pkt_len = ntohs(ipha->ipha_length);
2895 }
2896 nxt_mp = mp->b_next;
2897 mp->b_next = NULL;
2898 /*
2899 * IXAF_NO_DEV_FLOW_CTL information for TCP packets is no
2900 * longer available, but it's ok to drop this flag because TCP
2901 * has its own flow-control in effect, so TCP packets
2902 * are not likely to get here when flow-control is in effect.
2903 */
2904 mutex_enter(&ill->ill_lock);
2905 nce = nce_lookup(ill, &ncec->ncec_addr);
2906 mutex_exit(&ill->ill_lock);
2907
2908 if (nce == NULL) {
2909 if (isv6) {
2910 BUMP_MIB(&ipst->ips_ip6_mib,
2911 ipIfStatsOutDiscards);
2912 } else {
2913 BUMP_MIB(&ipst->ips_ip_mib,
2914 ipIfStatsOutDiscards);
2915 }
2916 ip_drop_output("ipIfStatsOutDiscards - no nce",
2917 mp, NULL);
2918 freemsg(mp);
2919 } else {
2920 /*
2921 * We don't know the zoneid, but
2922 * ip_xmit does not care since IXAF_NO_TRACE
2923 * is set. (We traced the packet the first
2924 * time through ip_xmit.)
2925 */
2926 (void) ip_xmit(mp, nce, ixaflags, pkt_len, 0,
2927 ALL_ZONES, 0, NULL);
2928 nce_refrele(nce);
2929 }
2930 mp = nxt_mp;
2931 }
2932
2933 ncec_cb_dispatch(ncec); /* complete callbacks */
2934 }
2935
2936 /*
2937 * Called by SIOCSNDP* ioctl to add/change an ncec entry
2938 * and the corresponding attributes.
2939 * Disallow states other than ND_REACHABLE or ND_STALE.
2940 */
2941 int
ndp_sioc_update(ill_t * ill,lif_nd_req_t * lnr)2942 ndp_sioc_update(ill_t *ill, lif_nd_req_t *lnr)
2943 {
2944 sin6_t *sin6;
2945 in6_addr_t *addr;
2946 ncec_t *ncec;
2947 nce_t *nce;
2948 int err = 0;
2949 uint16_t new_flags = 0;
2950 uint16_t old_flags = 0;
2951 int inflags = lnr->lnr_flags;
2952 ip_stack_t *ipst = ill->ill_ipst;
2953 boolean_t do_postprocess = B_FALSE;
2954
2955 ASSERT(ill->ill_isv6);
2956 if ((lnr->lnr_state_create != ND_REACHABLE) &&
2957 (lnr->lnr_state_create != ND_STALE))
2958 return (EINVAL);
2959
2960 sin6 = (sin6_t *)&lnr->lnr_addr;
2961 addr = &sin6->sin6_addr;
2962
2963 mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
2964 ASSERT(!IS_UNDER_IPMP(ill));
2965 nce = nce_lookup_addr(ill, addr);
2966 if (nce != NULL)
2967 new_flags = nce->nce_common->ncec_flags;
2968
2969 switch (inflags & (NDF_ISROUTER_ON|NDF_ISROUTER_OFF)) {
2970 case NDF_ISROUTER_ON:
2971 new_flags |= NCE_F_ISROUTER;
2972 break;
2973 case NDF_ISROUTER_OFF:
2974 new_flags &= ~NCE_F_ISROUTER;
2975 break;
2976 case (NDF_ISROUTER_OFF|NDF_ISROUTER_ON):
2977 mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
2978 if (nce != NULL)
2979 nce_refrele(nce);
2980 return (EINVAL);
2981 }
2982 if (inflags & NDF_STATIC)
2983 new_flags |= NCE_F_STATIC;
2984
2985 switch (inflags & (NDF_ANYCAST_ON|NDF_ANYCAST_OFF)) {
2986 case NDF_ANYCAST_ON:
2987 new_flags |= NCE_F_ANYCAST;
2988 break;
2989 case NDF_ANYCAST_OFF:
2990 new_flags &= ~NCE_F_ANYCAST;
2991 break;
2992 case (NDF_ANYCAST_OFF|NDF_ANYCAST_ON):
2993 mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
2994 if (nce != NULL)
2995 nce_refrele(nce);
2996 return (EINVAL);
2997 }
2998
2999 if (nce == NULL) {
3000 err = nce_add_v6(ill,
3001 (uchar_t *)lnr->lnr_hdw_addr,
3002 ill->ill_phys_addr_length,
3003 addr,
3004 new_flags,
3005 lnr->lnr_state_create,
3006 &nce);
3007 if (err != 0) {
3008 mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
3009 ip1dbg(("ndp_sioc_update: Can't create NCE %d\n", err));
3010 return (err);
3011 } else {
3012 do_postprocess = B_TRUE;
3013 }
3014 }
3015 ncec = nce->nce_common;
3016 old_flags = ncec->ncec_flags;
3017 if (old_flags & NCE_F_ISROUTER && !(new_flags & NCE_F_ISROUTER)) {
3018 ncec_router_to_host(ncec);
3019 mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
3020 if (do_postprocess)
3021 err = nce_add_v6_postprocess(nce);
3022 nce_refrele(nce);
3023 return (0);
3024 }
3025 mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
3026
3027 if (do_postprocess)
3028 err = nce_add_v6_postprocess(nce);
3029 /*
3030 * err cannot be anything other than 0 because we don't support
3031 * proxy arp of static addresses.
3032 */
3033 ASSERT(err == 0);
3034
3035 mutex_enter(&ncec->ncec_lock);
3036 ncec->ncec_flags = new_flags;
3037 mutex_exit(&ncec->ncec_lock);
3038 /*
3039 * Note that we ignore the state at this point, which
3040 * should be either STALE or REACHABLE. Instead we let
3041 * the link layer address passed in to determine the state
3042 * much like incoming packets.
3043 */
3044 nce_process(ncec, (uchar_t *)lnr->lnr_hdw_addr, 0, B_FALSE);
3045 nce_refrele(nce);
3046 return (0);
3047 }
3048
3049 /*
3050 * Create an nce_t structure for ill using the ncec->ncec_lladdr to set up
3051 * the nce_dlur_mp. If ill != ncec->ncec_ill, then the ips_ill_g_lock must
3052 * be held to ensure that they are in the same group.
3053 */
3054 static nce_t *
nce_fastpath_create(ill_t * ill,ncec_t * ncec)3055 nce_fastpath_create(ill_t *ill, ncec_t *ncec)
3056 {
3057
3058 nce_t *nce;
3059
3060 nce = nce_ill_lookup_then_add(ill, ncec);
3061
3062 if (nce == NULL || IS_LOOPBACK(nce->nce_ill) || IS_VNI(nce->nce_ill))
3063 return (nce);
3064
3065 /*
3066 * hold the ncec_lock to synchronize with nce_update() so that,
3067 * at the end of this function, the contents of nce_dlur_mp are
3068 * consistent with ncec->ncec_lladdr, even though some intermediate
3069 * packet may have been sent out with a mangled address, which would
3070 * only be a transient condition.
3071 */
3072 mutex_enter(&ncec->ncec_lock);
3073 if (ncec->ncec_lladdr != NULL) {
3074 bcopy(ncec->ncec_lladdr, nce->nce_dlur_mp->b_rptr +
3075 NCE_LL_ADDR_OFFSET(ill), ill->ill_phys_addr_length);
3076 } else {
3077 nce->nce_dlur_mp = ill_dlur_gen(NULL, 0, ill->ill_sap,
3078 ill->ill_sap_length);
3079 }
3080 mutex_exit(&ncec->ncec_lock);
3081 return (nce);
3082 }
3083
3084 /*
3085 * we make nce_fp_mp to have an M_DATA prepend.
3086 * The caller ensures there is hold on ncec for this function.
3087 * Note that since ill_fastpath_probe() copies the mblk there is
3088 * no need to hold the nce or ncec beyond this function.
3089 *
3090 * If the caller has passed in a non-null ncec_nce to nce_fastpath() that
3091 * ncec_nce must correspond to the nce for ncec with nce_ill == ncec->ncec_ill
3092 * and will be returned back by this function, so that no extra nce_refrele
3093 * is required for the caller. The calls from nce_add_common() use this
3094 * method. All other callers (that pass in NULL ncec_nce) will have to do a
3095 * nce_refrele of the returned nce (when it is non-null).
3096 */
3097 nce_t *
nce_fastpath(ncec_t * ncec,boolean_t trigger_fp_req,nce_t * ncec_nce)3098 nce_fastpath(ncec_t *ncec, boolean_t trigger_fp_req, nce_t *ncec_nce)
3099 {
3100 nce_t *nce;
3101 ill_t *ill = ncec->ncec_ill;
3102
3103 ASSERT(ill != NULL);
3104
3105 if (IS_IPMP(ill) && trigger_fp_req) {
3106 trigger_fp_req = B_FALSE;
3107 ipmp_ncec_refresh_nce(ncec);
3108 }
3109
3110 /*
3111 * If the caller already has the nce corresponding to the ill, use
3112 * that one. Otherwise we have to lookup/add the nce. Calls from
3113 * nce_add_common() fall in the former category, and have just done
3114 * the nce lookup/add that can be reused.
3115 */
3116 if (ncec_nce == NULL)
3117 nce = nce_fastpath_create(ill, ncec);
3118 else
3119 nce = ncec_nce;
3120
3121 if (nce == NULL || IS_LOOPBACK(nce->nce_ill) || IS_VNI(nce->nce_ill))
3122 return (nce);
3123
3124 if (trigger_fp_req)
3125 nce_fastpath_trigger(nce);
3126 return (nce);
3127 }
3128
3129 /*
3130 * Trigger fastpath on nce. No locks may be held.
3131 */
3132 static void
nce_fastpath_trigger(nce_t * nce)3133 nce_fastpath_trigger(nce_t *nce)
3134 {
3135 int res;
3136 ill_t *ill = nce->nce_ill;
3137 ncec_t *ncec = nce->nce_common;
3138
3139 res = ill_fastpath_probe(ill, nce->nce_dlur_mp);
3140 /*
3141 * EAGAIN is an indication of a transient error
3142 * i.e. allocation failure etc. leave the ncec in the list it
3143 * will be updated when another probe happens for another ire
3144 * if not it will be taken out of the list when the ire is
3145 * deleted.
3146 */
3147 if (res != 0 && res != EAGAIN && res != ENOTSUP)
3148 nce_fastpath_list_delete(ill, ncec, NULL);
3149 }
3150
3151 /*
3152 * Add ncec to the nce fastpath list on ill.
3153 */
3154 static nce_t *
nce_ill_lookup_then_add_locked(ill_t * ill,ncec_t * ncec)3155 nce_ill_lookup_then_add_locked(ill_t *ill, ncec_t *ncec)
3156 {
3157 nce_t *nce = NULL;
3158
3159 ASSERT(MUTEX_HELD(&ill->ill_lock));
3160 /*
3161 * Atomically ensure that the ill is not CONDEMNED and is not going
3162 * down, before adding the NCE.
3163 */
3164 if (ill->ill_state_flags & ILL_CONDEMNED)
3165 return (NULL);
3166 mutex_enter(&ncec->ncec_lock);
3167 /*
3168 * if ncec has not been deleted and
3169 * is not already in the list add it.
3170 */
3171 if (!NCE_ISCONDEMNED(ncec)) {
3172 nce = nce_lookup(ill, &ncec->ncec_addr);
3173 if (nce != NULL)
3174 goto done;
3175 nce = nce_add(ill, ncec);
3176 }
3177 done:
3178 mutex_exit(&ncec->ncec_lock);
3179 return (nce);
3180 }
3181
3182 nce_t *
nce_ill_lookup_then_add(ill_t * ill,ncec_t * ncec)3183 nce_ill_lookup_then_add(ill_t *ill, ncec_t *ncec)
3184 {
3185 nce_t *nce;
3186
3187 mutex_enter(&ill->ill_lock);
3188 nce = nce_ill_lookup_then_add_locked(ill, ncec);
3189 mutex_exit(&ill->ill_lock);
3190 return (nce);
3191 }
3192
3193
3194 /*
3195 * remove ncec from the ill_nce list. If 'dead' is non-null, the deleted
3196 * nce is added to the 'dead' list, and the caller must nce_refrele() the
3197 * entry after all locks have been dropped.
3198 */
3199 void
nce_fastpath_list_delete(ill_t * ill,ncec_t * ncec,list_t * dead)3200 nce_fastpath_list_delete(ill_t *ill, ncec_t *ncec, list_t *dead)
3201 {
3202 nce_t *nce;
3203
3204 ASSERT(ill != NULL);
3205
3206 /* delete any nces referencing the ncec from underlying ills */
3207 if (IS_IPMP(ill))
3208 ipmp_ncec_delete_nce(ncec);
3209
3210 /* now the ill itself */
3211 mutex_enter(&ill->ill_lock);
3212 for (nce = list_head(&ill->ill_nce); nce != NULL;
3213 nce = list_next(&ill->ill_nce, nce)) {
3214 if (nce->nce_common == ncec) {
3215 nce_refhold(nce);
3216 nce_delete(nce);
3217 break;
3218 }
3219 }
3220 mutex_exit(&ill->ill_lock);
3221 if (nce != NULL) {
3222 if (dead == NULL)
3223 nce_refrele(nce);
3224 else
3225 list_insert_tail(dead, nce);
3226 }
3227 }
3228
3229 /*
3230 * when the fastpath response does not fit in the datab
3231 * associated with the existing nce_fp_mp, we delete and
3232 * add the nce to retrigger fastpath based on the information
3233 * in the ncec_t.
3234 */
3235 static nce_t *
nce_delete_then_add(nce_t * nce)3236 nce_delete_then_add(nce_t *nce)
3237 {
3238 ill_t *ill = nce->nce_ill;
3239 nce_t *newnce = NULL;
3240
3241 ip0dbg(("nce_delete_then_add nce %p ill %s\n",
3242 (void *)nce, ill->ill_name));
3243 mutex_enter(&ill->ill_lock);
3244 mutex_enter(&nce->nce_common->ncec_lock);
3245 nce_delete(nce);
3246 /*
3247 * Make sure that ncec is not condemned before adding. We hold the
3248 * ill_lock and ncec_lock to synchronize with ncec_delete() and
3249 * ipmp_ncec_delete_nce()
3250 */
3251 if (!NCE_ISCONDEMNED(nce->nce_common))
3252 newnce = nce_add(ill, nce->nce_common);
3253 mutex_exit(&nce->nce_common->ncec_lock);
3254 mutex_exit(&ill->ill_lock);
3255 nce_refrele(nce);
3256 return (newnce); /* could be null if nomem */
3257 }
3258
3259 typedef struct nce_fp_match_s {
3260 nce_t *nce_fp_match_res;
3261 mblk_t *nce_fp_match_ack_mp;
3262 } nce_fp_match_t;
3263
3264 /* ARGSUSED */
3265 static int
nce_fastpath_match_dlur(ill_t * ill,nce_t * nce,void * arg)3266 nce_fastpath_match_dlur(ill_t *ill, nce_t *nce, void *arg)
3267 {
3268 nce_fp_match_t *nce_fp_marg = arg;
3269 ncec_t *ncec = nce->nce_common;
3270 mblk_t *mp = nce_fp_marg->nce_fp_match_ack_mp;
3271 uchar_t *mp_rptr, *ud_mp_rptr;
3272 mblk_t *ud_mp = nce->nce_dlur_mp;
3273 ptrdiff_t cmplen;
3274
3275 /*
3276 * mp is the mp associated with the fastpath ack.
3277 * ud_mp is the outstanding DL_UNITDATA_REQ on the nce_t
3278 * under consideration. If the contents match, then the
3279 * fastpath ack is used to update the nce.
3280 */
3281 if (ud_mp == NULL)
3282 return (0);
3283 mp_rptr = mp->b_rptr;
3284 cmplen = mp->b_wptr - mp_rptr;
3285 ASSERT(cmplen >= 0);
3286
3287 ud_mp_rptr = ud_mp->b_rptr;
3288 /*
3289 * The ncec is locked here to prevent any other threads from accessing
3290 * and changing nce_dlur_mp when the address becomes resolved to an
3291 * lla while we're in the middle of looking at and comparing the
3292 * hardware address (lla). It is also locked to prevent multiple
3293 * threads in nce_fastpath() from examining nce_dlur_mp at the same
3294 * time.
3295 */
3296 mutex_enter(&ncec->ncec_lock);
3297 if (ud_mp->b_wptr - ud_mp_rptr != cmplen ||
3298 bcmp((char *)mp_rptr, (char *)ud_mp_rptr, cmplen) == 0) {
3299 nce_fp_marg->nce_fp_match_res = nce;
3300 mutex_exit(&ncec->ncec_lock);
3301 nce_refhold(nce);
3302 return (1);
3303 }
3304 mutex_exit(&ncec->ncec_lock);
3305 return (0);
3306 }
3307
3308 /*
3309 * Update all NCE's that are not in fastpath mode and
3310 * have an nce_fp_mp that matches mp. mp->b_cont contains
3311 * the fastpath header.
3312 *
3313 * Returns TRUE if entry should be dequeued, or FALSE otherwise.
3314 */
3315 void
nce_fastpath_update(ill_t * ill,mblk_t * mp)3316 nce_fastpath_update(ill_t *ill, mblk_t *mp)
3317 {
3318 nce_fp_match_t nce_fp_marg;
3319 nce_t *nce;
3320 mblk_t *nce_fp_mp, *fp_mp;
3321
3322 nce_fp_marg.nce_fp_match_res = NULL;
3323 nce_fp_marg.nce_fp_match_ack_mp = mp;
3324
3325 nce_walk(ill, nce_fastpath_match_dlur, &nce_fp_marg);
3326
3327 if ((nce = nce_fp_marg.nce_fp_match_res) == NULL)
3328 return;
3329
3330 mutex_enter(&nce->nce_lock);
3331 nce_fp_mp = nce->nce_fp_mp;
3332
3333 if (nce_fp_mp != NULL) {
3334 fp_mp = mp->b_cont;
3335 if (nce_fp_mp->b_rptr + MBLKL(fp_mp) >
3336 nce_fp_mp->b_datap->db_lim) {
3337 mutex_exit(&nce->nce_lock);
3338 nce = nce_delete_then_add(nce);
3339 if (nce == NULL) {
3340 return;
3341 }
3342 mutex_enter(&nce->nce_lock);
3343 nce_fp_mp = nce->nce_fp_mp;
3344 }
3345 }
3346
3347 /* Matched - install mp as the fastpath mp */
3348 if (nce_fp_mp == NULL) {
3349 fp_mp = dupb(mp->b_cont);
3350 nce->nce_fp_mp = fp_mp;
3351 } else {
3352 fp_mp = mp->b_cont;
3353 bcopy(fp_mp->b_rptr, nce_fp_mp->b_rptr, MBLKL(fp_mp));
3354 nce->nce_fp_mp->b_wptr = nce->nce_fp_mp->b_rptr
3355 + MBLKL(fp_mp);
3356 }
3357 mutex_exit(&nce->nce_lock);
3358 nce_refrele(nce);
3359 }
3360
3361 /*
3362 * Return a pointer to a given option in the packet.
3363 * Assumes that option part of the packet have already been validated.
3364 */
3365 nd_opt_hdr_t *
ndp_get_option(nd_opt_hdr_t * opt,int optlen,int opt_type)3366 ndp_get_option(nd_opt_hdr_t *opt, int optlen, int opt_type)
3367 {
3368 while (optlen > 0) {
3369 if (opt->nd_opt_type == opt_type)
3370 return (opt);
3371 optlen -= 8 * opt->nd_opt_len;
3372 opt = (struct nd_opt_hdr *)((char *)opt + 8 * opt->nd_opt_len);
3373 }
3374 return (NULL);
3375 }
3376
3377 /*
3378 * Verify all option lengths present are > 0, also check to see
3379 * if the option lengths and packet length are consistent.
3380 */
3381 boolean_t
ndp_verify_optlen(nd_opt_hdr_t * opt,int optlen)3382 ndp_verify_optlen(nd_opt_hdr_t *opt, int optlen)
3383 {
3384 ASSERT(opt != NULL);
3385 while (optlen > 0) {
3386 if (opt->nd_opt_len == 0)
3387 return (B_FALSE);
3388 optlen -= 8 * opt->nd_opt_len;
3389 if (optlen < 0)
3390 return (B_FALSE);
3391 opt = (struct nd_opt_hdr *)((char *)opt + 8 * opt->nd_opt_len);
3392 }
3393 return (B_TRUE);
3394 }
3395
3396 /*
3397 * ncec_walk function.
3398 * Free a fraction of the NCE cache entries.
3399 *
3400 * A possible optimization here would be to use ncec_last where possible, and
3401 * delete the least-frequently used entry, which would require more complex
3402 * computation as we walk through the ncec's (e.g., track ncec entries by
3403 * order of ncec_last and/or maintain state)
3404 */
3405 static void
ncec_cache_reclaim(ncec_t * ncec,char * arg)3406 ncec_cache_reclaim(ncec_t *ncec, char *arg)
3407 {
3408 ip_stack_t *ipst = ncec->ncec_ipst;
3409 uint_t fraction = *(uint_t *)arg;
3410 uint_t rand;
3411
3412 if ((ncec->ncec_flags &
3413 (NCE_F_MYADDR | NCE_F_STATIC | NCE_F_BCAST)) != 0) {
3414 return;
3415 }
3416
3417 rand = (uint_t)ddi_get_lbolt() +
3418 NCE_ADDR_HASH_V6(ncec->ncec_addr, NCE_TABLE_SIZE);
3419 if ((rand/fraction)*fraction == rand) {
3420 IP_STAT(ipst, ip_nce_reclaim_deleted);
3421 ncec_delete(ncec);
3422 }
3423 }
3424
3425 /*
3426 * kmem_cache callback to free up memory.
3427 *
3428 * For now we just delete a fixed fraction.
3429 */
3430 static void
ip_nce_reclaim_stack(ip_stack_t * ipst)3431 ip_nce_reclaim_stack(ip_stack_t *ipst)
3432 {
3433 uint_t fraction = ipst->ips_ip_nce_reclaim_fraction;
3434
3435 IP_STAT(ipst, ip_nce_reclaim_calls);
3436
3437 ncec_walk(NULL, (pfi_t)ncec_cache_reclaim, (uchar_t *)&fraction, ipst);
3438
3439 /*
3440 * Walk all CONNs that can have a reference on an ire, ncec or dce.
3441 * Get them to update any stale references to drop any refholds they
3442 * have.
3443 */
3444 ipcl_walk(conn_ixa_cleanup, (void *)B_FALSE, ipst);
3445 }
3446
3447 /*
3448 * Called by the memory allocator subsystem directly, when the system
3449 * is running low on memory.
3450 */
3451 /* ARGSUSED */
3452 void
ip_nce_reclaim(void * args)3453 ip_nce_reclaim(void *args)
3454 {
3455 netstack_handle_t nh;
3456 netstack_t *ns;
3457 ip_stack_t *ipst;
3458
3459 netstack_next_init(&nh);
3460 while ((ns = netstack_next(&nh)) != NULL) {
3461 /*
3462 * netstack_next() can return a netstack_t with a NULL
3463 * netstack_ip at boot time.
3464 */
3465 if ((ipst = ns->netstack_ip) == NULL) {
3466 netstack_rele(ns);
3467 continue;
3468 }
3469 ip_nce_reclaim_stack(ipst);
3470 netstack_rele(ns);
3471 }
3472 netstack_next_fini(&nh);
3473 }
3474
3475 #ifdef DEBUG
3476 void
ncec_trace_ref(ncec_t * ncec)3477 ncec_trace_ref(ncec_t *ncec)
3478 {
3479 ASSERT(MUTEX_HELD(&ncec->ncec_lock));
3480
3481 if (ncec->ncec_trace_disable)
3482 return;
3483
3484 if (!th_trace_ref(ncec, ncec->ncec_ipst)) {
3485 ncec->ncec_trace_disable = B_TRUE;
3486 ncec_trace_cleanup(ncec);
3487 }
3488 }
3489
3490 void
ncec_untrace_ref(ncec_t * ncec)3491 ncec_untrace_ref(ncec_t *ncec)
3492 {
3493 ASSERT(MUTEX_HELD(&ncec->ncec_lock));
3494
3495 if (!ncec->ncec_trace_disable)
3496 th_trace_unref(ncec);
3497 }
3498
3499 static void
ncec_trace_cleanup(const ncec_t * ncec)3500 ncec_trace_cleanup(const ncec_t *ncec)
3501 {
3502 th_trace_cleanup(ncec, ncec->ncec_trace_disable);
3503 }
3504 #endif
3505
3506 /*
3507 * Called when address resolution fails due to a timeout.
3508 * Send an ICMP unreachable in response to all queued packets.
3509 */
3510 void
arp_resolv_failed(ncec_t * ncec)3511 arp_resolv_failed(ncec_t *ncec)
3512 {
3513 mblk_t *mp, *nxt_mp;
3514 char buf[INET6_ADDRSTRLEN];
3515 struct in_addr ipv4addr;
3516 ill_t *ill = ncec->ncec_ill;
3517 ip_stack_t *ipst = ncec->ncec_ipst;
3518 ip_recv_attr_t iras;
3519
3520 bzero(&iras, sizeof (iras));
3521 iras.ira_flags = IRAF_IS_IPV4;
3522 /*
3523 * we are setting the ira_rill to the ipmp_ill (instead of
3524 * the actual ill on which the packet was received), but this
3525 * is ok because we don't actually need the real ira_rill.
3526 * to send the icmp unreachable to the sender.
3527 */
3528 iras.ira_ill = iras.ira_rill = ill;
3529 iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex;
3530 iras.ira_rifindex = iras.ira_ruifindex;
3531
3532 IN6_V4MAPPED_TO_INADDR(&ncec->ncec_addr, &ipv4addr);
3533 ip3dbg(("arp_resolv_failed: dst %s\n",
3534 inet_ntop(AF_INET, &ipv4addr, buf, sizeof (buf))));
3535 mutex_enter(&ncec->ncec_lock);
3536 mp = ncec->ncec_qd_mp;
3537 ncec->ncec_qd_mp = NULL;
3538 ncec->ncec_nprobes = 0;
3539 mutex_exit(&ncec->ncec_lock);
3540 while (mp != NULL) {
3541 nxt_mp = mp->b_next;
3542 mp->b_next = NULL;
3543
3544 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
3545 ip_drop_output("ipIfStatsOutDiscards - address unreachable",
3546 mp, ill);
3547 if (ipst->ips_ip_arp_icmp_error) {
3548 ip3dbg(("arp_resolv_failed: "
3549 "Calling icmp_unreachable\n"));
3550 icmp_unreachable(mp, ICMP_HOST_UNREACHABLE, &iras);
3551 } else {
3552 freemsg(mp);
3553 }
3554 ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE));
3555 mp = nxt_mp;
3556 }
3557 ncec_cb_dispatch(ncec); /* finish off waiting callbacks */
3558 }
3559
3560 /*
3561 * if ill is an under_ill, translate it to the ipmp_ill and add the
3562 * nce on the ipmp_ill. Two nce_t entries (one on the ipmp_ill, and
3563 * one on the underlying in_ill) will be created for the
3564 * ncec_t in this case. The ncec_t itself will be created on the ipmp_ill.
3565 */
3566 int
nce_lookup_then_add_v4(ill_t * ill,uchar_t * hw_addr,uint_t hw_addr_len,const in_addr_t * addr,uint16_t flags,uint16_t state,nce_t ** newnce)3567 nce_lookup_then_add_v4(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len,
3568 const in_addr_t *addr, uint16_t flags, uint16_t state, nce_t **newnce)
3569 {
3570 int err;
3571 in6_addr_t addr6;
3572 ip_stack_t *ipst = ill->ill_ipst;
3573 nce_t *nce, *upper_nce = NULL;
3574 ill_t *in_ill = ill, *under = NULL;
3575 boolean_t need_ill_refrele = B_FALSE;
3576
3577 if (flags & NCE_F_MCAST) {
3578 /*
3579 * hw_addr will be figured out in nce_set_multicast_v4;
3580 * caller needs to pass in the cast_ill for ipmp
3581 */
3582 ASSERT(hw_addr == NULL);
3583 ASSERT(!IS_IPMP(ill));
3584 err = nce_set_multicast_v4(ill, addr, flags, newnce);
3585 return (err);
3586 }
3587
3588 if (IS_UNDER_IPMP(ill) && !(flags & NCE_F_MYADDR)) {
3589 ill = ipmp_ill_hold_ipmp_ill(ill);
3590 if (ill == NULL)
3591 return (ENXIO);
3592 need_ill_refrele = B_TRUE;
3593 }
3594 if ((flags & NCE_F_BCAST) != 0) {
3595 /*
3596 * IPv4 broadcast ncec: compute the hwaddr.
3597 */
3598 if (IS_IPMP(ill)) {
3599 under = ipmp_ill_hold_xmit_ill(ill, B_FALSE);
3600 if (under == NULL) {
3601 if (need_ill_refrele)
3602 ill_refrele(ill);
3603 return (ENETDOWN);
3604 }
3605 hw_addr = under->ill_bcast_mp->b_rptr +
3606 NCE_LL_ADDR_OFFSET(under);
3607 hw_addr_len = under->ill_phys_addr_length;
3608 } else {
3609 hw_addr = ill->ill_bcast_mp->b_rptr +
3610 NCE_LL_ADDR_OFFSET(ill),
3611 hw_addr_len = ill->ill_phys_addr_length;
3612 }
3613 }
3614
3615 mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
3616 IN6_IPADDR_TO_V4MAPPED(*addr, &addr6);
3617 nce = nce_lookup_addr(ill, &addr6);
3618 if (nce == NULL) {
3619 err = nce_add_v4(ill, hw_addr, hw_addr_len, addr, flags,
3620 state, &nce);
3621 } else {
3622 err = EEXIST;
3623 }
3624 mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
3625 if (err == 0)
3626 err = nce_add_v4_postprocess(nce);
3627
3628 if (in_ill != ill && nce != NULL) {
3629 nce_t *under_nce = NULL;
3630
3631 /*
3632 * in_ill was the under_ill. Try to create the under_nce.
3633 * Hold the ill_g_lock to prevent changes to group membership
3634 * until we are done.
3635 */
3636 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
3637 if (!IS_IN_SAME_ILLGRP(in_ill, ill)) {
3638 DTRACE_PROBE2(ill__not__in__group, nce_t *, nce,
3639 ill_t *, ill);
3640 rw_exit(&ipst->ips_ill_g_lock);
3641 err = ENXIO;
3642 nce_refrele(nce);
3643 nce = NULL;
3644 goto bail;
3645 }
3646 under_nce = nce_fastpath_create(in_ill, nce->nce_common);
3647 if (under_nce == NULL) {
3648 rw_exit(&ipst->ips_ill_g_lock);
3649 err = EINVAL;
3650 nce_refrele(nce);
3651 nce = NULL;
3652 goto bail;
3653 }
3654 rw_exit(&ipst->ips_ill_g_lock);
3655 upper_nce = nce;
3656 nce = under_nce; /* will be returned to caller */
3657 if (NCE_ISREACHABLE(nce->nce_common))
3658 nce_fastpath_trigger(under_nce);
3659 }
3660 if (nce != NULL) {
3661 if (newnce != NULL)
3662 *newnce = nce;
3663 else
3664 nce_refrele(nce);
3665 }
3666 bail:
3667 if (under != NULL)
3668 ill_refrele(under);
3669 if (upper_nce != NULL)
3670 nce_refrele(upper_nce);
3671 if (need_ill_refrele)
3672 ill_refrele(ill);
3673
3674 return (err);
3675 }
3676
3677 /*
3678 * NDP Cache Entry creation routine for IPv4.
3679 * This routine must always be called with ndp4->ndp_g_lock held.
3680 * Prior to return, ncec_refcnt is incremented.
3681 *
3682 * IPMP notes: the ncec for non-local (i.e., !NCE_MYADDR(ncec) addresses
3683 * are always added pointing at the ipmp_ill. Thus, when the ill passed
3684 * to nce_add_v4 is an under_ill (i.e., IS_UNDER_IPMP(ill)) two nce_t
3685 * entries will be created, both pointing at the same ncec_t. The nce_t
3686 * entries will have their nce_ill set to the ipmp_ill and the under_ill
3687 * respectively, with the ncec_t having its ncec_ill pointing at the ipmp_ill.
3688 * Local addresses are always created on the ill passed to nce_add_v4.
3689 */
3690 int
nce_add_v4(ill_t * ill,uchar_t * hw_addr,uint_t hw_addr_len,const in_addr_t * addr,uint16_t flags,uint16_t state,nce_t ** newnce)3691 nce_add_v4(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len,
3692 const in_addr_t *addr, uint16_t flags, uint16_t state, nce_t **newnce)
3693 {
3694 int err;
3695 boolean_t is_multicast = (flags & NCE_F_MCAST);
3696 struct in6_addr addr6;
3697 nce_t *nce;
3698
3699 ASSERT(MUTEX_HELD(&ill->ill_ipst->ips_ndp4->ndp_g_lock));
3700 ASSERT(!ill->ill_isv6);
3701 ASSERT(!IN_MULTICAST(htonl(*addr)) || is_multicast);
3702
3703 IN6_IPADDR_TO_V4MAPPED(*addr, &addr6);
3704 err = nce_add_common(ill, hw_addr, hw_addr_len, &addr6, flags, state,
3705 &nce);
3706 ASSERT(newnce != NULL);
3707 *newnce = nce;
3708 return (err);
3709 }
3710
3711 /*
3712 * Post-processing routine to be executed after nce_add_v4(). This function
3713 * triggers fastpath (if appropriate) and DAD on the newly added nce entry
3714 * and must be called without any locks held.
3715 *
3716 * Always returns 0, but we return an int to keep this symmetric with the
3717 * IPv6 counter-part.
3718 */
3719 int
nce_add_v4_postprocess(nce_t * nce)3720 nce_add_v4_postprocess(nce_t *nce)
3721 {
3722 ncec_t *ncec = nce->nce_common;
3723 uint16_t flags = ncec->ncec_flags;
3724 boolean_t ndp_need_dad = B_FALSE;
3725 boolean_t dropped;
3726 clock_t delay;
3727 ip_stack_t *ipst = ncec->ncec_ill->ill_ipst;
3728 uchar_t *hw_addr = ncec->ncec_lladdr;
3729 boolean_t trigger_fastpath = B_TRUE;
3730
3731 /*
3732 * If the hw_addr is NULL, typically for ND_INCOMPLETE nces, then
3733 * we call nce_fastpath as soon as the ncec is resolved in nce_process.
3734 * We call nce_fastpath from nce_update if the link layer address of
3735 * the peer changes from nce_update
3736 */
3737 if (NCE_PUBLISH(ncec) || !NCE_ISREACHABLE(ncec) || (hw_addr == NULL &&
3738 ncec->ncec_ill->ill_net_type != IRE_IF_NORESOLVER))
3739 trigger_fastpath = B_FALSE;
3740
3741 if (trigger_fastpath)
3742 nce_fastpath_trigger(nce);
3743
3744 if (NCE_PUBLISH(ncec) && ncec->ncec_state == ND_PROBE) {
3745 /*
3746 * Either the caller (by passing in ND_PROBE)
3747 * or nce_add_common() (by the internally computed state
3748 * based on ncec_addr and ill_net_type) has determined
3749 * that this unicast entry needs DAD. Trigger DAD.
3750 */
3751 ndp_need_dad = B_TRUE;
3752 } else if (flags & NCE_F_UNSOL_ADV) {
3753 /*
3754 * We account for the transmit below by assigning one
3755 * less than the ndd variable. Subsequent decrements
3756 * are done in nce_timer.
3757 */
3758 mutex_enter(&ncec->ncec_lock);
3759 ncec->ncec_unsolicit_count =
3760 ipst->ips_ip_arp_publish_count - 1;
3761 mutex_exit(&ncec->ncec_lock);
3762 dropped = arp_announce(ncec);
3763 mutex_enter(&ncec->ncec_lock);
3764 if (dropped)
3765 ncec->ncec_unsolicit_count++;
3766 else
3767 ncec->ncec_last_time_defended = ddi_get_lbolt();
3768 if (ncec->ncec_unsolicit_count != 0) {
3769 nce_start_timer(ncec,
3770 ipst->ips_ip_arp_publish_interval);
3771 }
3772 mutex_exit(&ncec->ncec_lock);
3773 }
3774
3775 /*
3776 * If ncec_xmit_interval is 0, user has configured us to send the first
3777 * probe right away. Do so, and set up for the subsequent probes.
3778 */
3779 if (ndp_need_dad) {
3780 mutex_enter(&ncec->ncec_lock);
3781 if (ncec->ncec_pcnt == 0) {
3782 /*
3783 * DAD probes and announce can be
3784 * administratively disabled by setting the
3785 * probe_count to zero. Restart the timer in
3786 * this case to mark the ipif as ready.
3787 */
3788 ncec->ncec_unsolicit_count = 0;
3789 mutex_exit(&ncec->ncec_lock);
3790 nce_restart_timer(ncec, 0);
3791 } else {
3792 mutex_exit(&ncec->ncec_lock);
3793 delay = ((ncec->ncec_flags & NCE_F_FAST) ?
3794 ipst->ips_arp_probe_delay :
3795 ipst->ips_arp_fastprobe_delay);
3796 nce_dad(ncec, NULL, (delay == 0 ? B_TRUE : B_FALSE));
3797 }
3798 }
3799 return (0);
3800 }
3801
3802 /*
3803 * ncec_walk routine to update all entries that have a given destination or
3804 * gateway address and cached link layer (MAC) address. This is used when ARP
3805 * informs us that a network-to-link-layer mapping may have changed.
3806 */
3807 void
nce_update_hw_changed(ncec_t * ncec,void * arg)3808 nce_update_hw_changed(ncec_t *ncec, void *arg)
3809 {
3810 nce_hw_map_t *hwm = arg;
3811 ipaddr_t ncec_addr;
3812
3813 if (ncec->ncec_state != ND_REACHABLE)
3814 return;
3815
3816 IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, ncec_addr);
3817 if (ncec_addr != hwm->hwm_addr)
3818 return;
3819
3820 mutex_enter(&ncec->ncec_lock);
3821 if (hwm->hwm_flags != 0)
3822 ncec->ncec_flags = hwm->hwm_flags;
3823 nce_update(ncec, ND_STALE, hwm->hwm_hwaddr);
3824 mutex_exit(&ncec->ncec_lock);
3825 }
3826
3827 void
ncec_refhold(ncec_t * ncec)3828 ncec_refhold(ncec_t *ncec)
3829 {
3830 mutex_enter(&(ncec)->ncec_lock);
3831 (ncec)->ncec_refcnt++;
3832 ASSERT((ncec)->ncec_refcnt != 0);
3833 #ifdef DEBUG
3834 ncec_trace_ref(ncec);
3835 #endif
3836 mutex_exit(&(ncec)->ncec_lock);
3837 }
3838
3839 void
ncec_refhold_notr(ncec_t * ncec)3840 ncec_refhold_notr(ncec_t *ncec)
3841 {
3842 mutex_enter(&(ncec)->ncec_lock);
3843 (ncec)->ncec_refcnt++;
3844 ASSERT((ncec)->ncec_refcnt != 0);
3845 mutex_exit(&(ncec)->ncec_lock);
3846 }
3847
3848 static void
ncec_refhold_locked(ncec_t * ncec)3849 ncec_refhold_locked(ncec_t *ncec)
3850 {
3851 ASSERT(MUTEX_HELD(&(ncec)->ncec_lock));
3852 (ncec)->ncec_refcnt++;
3853 #ifdef DEBUG
3854 ncec_trace_ref(ncec);
3855 #endif
3856 }
3857
3858 /* ncec_inactive destroys the mutex thus no mutex_exit is needed */
3859 void
ncec_refrele(ncec_t * ncec)3860 ncec_refrele(ncec_t *ncec)
3861 {
3862 mutex_enter(&(ncec)->ncec_lock);
3863 #ifdef DEBUG
3864 ncec_untrace_ref(ncec);
3865 #endif
3866 ASSERT((ncec)->ncec_refcnt != 0);
3867 if (--(ncec)->ncec_refcnt == 0) {
3868 ncec_inactive(ncec);
3869 } else {
3870 mutex_exit(&(ncec)->ncec_lock);
3871 }
3872 }
3873
3874 void
ncec_refrele_notr(ncec_t * ncec)3875 ncec_refrele_notr(ncec_t *ncec)
3876 {
3877 mutex_enter(&(ncec)->ncec_lock);
3878 ASSERT((ncec)->ncec_refcnt != 0);
3879 if (--(ncec)->ncec_refcnt == 0) {
3880 ncec_inactive(ncec);
3881 } else {
3882 mutex_exit(&(ncec)->ncec_lock);
3883 }
3884 }
3885
3886 /*
3887 * Common to IPv4 and IPv6.
3888 */
3889 void
nce_restart_timer(ncec_t * ncec,uint_t ms)3890 nce_restart_timer(ncec_t *ncec, uint_t ms)
3891 {
3892 timeout_id_t tid;
3893
3894 ASSERT(!MUTEX_HELD(&(ncec)->ncec_lock));
3895
3896 /* First cancel any running timer */
3897 mutex_enter(&ncec->ncec_lock);
3898 tid = ncec->ncec_timeout_id;
3899 ncec->ncec_timeout_id = 0;
3900 if (tid != 0) {
3901 mutex_exit(&ncec->ncec_lock);
3902 (void) untimeout(tid);
3903 mutex_enter(&ncec->ncec_lock);
3904 }
3905
3906 /* Restart timer */
3907 nce_start_timer(ncec, ms);
3908 mutex_exit(&ncec->ncec_lock);
3909 }
3910
3911 static void
nce_start_timer(ncec_t * ncec,uint_t ms)3912 nce_start_timer(ncec_t *ncec, uint_t ms)
3913 {
3914 ASSERT(MUTEX_HELD(&ncec->ncec_lock));
3915 /*
3916 * Don't start the timer if the ncec has been deleted, or if the timer
3917 * is already running
3918 */
3919 if (!NCE_ISCONDEMNED(ncec) && ncec->ncec_timeout_id == 0) {
3920 ncec->ncec_timeout_id = timeout(nce_timer, ncec,
3921 MSEC_TO_TICK(ms) == 0 ? 1 : MSEC_TO_TICK(ms));
3922 }
3923 }
3924
3925 int
nce_set_multicast_v4(ill_t * ill,const in_addr_t * dst,uint16_t flags,nce_t ** newnce)3926 nce_set_multicast_v4(ill_t *ill, const in_addr_t *dst,
3927 uint16_t flags, nce_t **newnce)
3928 {
3929 uchar_t *hw_addr;
3930 int err = 0;
3931 ip_stack_t *ipst = ill->ill_ipst;
3932 in6_addr_t dst6;
3933 nce_t *nce;
3934
3935 ASSERT(!ill->ill_isv6);
3936
3937 IN6_IPADDR_TO_V4MAPPED(*dst, &dst6);
3938 mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
3939 if ((nce = nce_lookup_addr(ill, &dst6)) != NULL) {
3940 mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
3941 goto done;
3942 }
3943 if (ill->ill_net_type == IRE_IF_RESOLVER) {
3944 /*
3945 * For IRE_IF_RESOLVER a hardware mapping can be
3946 * generated, for IRE_IF_NORESOLVER, resolution cookie
3947 * in the ill is copied in nce_add_v4().
3948 */
3949 hw_addr = kmem_alloc(ill->ill_phys_addr_length, KM_NOSLEEP);
3950 if (hw_addr == NULL) {
3951 mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
3952 return (ENOMEM);
3953 }
3954 ip_mcast_mapping(ill, (uchar_t *)dst, hw_addr);
3955 } else {
3956 /*
3957 * IRE_IF_NORESOLVER type simply copies the resolution
3958 * cookie passed in. So no hw_addr is needed.
3959 */
3960 hw_addr = NULL;
3961 }
3962 ASSERT(flags & NCE_F_MCAST);
3963 ASSERT(flags & NCE_F_NONUD);
3964 /* nce_state will be computed by nce_add_common() */
3965 err = nce_add_v4(ill, hw_addr, ill->ill_phys_addr_length, dst, flags,
3966 ND_UNCHANGED, &nce);
3967 mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
3968 if (err == 0)
3969 err = nce_add_v4_postprocess(nce);
3970 if (hw_addr != NULL)
3971 kmem_free(hw_addr, ill->ill_phys_addr_length);
3972 if (err != 0) {
3973 ip1dbg(("nce_set_multicast_v4: create failed" "%d\n", err));
3974 return (err);
3975 }
3976 done:
3977 if (newnce != NULL)
3978 *newnce = nce;
3979 else
3980 nce_refrele(nce);
3981 return (0);
3982 }
3983
3984 /*
3985 * This is used when scanning for "old" (least recently broadcast) NCEs. We
3986 * don't want to have to walk the list for every single one, so we gather up
3987 * batches at a time.
3988 */
3989 #define NCE_RESCHED_LIST_LEN 8
3990
3991 typedef struct {
3992 ill_t *ncert_ill;
3993 uint_t ncert_num;
3994 ncec_t *ncert_nces[NCE_RESCHED_LIST_LEN];
3995 } nce_resched_t;
3996
3997 /*
3998 * Pick the longest waiting NCEs for defense.
3999 */
4000 /* ARGSUSED */
4001 static int
ncec_reschedule(ill_t * ill,nce_t * nce,void * arg)4002 ncec_reschedule(ill_t *ill, nce_t *nce, void *arg)
4003 {
4004 nce_resched_t *ncert = arg;
4005 ncec_t **ncecs;
4006 ncec_t **ncec_max;
4007 ncec_t *ncec_temp;
4008 ncec_t *ncec = nce->nce_common;
4009
4010 ASSERT(ncec->ncec_ill == ncert->ncert_ill);
4011 /*
4012 * Only reachable entries that are ready for announcement are eligible.
4013 */
4014 if (!NCE_MYADDR(ncec) || ncec->ncec_state != ND_REACHABLE)
4015 return (0);
4016 if (ncert->ncert_num < NCE_RESCHED_LIST_LEN) {
4017 ncec_refhold(ncec);
4018 ncert->ncert_nces[ncert->ncert_num++] = ncec;
4019 } else {
4020 ncecs = ncert->ncert_nces;
4021 ncec_max = ncecs + NCE_RESCHED_LIST_LEN;
4022 ncec_refhold(ncec);
4023 for (; ncecs < ncec_max; ncecs++) {
4024 ASSERT(ncec != NULL);
4025 if ((*ncecs)->ncec_last_time_defended >
4026 ncec->ncec_last_time_defended) {
4027 ncec_temp = *ncecs;
4028 *ncecs = ncec;
4029 ncec = ncec_temp;
4030 }
4031 }
4032 ncec_refrele(ncec);
4033 }
4034 return (0);
4035 }
4036
4037 /*
4038 * Reschedule the ARP defense of any long-waiting NCEs. It's assumed that this
4039 * doesn't happen very often (if at all), and thus it needn't be highly
4040 * optimized. (Note, though, that it's actually O(N) complexity, because the
4041 * outer loop is bounded by a constant rather than by the length of the list.)
4042 */
4043 static void
nce_ill_reschedule(ill_t * ill,nce_resched_t * ncert)4044 nce_ill_reschedule(ill_t *ill, nce_resched_t *ncert)
4045 {
4046 ncec_t *ncec;
4047 ip_stack_t *ipst = ill->ill_ipst;
4048 uint_t i, defend_rate;
4049
4050 i = ill->ill_defend_count;
4051 ill->ill_defend_count = 0;
4052 if (ill->ill_isv6)
4053 defend_rate = ipst->ips_ndp_defend_rate;
4054 else
4055 defend_rate = ipst->ips_arp_defend_rate;
4056 /* If none could be sitting around, then don't reschedule */
4057 if (i < defend_rate) {
4058 DTRACE_PROBE1(reschedule_none, ill_t *, ill);
4059 return;
4060 }
4061 ncert->ncert_ill = ill;
4062 while (ill->ill_defend_count < defend_rate) {
4063 nce_walk_common(ill, ncec_reschedule, ncert);
4064 for (i = 0; i < ncert->ncert_num; i++) {
4065
4066 ncec = ncert->ncert_nces[i];
4067 mutex_enter(&ncec->ncec_lock);
4068 ncec->ncec_flags |= NCE_F_DELAYED;
4069 mutex_exit(&ncec->ncec_lock);
4070 /*
4071 * we plan to schedule this ncec, so incr the
4072 * defend_count in anticipation.
4073 */
4074 if (++ill->ill_defend_count >= defend_rate)
4075 break;
4076 }
4077 if (ncert->ncert_num < NCE_RESCHED_LIST_LEN)
4078 break;
4079 }
4080 }
4081
4082 /*
4083 * Check if the current rate-limiting parameters permit the sending
4084 * of another address defense announcement for both IPv4 and IPv6.
4085 * Returns B_TRUE if rate-limiting is in effect (i.e., send is not
4086 * permitted), and B_FALSE otherwise. The `defend_rate' parameter
4087 * determines how many address defense announcements are permitted
4088 * in any `defense_perio' interval.
4089 */
4090 static boolean_t
ill_defend_rate_limit(ill_t * ill,ncec_t * ncec)4091 ill_defend_rate_limit(ill_t *ill, ncec_t *ncec)
4092 {
4093 clock_t now = ddi_get_lbolt();
4094 ip_stack_t *ipst = ill->ill_ipst;
4095 clock_t start = ill->ill_defend_start;
4096 uint32_t elapsed, defend_period, defend_rate;
4097 nce_resched_t ncert;
4098 boolean_t ret;
4099 int i;
4100
4101 if (ill->ill_isv6) {
4102 defend_period = ipst->ips_ndp_defend_period;
4103 defend_rate = ipst->ips_ndp_defend_rate;
4104 } else {
4105 defend_period = ipst->ips_arp_defend_period;
4106 defend_rate = ipst->ips_arp_defend_rate;
4107 }
4108 if (defend_rate == 0)
4109 return (B_TRUE);
4110 bzero(&ncert, sizeof (ncert));
4111 mutex_enter(&ill->ill_lock);
4112 if (start > 0) {
4113 elapsed = now - start;
4114 if (elapsed > SEC_TO_TICK(defend_period)) {
4115 ill->ill_defend_start = now;
4116 /*
4117 * nce_ill_reschedule will attempt to
4118 * prevent starvation by reschduling the
4119 * oldest entries, which are marked with
4120 * the NCE_F_DELAYED flag.
4121 */
4122 nce_ill_reschedule(ill, &ncert);
4123 }
4124 } else {
4125 ill->ill_defend_start = now;
4126 }
4127 ASSERT(ill->ill_defend_count <= defend_rate);
4128 mutex_enter(&ncec->ncec_lock);
4129 if (ncec->ncec_flags & NCE_F_DELAYED) {
4130 /*
4131 * This ncec was rescheduled as one of the really old
4132 * entries needing on-going defense. The
4133 * ill_defend_count was already incremented in
4134 * nce_ill_reschedule. Go ahead and send the announce.
4135 */
4136 ncec->ncec_flags &= ~NCE_F_DELAYED;
4137 mutex_exit(&ncec->ncec_lock);
4138 ret = B_FALSE;
4139 goto done;
4140 }
4141 mutex_exit(&ncec->ncec_lock);
4142 if (ill->ill_defend_count < defend_rate)
4143 ill->ill_defend_count++;
4144 if (ill->ill_defend_count == defend_rate) {
4145 /*
4146 * we are no longer allowed to send unbidden defense
4147 * messages. Wait for rescheduling.
4148 */
4149 ret = B_TRUE;
4150 } else {
4151 ret = B_FALSE;
4152 }
4153 done:
4154 mutex_exit(&ill->ill_lock);
4155 /*
4156 * After all the locks have been dropped we can restart nce timer,
4157 * and refrele the delayed ncecs
4158 */
4159 for (i = 0; i < ncert.ncert_num; i++) {
4160 clock_t xmit_interval;
4161 ncec_t *tmp;
4162
4163 tmp = ncert.ncert_nces[i];
4164 xmit_interval = nce_fuzz_interval(tmp->ncec_xmit_interval,
4165 B_FALSE);
4166 nce_restart_timer(tmp, xmit_interval);
4167 ncec_refrele(tmp);
4168 }
4169 return (ret);
4170 }
4171
4172 boolean_t
ndp_announce(ncec_t * ncec)4173 ndp_announce(ncec_t *ncec)
4174 {
4175 return (ndp_xmit(ncec->ncec_ill, ND_NEIGHBOR_ADVERT, ncec->ncec_lladdr,
4176 ncec->ncec_lladdr_length, &ncec->ncec_addr, &ipv6_all_hosts_mcast,
4177 nce_advert_flags(ncec)));
4178 }
4179
4180 ill_t *
nce_resolve_src(ncec_t * ncec,in6_addr_t * src)4181 nce_resolve_src(ncec_t *ncec, in6_addr_t *src)
4182 {
4183 mblk_t *mp;
4184 in6_addr_t src6;
4185 ipaddr_t src4;
4186 ill_t *ill = ncec->ncec_ill;
4187 ill_t *src_ill = NULL;
4188 ipif_t *ipif = NULL;
4189 boolean_t is_myaddr = NCE_MYADDR(ncec);
4190 boolean_t isv6 = (ncec->ncec_ipversion == IPV6_VERSION);
4191
4192 ASSERT(src != NULL);
4193 ASSERT(IN6_IS_ADDR_UNSPECIFIED(src));
4194 src6 = *src;
4195 if (is_myaddr) {
4196 src6 = ncec->ncec_addr;
4197 if (!isv6)
4198 IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, src4);
4199 } else {
4200 /*
4201 * try to find one from the outgoing packet.
4202 */
4203 mutex_enter(&ncec->ncec_lock);
4204 mp = ncec->ncec_qd_mp;
4205 if (mp != NULL) {
4206 if (isv6) {
4207 ip6_t *ip6h = (ip6_t *)mp->b_rptr;
4208
4209 src6 = ip6h->ip6_src;
4210 } else {
4211 ipha_t *ipha = (ipha_t *)mp->b_rptr;
4212
4213 src4 = ipha->ipha_src;
4214 IN6_IPADDR_TO_V4MAPPED(src4, &src6);
4215 }
4216 }
4217 mutex_exit(&ncec->ncec_lock);
4218 }
4219
4220 /*
4221 * For outgoing packets, if the src of outgoing packet is one
4222 * of the assigned interface addresses use it, otherwise we
4223 * will pick the source address below.
4224 * For local addresses (is_myaddr) doing DAD, NDP announce
4225 * messages are mcast. So we use the (IPMP) cast_ill or the
4226 * (non-IPMP) ncec_ill for these message types. The only case
4227 * of unicast DAD messages are for IPv6 ND probes, for which
4228 * we find the ipif_bound_ill corresponding to the ncec_addr.
4229 */
4230 if (!IN6_IS_ADDR_UNSPECIFIED(&src6) || is_myaddr) {
4231 if (isv6) {
4232 ipif = ipif_lookup_addr_nondup_v6(&src6, ill, ALL_ZONES,
4233 ill->ill_ipst);
4234 } else {
4235 ipif = ipif_lookup_addr_nondup(src4, ill, ALL_ZONES,
4236 ill->ill_ipst);
4237 }
4238
4239 /*
4240 * If no relevant ipif can be found, then it's not one of our
4241 * addresses. Reset to :: and try to find a src for the NS or
4242 * ARP request using ipif_select_source_v[4,6] below.
4243 * If an ipif can be found, but it's not yet done with
4244 * DAD verification, and we are not being invoked for
4245 * DAD (i.e., !is_myaddr), then just postpone this
4246 * transmission until later.
4247 */
4248 if (ipif == NULL) {
4249 src6 = ipv6_all_zeros;
4250 src4 = INADDR_ANY;
4251 } else if (!ipif->ipif_addr_ready && !is_myaddr) {
4252 DTRACE_PROBE2(nce__resolve__ipif__not__ready,
4253 ncec_t *, ncec, ipif_t *, ipif);
4254 ipif_refrele(ipif);
4255 return (NULL);
4256 }
4257 }
4258
4259 if (IN6_IS_ADDR_UNSPECIFIED(&src6) && !is_myaddr) {
4260 /*
4261 * Pick a source address for this solicitation, but
4262 * restrict the selection to addresses assigned to the
4263 * output interface. We do this because the destination will
4264 * create a neighbor cache entry for the source address of
4265 * this packet, so the source address had better be a valid
4266 * neighbor.
4267 */
4268 if (isv6) {
4269 ipif = ipif_select_source_v6(ill, &ncec->ncec_addr,
4270 B_TRUE, IPV6_PREFER_SRC_DEFAULT, ALL_ZONES,
4271 B_FALSE, NULL);
4272 } else {
4273 ipaddr_t nce_addr;
4274
4275 IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, nce_addr);
4276 ipif = ipif_select_source_v4(ill, nce_addr, ALL_ZONES,
4277 B_FALSE, NULL);
4278 }
4279 if (ipif == NULL && IS_IPMP(ill)) {
4280 ill_t *send_ill = ipmp_ill_hold_xmit_ill(ill, B_TRUE);
4281
4282 if (send_ill != NULL) {
4283 if (isv6) {
4284 ipif = ipif_select_source_v6(send_ill,
4285 &ncec->ncec_addr, B_TRUE,
4286 IPV6_PREFER_SRC_DEFAULT, ALL_ZONES,
4287 B_FALSE, NULL);
4288 } else {
4289 IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr,
4290 src4);
4291 ipif = ipif_select_source_v4(send_ill,
4292 src4, ALL_ZONES, B_TRUE, NULL);
4293 }
4294 ill_refrele(send_ill);
4295 }
4296 }
4297
4298 if (ipif == NULL) {
4299 char buf[INET6_ADDRSTRLEN];
4300
4301 ip1dbg(("nce_resolve_src: No source ipif for dst %s\n",
4302 inet_ntop((isv6 ? AF_INET6 : AF_INET),
4303 (char *)&ncec->ncec_addr, buf, sizeof (buf))));
4304 DTRACE_PROBE1(nce__resolve__no__ipif, ncec_t *, ncec);
4305 return (NULL);
4306 }
4307 src6 = ipif->ipif_v6lcl_addr;
4308 }
4309 *src = src6;
4310 if (ipif != NULL) {
4311 src_ill = ipif->ipif_ill;
4312 if (IS_IPMP(src_ill))
4313 src_ill = ipmp_ipif_hold_bound_ill(ipif);
4314 else
4315 ill_refhold(src_ill);
4316 ipif_refrele(ipif);
4317 DTRACE_PROBE2(nce__resolve__src__ill, ncec_t *, ncec,
4318 ill_t *, src_ill);
4319 }
4320 return (src_ill);
4321 }
4322
4323 void
ip_nce_lookup_and_update(ipaddr_t * addr,ipif_t * ipif,ip_stack_t * ipst,uchar_t * hwaddr,int hwaddr_len,int flags)4324 ip_nce_lookup_and_update(ipaddr_t *addr, ipif_t *ipif, ip_stack_t *ipst,
4325 uchar_t *hwaddr, int hwaddr_len, int flags)
4326 {
4327 ill_t *ill;
4328 ncec_t *ncec;
4329 nce_t *nce;
4330 uint16_t new_state;
4331
4332 ill = (ipif ? ipif->ipif_ill : NULL);
4333 if (ill != NULL) {
4334 /*
4335 * only one ncec is possible
4336 */
4337 nce = nce_lookup_v4(ill, addr);
4338 if (nce != NULL) {
4339 ncec = nce->nce_common;
4340 mutex_enter(&ncec->ncec_lock);
4341 if (NCE_ISREACHABLE(ncec))
4342 new_state = ND_UNCHANGED;
4343 else
4344 new_state = ND_STALE;
4345 ncec->ncec_flags = flags;
4346 nce_update(ncec, new_state, hwaddr);
4347 mutex_exit(&ncec->ncec_lock);
4348 nce_refrele(nce);
4349 return;
4350 }
4351 } else {
4352 /*
4353 * ill is wildcard; clean up all ncec's and ire's
4354 * that match on addr.
4355 */
4356 nce_hw_map_t hwm;
4357
4358 hwm.hwm_addr = *addr;
4359 hwm.hwm_hwlen = hwaddr_len;
4360 hwm.hwm_hwaddr = hwaddr;
4361 hwm.hwm_flags = flags;
4362
4363 ncec_walk_common(ipst->ips_ndp4, NULL,
4364 (pfi_t)nce_update_hw_changed, (uchar_t *)&hwm, B_TRUE);
4365 }
4366 }
4367
4368 /*
4369 * Common function to add ncec entries.
4370 * we always add the ncec with ncec_ill == ill, and always create
4371 * nce_t on ncec_ill. A dlpi fastpath message may be triggered if the
4372 * ncec is !reachable.
4373 *
4374 * When the caller passes in an nce_state of ND_UNCHANGED,
4375 * nce_add_common() will determine the state of the created nce based
4376 * on the ill_net_type and nce_flags used. Otherwise, the nce will
4377 * be created with state set to the passed in nce_state.
4378 */
4379 static int
nce_add_common(ill_t * ill,uchar_t * hw_addr,uint_t hw_addr_len,const in6_addr_t * addr,uint16_t flags,uint16_t nce_state,nce_t ** retnce)4380 nce_add_common(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len,
4381 const in6_addr_t *addr, uint16_t flags, uint16_t nce_state, nce_t **retnce)
4382 {
4383 static ncec_t nce_nil;
4384 uchar_t *template = NULL;
4385 int err;
4386 ncec_t *ncec;
4387 ncec_t **ncep;
4388 ip_stack_t *ipst = ill->ill_ipst;
4389 uint16_t state;
4390 boolean_t fastprobe = B_FALSE;
4391 struct ndp_g_s *ndp;
4392 nce_t *nce = NULL;
4393 mblk_t *dlur_mp = NULL;
4394
4395 if (ill->ill_isv6)
4396 ndp = ill->ill_ipst->ips_ndp6;
4397 else
4398 ndp = ill->ill_ipst->ips_ndp4;
4399
4400 *retnce = NULL;
4401
4402 ASSERT(MUTEX_HELD(&ndp->ndp_g_lock));
4403
4404 if (IN6_IS_ADDR_UNSPECIFIED(addr)) {
4405 ip0dbg(("nce_add_common: no addr\n"));
4406 return (EINVAL);
4407 }
4408 if ((flags & ~NCE_EXTERNAL_FLAGS_MASK)) {
4409 ip0dbg(("nce_add_common: flags = %x\n", (int)flags));
4410 return (EINVAL);
4411 }
4412
4413 if (ill->ill_isv6) {
4414 ncep = ((ncec_t **)NCE_HASH_PTR_V6(ipst, *addr));
4415 } else {
4416 ipaddr_t v4addr;
4417
4418 IN6_V4MAPPED_TO_IPADDR(addr, v4addr);
4419 ncep = ((ncec_t **)NCE_HASH_PTR_V4(ipst, v4addr));
4420 }
4421
4422 /*
4423 * The caller has ensured that there is no nce on ill, but there could
4424 * still be an nce_common_t for the address, so that we find exisiting
4425 * ncec_t strucutures first, and atomically add a new nce_t if
4426 * one is found. The ndp_g_lock ensures that we don't cross threads
4427 * with an ncec_delete(). Unlike ncec_lookup_illgrp() we do not
4428 * compare for matches across the illgrp because this function is
4429 * called via nce_lookup_then_add_v* -> nce_add_v* -> nce_add_common,
4430 * with the nce_lookup_then_add_v* passing in the ipmp_ill where
4431 * appropriate.
4432 */
4433 ncec = *ncep;
4434 for (; ncec != NULL; ncec = ncec->ncec_next) {
4435 if (ncec->ncec_ill == ill) {
4436 if (IN6_ARE_ADDR_EQUAL(&ncec->ncec_addr, addr)) {
4437 /*
4438 * We should never find *retnce to be
4439 * MYADDR, since the caller may then
4440 * incorrectly restart a DAD timer that's
4441 * already running. However, if we are in
4442 * forwarding mode, and the interface is
4443 * moving in/out of groups, the data
4444 * path ire lookup (e.g., ire_revalidate_nce)
4445 * may have determined that some destination
4446 * is offlink while the control path is adding
4447 * that address as a local address.
4448 * Recover from this case by failing the
4449 * lookup
4450 */
4451 if (NCE_MYADDR(ncec))
4452 return (ENXIO);
4453 *retnce = nce_ill_lookup_then_add(ill, ncec);
4454 if (*retnce != NULL)
4455 break;
4456 }
4457 }
4458 }
4459 if (*retnce != NULL) /* caller must trigger fastpath on nce */
4460 return (0);
4461
4462 ncec = kmem_cache_alloc(ncec_cache, KM_NOSLEEP);
4463 if (ncec == NULL)
4464 return (ENOMEM);
4465 *ncec = nce_nil;
4466 ncec->ncec_ill = ill;
4467 ncec->ncec_ipversion = (ill->ill_isv6 ? IPV6_VERSION : IPV4_VERSION);
4468 ncec->ncec_flags = flags;
4469 ncec->ncec_ipst = ipst; /* No netstack_hold */
4470
4471 if (!ill->ill_isv6) {
4472 ipaddr_t addr4;
4473
4474 /*
4475 * DAD probe interval and probe count are set based on
4476 * fast/slow probe settings. If the underlying link doesn't
4477 * have reliably up/down notifications or if we're working
4478 * with IPv4 169.254.0.0/16 Link Local Address space, then
4479 * don't use the fast timers. Otherwise, use them.
4480 */
4481 ASSERT(IN6_IS_ADDR_V4MAPPED(addr));
4482 IN6_V4MAPPED_TO_IPADDR(addr, addr4);
4483 if (ill->ill_note_link && !IS_IPV4_LL_SPACE(&addr4)) {
4484 fastprobe = B_TRUE;
4485 } else if (IS_IPMP(ill) && NCE_PUBLISH(ncec) &&
4486 !IS_IPV4_LL_SPACE(&addr4)) {
4487 ill_t *hwaddr_ill;
4488
4489 hwaddr_ill = ipmp_illgrp_find_ill(ill->ill_grp, hw_addr,
4490 hw_addr_len);
4491 if (hwaddr_ill != NULL && hwaddr_ill->ill_note_link)
4492 fastprobe = B_TRUE;
4493 }
4494 if (fastprobe) {
4495 ncec->ncec_xmit_interval =
4496 ipst->ips_arp_fastprobe_interval;
4497 ncec->ncec_pcnt =
4498 ipst->ips_arp_fastprobe_count;
4499 ncec->ncec_flags |= NCE_F_FAST;
4500 } else {
4501 ncec->ncec_xmit_interval =
4502 ipst->ips_arp_probe_interval;
4503 ncec->ncec_pcnt =
4504 ipst->ips_arp_probe_count;
4505 }
4506 if (NCE_PUBLISH(ncec)) {
4507 ncec->ncec_unsolicit_count =
4508 ipst->ips_ip_arp_publish_count;
4509 }
4510 } else {
4511 /*
4512 * probe interval is constant: ILL_PROBE_INTERVAL
4513 * probe count is constant: ND_MAX_UNICAST_SOLICIT
4514 */
4515 ncec->ncec_pcnt = ND_MAX_UNICAST_SOLICIT;
4516 if (NCE_PUBLISH(ncec)) {
4517 ncec->ncec_unsolicit_count =
4518 ipst->ips_ip_ndp_unsolicit_count;
4519 }
4520 }
4521 ncec->ncec_rcnt = ill->ill_xmit_count;
4522 ncec->ncec_addr = *addr;
4523 ncec->ncec_qd_mp = NULL;
4524 ncec->ncec_refcnt = 1; /* for ncec getting created */
4525 mutex_init(&ncec->ncec_lock, NULL, MUTEX_DEFAULT, NULL);
4526 ncec->ncec_trace_disable = B_FALSE;
4527
4528 /*
4529 * ncec_lladdr holds link layer address
4530 */
4531 if (hw_addr_len > 0) {
4532 template = kmem_alloc(hw_addr_len, KM_NOSLEEP);
4533 if (template == NULL) {
4534 err = ENOMEM;
4535 goto err_ret;
4536 }
4537 ncec->ncec_lladdr = template;
4538 ncec->ncec_lladdr_length = hw_addr_len;
4539 bzero(ncec->ncec_lladdr, hw_addr_len);
4540 }
4541 if ((flags & NCE_F_BCAST) != 0) {
4542 state = ND_REACHABLE;
4543 ASSERT(hw_addr_len > 0);
4544 } else if (ill->ill_net_type == IRE_IF_RESOLVER) {
4545 state = ND_INITIAL;
4546 } else if (ill->ill_net_type == IRE_IF_NORESOLVER) {
4547 /*
4548 * NORESOLVER entries are always created in the REACHABLE
4549 * state.
4550 */
4551 state = ND_REACHABLE;
4552 if (ill->ill_phys_addr_length == IP_ADDR_LEN &&
4553 ill->ill_mactype != DL_IPV4 &&
4554 ill->ill_mactype != DL_6TO4) {
4555 /*
4556 * We create a nce_res_mp with the IP nexthop address
4557 * as the destination address if the physical length
4558 * is exactly 4 bytes for point-to-multipoint links
4559 * that do their own resolution from IP to link-layer
4560 * address (e.g. IP over X.25).
4561 */
4562 bcopy((uchar_t *)addr,
4563 ncec->ncec_lladdr, ill->ill_phys_addr_length);
4564 }
4565 if (ill->ill_phys_addr_length == IPV6_ADDR_LEN &&
4566 ill->ill_mactype != DL_IPV6) {
4567 /*
4568 * We create a nce_res_mp with the IP nexthop address
4569 * as the destination address if the physical legnth
4570 * is exactly 16 bytes for point-to-multipoint links
4571 * that do their own resolution from IP to link-layer
4572 * address.
4573 */
4574 bcopy((uchar_t *)addr,
4575 ncec->ncec_lladdr, ill->ill_phys_addr_length);
4576 }
4577 /*
4578 * Since NUD is not part of the base IPv4 protocol definition,
4579 * IPv4 neighbor entries on NORESOLVER interfaces will never
4580 * age, and are marked NCE_F_NONUD.
4581 */
4582 if (!ill->ill_isv6)
4583 ncec->ncec_flags |= NCE_F_NONUD;
4584 } else if (ill->ill_net_type == IRE_LOOPBACK) {
4585 state = ND_REACHABLE;
4586 }
4587
4588 if (hw_addr != NULL || ill->ill_net_type == IRE_IF_NORESOLVER) {
4589 /*
4590 * We are adding an ncec with a deterministic hw_addr,
4591 * so the state can only be one of {REACHABLE, STALE, PROBE}.
4592 *
4593 * if we are adding a unicast ncec for the local address
4594 * it would be REACHABLE; we would be adding a ND_STALE entry
4595 * for the requestor of an ARP_REQUEST/ND_SOLICIT. Our own
4596 * addresses are added in PROBE to trigger DAD.
4597 */
4598 if ((flags & (NCE_F_MCAST|NCE_F_BCAST)) ||
4599 ill->ill_net_type == IRE_IF_NORESOLVER)
4600 state = ND_REACHABLE;
4601 else if (!NCE_PUBLISH(ncec))
4602 state = ND_STALE;
4603 else
4604 state = ND_PROBE;
4605 if (hw_addr != NULL)
4606 nce_set_ll(ncec, hw_addr);
4607 }
4608 /* caller overrides internally computed state */
4609 if (nce_state != ND_UNCHANGED)
4610 state = nce_state;
4611
4612 if (state == ND_PROBE)
4613 ncec->ncec_flags |= NCE_F_UNVERIFIED;
4614
4615 ncec->ncec_state = state;
4616
4617 if (state == ND_REACHABLE) {
4618 ncec->ncec_last = ncec->ncec_init_time =
4619 TICK_TO_MSEC(ddi_get_lbolt64());
4620 } else {
4621 ncec->ncec_last = 0;
4622 if (state == ND_INITIAL)
4623 ncec->ncec_init_time = TICK_TO_MSEC(ddi_get_lbolt64());
4624 }
4625 list_create(&ncec->ncec_cb, sizeof (ncec_cb_t),
4626 offsetof(ncec_cb_t, ncec_cb_node));
4627 /*
4628 * have all the memory allocations out of the way before taking locks
4629 * and adding the nce.
4630 */
4631 nce = kmem_cache_alloc(nce_cache, KM_NOSLEEP);
4632 if (nce == NULL) {
4633 err = ENOMEM;
4634 goto err_ret;
4635 }
4636 if (ncec->ncec_lladdr != NULL ||
4637 ill->ill_net_type == IRE_IF_NORESOLVER) {
4638 dlur_mp = ill_dlur_gen(ncec->ncec_lladdr,
4639 ill->ill_phys_addr_length, ill->ill_sap,
4640 ill->ill_sap_length);
4641 if (dlur_mp == NULL) {
4642 err = ENOMEM;
4643 goto err_ret;
4644 }
4645 }
4646
4647 /*
4648 * Atomically ensure that the ill is not CONDEMNED, before
4649 * adding the NCE.
4650 */
4651 mutex_enter(&ill->ill_lock);
4652 if (ill->ill_state_flags & ILL_CONDEMNED) {
4653 mutex_exit(&ill->ill_lock);
4654 err = EINVAL;
4655 goto err_ret;
4656 }
4657 if (!NCE_MYADDR(ncec) &&
4658 (ill->ill_state_flags & ILL_DOWN_IN_PROGRESS)) {
4659 mutex_exit(&ill->ill_lock);
4660 DTRACE_PROBE1(nce__add__on__down__ill, ncec_t *, ncec);
4661 err = EINVAL;
4662 goto err_ret;
4663 }
4664 /*
4665 * Acquire the ncec_lock even before adding the ncec to the list
4666 * so that it cannot get deleted after the ncec is added, but
4667 * before we add the nce.
4668 */
4669 mutex_enter(&ncec->ncec_lock);
4670 if ((ncec->ncec_next = *ncep) != NULL)
4671 ncec->ncec_next->ncec_ptpn = &ncec->ncec_next;
4672 *ncep = ncec;
4673 ncec->ncec_ptpn = ncep;
4674
4675 /* Bump up the number of ncec's referencing this ill */
4676 DTRACE_PROBE3(ill__incr__cnt, (ill_t *), ill,
4677 (char *), "ncec", (void *), ncec);
4678 ill->ill_ncec_cnt++;
4679 /*
4680 * Since we hold the ncec_lock at this time, the ncec cannot be
4681 * condemned, and we can safely add the nce.
4682 */
4683 *retnce = nce_add_impl(ill, ncec, nce, dlur_mp);
4684 mutex_exit(&ncec->ncec_lock);
4685 mutex_exit(&ill->ill_lock);
4686
4687 /* caller must trigger fastpath on *retnce */
4688 return (0);
4689
4690 err_ret:
4691 if (ncec != NULL)
4692 kmem_cache_free(ncec_cache, ncec);
4693 if (nce != NULL)
4694 kmem_cache_free(nce_cache, nce);
4695 freemsg(dlur_mp);
4696 if (template != NULL)
4697 kmem_free(template, ill->ill_phys_addr_length);
4698 return (err);
4699 }
4700
4701 /*
4702 * take a ref on the nce
4703 */
4704 void
nce_refhold(nce_t * nce)4705 nce_refhold(nce_t *nce)
4706 {
4707 mutex_enter(&nce->nce_lock);
4708 nce->nce_refcnt++;
4709 ASSERT((nce)->nce_refcnt != 0);
4710 mutex_exit(&nce->nce_lock);
4711 }
4712
4713 /*
4714 * release a ref on the nce; In general, this
4715 * cannot be called with locks held because nce_inactive
4716 * may result in nce_inactive which will take the ill_lock,
4717 * do ipif_ill_refrele_tail etc. Thus the one exception
4718 * where this can be called with locks held is when the caller
4719 * is certain that the nce_refcnt is sufficient to prevent
4720 * the invocation of nce_inactive.
4721 */
4722 void
nce_refrele(nce_t * nce)4723 nce_refrele(nce_t *nce)
4724 {
4725 ASSERT((nce)->nce_refcnt != 0);
4726 mutex_enter(&nce->nce_lock);
4727 if (--nce->nce_refcnt == 0)
4728 nce_inactive(nce); /* destroys the mutex */
4729 else
4730 mutex_exit(&nce->nce_lock);
4731 }
4732
4733 /*
4734 * free the nce after all refs have gone away.
4735 */
4736 static void
nce_inactive(nce_t * nce)4737 nce_inactive(nce_t *nce)
4738 {
4739 ill_t *ill = nce->nce_ill;
4740
4741 ASSERT(nce->nce_refcnt == 0);
4742
4743 ncec_refrele_notr(nce->nce_common);
4744 nce->nce_common = NULL;
4745 freemsg(nce->nce_fp_mp);
4746 freemsg(nce->nce_dlur_mp);
4747
4748 mutex_enter(&ill->ill_lock);
4749 DTRACE_PROBE3(ill__decr__cnt, (ill_t *), ill,
4750 (char *), "nce", (void *), nce);
4751 ill->ill_nce_cnt--;
4752 nce->nce_ill = NULL;
4753 /*
4754 * If the number of ncec's associated with this ill have dropped
4755 * to zero, check whether we need to restart any operation that
4756 * is waiting for this to happen.
4757 */
4758 if (ILL_DOWN_OK(ill)) {
4759 /* ipif_ill_refrele_tail drops the ill_lock */
4760 ipif_ill_refrele_tail(ill);
4761 } else {
4762 mutex_exit(&ill->ill_lock);
4763 }
4764
4765 mutex_destroy(&nce->nce_lock);
4766 kmem_cache_free(nce_cache, nce);
4767 }
4768
4769 /*
4770 * Add an nce to the ill_nce list.
4771 */
4772 static nce_t *
nce_add_impl(ill_t * ill,ncec_t * ncec,nce_t * nce,mblk_t * dlur_mp)4773 nce_add_impl(ill_t *ill, ncec_t *ncec, nce_t *nce, mblk_t *dlur_mp)
4774 {
4775 bzero(nce, sizeof (*nce));
4776 mutex_init(&nce->nce_lock, NULL, MUTEX_DEFAULT, NULL);
4777 nce->nce_common = ncec;
4778 nce->nce_addr = ncec->ncec_addr;
4779 nce->nce_ill = ill;
4780 DTRACE_PROBE3(ill__incr__cnt, (ill_t *), ill,
4781 (char *), "nce", (void *), nce);
4782 ill->ill_nce_cnt++;
4783
4784 nce->nce_refcnt = 1; /* for the thread */
4785 ncec->ncec_refcnt++; /* want ncec_refhold_locked_notr(ncec) */
4786 nce->nce_dlur_mp = dlur_mp;
4787
4788 /* add nce to the ill's fastpath list. */
4789 nce->nce_refcnt++; /* for the list */
4790 list_insert_head(&ill->ill_nce, nce);
4791 return (nce);
4792 }
4793
4794 static nce_t *
nce_add(ill_t * ill,ncec_t * ncec)4795 nce_add(ill_t *ill, ncec_t *ncec)
4796 {
4797 nce_t *nce;
4798 mblk_t *dlur_mp = NULL;
4799
4800 ASSERT(MUTEX_HELD(&ill->ill_lock));
4801 ASSERT(MUTEX_HELD(&ncec->ncec_lock));
4802
4803 nce = kmem_cache_alloc(nce_cache, KM_NOSLEEP);
4804 if (nce == NULL)
4805 return (NULL);
4806 if (ncec->ncec_lladdr != NULL ||
4807 ill->ill_net_type == IRE_IF_NORESOLVER) {
4808 dlur_mp = ill_dlur_gen(ncec->ncec_lladdr,
4809 ill->ill_phys_addr_length, ill->ill_sap,
4810 ill->ill_sap_length);
4811 if (dlur_mp == NULL) {
4812 kmem_cache_free(nce_cache, nce);
4813 return (NULL);
4814 }
4815 }
4816 return (nce_add_impl(ill, ncec, nce, dlur_mp));
4817 }
4818
4819 /*
4820 * remove the nce from the ill_faspath list
4821 */
4822 void
nce_delete(nce_t * nce)4823 nce_delete(nce_t *nce)
4824 {
4825 ill_t *ill = nce->nce_ill;
4826
4827 ASSERT(MUTEX_HELD(&ill->ill_lock));
4828
4829 mutex_enter(&nce->nce_lock);
4830 if (nce->nce_is_condemned) {
4831 /*
4832 * some other thread has removed this nce from the ill_nce list
4833 */
4834 mutex_exit(&nce->nce_lock);
4835 return;
4836 }
4837 nce->nce_is_condemned = B_TRUE;
4838 mutex_exit(&nce->nce_lock);
4839
4840 list_remove(&ill->ill_nce, nce);
4841 /*
4842 * even though we are holding the ill_lock, it is ok to
4843 * call nce_refrele here because we know that we should have
4844 * at least 2 refs on the nce: one for the thread, and one
4845 * for the list. The refrele below will release the one for
4846 * the list.
4847 */
4848 nce_refrele(nce);
4849 }
4850
4851 nce_t *
nce_lookup(ill_t * ill,const in6_addr_t * addr)4852 nce_lookup(ill_t *ill, const in6_addr_t *addr)
4853 {
4854 nce_t *nce = NULL;
4855
4856 ASSERT(ill != NULL);
4857 ASSERT(MUTEX_HELD(&ill->ill_lock));
4858
4859 for (nce = list_head(&ill->ill_nce); nce != NULL;
4860 nce = list_next(&ill->ill_nce, nce)) {
4861 if (IN6_ARE_ADDR_EQUAL(&nce->nce_addr, addr))
4862 break;
4863 }
4864
4865 /*
4866 * if we found the nce on the ill_nce list while holding
4867 * the ill_lock, then it cannot be condemned yet.
4868 */
4869 if (nce != NULL) {
4870 ASSERT(!nce->nce_is_condemned);
4871 nce_refhold(nce);
4872 }
4873 return (nce);
4874 }
4875
4876 /*
4877 * Walk the ill_nce list on ill. The callback function func() cannot perform
4878 * any destructive actions.
4879 */
4880 static void
nce_walk_common(ill_t * ill,pfi_t func,void * arg)4881 nce_walk_common(ill_t *ill, pfi_t func, void *arg)
4882 {
4883 nce_t *nce = NULL, *nce_next;
4884
4885 ASSERT(MUTEX_HELD(&ill->ill_lock));
4886 for (nce = list_head(&ill->ill_nce); nce != NULL; ) {
4887 nce_next = list_next(&ill->ill_nce, nce);
4888 if (func(ill, nce, arg) != 0)
4889 break;
4890 nce = nce_next;
4891 }
4892 }
4893
4894 void
nce_walk(ill_t * ill,pfi_t func,void * arg)4895 nce_walk(ill_t *ill, pfi_t func, void *arg)
4896 {
4897 mutex_enter(&ill->ill_lock);
4898 nce_walk_common(ill, func, arg);
4899 mutex_exit(&ill->ill_lock);
4900 }
4901
4902 void
nce_flush(ill_t * ill,boolean_t flushall)4903 nce_flush(ill_t *ill, boolean_t flushall)
4904 {
4905 nce_t *nce, *nce_next;
4906 list_t dead;
4907
4908 list_create(&dead, sizeof (nce_t), offsetof(nce_t, nce_node));
4909 mutex_enter(&ill->ill_lock);
4910 for (nce = list_head(&ill->ill_nce); nce != NULL; ) {
4911 nce_next = list_next(&ill->ill_nce, nce);
4912 if (!flushall && NCE_PUBLISH(nce->nce_common)) {
4913 nce = nce_next;
4914 continue;
4915 }
4916 /*
4917 * nce_delete requires that the caller should either not
4918 * be holding locks, or should hold a ref to ensure that
4919 * we wont hit ncec_inactive. So take a ref and clean up
4920 * after the list is flushed.
4921 */
4922 nce_refhold(nce);
4923 nce_delete(nce);
4924 list_insert_tail(&dead, nce);
4925 nce = nce_next;
4926 }
4927 mutex_exit(&ill->ill_lock);
4928 while ((nce = list_head(&dead)) != NULL) {
4929 list_remove(&dead, nce);
4930 nce_refrele(nce);
4931 }
4932 ASSERT(list_is_empty(&dead));
4933 list_destroy(&dead);
4934 }
4935
4936 /* Return an interval that is anywhere in the [1 .. intv] range */
4937 static clock_t
nce_fuzz_interval(clock_t intv,boolean_t initial_time)4938 nce_fuzz_interval(clock_t intv, boolean_t initial_time)
4939 {
4940 clock_t rnd, frac;
4941
4942 (void) random_get_pseudo_bytes((uint8_t *)&rnd, sizeof (rnd));
4943 /* Note that clock_t is signed; must chop off bits */
4944 rnd &= (1ul << (NBBY * sizeof (rnd) - 1)) - 1;
4945 if (initial_time) {
4946 if (intv <= 0)
4947 intv = 1;
4948 else
4949 intv = (rnd % intv) + 1;
4950 } else {
4951 /* Compute 'frac' as 20% of the configured interval */
4952 if ((frac = intv / 5) <= 1)
4953 frac = 2;
4954 /* Set intv randomly in the range [intv-frac .. intv+frac] */
4955 if ((intv = intv - frac + rnd % (2 * frac + 1)) <= 0)
4956 intv = 1;
4957 }
4958 return (intv);
4959 }
4960
4961 void
nce_resolv_ipmp_ok(ncec_t * ncec)4962 nce_resolv_ipmp_ok(ncec_t *ncec)
4963 {
4964 mblk_t *mp;
4965 uint_t pkt_len;
4966 iaflags_t ixaflags = IXAF_NO_TRACE;
4967 nce_t *under_nce;
4968 ill_t *ill = ncec->ncec_ill;
4969 boolean_t isv6 = (ncec->ncec_ipversion == IPV6_VERSION);
4970 ipif_t *src_ipif = NULL;
4971 ip_stack_t *ipst = ill->ill_ipst;
4972 ill_t *send_ill;
4973 uint_t nprobes;
4974
4975 ASSERT(IS_IPMP(ill));
4976
4977 mutex_enter(&ncec->ncec_lock);
4978 nprobes = ncec->ncec_nprobes;
4979 mp = ncec->ncec_qd_mp;
4980 ncec->ncec_qd_mp = NULL;
4981 ncec->ncec_nprobes = 0;
4982 mutex_exit(&ncec->ncec_lock);
4983
4984 while (mp != NULL) {
4985 mblk_t *nxt_mp;
4986
4987 nxt_mp = mp->b_next;
4988 mp->b_next = NULL;
4989 if (isv6) {
4990 ip6_t *ip6h = (ip6_t *)mp->b_rptr;
4991
4992 pkt_len = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN;
4993 src_ipif = ipif_lookup_addr_nondup_v6(&ip6h->ip6_src,
4994 ill, ALL_ZONES, ipst);
4995 } else {
4996 ipha_t *ipha = (ipha_t *)mp->b_rptr;
4997
4998 ixaflags |= IXAF_IS_IPV4;
4999 pkt_len = ntohs(ipha->ipha_length);
5000 src_ipif = ipif_lookup_addr_nondup(ipha->ipha_src,
5001 ill, ALL_ZONES, ipst);
5002 }
5003
5004 /*
5005 * find a new nce based on an under_ill. The first IPMP probe
5006 * packet gets queued, so we could still find a src_ipif that
5007 * matches an IPMP test address.
5008 */
5009 if (src_ipif == NULL || IS_IPMP(src_ipif->ipif_ill)) {
5010 /*
5011 * if src_ipif is null, this could be either a
5012 * forwarded packet or a probe whose src got deleted.
5013 * We identify the former case by looking for the
5014 * ncec_nprobes: the first ncec_nprobes packets are
5015 * probes;
5016 */
5017 if (src_ipif == NULL && nprobes > 0)
5018 goto drop_pkt;
5019
5020 /*
5021 * For forwarded packets, we use the ipmp rotor
5022 * to find send_ill.
5023 */
5024 send_ill = ipmp_ill_hold_xmit_ill(ncec->ncec_ill,
5025 B_TRUE);
5026 } else {
5027 send_ill = src_ipif->ipif_ill;
5028 ill_refhold(send_ill);
5029 }
5030
5031 DTRACE_PROBE4(nce__resolve__ipmp, (mblk_t *), mp,
5032 (ncec_t *), ncec, (ipif_t *),
5033 src_ipif, (ill_t *), send_ill);
5034
5035 if (send_ill == NULL) {
5036 if (src_ipif != NULL)
5037 ipif_refrele(src_ipif);
5038 goto drop_pkt;
5039 }
5040 /* create an under_nce on send_ill */
5041 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
5042 if (IS_IN_SAME_ILLGRP(send_ill, ncec->ncec_ill))
5043 under_nce = nce_fastpath_create(send_ill, ncec);
5044 else
5045 under_nce = NULL;
5046 rw_exit(&ipst->ips_ill_g_lock);
5047 if (under_nce != NULL && NCE_ISREACHABLE(ncec))
5048 nce_fastpath_trigger(under_nce);
5049
5050 ill_refrele(send_ill);
5051 if (src_ipif != NULL)
5052 ipif_refrele(src_ipif);
5053
5054 if (under_nce != NULL) {
5055 (void) ip_xmit(mp, under_nce, ixaflags, pkt_len, 0,
5056 ALL_ZONES, 0, NULL);
5057 nce_refrele(under_nce);
5058 if (nprobes > 0)
5059 nprobes--;
5060 mp = nxt_mp;
5061 continue;
5062 }
5063 drop_pkt:
5064 if (isv6) {
5065 BUMP_MIB(&ipst->ips_ip6_mib, ipIfStatsOutDiscards);
5066 } else {
5067 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
5068 }
5069 ip_drop_output("ipIfStatsOutDiscards - no under_ill", mp, NULL);
5070 freemsg(mp);
5071 if (nprobes > 0)
5072 nprobes--;
5073 mp = nxt_mp;
5074 }
5075 ncec_cb_dispatch(ncec); /* complete callbacks */
5076 }
5077