1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved. 23 */ 24 25 /* 26 * Copyright (c) 2019, Joyent, Inc. 27 */ 28 29 #include <sys/types.h> 30 #include <sys/stream.h> 31 #include <sys/stropts.h> 32 #include <sys/strsun.h> 33 #include <sys/sysmacros.h> 34 #include <sys/errno.h> 35 #include <sys/dlpi.h> 36 #include <sys/socket.h> 37 #include <sys/ddi.h> 38 #include <sys/sunddi.h> 39 #include <sys/cmn_err.h> 40 #include <sys/debug.h> 41 #include <sys/vtrace.h> 42 #include <sys/kmem.h> 43 #include <sys/zone.h> 44 #include <sys/ethernet.h> 45 #include <sys/sdt.h> 46 #include <sys/mac.h> 47 48 #include <net/if.h> 49 #include <net/if_types.h> 50 #include <net/if_dl.h> 51 #include <net/route.h> 52 #include <netinet/in.h> 53 #include <netinet/ip6.h> 54 #include <netinet/icmp6.h> 55 56 #include <inet/common.h> 57 #include <inet/mi.h> 58 #include <inet/mib2.h> 59 #include <inet/nd.h> 60 #include <inet/ip.h> 61 #include <inet/ip_impl.h> 62 #include <inet/ipclassifier.h> 63 #include <inet/ip_if.h> 64 #include <inet/ip_ire.h> 65 #include <inet/ip_rts.h> 66 #include <inet/ip6.h> 67 #include <inet/ip_ndp.h> 68 #include <inet/sctp_ip.h> 69 #include <inet/ip_arp.h> 70 #include <inet/ip2mac_impl.h> 71 72 #define ANNOUNCE_INTERVAL(isv6) \ 73 (isv6 ? ipst->ips_ip_ndp_unsolicit_interval : \ 74 ipst->ips_ip_arp_publish_interval) 75 76 #define DEFENSE_INTERVAL(isv6) \ 77 (isv6 ? ipst->ips_ndp_defend_interval : \ 78 ipst->ips_arp_defend_interval) 79 80 /* Non-tunable probe interval, based on link capabilities */ 81 #define ILL_PROBE_INTERVAL(ill) ((ill)->ill_note_link ? 150 : 1500) 82 83 /* 84 * The IPv4 Link Local address space is special; we do extra duplicate checking 85 * there, as the entire assignment mechanism rests on random numbers. 86 */ 87 #define IS_IPV4_LL_SPACE(ptr) (((uchar_t *)ptr)[0] == 169 && \ 88 ((uchar_t *)ptr)[1] == 254) 89 90 /* 91 * NCE_EXTERNAL_FLAGS_MASK defines the set of ncec_flags that may be passed 92 * in to the ncec*add* functions. 93 * 94 * NCE_F_AUTHORITY means that we ignore any incoming adverts for that 95 * mapping (though DAD is performed for the mapping). NCE_F_PUBLISH means 96 * that we will respond to requests for the protocol address. 97 */ 98 #define NCE_EXTERNAL_FLAGS_MASK \ 99 (NCE_F_MYADDR | NCE_F_ISROUTER | NCE_F_NONUD | \ 100 NCE_F_ANYCAST | NCE_F_UNSOL_ADV | NCE_F_BCAST | NCE_F_MCAST | \ 101 NCE_F_AUTHORITY | NCE_F_PUBLISH | NCE_F_STATIC) 102 103 /* 104 * Lock ordering: 105 * 106 * ndp_g_lock -> ill_lock -> ncec_lock 107 * 108 * The ndp_g_lock protects the NCE hash (nce_hash_tbl, NCE_HASH_PTR) and 109 * ncec_next. ncec_lock protects the contents of the NCE (particularly 110 * ncec_refcnt). 111 */ 112 113 static void nce_cleanup_list(ncec_t *ncec); 114 static void nce_set_ll(ncec_t *ncec, uchar_t *ll_addr); 115 static ncec_t *ncec_lookup_illgrp(ill_t *, const in6_addr_t *, 116 ncec_t *); 117 static nce_t *nce_lookup_addr(ill_t *, const in6_addr_t *); 118 static int nce_set_multicast_v6(ill_t *ill, const in6_addr_t *addr, 119 uint16_t ncec_flags, nce_t **newnce); 120 static int nce_set_multicast_v4(ill_t *ill, const in_addr_t *dst, 121 uint16_t ncec_flags, nce_t **newnce); 122 static boolean_t ndp_xmit(ill_t *ill, uint32_t operation, 123 uint8_t *hwaddr, uint_t hwaddr_len, const in6_addr_t *sender, 124 const in6_addr_t *target, int flag); 125 static void ncec_refhold_locked(ncec_t *); 126 static boolean_t ill_defend_rate_limit(ill_t *, ncec_t *); 127 static void nce_queue_mp_common(ncec_t *, mblk_t *, boolean_t); 128 static int nce_add_common(ill_t *, uchar_t *, uint_t, const in6_addr_t *, 129 uint16_t, uint16_t, nce_t **); 130 static nce_t *nce_add_impl(ill_t *, ncec_t *, nce_t *, mblk_t *, list_t *); 131 static nce_t *nce_add(ill_t *, ncec_t *, list_t *); 132 static void nce_inactive(nce_t *); 133 extern nce_t *nce_lookup(ill_t *, const in6_addr_t *); 134 static nce_t *nce_ill_lookup_then_add(ill_t *, ncec_t *); 135 static int nce_add_v6(ill_t *, uchar_t *, uint_t, const in6_addr_t *, 136 uint16_t, uint16_t, nce_t **); 137 static int nce_add_v4(ill_t *, uchar_t *, uint_t, const in_addr_t *, 138 uint16_t, uint16_t, nce_t **); 139 static int nce_add_v6_postprocess(nce_t *); 140 static int nce_add_v4_postprocess(nce_t *); 141 static ill_t *nce_resolve_src(ncec_t *, in6_addr_t *); 142 static clock_t nce_fuzz_interval(clock_t, boolean_t); 143 static void nce_resolv_ipmp_ok(ncec_t *); 144 static void nce_walk_common(ill_t *, pfi_t, void *); 145 static void nce_start_timer(ncec_t *, uint_t); 146 static nce_t *nce_fastpath_create(ill_t *, ncec_t *); 147 static void nce_fastpath_trigger(nce_t *); 148 static nce_t *nce_fastpath(ncec_t *, boolean_t, nce_t *); 149 150 #ifdef DEBUG 151 static void ncec_trace_cleanup(const ncec_t *); 152 #endif 153 154 #define NCE_HASH_PTR_V4(ipst, addr) \ 155 (&((ipst)->ips_ndp4->nce_hash_tbl[IRE_ADDR_HASH(addr, NCE_TABLE_SIZE)])) 156 157 #define NCE_HASH_PTR_V6(ipst, addr) \ 158 (&((ipst)->ips_ndp6->nce_hash_tbl[NCE_ADDR_HASH_V6(addr, \ 159 NCE_TABLE_SIZE)])) 160 161 extern kmem_cache_t *ncec_cache; 162 extern kmem_cache_t *nce_cache; 163 164 /* 165 * Send out a IPv6 (unicast) or IPv4 (broadcast) DAD probe 166 * If src_ill is not null, the ncec_addr is bound to src_ill. The 167 * src_ill is ignored by nce_dad for IPv4 Neighbor Cache entries where 168 * the probe is sent on the ncec_ill (in the non-IPMP case) or the 169 * IPMP cast_ill (in the IPMP case). 170 * 171 * Note that the probe interval is based on the src_ill for IPv6, and 172 * the ncec_xmit_interval for IPv4. 173 */ 174 static void 175 nce_dad(ncec_t *ncec, ill_t *src_ill, boolean_t send_probe) 176 { 177 boolean_t dropped; 178 uint32_t probe_interval; 179 180 ASSERT(!(ncec->ncec_flags & NCE_F_MCAST)); 181 ASSERT(!(ncec->ncec_flags & NCE_F_BCAST)); 182 if (ncec->ncec_ipversion == IPV6_VERSION) { 183 dropped = ndp_xmit(src_ill, ND_NEIGHBOR_SOLICIT, 184 ncec->ncec_lladdr, ncec->ncec_lladdr_length, 185 &ipv6_all_zeros, &ncec->ncec_addr, NDP_PROBE); 186 probe_interval = ILL_PROBE_INTERVAL(src_ill); 187 } else { 188 /* IPv4 DAD delay the initial probe. */ 189 if (send_probe) 190 dropped = arp_probe(ncec); 191 else 192 dropped = B_TRUE; 193 probe_interval = nce_fuzz_interval(ncec->ncec_xmit_interval, 194 !send_probe); 195 } 196 if (!dropped) { 197 mutex_enter(&ncec->ncec_lock); 198 ncec->ncec_pcnt--; 199 mutex_exit(&ncec->ncec_lock); 200 } 201 nce_restart_timer(ncec, probe_interval); 202 } 203 204 /* 205 * Compute default flags to use for an advertisement of this ncec's address. 206 */ 207 static int 208 nce_advert_flags(const ncec_t *ncec) 209 { 210 int flag = 0; 211 212 if (ncec->ncec_flags & NCE_F_ISROUTER) 213 flag |= NDP_ISROUTER; 214 if (!(ncec->ncec_flags & NCE_F_ANYCAST)) 215 flag |= NDP_ORIDE; 216 217 return (flag); 218 } 219 220 /* 221 * NDP Cache Entry creation routine. 222 * This routine must always be called with ndp6->ndp_g_lock held. 223 */ 224 int 225 nce_add_v6(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len, 226 const in6_addr_t *addr, uint16_t flags, uint16_t state, nce_t **newnce) 227 { 228 int err; 229 nce_t *nce; 230 231 ASSERT(MUTEX_HELD(&ill->ill_ipst->ips_ndp6->ndp_g_lock)); 232 ASSERT(ill != NULL && ill->ill_isv6); 233 234 err = nce_add_common(ill, hw_addr, hw_addr_len, addr, flags, state, 235 &nce); 236 if (err != 0) 237 return (err); 238 ASSERT(newnce != NULL); 239 *newnce = nce; 240 return (err); 241 } 242 243 /* 244 * Post-processing routine to be executed after nce_add_v6(). This function 245 * triggers fastpath (if appropriate) and DAD on the newly added nce entry 246 * and must be called without any locks held. 247 */ 248 int 249 nce_add_v6_postprocess(nce_t *nce) 250 { 251 ncec_t *ncec = nce->nce_common; 252 boolean_t dropped = B_FALSE; 253 uchar_t *hw_addr = ncec->ncec_lladdr; 254 uint_t hw_addr_len = ncec->ncec_lladdr_length; 255 ill_t *ill = ncec->ncec_ill; 256 int err = 0; 257 uint16_t flags = ncec->ncec_flags; 258 ip_stack_t *ipst = ill->ill_ipst; 259 boolean_t trigger_fastpath = B_TRUE; 260 261 /* 262 * If the hw_addr is NULL, typically for ND_INCOMPLETE nces, then 263 * we call nce_fastpath as soon as the ncec is resolved in nce_process. 264 * We call nce_fastpath from nce_update if the link layer address of 265 * the peer changes from nce_update 266 */ 267 if (NCE_PUBLISH(ncec) || !NCE_ISREACHABLE(ncec) || 268 (hw_addr == NULL && ill->ill_net_type != IRE_IF_NORESOLVER)) 269 trigger_fastpath = B_FALSE; 270 271 if (trigger_fastpath) 272 nce_fastpath_trigger(nce); 273 if (NCE_PUBLISH(ncec) && ncec->ncec_state == ND_PROBE) { 274 ill_t *hwaddr_ill; 275 /* 276 * Unicast entry that needs DAD. 277 */ 278 if (IS_IPMP(ill)) { 279 hwaddr_ill = ipmp_illgrp_find_ill(ill->ill_grp, 280 hw_addr, hw_addr_len); 281 } else { 282 hwaddr_ill = ill; 283 } 284 nce_dad(ncec, hwaddr_ill, B_TRUE); 285 err = EINPROGRESS; 286 } else if (flags & NCE_F_UNSOL_ADV) { 287 /* 288 * We account for the transmit below by assigning one 289 * less than the ndd variable. Subsequent decrements 290 * are done in nce_timer. 291 */ 292 mutex_enter(&ncec->ncec_lock); 293 ncec->ncec_unsolicit_count = 294 ipst->ips_ip_ndp_unsolicit_count - 1; 295 mutex_exit(&ncec->ncec_lock); 296 dropped = ndp_xmit(ill, 297 ND_NEIGHBOR_ADVERT, 298 hw_addr, 299 hw_addr_len, 300 &ncec->ncec_addr, /* Source and target of the adv */ 301 &ipv6_all_hosts_mcast, /* Destination of the packet */ 302 nce_advert_flags(ncec)); 303 mutex_enter(&ncec->ncec_lock); 304 if (dropped) 305 ncec->ncec_unsolicit_count++; 306 else 307 ncec->ncec_last_time_defended = ddi_get_lbolt(); 308 if (ncec->ncec_unsolicit_count != 0) { 309 nce_start_timer(ncec, 310 ipst->ips_ip_ndp_unsolicit_interval); 311 } 312 mutex_exit(&ncec->ncec_lock); 313 } 314 return (err); 315 } 316 317 /* 318 * Atomically lookup and add (if needed) Neighbor Cache information for 319 * an address. 320 * 321 * IPMP notes: the ncec for non-local (i.e., !NCE_MYADDR(ncec) addresses 322 * are always added pointing at the ipmp_ill. Thus, when the ill passed 323 * to nce_add_v6 is an under_ill (i.e., IS_UNDER_IPMP(ill)) two nce_t 324 * entries will be created, both pointing at the same ncec_t. The nce_t 325 * entries will have their nce_ill set to the ipmp_ill and the under_ill 326 * respectively, with the ncec_t having its ncec_ill pointing at the ipmp_ill. 327 * Local addresses are always created on the ill passed to nce_add_v6. 328 */ 329 int 330 nce_lookup_then_add_v6(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len, 331 const in6_addr_t *addr, uint16_t flags, uint16_t state, nce_t **newnce) 332 { 333 int err = 0; 334 ip_stack_t *ipst = ill->ill_ipst; 335 nce_t *nce, *upper_nce = NULL; 336 ill_t *in_ill = ill; 337 boolean_t need_ill_refrele = B_FALSE; 338 339 if (flags & NCE_F_MCAST) { 340 /* 341 * hw_addr will be figured out in nce_set_multicast_v6; 342 * caller has to select the cast_ill 343 */ 344 ASSERT(hw_addr == NULL); 345 ASSERT(!IS_IPMP(ill)); 346 err = nce_set_multicast_v6(ill, addr, flags, newnce); 347 return (err); 348 } 349 ASSERT(ill->ill_isv6); 350 if (IS_UNDER_IPMP(ill) && !(flags & NCE_F_MYADDR)) { 351 ill = ipmp_ill_hold_ipmp_ill(ill); 352 if (ill == NULL) 353 return (ENXIO); 354 need_ill_refrele = B_TRUE; 355 } 356 357 mutex_enter(&ipst->ips_ndp6->ndp_g_lock); 358 nce = nce_lookup_addr(ill, addr); 359 if (nce == NULL) { 360 err = nce_add_v6(ill, hw_addr, hw_addr_len, addr, flags, state, 361 &nce); 362 } else { 363 err = EEXIST; 364 } 365 mutex_exit(&ipst->ips_ndp6->ndp_g_lock); 366 if (err == 0) 367 err = nce_add_v6_postprocess(nce); 368 if (in_ill != ill && nce != NULL) { 369 nce_t *under_nce = NULL; 370 371 /* 372 * in_ill was the under_ill. Try to create the under_nce. 373 * Hold the ill_g_lock to prevent changes to group membership 374 * until we are done. 375 */ 376 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 377 if (!IS_IN_SAME_ILLGRP(in_ill, ill)) { 378 DTRACE_PROBE2(ill__not__in__group, nce_t *, nce, 379 ill_t *, ill); 380 rw_exit(&ipst->ips_ill_g_lock); 381 err = ENXIO; 382 nce_refrele(nce); 383 nce = NULL; 384 goto bail; 385 } 386 under_nce = nce_fastpath_create(in_ill, nce->nce_common); 387 if (under_nce == NULL) { 388 rw_exit(&ipst->ips_ill_g_lock); 389 err = EINVAL; 390 nce_refrele(nce); 391 nce = NULL; 392 goto bail; 393 } 394 rw_exit(&ipst->ips_ill_g_lock); 395 upper_nce = nce; 396 nce = under_nce; /* will be returned to caller */ 397 if (NCE_ISREACHABLE(nce->nce_common)) 398 nce_fastpath_trigger(under_nce); 399 } 400 /* nce_refrele is deferred until the lock is dropped */ 401 if (nce != NULL) { 402 if (newnce != NULL) 403 *newnce = nce; 404 else 405 nce_refrele(nce); 406 } 407 bail: 408 if (upper_nce != NULL) 409 nce_refrele(upper_nce); 410 if (need_ill_refrele) 411 ill_refrele(ill); 412 return (err); 413 } 414 415 /* 416 * Remove all the CONDEMNED nces from the appropriate hash table. 417 * We create a private list of NCEs, these may have ires pointing 418 * to them, so the list will be passed through to clean up dependent 419 * ires and only then we can do ncec_refrele() which can make NCE inactive. 420 */ 421 static void 422 nce_remove(ndp_g_t *ndp, ncec_t *ncec, ncec_t **free_nce_list) 423 { 424 ncec_t *ncec1; 425 ncec_t **ptpn; 426 427 ASSERT(MUTEX_HELD(&ndp->ndp_g_lock)); 428 ASSERT(ndp->ndp_g_walker == 0); 429 for (; ncec; ncec = ncec1) { 430 ncec1 = ncec->ncec_next; 431 mutex_enter(&ncec->ncec_lock); 432 if (NCE_ISCONDEMNED(ncec)) { 433 ptpn = ncec->ncec_ptpn; 434 ncec1 = ncec->ncec_next; 435 if (ncec1 != NULL) 436 ncec1->ncec_ptpn = ptpn; 437 *ptpn = ncec1; 438 ncec->ncec_ptpn = NULL; 439 ncec->ncec_next = NULL; 440 ncec->ncec_next = *free_nce_list; 441 *free_nce_list = ncec; 442 } 443 mutex_exit(&ncec->ncec_lock); 444 } 445 } 446 447 /* 448 * 1. Mark the entry CONDEMNED. This ensures that no new nce_lookup() 449 * will return this NCE. Also no new timeouts will 450 * be started (See nce_restart_timer). 451 * 2. Cancel any currently running timeouts. 452 * 3. If there is an ndp walker, return. The walker will do the cleanup. 453 * This ensures that walkers see a consistent list of NCEs while walking. 454 * 4. Otherwise remove the NCE from the list of NCEs 455 */ 456 void 457 ncec_delete(ncec_t *ncec) 458 { 459 ncec_t **ptpn; 460 ncec_t *ncec1; 461 int ipversion = ncec->ncec_ipversion; 462 ndp_g_t *ndp; 463 ip_stack_t *ipst = ncec->ncec_ipst; 464 465 if (ipversion == IPV4_VERSION) 466 ndp = ipst->ips_ndp4; 467 else 468 ndp = ipst->ips_ndp6; 469 470 /* Serialize deletes */ 471 mutex_enter(&ncec->ncec_lock); 472 if (NCE_ISCONDEMNED(ncec)) { 473 /* Some other thread is doing the delete */ 474 mutex_exit(&ncec->ncec_lock); 475 return; 476 } 477 /* 478 * Caller has a refhold. Also 1 ref for being in the list. Thus 479 * refcnt has to be >= 2 480 */ 481 ASSERT(ncec->ncec_refcnt >= 2); 482 ncec->ncec_flags |= NCE_F_CONDEMNED; 483 mutex_exit(&ncec->ncec_lock); 484 485 /* Count how many condemned ires for kmem_cache callback */ 486 atomic_inc_32(&ipst->ips_num_nce_condemned); 487 nce_fastpath_list_delete(ncec->ncec_ill, ncec, NULL); 488 489 /* Complete any waiting callbacks */ 490 ncec_cb_dispatch(ncec); 491 492 /* 493 * Cancel any running timer. Timeout can't be restarted 494 * since CONDEMNED is set. Can't hold ncec_lock across untimeout. 495 * Passing invalid timeout id is fine. 496 */ 497 if (ncec->ncec_timeout_id != 0) { 498 (void) untimeout(ncec->ncec_timeout_id); 499 ncec->ncec_timeout_id = 0; 500 } 501 502 mutex_enter(&ndp->ndp_g_lock); 503 if (ncec->ncec_ptpn == NULL) { 504 /* 505 * The last ndp walker has already removed this ncec from 506 * the list after we marked the ncec CONDEMNED and before 507 * we grabbed the global lock. 508 */ 509 mutex_exit(&ndp->ndp_g_lock); 510 return; 511 } 512 if (ndp->ndp_g_walker > 0) { 513 /* 514 * Can't unlink. The walker will clean up 515 */ 516 ndp->ndp_g_walker_cleanup = B_TRUE; 517 mutex_exit(&ndp->ndp_g_lock); 518 return; 519 } 520 521 /* 522 * Now remove the ncec from the list. nce_restart_timer won't restart 523 * the timer since it is marked CONDEMNED. 524 */ 525 ptpn = ncec->ncec_ptpn; 526 ncec1 = ncec->ncec_next; 527 if (ncec1 != NULL) 528 ncec1->ncec_ptpn = ptpn; 529 *ptpn = ncec1; 530 ncec->ncec_ptpn = NULL; 531 ncec->ncec_next = NULL; 532 mutex_exit(&ndp->ndp_g_lock); 533 534 /* Removed from ncec_ptpn/ncec_next list */ 535 ncec_refrele_notr(ncec); 536 } 537 538 void 539 ncec_inactive(ncec_t *ncec) 540 { 541 mblk_t **mpp; 542 ill_t *ill = ncec->ncec_ill; 543 ip_stack_t *ipst = ncec->ncec_ipst; 544 545 ASSERT(ncec->ncec_refcnt == 0); 546 ASSERT(MUTEX_HELD(&ncec->ncec_lock)); 547 548 /* Count how many condemned nces for kmem_cache callback */ 549 if (NCE_ISCONDEMNED(ncec)) 550 atomic_add_32(&ipst->ips_num_nce_condemned, -1); 551 552 /* Free all allocated messages */ 553 mpp = &ncec->ncec_qd_mp; 554 while (*mpp != NULL) { 555 mblk_t *mp; 556 557 mp = *mpp; 558 *mpp = mp->b_next; 559 560 inet_freemsg(mp); 561 } 562 /* 563 * must have been cleaned up in ncec_delete 564 */ 565 ASSERT(list_is_empty(&ncec->ncec_cb)); 566 list_destroy(&ncec->ncec_cb); 567 /* 568 * free the ncec_lladdr if one was allocated in nce_add_common() 569 */ 570 if (ncec->ncec_lladdr_length > 0) 571 kmem_free(ncec->ncec_lladdr, ncec->ncec_lladdr_length); 572 573 #ifdef DEBUG 574 ncec_trace_cleanup(ncec); 575 #endif 576 577 mutex_enter(&ill->ill_lock); 578 DTRACE_PROBE3(ill__decr__cnt, (ill_t *), ill, 579 (char *), "ncec", (void *), ncec); 580 ill->ill_ncec_cnt--; 581 ncec->ncec_ill = NULL; 582 /* 583 * If the number of ncec's associated with this ill have dropped 584 * to zero, check whether we need to restart any operation that 585 * is waiting for this to happen. 586 */ 587 if (ILL_DOWN_OK(ill)) { 588 /* ipif_ill_refrele_tail drops the ill_lock */ 589 ipif_ill_refrele_tail(ill); 590 } else { 591 mutex_exit(&ill->ill_lock); 592 } 593 594 mutex_destroy(&ncec->ncec_lock); 595 kmem_cache_free(ncec_cache, ncec); 596 } 597 598 /* 599 * ncec_walk routine. Delete the ncec if it is associated with the ill 600 * that is going away. Always called as a writer. 601 */ 602 void 603 ncec_delete_per_ill(ncec_t *ncec, void *arg) 604 { 605 if ((ncec != NULL) && ncec->ncec_ill == arg) { 606 ncec_delete(ncec); 607 } 608 } 609 610 /* 611 * Neighbor Cache cleanup logic for a list of ncec_t entries. 612 */ 613 static void 614 nce_cleanup_list(ncec_t *ncec) 615 { 616 ncec_t *ncec_next; 617 618 ASSERT(ncec != NULL); 619 while (ncec != NULL) { 620 ncec_next = ncec->ncec_next; 621 ncec->ncec_next = NULL; 622 623 /* 624 * It is possible for the last ndp walker (this thread) 625 * to come here after ncec_delete has marked the ncec CONDEMNED 626 * and before it has removed the ncec from the fastpath list 627 * or called untimeout. So we need to do it here. It is safe 628 * for both ncec_delete and this thread to do it twice or 629 * even simultaneously since each of the threads has a 630 * reference on the ncec. 631 */ 632 nce_fastpath_list_delete(ncec->ncec_ill, ncec, NULL); 633 /* 634 * Cancel any running timer. Timeout can't be restarted 635 * since CONDEMNED is set. The ncec_lock can't be 636 * held across untimeout though passing invalid timeout 637 * id is fine. 638 */ 639 if (ncec->ncec_timeout_id != 0) { 640 (void) untimeout(ncec->ncec_timeout_id); 641 ncec->ncec_timeout_id = 0; 642 } 643 /* Removed from ncec_ptpn/ncec_next list */ 644 ncec_refrele_notr(ncec); 645 ncec = ncec_next; 646 } 647 } 648 649 /* 650 * Restart DAD on given NCE. Returns B_TRUE if DAD has been restarted. 651 */ 652 boolean_t 653 nce_restart_dad(ncec_t *ncec) 654 { 655 boolean_t started; 656 ill_t *ill, *hwaddr_ill; 657 658 if (ncec == NULL) 659 return (B_FALSE); 660 ill = ncec->ncec_ill; 661 mutex_enter(&ncec->ncec_lock); 662 if (ncec->ncec_state == ND_PROBE) { 663 mutex_exit(&ncec->ncec_lock); 664 started = B_TRUE; 665 } else if (ncec->ncec_state == ND_REACHABLE) { 666 ASSERT(ncec->ncec_lladdr != NULL); 667 ncec->ncec_state = ND_PROBE; 668 ncec->ncec_pcnt = ND_MAX_UNICAST_SOLICIT; 669 /* 670 * Slight cheat here: we don't use the initial probe delay 671 * for IPv4 in this obscure case. 672 */ 673 mutex_exit(&ncec->ncec_lock); 674 if (IS_IPMP(ill)) { 675 hwaddr_ill = ipmp_illgrp_find_ill(ill->ill_grp, 676 ncec->ncec_lladdr, ncec->ncec_lladdr_length); 677 } else { 678 hwaddr_ill = ill; 679 } 680 nce_dad(ncec, hwaddr_ill, B_TRUE); 681 started = B_TRUE; 682 } else { 683 mutex_exit(&ncec->ncec_lock); 684 started = B_FALSE; 685 } 686 return (started); 687 } 688 689 /* 690 * IPv6 Cache entry lookup. Try to find an ncec matching the parameters passed. 691 * If one is found, the refcnt on the ncec will be incremented. 692 */ 693 ncec_t * 694 ncec_lookup_illgrp_v6(ill_t *ill, const in6_addr_t *addr) 695 { 696 ncec_t *ncec; 697 ip_stack_t *ipst = ill->ill_ipst; 698 699 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 700 mutex_enter(&ipst->ips_ndp6->ndp_g_lock); 701 702 /* Get head of v6 hash table */ 703 ncec = *((ncec_t **)NCE_HASH_PTR_V6(ipst, *addr)); 704 ncec = ncec_lookup_illgrp(ill, addr, ncec); 705 mutex_exit(&ipst->ips_ndp6->ndp_g_lock); 706 rw_exit(&ipst->ips_ill_g_lock); 707 return (ncec); 708 } 709 /* 710 * IPv4 Cache entry lookup. Try to find an ncec matching the parameters passed. 711 * If one is found, the refcnt on the ncec will be incremented. 712 */ 713 ncec_t * 714 ncec_lookup_illgrp_v4(ill_t *ill, const in_addr_t *addr) 715 { 716 ncec_t *ncec = NULL; 717 in6_addr_t addr6; 718 ip_stack_t *ipst = ill->ill_ipst; 719 720 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 721 mutex_enter(&ipst->ips_ndp4->ndp_g_lock); 722 723 /* Get head of v4 hash table */ 724 ncec = *((ncec_t **)NCE_HASH_PTR_V4(ipst, *addr)); 725 IN6_IPADDR_TO_V4MAPPED(*addr, &addr6); 726 ncec = ncec_lookup_illgrp(ill, &addr6, ncec); 727 mutex_exit(&ipst->ips_ndp4->ndp_g_lock); 728 rw_exit(&ipst->ips_ill_g_lock); 729 return (ncec); 730 } 731 732 /* 733 * Cache entry lookup. Try to find an ncec matching the parameters passed. 734 * If an ncec is found, increment the hold count on that ncec. 735 * The caller passes in the start of the appropriate hash table, and must 736 * be holding the appropriate global lock (ndp_g_lock). In addition, since 737 * this function matches ncec_t entries across the illgrp, the ips_ill_g_lock 738 * must be held as reader. 739 * 740 * This function always matches across the ipmp group. 741 */ 742 ncec_t * 743 ncec_lookup_illgrp(ill_t *ill, const in6_addr_t *addr, ncec_t *ncec) 744 { 745 ndp_g_t *ndp; 746 ip_stack_t *ipst = ill->ill_ipst; 747 748 if (ill->ill_isv6) 749 ndp = ipst->ips_ndp6; 750 else 751 ndp = ipst->ips_ndp4; 752 753 ASSERT(ill != NULL); 754 ASSERT(MUTEX_HELD(&ndp->ndp_g_lock)); 755 if (IN6_IS_ADDR_UNSPECIFIED(addr)) 756 return (NULL); 757 for (; ncec != NULL; ncec = ncec->ncec_next) { 758 if (ncec->ncec_ill == ill || 759 IS_IN_SAME_ILLGRP(ill, ncec->ncec_ill)) { 760 if (IN6_ARE_ADDR_EQUAL(&ncec->ncec_addr, addr)) { 761 mutex_enter(&ncec->ncec_lock); 762 if (!NCE_ISCONDEMNED(ncec)) { 763 ncec_refhold_locked(ncec); 764 mutex_exit(&ncec->ncec_lock); 765 break; 766 } 767 mutex_exit(&ncec->ncec_lock); 768 } 769 } 770 } 771 return (ncec); 772 } 773 774 /* 775 * Find an nce_t on ill with nce_addr == addr. Lookup the nce_t 776 * entries for ill only, i.e., when ill is part of an ipmp group, 777 * nce_lookup_v4 will never try to match across the group. 778 */ 779 nce_t * 780 nce_lookup_v4(ill_t *ill, const in_addr_t *addr) 781 { 782 nce_t *nce; 783 in6_addr_t addr6; 784 ip_stack_t *ipst = ill->ill_ipst; 785 786 mutex_enter(&ipst->ips_ndp4->ndp_g_lock); 787 IN6_IPADDR_TO_V4MAPPED(*addr, &addr6); 788 nce = nce_lookup_addr(ill, &addr6); 789 mutex_exit(&ipst->ips_ndp4->ndp_g_lock); 790 return (nce); 791 } 792 793 /* 794 * Find an nce_t on ill with nce_addr == addr. Lookup the nce_t 795 * entries for ill only, i.e., when ill is part of an ipmp group, 796 * nce_lookup_v6 will never try to match across the group. 797 */ 798 nce_t * 799 nce_lookup_v6(ill_t *ill, const in6_addr_t *addr6) 800 { 801 nce_t *nce; 802 ip_stack_t *ipst = ill->ill_ipst; 803 804 mutex_enter(&ipst->ips_ndp6->ndp_g_lock); 805 nce = nce_lookup_addr(ill, addr6); 806 mutex_exit(&ipst->ips_ndp6->ndp_g_lock); 807 return (nce); 808 } 809 810 static nce_t * 811 nce_lookup_addr(ill_t *ill, const in6_addr_t *addr) 812 { 813 nce_t *nce; 814 815 ASSERT(ill != NULL); 816 #ifdef DEBUG 817 if (ill->ill_isv6) 818 ASSERT(MUTEX_HELD(&ill->ill_ipst->ips_ndp6->ndp_g_lock)); 819 else 820 ASSERT(MUTEX_HELD(&ill->ill_ipst->ips_ndp4->ndp_g_lock)); 821 #endif 822 mutex_enter(&ill->ill_lock); 823 nce = nce_lookup(ill, addr); 824 mutex_exit(&ill->ill_lock); 825 return (nce); 826 } 827 828 829 /* 830 * Router turned to host. We need to make sure that cached copies of the ncec 831 * are not used for forwarding packets if they were derived from the default 832 * route, and that the default route itself is removed, as required by 833 * section 7.2.5 of RFC 2461. 834 * 835 * Note that the ncec itself probably has valid link-layer information for the 836 * nexthop, so that there is no reason to delete the ncec, as long as the 837 * ISROUTER flag is turned off. 838 */ 839 static void 840 ncec_router_to_host(ncec_t *ncec) 841 { 842 ire_t *ire; 843 ip_stack_t *ipst = ncec->ncec_ipst; 844 845 mutex_enter(&ncec->ncec_lock); 846 ncec->ncec_flags &= ~NCE_F_ISROUTER; 847 mutex_exit(&ncec->ncec_lock); 848 849 ire = ire_ftable_lookup_v6(&ipv6_all_zeros, &ipv6_all_zeros, 850 &ncec->ncec_addr, IRE_DEFAULT, ncec->ncec_ill, ALL_ZONES, NULL, 851 MATCH_IRE_ILL | MATCH_IRE_TYPE | MATCH_IRE_GW, 0, ipst, NULL); 852 if (ire != NULL) { 853 ip_rts_rtmsg(RTM_DELETE, ire, 0, ipst); 854 ire_delete(ire); 855 ire_refrele(ire); 856 } 857 } 858 859 /* 860 * Process passed in parameters either from an incoming packet or via 861 * user ioctl. 862 */ 863 void 864 nce_process(ncec_t *ncec, uchar_t *hw_addr, uint32_t flag, boolean_t is_adv) 865 { 866 ill_t *ill = ncec->ncec_ill; 867 uint32_t hw_addr_len = ill->ill_phys_addr_length; 868 boolean_t ll_updated = B_FALSE; 869 boolean_t ll_changed; 870 nce_t *nce; 871 872 ASSERT(ncec->ncec_ipversion == IPV6_VERSION); 873 /* 874 * No updates of link layer address or the neighbor state is 875 * allowed, when the cache is in NONUD state. This still 876 * allows for responding to reachability solicitation. 877 */ 878 mutex_enter(&ncec->ncec_lock); 879 if (ncec->ncec_state == ND_INCOMPLETE) { 880 if (hw_addr == NULL) { 881 mutex_exit(&ncec->ncec_lock); 882 return; 883 } 884 nce_set_ll(ncec, hw_addr); 885 /* 886 * Update ncec state and send the queued packets 887 * back to ip this time ire will be added. 888 */ 889 if (flag & ND_NA_FLAG_SOLICITED) { 890 nce_update(ncec, ND_REACHABLE, NULL); 891 } else { 892 nce_update(ncec, ND_STALE, NULL); 893 } 894 mutex_exit(&ncec->ncec_lock); 895 nce = nce_fastpath(ncec, B_TRUE, NULL); 896 nce_resolv_ok(ncec); 897 if (nce != NULL) 898 nce_refrele(nce); 899 return; 900 } 901 ll_changed = nce_cmp_ll_addr(ncec, hw_addr, hw_addr_len); 902 if (!is_adv) { 903 /* If this is a SOLICITATION request only */ 904 if (ll_changed) 905 nce_update(ncec, ND_STALE, hw_addr); 906 mutex_exit(&ncec->ncec_lock); 907 ncec_cb_dispatch(ncec); 908 return; 909 } 910 if (!(flag & ND_NA_FLAG_OVERRIDE) && ll_changed) { 911 /* If in any other state than REACHABLE, ignore */ 912 if (ncec->ncec_state == ND_REACHABLE) { 913 nce_update(ncec, ND_STALE, NULL); 914 } 915 mutex_exit(&ncec->ncec_lock); 916 ncec_cb_dispatch(ncec); 917 return; 918 } else { 919 if (ll_changed) { 920 nce_update(ncec, ND_UNCHANGED, hw_addr); 921 ll_updated = B_TRUE; 922 } 923 if (flag & ND_NA_FLAG_SOLICITED) { 924 nce_update(ncec, ND_REACHABLE, NULL); 925 } else { 926 if (ll_updated) { 927 nce_update(ncec, ND_STALE, NULL); 928 } 929 } 930 mutex_exit(&ncec->ncec_lock); 931 if (!(flag & ND_NA_FLAG_ROUTER) && (ncec->ncec_flags & 932 NCE_F_ISROUTER)) { 933 ncec_router_to_host(ncec); 934 } else { 935 ncec_cb_dispatch(ncec); 936 } 937 } 938 } 939 940 /* 941 * Pass arg1 to the cbf supplied, along with each ncec in existence. 942 * ncec_walk() places a REFHOLD on the ncec and drops the lock when 943 * walking the hash list. 944 */ 945 void 946 ncec_walk_common(ndp_g_t *ndp, ill_t *ill, ncec_walk_cb_t cbf, 947 void *arg1, boolean_t trace) 948 { 949 ncec_t *ncec; 950 ncec_t *ncec1; 951 ncec_t **ncep; 952 ncec_t *free_nce_list = NULL; 953 954 mutex_enter(&ndp->ndp_g_lock); 955 /* Prevent ncec_delete from unlink and free of NCE */ 956 ndp->ndp_g_walker++; 957 mutex_exit(&ndp->ndp_g_lock); 958 for (ncep = ndp->nce_hash_tbl; 959 ncep < A_END(ndp->nce_hash_tbl); ncep++) { 960 for (ncec = *ncep; ncec != NULL; ncec = ncec1) { 961 ncec1 = ncec->ncec_next; 962 if (ill == NULL || ncec->ncec_ill == ill) { 963 if (trace) { 964 ncec_refhold(ncec); 965 (*cbf)(ncec, arg1); 966 ncec_refrele(ncec); 967 } else { 968 ncec_refhold_notr(ncec); 969 (*cbf)(ncec, arg1); 970 ncec_refrele_notr(ncec); 971 } 972 } 973 } 974 } 975 mutex_enter(&ndp->ndp_g_lock); 976 ndp->ndp_g_walker--; 977 if (ndp->ndp_g_walker_cleanup && ndp->ndp_g_walker == 0) { 978 /* Time to delete condemned entries */ 979 for (ncep = ndp->nce_hash_tbl; 980 ncep < A_END(ndp->nce_hash_tbl); ncep++) { 981 ncec = *ncep; 982 if (ncec != NULL) { 983 nce_remove(ndp, ncec, &free_nce_list); 984 } 985 } 986 ndp->ndp_g_walker_cleanup = B_FALSE; 987 } 988 989 mutex_exit(&ndp->ndp_g_lock); 990 991 if (free_nce_list != NULL) { 992 nce_cleanup_list(free_nce_list); 993 } 994 } 995 996 /* 997 * Walk everything. 998 * Note that ill can be NULL hence can't derive the ipst from it. 999 */ 1000 void 1001 ncec_walk(ill_t *ill, ncec_walk_cb_t cbf, void *arg1, ip_stack_t *ipst) 1002 { 1003 ncec_walk_common(ipst->ips_ndp4, ill, cbf, arg1, B_TRUE); 1004 ncec_walk_common(ipst->ips_ndp6, ill, cbf, arg1, B_TRUE); 1005 } 1006 1007 /* 1008 * Cheesy globals (i.e. all netstacks) for both a limit on per-ill multicast 1009 * NCEs, and the number to reclaim if we hit the limit. Used by 1010 * nce_set_multicast_v[46]() to limit the linked-list length of ill_nce. Until 1011 * we solve the multicast-mappings-shouldn't-be-NCEs problem, use this. 1012 */ 1013 1014 /* Maximum number of multicast NCEs on an ill. */ 1015 uint_t ip_max_ill_mcast_nces = 16384; 1016 /* 1017 * Number of NCEs to delete if we hit the maximum above. 0 means *don't* and 1018 * return an error. Non-zero means delete so many, and if the number is >= 1019 * the max above, that means delete them all. 1020 */ 1021 uint_t ip_ill_mcast_reclaim = 256; 1022 1023 /* 1024 * Encapsulate multicast ill capping in a function, for easier DTrace 1025 * detections. Return a list of refheld NCEs to destroy-via-refrele. That 1026 * list can be NULL, but can only be non-NULL if we successfully reclaimed. 1027 * 1028 * NOTE: This function must be called while holding the ill_lock AND 1029 * JUST PRIOR to making the insertion into the ill_nce list. 1030 * 1031 * We can't release the ones we delete ourselves because the ill_lock is held 1032 * by the caller. They are, instead, passed back in a list_t for deletion 1033 * outside of the ill_lock hold. nce_graveyard_free() actually frees them. 1034 * 1035 * While this covers nce_t, ncec_t gets done even further down the road. See 1036 * nce_graveyard_free() for why. 1037 */ 1038 static boolean_t 1039 nce_too_many_mcast(ill_t *ill, list_t *graveyard) 1040 { 1041 uint_t reclaim_count, max_count, reclaimed = 0; 1042 boolean_t too_many; 1043 nce_t *nce, *deadman; 1044 1045 ASSERT(graveyard != NULL); 1046 ASSERT(list_is_empty(graveyard)); 1047 ASSERT(MUTEX_HELD(&ill->ill_lock)); 1048 1049 /* 1050 * NOTE: Some grinning weirdo may have lowered the global max beyond 1051 * what this ill currently has. The behavior in this case will be 1052 * trim-back just by the reclaim amount for any new ones. 1053 */ 1054 max_count = ip_max_ill_mcast_nces; 1055 reclaim_count = min(ip_ill_mcast_reclaim, max_count); 1056 1057 /* All good? */ 1058 if (ill->ill_mcast_nces < max_count) 1059 return (B_FALSE); /* Yes, all good. */ 1060 1061 if (reclaim_count == 0) 1062 return (B_TRUE); /* Don't bother - we're stuck. */ 1063 1064 /* We need to reclaim now. Exploit our held ill_lock. */ 1065 1066 /* 1067 * Start at the tail and work backwards, new nces are head-inserted, 1068 * so we'll be reaping the oldest entries. 1069 */ 1070 nce = list_tail(&ill->ill_nce); 1071 while (reclaimed < reclaim_count) { 1072 /* Skip ahead to a multicast NCE. */ 1073 while (nce != NULL && 1074 (nce->nce_common->ncec_flags & NCE_F_MCAST) == 0) { 1075 nce = list_prev(&ill->ill_nce, nce); 1076 } 1077 if (nce == NULL) 1078 break; 1079 1080 /* 1081 * NOTE: For now, we just delete the first one(s) we find. 1082 * This is not optimal, and may require some inspection of nce 1083 * & its ncec to be better. 1084 */ 1085 deadman = nce; 1086 nce = list_prev(&ill->ill_nce, nce); 1087 1088 /* nce_delete() requires caller holds... */ 1089 nce_refhold(deadman); 1090 nce_delete(deadman); /* Bumps down ill_mcast_nces. */ 1091 1092 /* Link the dead ones singly, still refheld... */ 1093 list_insert_tail(graveyard, deadman); 1094 reclaimed++; 1095 } 1096 1097 if (reclaimed != reclaim_count) { 1098 /* We didn't have enough to reach reclaim_count. Why?!? */ 1099 DTRACE_PROBE3(ill__mcast__nce__reclaim__mismatch, ill_t *, ill, 1100 uint_t, reclaimed, uint_t, reclaim_count); 1101 1102 /* In case for some REALLY weird reason we found none! */ 1103 too_many = (reclaimed == 0); 1104 } else { 1105 too_many = B_FALSE; 1106 } 1107 1108 return (too_many); 1109 } 1110 1111 static void 1112 ncec_mcast_reap_one(ncec_t *ncec, void *arg) 1113 { 1114 boolean_t reapit; 1115 ill_t *ill = (ill_t *)arg; 1116 1117 /* Obvious no-lock-needed checks... */ 1118 if (ncec == NULL || ncec->ncec_ill != ill || 1119 (ncec->ncec_flags & NCE_F_MCAST) == 0) 1120 return; 1121 1122 mutex_enter(&ncec->ncec_lock); 1123 /* 1124 * It's refheld by the walk infrastructure. It has one reference for 1125 * being in the ndp_g_hash, and if an nce_t exists, that's one more. 1126 * We want ones without an nce_t, so 2 is the magic number. If it's 1127 * LESS than 2, we have much bigger problems anyway. 1128 */ 1129 ASSERT(ncec->ncec_refcnt >= 2); 1130 reapit = (ncec->ncec_refcnt == 2); 1131 mutex_exit(&ncec->ncec_lock); 1132 1133 if (reapit) { 1134 IP_STAT(ill->ill_ipst, ip_nce_mcast_reclaim_deleted); 1135 ncec_delete(ncec); 1136 } 1137 } 1138 1139 /* 1140 * Attempt to reap stray multicast ncec_t structures left in the wake of 1141 * nce_graveyard_free(). This is a taskq servicing routine, as it's well 1142 * outside any netstack-global locks being held - ndp_g_lock in this case. We 1143 * have a reference hold on the ill, which will prevent any unplumbing races. 1144 */ 1145 static void 1146 ncec_mcast_reap(void *arg) 1147 { 1148 ill_t *ill = (ill_t *)arg; 1149 1150 IP_STAT(ill->ill_ipst, ip_nce_mcast_reclaim_calls); 1151 ncec_walk(ill, ncec_mcast_reap_one, ill, ill->ill_ipst); 1152 mutex_enter(&ill->ill_lock); 1153 ill->ill_mcast_ncec_cleanup = B_FALSE; 1154 /* 1155 * Inline a _notr() version of ill_refrele. See nce_graveyard_free() 1156 * below for why. 1157 */ 1158 ill->ill_refcnt--; 1159 if (ill->ill_refcnt == 0) 1160 ipif_ill_refrele_tail(ill); /* Drops ill_lock. */ 1161 else 1162 mutex_exit(&ill->ill_lock); 1163 } 1164 1165 /* 1166 * Free a list (including handling an empty list or NULL list) of 1167 * reference-held NCEs that were reaped from a nce_too_many_mcast() 1168 * call. Separate because the caller must have dropped ndp_g_lock first. 1169 * 1170 * This also schedules a taskq task to unlink underlying NCECs from the 1171 * ndp_g_hash, which are protected by ndp_g_lock. 1172 */ 1173 static void 1174 nce_graveyard_free(list_t *graveyard) 1175 { 1176 nce_t *deadman, *current; 1177 ill_t *ill; 1178 boolean_t doit; 1179 1180 if (graveyard == NULL) 1181 return; 1182 1183 current = list_head(graveyard); 1184 if (current == NULL) { 1185 list_destroy(graveyard); 1186 return; 1187 } 1188 1189 ill = current->nce_ill; 1190 /* 1191 * Normally one should ill_refhold(ill) here. There's no _notr() 1192 * variant like there is for ire_t, dce_t, or even ncec_t, but this is 1193 * the ONLY case that'll break the mh_trace that IP debugging uses for 1194 * reference counts (i.e. they assume same thread releases as 1195 * holds). Instead, we inline ill_refhold() here. We must do the same 1196 * in the release done by the ncec_mcast_reap() above. 1197 */ 1198 mutex_enter(&ill->ill_lock); 1199 ill->ill_refcnt++; 1200 mutex_exit(&ill->ill_lock); 1201 1202 while (current != NULL) { 1203 ASSERT3P(ill, ==, current->nce_ill); 1204 deadman = current; 1205 current = list_next(graveyard, deadman); 1206 list_remove(graveyard, deadman); 1207 ASSERT3U((deadman->nce_common->ncec_flags & NCE_F_MCAST), !=, 1208 0); 1209 nce_refrele(deadman); 1210 } 1211 list_destroy(graveyard); 1212 1213 mutex_enter(&ill->ill_lock); 1214 if (ill->ill_mcast_ncec_cleanup) 1215 doit = B_FALSE; 1216 else { 1217 ill->ill_mcast_ncec_cleanup = B_TRUE; 1218 doit = B_TRUE; 1219 } 1220 mutex_exit(&ill->ill_lock); 1221 if (!doit || taskq_dispatch(system_taskq, ncec_mcast_reap, 1222 ill, TQ_NOSLEEP) == TASKQID_INVALID) { 1223 mutex_enter(&ill->ill_lock); 1224 if (doit) { 1225 IP_STAT(ill->ill_ipst, ip_nce_mcast_reclaim_tqfail); 1226 ill->ill_mcast_ncec_cleanup = B_FALSE; 1227 } 1228 /* There's no _notr() for ill_refrele(), so inline it here. */ 1229 ill->ill_refcnt--; 1230 if (ill->ill_refcnt == 0) 1231 ipif_ill_refrele_tail(ill); /* Drops ill_lock */ 1232 else 1233 mutex_exit(&ill->ill_lock); 1234 } 1235 } 1236 1237 /* 1238 * For each interface an entry is added for the unspecified multicast group. 1239 * Here that mapping is used to form the multicast cache entry for a particular 1240 * multicast destination. 1241 */ 1242 static int 1243 nce_set_multicast_v6(ill_t *ill, const in6_addr_t *dst, 1244 uint16_t flags, nce_t **newnce) 1245 { 1246 uchar_t *hw_addr; 1247 int err = 0; 1248 ip_stack_t *ipst = ill->ill_ipst; 1249 nce_t *nce; 1250 1251 ASSERT(ill != NULL); 1252 ASSERT(ill->ill_isv6); 1253 ASSERT(!(IN6_IS_ADDR_UNSPECIFIED(dst))); 1254 1255 mutex_enter(&ipst->ips_ndp6->ndp_g_lock); 1256 nce = nce_lookup_addr(ill, dst); 1257 if (nce != NULL) { 1258 mutex_exit(&ipst->ips_ndp6->ndp_g_lock); 1259 goto done; 1260 } 1261 if (ill->ill_net_type == IRE_IF_RESOLVER) { 1262 /* 1263 * For IRE_IF_RESOLVER a hardware mapping can be 1264 * generated. 1265 */ 1266 hw_addr = kmem_alloc(ill->ill_nd_lla_len, KM_NOSLEEP); 1267 if (hw_addr == NULL) { 1268 mutex_exit(&ipst->ips_ndp6->ndp_g_lock); 1269 return (ENOMEM); 1270 } 1271 ip_mcast_mapping(ill, (uchar_t *)dst, hw_addr); 1272 } else { 1273 /* No hw_addr is needed for IRE_IF_NORESOLVER. */ 1274 hw_addr = NULL; 1275 } 1276 ASSERT((flags & NCE_F_MCAST) != 0); 1277 ASSERT((flags & NCE_F_NONUD) != 0); 1278 /* nce_state will be computed by nce_add_common() */ 1279 err = nce_add_v6(ill, hw_addr, ill->ill_phys_addr_length, dst, flags, 1280 ND_UNCHANGED, &nce); 1281 mutex_exit(&ipst->ips_ndp6->ndp_g_lock); 1282 if (err == 0) 1283 err = (nce != NULL) ? nce_add_v6_postprocess(nce) : ENOMEM; 1284 if (hw_addr != NULL) 1285 kmem_free(hw_addr, ill->ill_nd_lla_len); 1286 if (err != 0) { 1287 ip1dbg(("nce_set_multicast_v6: create failed" "%d\n", err)); 1288 return (err); 1289 } 1290 done: 1291 ASSERT(nce->nce_common->ncec_state == ND_REACHABLE); 1292 if (newnce != NULL) 1293 *newnce = nce; 1294 else 1295 nce_refrele(nce); 1296 return (0); 1297 } 1298 1299 /* 1300 * Return the link layer address, and any flags of a ncec. 1301 */ 1302 int 1303 ndp_query(ill_t *ill, struct lif_nd_req *lnr) 1304 { 1305 ncec_t *ncec; 1306 in6_addr_t *addr; 1307 sin6_t *sin6; 1308 1309 ASSERT(ill != NULL && ill->ill_isv6); 1310 sin6 = (sin6_t *)&lnr->lnr_addr; 1311 addr = &sin6->sin6_addr; 1312 1313 /* 1314 * NOTE: if the ill is an IPMP interface, then match against the whole 1315 * illgrp. This e.g. allows in.ndpd to retrieve the link layer 1316 * addresses for the data addresses on an IPMP interface even though 1317 * ipif_ndp_up() created them with an ncec_ill of ipif_bound_ill. 1318 */ 1319 ncec = ncec_lookup_illgrp_v6(ill, addr); 1320 if (ncec == NULL) 1321 return (ESRCH); 1322 /* If no link layer address is available yet, return ESRCH */ 1323 if (!NCE_ISREACHABLE(ncec)) { 1324 ncec_refrele(ncec); 1325 return (ESRCH); 1326 } 1327 lnr->lnr_hdw_len = ill->ill_phys_addr_length; 1328 bcopy(ncec->ncec_lladdr, (uchar_t *)&lnr->lnr_hdw_addr, 1329 lnr->lnr_hdw_len); 1330 if (ncec->ncec_flags & NCE_F_ISROUTER) 1331 lnr->lnr_flags = NDF_ISROUTER_ON; 1332 if (ncec->ncec_flags & NCE_F_ANYCAST) 1333 lnr->lnr_flags |= NDF_ANYCAST_ON; 1334 if (ncec->ncec_flags & NCE_F_STATIC) 1335 lnr->lnr_flags |= NDF_STATIC; 1336 ncec_refrele(ncec); 1337 return (0); 1338 } 1339 1340 /* 1341 * Finish setting up the Enable/Disable multicast for the driver. 1342 */ 1343 mblk_t * 1344 ndp_mcastreq(ill_t *ill, const in6_addr_t *v6group, uint32_t hw_addr_len, 1345 uint32_t hw_addr_offset, mblk_t *mp) 1346 { 1347 uchar_t *hw_addr; 1348 ipaddr_t v4group; 1349 uchar_t *addr; 1350 1351 ASSERT(ill->ill_net_type == IRE_IF_RESOLVER); 1352 if (IN6_IS_ADDR_V4MAPPED(v6group)) { 1353 IN6_V4MAPPED_TO_IPADDR(v6group, v4group); 1354 1355 ASSERT(CLASSD(v4group)); 1356 ASSERT(!(ill->ill_isv6)); 1357 1358 addr = (uchar_t *)&v4group; 1359 } else { 1360 ASSERT(IN6_IS_ADDR_MULTICAST(v6group)); 1361 ASSERT(ill->ill_isv6); 1362 1363 addr = (uchar_t *)v6group; 1364 } 1365 hw_addr = mi_offset_paramc(mp, hw_addr_offset, hw_addr_len); 1366 if (hw_addr == NULL) { 1367 ip0dbg(("ndp_mcastreq NULL hw_addr\n")); 1368 freemsg(mp); 1369 return (NULL); 1370 } 1371 1372 ip_mcast_mapping(ill, addr, hw_addr); 1373 return (mp); 1374 } 1375 1376 void 1377 ip_ndp_resolve(ncec_t *ncec) 1378 { 1379 in_addr_t sender4 = INADDR_ANY; 1380 in6_addr_t sender6 = ipv6_all_zeros; 1381 ill_t *src_ill; 1382 uint32_t ms; 1383 1384 src_ill = nce_resolve_src(ncec, &sender6); 1385 if (src_ill == NULL) { 1386 /* Make sure we try again later */ 1387 ms = ncec->ncec_ill->ill_reachable_retrans_time; 1388 nce_restart_timer(ncec, (clock_t)ms); 1389 return; 1390 } 1391 if (ncec->ncec_ipversion == IPV4_VERSION) 1392 IN6_V4MAPPED_TO_IPADDR(&sender6, sender4); 1393 mutex_enter(&ncec->ncec_lock); 1394 if (ncec->ncec_ipversion == IPV6_VERSION) 1395 ms = ndp_solicit(ncec, sender6, src_ill); 1396 else 1397 ms = arp_request(ncec, sender4, src_ill); 1398 mutex_exit(&ncec->ncec_lock); 1399 if (ms == 0) { 1400 if (ncec->ncec_state != ND_REACHABLE) { 1401 if (ncec->ncec_ipversion == IPV6_VERSION) 1402 ndp_resolv_failed(ncec); 1403 else 1404 arp_resolv_failed(ncec); 1405 ASSERT((ncec->ncec_flags & NCE_F_STATIC) == 0); 1406 nce_make_unreachable(ncec); 1407 ncec_delete(ncec); 1408 } 1409 } else { 1410 nce_restart_timer(ncec, (clock_t)ms); 1411 } 1412 done: 1413 ill_refrele(src_ill); 1414 } 1415 1416 /* 1417 * Send an IPv6 neighbor solicitation. 1418 * Returns number of milliseconds after which we should either rexmit or abort. 1419 * Return of zero means we should abort. 1420 * The caller holds the ncec_lock to protect ncec_qd_mp and ncec_rcnt. 1421 * The optional source address is used as a hint to ndp_solicit for 1422 * which source to use in the packet. 1423 * 1424 * NOTE: This routine drops ncec_lock (and later reacquires it) when sending 1425 * the packet. 1426 */ 1427 uint32_t 1428 ndp_solicit(ncec_t *ncec, in6_addr_t src, ill_t *ill) 1429 { 1430 in6_addr_t dst; 1431 boolean_t dropped = B_FALSE; 1432 1433 ASSERT(ncec->ncec_ipversion == IPV6_VERSION); 1434 ASSERT(MUTEX_HELD(&ncec->ncec_lock)); 1435 1436 if (ncec->ncec_rcnt == 0) 1437 return (0); 1438 1439 dst = ncec->ncec_addr; 1440 ncec->ncec_rcnt--; 1441 mutex_exit(&ncec->ncec_lock); 1442 dropped = ndp_xmit(ill, ND_NEIGHBOR_SOLICIT, ill->ill_phys_addr, 1443 ill->ill_phys_addr_length, &src, &dst, 0); 1444 mutex_enter(&ncec->ncec_lock); 1445 if (dropped) 1446 ncec->ncec_rcnt++; 1447 return (ncec->ncec_ill->ill_reachable_retrans_time); 1448 } 1449 1450 /* 1451 * Attempt to recover an address on an interface that's been marked as a 1452 * duplicate. Because NCEs are destroyed when the interface goes down, there's 1453 * no easy way to just probe the address and have the right thing happen if 1454 * it's no longer in use. Instead, we just bring it up normally and allow the 1455 * regular interface start-up logic to probe for a remaining duplicate and take 1456 * us back down if necessary. 1457 * Neither DHCP nor temporary addresses arrive here; they're excluded by 1458 * ip_ndp_excl. 1459 */ 1460 /* ARGSUSED */ 1461 void 1462 ip_addr_recover(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg) 1463 { 1464 ill_t *ill = rq->q_ptr; 1465 ipif_t *ipif; 1466 in6_addr_t *addr6 = (in6_addr_t *)mp->b_rptr; 1467 in_addr_t *addr4 = (in_addr_t *)mp->b_rptr; 1468 boolean_t addr_equal; 1469 1470 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 1471 /* 1472 * We do not support recovery of proxy ARP'd interfaces, 1473 * because the system lacks a complete proxy ARP mechanism. 1474 */ 1475 if (ill->ill_isv6) { 1476 addr_equal = IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6lcl_addr, 1477 addr6); 1478 } else { 1479 addr_equal = (ipif->ipif_lcl_addr == *addr4); 1480 } 1481 1482 if ((ipif->ipif_flags & IPIF_POINTOPOINT) || !addr_equal) 1483 continue; 1484 1485 /* 1486 * If we have already recovered or if the interface is going 1487 * away, then ignore. 1488 */ 1489 mutex_enter(&ill->ill_lock); 1490 if (!(ipif->ipif_flags & IPIF_DUPLICATE) || 1491 (ipif->ipif_state_flags & IPIF_CONDEMNED)) { 1492 mutex_exit(&ill->ill_lock); 1493 continue; 1494 } 1495 1496 ipif->ipif_flags &= ~IPIF_DUPLICATE; 1497 ill->ill_ipif_dup_count--; 1498 mutex_exit(&ill->ill_lock); 1499 ipif->ipif_was_dup = B_TRUE; 1500 1501 if (ill->ill_isv6) { 1502 VERIFY(ipif_ndp_up(ipif, B_TRUE) != EINPROGRESS); 1503 (void) ipif_up_done_v6(ipif); 1504 } else { 1505 VERIFY(ipif_arp_up(ipif, Res_act_initial, B_TRUE) != 1506 EINPROGRESS); 1507 (void) ipif_up_done(ipif); 1508 } 1509 } 1510 freeb(mp); 1511 } 1512 1513 /* 1514 * Attempt to recover an IPv6 interface that's been shut down as a duplicate. 1515 * As long as someone else holds the address, the interface will stay down. 1516 * When that conflict goes away, the interface is brought back up. This is 1517 * done so that accidental shutdowns of addresses aren't made permanent. Your 1518 * server will recover from a failure. 1519 * 1520 * For DHCP and temporary addresses, recovery is not done in the kernel. 1521 * Instead, it's handled by user space processes (dhcpagent and in.ndpd). 1522 * 1523 * This function is entered on a timer expiry; the ID is in ipif_recovery_id. 1524 */ 1525 void 1526 ipif_dup_recovery(void *arg) 1527 { 1528 ipif_t *ipif = arg; 1529 1530 ipif->ipif_recovery_id = 0; 1531 if (!(ipif->ipif_flags & IPIF_DUPLICATE)) 1532 return; 1533 1534 /* 1535 * No lock, because this is just an optimization. 1536 */ 1537 if (ipif->ipif_state_flags & IPIF_CONDEMNED) 1538 return; 1539 1540 /* If the link is down, we'll retry this later */ 1541 if (!(ipif->ipif_ill->ill_phyint->phyint_flags & PHYI_RUNNING)) 1542 return; 1543 1544 ipif_do_recovery(ipif); 1545 } 1546 1547 /* 1548 * Perform interface recovery by forcing the duplicate interfaces up and 1549 * allowing the system to determine which ones should stay up. 1550 * 1551 * Called both by recovery timer expiry and link-up notification. 1552 */ 1553 void 1554 ipif_do_recovery(ipif_t *ipif) 1555 { 1556 ill_t *ill = ipif->ipif_ill; 1557 mblk_t *mp; 1558 ip_stack_t *ipst = ill->ill_ipst; 1559 size_t mp_size; 1560 1561 if (ipif->ipif_isv6) 1562 mp_size = sizeof (ipif->ipif_v6lcl_addr); 1563 else 1564 mp_size = sizeof (ipif->ipif_lcl_addr); 1565 mp = allocb(mp_size, BPRI_MED); 1566 if (mp == NULL) { 1567 mutex_enter(&ill->ill_lock); 1568 if (ipst->ips_ip_dup_recovery > 0 && 1569 ipif->ipif_recovery_id == 0 && 1570 !(ipif->ipif_state_flags & IPIF_CONDEMNED)) { 1571 ipif->ipif_recovery_id = timeout(ipif_dup_recovery, 1572 ipif, MSEC_TO_TICK(ipst->ips_ip_dup_recovery)); 1573 } 1574 mutex_exit(&ill->ill_lock); 1575 } else { 1576 /* 1577 * A recovery timer may still be running if we got here from 1578 * ill_restart_dad(); cancel that timer. 1579 */ 1580 if (ipif->ipif_recovery_id != 0) 1581 (void) untimeout(ipif->ipif_recovery_id); 1582 ipif->ipif_recovery_id = 0; 1583 1584 if (ipif->ipif_isv6) { 1585 bcopy(&ipif->ipif_v6lcl_addr, mp->b_rptr, 1586 sizeof (ipif->ipif_v6lcl_addr)); 1587 } else { 1588 bcopy(&ipif->ipif_lcl_addr, mp->b_rptr, 1589 sizeof (ipif->ipif_lcl_addr)); 1590 } 1591 ill_refhold(ill); 1592 qwriter_ip(ill, ill->ill_rq, mp, ip_addr_recover, NEW_OP, 1593 B_FALSE); 1594 } 1595 } 1596 1597 /* 1598 * Find the MAC and IP addresses in an NA/NS message. 1599 */ 1600 static void 1601 ip_ndp_find_addresses(mblk_t *mp, ip_recv_attr_t *ira, ill_t *ill, 1602 in6_addr_t *targp, uchar_t **haddr, uint_t *haddrlenp) 1603 { 1604 icmp6_t *icmp6 = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN); 1605 nd_neighbor_solicit_t *ns = (nd_neighbor_solicit_t *)icmp6; 1606 uchar_t *addr; 1607 int alen; 1608 1609 /* icmp_inbound_v6 ensures this */ 1610 ASSERT(ira->ira_flags & IRAF_L2SRC_SET); 1611 1612 addr = ira->ira_l2src; 1613 alen = ill->ill_phys_addr_length; 1614 if (alen > 0) { 1615 *haddr = addr; 1616 *haddrlenp = alen; 1617 } else { 1618 *haddr = NULL; 1619 *haddrlenp = 0; 1620 } 1621 1622 /* nd_ns_target and nd_na_target are at the same offset, so we cheat */ 1623 *targp = ns->nd_ns_target; 1624 } 1625 1626 /* 1627 * This is for exclusive changes due to NDP duplicate address detection 1628 * failure. 1629 */ 1630 /* ARGSUSED */ 1631 static void 1632 ip_ndp_excl(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg) 1633 { 1634 ill_t *ill = rq->q_ptr; 1635 ipif_t *ipif; 1636 uchar_t *haddr; 1637 uint_t haddrlen; 1638 ip_stack_t *ipst = ill->ill_ipst; 1639 in6_addr_t targ; 1640 ip_recv_attr_t iras; 1641 mblk_t *attrmp; 1642 1643 attrmp = mp; 1644 mp = mp->b_cont; 1645 attrmp->b_cont = NULL; 1646 if (!ip_recv_attr_from_mblk(attrmp, &iras)) { 1647 /* The ill or ip_stack_t disappeared on us */ 1648 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 1649 ip_drop_input("ip_recv_attr_from_mblk", mp, ill); 1650 freemsg(mp); 1651 ira_cleanup(&iras, B_TRUE); 1652 return; 1653 } 1654 1655 ASSERT(ill == iras.ira_rill); 1656 1657 ip_ndp_find_addresses(mp, &iras, ill, &targ, &haddr, &haddrlen); 1658 if (haddr != NULL && haddrlen == ill->ill_phys_addr_length) { 1659 /* 1660 * Ignore conflicts generated by misbehaving switches that 1661 * just reflect our own messages back to us. For IPMP, we may 1662 * see reflections across any ill in the illgrp. 1663 * 1664 * RFC2462 and revisions tried to detect both the case 1665 * when a statically configured IPv6 address is a duplicate, 1666 * and the case when the L2 address itself is a duplicate. The 1667 * later is important because, with stateles address autoconf, 1668 * if the L2 address is a duplicate, the resulting IPv6 1669 * address(es) would also be duplicates. We rely on DAD of the 1670 * IPv6 address itself to detect the latter case. 1671 */ 1672 /* For an under ill_grp can change under lock */ 1673 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 1674 if (bcmp(haddr, ill->ill_phys_addr, haddrlen) == 0 || 1675 IS_UNDER_IPMP(ill) && 1676 ipmp_illgrp_find_ill(ill->ill_grp, haddr, 1677 haddrlen) != NULL) { 1678 rw_exit(&ipst->ips_ill_g_lock); 1679 goto ignore_conflict; 1680 } 1681 rw_exit(&ipst->ips_ill_g_lock); 1682 } 1683 1684 /* 1685 * Look up the appropriate ipif. 1686 */ 1687 ipif = ipif_lookup_addr_v6(&targ, ill, ALL_ZONES, ipst); 1688 if (ipif == NULL) 1689 goto ignore_conflict; 1690 1691 /* Reload the ill to match the ipif */ 1692 ill = ipif->ipif_ill; 1693 1694 /* If it's already duplicate or ineligible, then don't do anything. */ 1695 if (ipif->ipif_flags & (IPIF_POINTOPOINT|IPIF_DUPLICATE)) { 1696 ipif_refrele(ipif); 1697 goto ignore_conflict; 1698 } 1699 1700 /* 1701 * If this is a failure during duplicate recovery, then don't 1702 * complain. It may take a long time to recover. 1703 */ 1704 if (!ipif->ipif_was_dup) { 1705 char ibuf[LIFNAMSIZ]; 1706 char hbuf[MAC_STR_LEN]; 1707 char sbuf[INET6_ADDRSTRLEN]; 1708 1709 ipif_get_name(ipif, ibuf, sizeof (ibuf)); 1710 cmn_err(CE_WARN, "%s has duplicate address %s (in use by %s);" 1711 " disabled", ibuf, 1712 inet_ntop(AF_INET6, &targ, sbuf, sizeof (sbuf)), 1713 mac_colon_addr(haddr, haddrlen, hbuf, sizeof (hbuf))); 1714 } 1715 mutex_enter(&ill->ill_lock); 1716 ASSERT(!(ipif->ipif_flags & IPIF_DUPLICATE)); 1717 ipif->ipif_flags |= IPIF_DUPLICATE; 1718 ill->ill_ipif_dup_count++; 1719 mutex_exit(&ill->ill_lock); 1720 (void) ipif_down(ipif, NULL, NULL); 1721 (void) ipif_down_tail(ipif); 1722 mutex_enter(&ill->ill_lock); 1723 if (!(ipif->ipif_flags & (IPIF_DHCPRUNNING|IPIF_TEMPORARY)) && 1724 ill->ill_net_type == IRE_IF_RESOLVER && 1725 !(ipif->ipif_state_flags & IPIF_CONDEMNED) && 1726 ipst->ips_ip_dup_recovery > 0) { 1727 ASSERT(ipif->ipif_recovery_id == 0); 1728 ipif->ipif_recovery_id = timeout(ipif_dup_recovery, 1729 ipif, MSEC_TO_TICK(ipst->ips_ip_dup_recovery)); 1730 } 1731 mutex_exit(&ill->ill_lock); 1732 ipif_refrele(ipif); 1733 1734 ignore_conflict: 1735 freemsg(mp); 1736 ira_cleanup(&iras, B_TRUE); 1737 } 1738 1739 /* 1740 * Handle failure by tearing down the ipifs with the specified address. Note 1741 * that tearing down the ipif also means deleting the ncec through ipif_down, so 1742 * it's not possible to do recovery by just restarting the ncec timer. Instead, 1743 * we start a timer on the ipif. 1744 * Caller has to free mp; 1745 */ 1746 static void 1747 ndp_failure(mblk_t *mp, ip_recv_attr_t *ira) 1748 { 1749 const uchar_t *haddr; 1750 ill_t *ill = ira->ira_rill; 1751 1752 /* 1753 * Ignore conflicts generated by misbehaving switches that just 1754 * reflect our own messages back to us. 1755 */ 1756 1757 /* icmp_inbound_v6 ensures this */ 1758 ASSERT(ira->ira_flags & IRAF_L2SRC_SET); 1759 haddr = ira->ira_l2src; 1760 if (haddr != NULL && 1761 bcmp(haddr, ill->ill_phys_addr, ill->ill_phys_addr_length) == 0) { 1762 return; 1763 } 1764 1765 if ((mp = copymsg(mp)) != NULL) { 1766 mblk_t *attrmp; 1767 1768 attrmp = ip_recv_attr_to_mblk(ira); 1769 if (attrmp == NULL) { 1770 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 1771 ip_drop_input("ipIfStatsInDiscards", mp, ill); 1772 freemsg(mp); 1773 } else { 1774 ASSERT(attrmp->b_cont == NULL); 1775 attrmp->b_cont = mp; 1776 mp = attrmp; 1777 ill_refhold(ill); 1778 qwriter_ip(ill, ill->ill_rq, mp, ip_ndp_excl, NEW_OP, 1779 B_FALSE); 1780 } 1781 } 1782 } 1783 1784 /* 1785 * Handle a discovered conflict: some other system is advertising that it owns 1786 * one of our IP addresses. We need to defend ourselves, or just shut down the 1787 * interface. 1788 * 1789 * Handles both IPv4 and IPv6 1790 */ 1791 boolean_t 1792 ip_nce_conflict(mblk_t *mp, ip_recv_attr_t *ira, ncec_t *ncec) 1793 { 1794 ipif_t *ipif; 1795 clock_t now; 1796 uint_t maxdefense; 1797 uint_t defs; 1798 ill_t *ill = ira->ira_ill; 1799 ip_stack_t *ipst = ill->ill_ipst; 1800 uint32_t elapsed; 1801 boolean_t isv6 = ill->ill_isv6; 1802 ipaddr_t ncec_addr; 1803 1804 if (isv6) { 1805 ipif = ipif_lookup_addr_v6(&ncec->ncec_addr, ill, ALL_ZONES, 1806 ipst); 1807 } else { 1808 if (arp_no_defense) { 1809 /* 1810 * Yes, there is a conflict, but no, we do not 1811 * defend ourself. 1812 */ 1813 return (B_TRUE); 1814 } 1815 IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, ncec_addr); 1816 ipif = ipif_lookup_addr(ncec_addr, ill, ALL_ZONES, 1817 ipst); 1818 } 1819 if (ipif == NULL) 1820 return (B_FALSE); 1821 1822 /* 1823 * First, figure out if this address is disposable. 1824 */ 1825 if (ipif->ipif_flags & (IPIF_DHCPRUNNING | IPIF_TEMPORARY)) 1826 maxdefense = ipst->ips_ip_max_temp_defend; 1827 else 1828 maxdefense = ipst->ips_ip_max_defend; 1829 1830 /* 1831 * Now figure out how many times we've defended ourselves. Ignore 1832 * defenses that happened long in the past. 1833 */ 1834 now = ddi_get_lbolt(); 1835 elapsed = (drv_hztousec(now - ncec->ncec_last_time_defended))/1000000; 1836 mutex_enter(&ncec->ncec_lock); 1837 if ((defs = ncec->ncec_defense_count) > 0 && 1838 elapsed > ipst->ips_ip_defend_interval) { 1839 /* 1840 * ip_defend_interval has elapsed. 1841 * reset the defense count. 1842 */ 1843 ncec->ncec_defense_count = defs = 0; 1844 } 1845 ncec->ncec_defense_count++; 1846 ncec->ncec_last_time_defended = now; 1847 mutex_exit(&ncec->ncec_lock); 1848 ipif_refrele(ipif); 1849 1850 /* 1851 * If we've defended ourselves too many times already, then give up and 1852 * tear down the interface(s) using this address. 1853 * Otherwise, caller has to defend by sending out an announce. 1854 */ 1855 if (defs >= maxdefense) { 1856 if (isv6) 1857 ndp_failure(mp, ira); 1858 else 1859 arp_failure(mp, ira); 1860 } else { 1861 return (B_TRUE); /* caller must defend this address */ 1862 } 1863 return (B_FALSE); 1864 } 1865 1866 /* 1867 * Handle reception of Neighbor Solicitation messages. 1868 */ 1869 static void 1870 ndp_input_solicit(mblk_t *mp, ip_recv_attr_t *ira) 1871 { 1872 ill_t *ill = ira->ira_ill, *under_ill; 1873 nd_neighbor_solicit_t *ns; 1874 uint32_t hlen = ill->ill_phys_addr_length; 1875 uchar_t *haddr = NULL; 1876 icmp6_t *icmp_nd; 1877 ip6_t *ip6h; 1878 ncec_t *our_ncec = NULL; 1879 in6_addr_t target; 1880 in6_addr_t src; 1881 int len; 1882 int flag = 0; 1883 nd_opt_hdr_t *opt = NULL; 1884 boolean_t bad_solicit = B_FALSE; 1885 mib2_ipv6IfIcmpEntry_t *mib = ill->ill_icmp6_mib; 1886 boolean_t need_ill_refrele = B_FALSE; 1887 1888 ip6h = (ip6_t *)mp->b_rptr; 1889 icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN); 1890 len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN; 1891 src = ip6h->ip6_src; 1892 ns = (nd_neighbor_solicit_t *)icmp_nd; 1893 target = ns->nd_ns_target; 1894 if (IN6_IS_ADDR_MULTICAST(&target) || IN6_IS_ADDR_V4MAPPED(&target) || 1895 IN6_IS_ADDR_LOOPBACK(&target)) { 1896 if (ip_debug > 2) { 1897 /* ip1dbg */ 1898 pr_addr_dbg("ndp_input_solicit: Martian Target %s\n", 1899 AF_INET6, &target); 1900 } 1901 bad_solicit = B_TRUE; 1902 goto done; 1903 } 1904 if (len > sizeof (nd_neighbor_solicit_t)) { 1905 /* Options present */ 1906 opt = (nd_opt_hdr_t *)&ns[1]; 1907 len -= sizeof (nd_neighbor_solicit_t); 1908 if (!ndp_verify_optlen(opt, len)) { 1909 ip1dbg(("ndp_input_solicit: Bad opt len\n")); 1910 bad_solicit = B_TRUE; 1911 goto done; 1912 } 1913 } 1914 if (IN6_IS_ADDR_UNSPECIFIED(&src)) { 1915 /* Check to see if this is a valid DAD solicitation */ 1916 if (!IN6_IS_ADDR_MC_SOLICITEDNODE(&ip6h->ip6_dst)) { 1917 if (ip_debug > 2) { 1918 /* ip1dbg */ 1919 pr_addr_dbg("ndp_input_solicit: IPv6 " 1920 "Destination is not solicited node " 1921 "multicast %s\n", AF_INET6, 1922 &ip6h->ip6_dst); 1923 } 1924 bad_solicit = B_TRUE; 1925 goto done; 1926 } 1927 } 1928 1929 /* 1930 * NOTE: with IPMP, it's possible the nominated multicast ill (which 1931 * received this packet if it's multicast) is not the ill tied to 1932 * e.g. the IPMP ill's data link-local. So we match across the illgrp 1933 * to ensure we find the associated NCE. 1934 */ 1935 our_ncec = ncec_lookup_illgrp_v6(ill, &target); 1936 /* 1937 * If this is a valid Solicitation for an address we are publishing, 1938 * then a PUBLISH entry should exist in the cache 1939 */ 1940 if (our_ncec == NULL || !NCE_PUBLISH(our_ncec)) { 1941 ip1dbg(("ndp_input_solicit: Wrong target in NS?!" 1942 "ifname=%s ", ill->ill_name)); 1943 if (ip_debug > 2) { 1944 /* ip1dbg */ 1945 pr_addr_dbg(" dst %s\n", AF_INET6, &target); 1946 } 1947 if (our_ncec == NULL) 1948 bad_solicit = B_TRUE; 1949 goto done; 1950 } 1951 1952 /* At this point we should have a verified NS per spec */ 1953 if (opt != NULL) { 1954 opt = ndp_get_option(opt, len, ND_OPT_SOURCE_LINKADDR); 1955 if (opt != NULL) { 1956 haddr = (uchar_t *)&opt[1]; 1957 if (hlen > opt->nd_opt_len * 8 - sizeof (*opt) || 1958 hlen == 0) { 1959 ip1dbg(("ndp_input_advert: bad SLLA\n")); 1960 bad_solicit = B_TRUE; 1961 goto done; 1962 } 1963 } 1964 } 1965 1966 /* If sending directly to peer, set the unicast flag */ 1967 if (!IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst)) 1968 flag |= NDP_UNICAST; 1969 1970 /* 1971 * Create/update the entry for the soliciting node on the ipmp_ill. 1972 * or respond to outstanding queries, don't if 1973 * the source is unspecified address. 1974 */ 1975 if (!IN6_IS_ADDR_UNSPECIFIED(&src)) { 1976 int err; 1977 nce_t *nnce; 1978 1979 ASSERT(ill->ill_isv6); 1980 /* 1981 * Regular solicitations *must* include the Source Link-Layer 1982 * Address option. Ignore messages that do not. 1983 */ 1984 if (haddr == NULL && IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst)) { 1985 ip1dbg(("ndp_input_solicit: source link-layer address " 1986 "option missing with a specified source.\n")); 1987 bad_solicit = B_TRUE; 1988 goto done; 1989 } 1990 1991 /* 1992 * This is a regular solicitation. If we're still in the 1993 * process of verifying the address, then don't respond at all 1994 * and don't keep track of the sender. 1995 */ 1996 if (our_ncec->ncec_state == ND_PROBE) 1997 goto done; 1998 1999 /* 2000 * If the solicitation doesn't have sender hardware address 2001 * (legal for unicast solicitation), then process without 2002 * installing the return NCE. Either we already know it, or 2003 * we'll be forced to look it up when (and if) we reply to the 2004 * packet. 2005 */ 2006 if (haddr == NULL) 2007 goto no_source; 2008 2009 under_ill = ill; 2010 if (IS_UNDER_IPMP(under_ill)) { 2011 ill = ipmp_ill_hold_ipmp_ill(under_ill); 2012 if (ill == NULL) 2013 ill = under_ill; 2014 else 2015 need_ill_refrele = B_TRUE; 2016 } 2017 err = nce_lookup_then_add_v6(ill, 2018 haddr, hlen, 2019 &src, /* Soliciting nodes address */ 2020 0, 2021 ND_STALE, 2022 &nnce); 2023 2024 if (need_ill_refrele) { 2025 ill_refrele(ill); 2026 ill = under_ill; 2027 need_ill_refrele = B_FALSE; 2028 } 2029 switch (err) { 2030 case 0: 2031 /* done with this entry */ 2032 nce_refrele(nnce); 2033 break; 2034 case EEXIST: 2035 /* 2036 * B_FALSE indicates this is not an an advertisement. 2037 */ 2038 nce_process(nnce->nce_common, haddr, 0, B_FALSE); 2039 nce_refrele(nnce); 2040 break; 2041 default: 2042 ip1dbg(("ndp_input_solicit: Can't create NCE %d\n", 2043 err)); 2044 goto done; 2045 } 2046 no_source: 2047 flag |= NDP_SOLICITED; 2048 } else { 2049 /* 2050 * No source link layer address option should be present in a 2051 * valid DAD request. 2052 */ 2053 if (haddr != NULL) { 2054 ip1dbg(("ndp_input_solicit: source link-layer address " 2055 "option present with an unspecified source.\n")); 2056 bad_solicit = B_TRUE; 2057 goto done; 2058 } 2059 if (our_ncec->ncec_state == ND_PROBE) { 2060 /* 2061 * Internally looped-back probes will have 2062 * IRAF_L2SRC_LOOPBACK set so we can ignore our own 2063 * transmissions. 2064 */ 2065 if (!(ira->ira_flags & IRAF_L2SRC_LOOPBACK)) { 2066 /* 2067 * If someone else is probing our address, then 2068 * we've crossed wires. Declare failure. 2069 */ 2070 ndp_failure(mp, ira); 2071 } 2072 goto done; 2073 } 2074 /* 2075 * This is a DAD probe. Multicast the advertisement to the 2076 * all-nodes address. 2077 */ 2078 src = ipv6_all_hosts_mcast; 2079 } 2080 flag |= nce_advert_flags(our_ncec); 2081 (void) ndp_xmit(ill, 2082 ND_NEIGHBOR_ADVERT, 2083 our_ncec->ncec_lladdr, 2084 our_ncec->ncec_lladdr_length, 2085 &target, /* Source and target of the advertisement pkt */ 2086 &src, /* IP Destination (source of original pkt) */ 2087 flag); 2088 done: 2089 if (bad_solicit) 2090 BUMP_MIB(mib, ipv6IfIcmpInBadNeighborSolicitations); 2091 if (our_ncec != NULL) 2092 ncec_refrele(our_ncec); 2093 } 2094 2095 /* 2096 * Handle reception of Neighbor Solicitation messages 2097 */ 2098 void 2099 ndp_input_advert(mblk_t *mp, ip_recv_attr_t *ira) 2100 { 2101 ill_t *ill = ira->ira_ill; 2102 nd_neighbor_advert_t *na; 2103 uint32_t hlen = ill->ill_phys_addr_length; 2104 uchar_t *haddr = NULL; 2105 icmp6_t *icmp_nd; 2106 ip6_t *ip6h; 2107 ncec_t *dst_ncec = NULL; 2108 in6_addr_t target; 2109 nd_opt_hdr_t *opt = NULL; 2110 int len; 2111 ip_stack_t *ipst = ill->ill_ipst; 2112 mib2_ipv6IfIcmpEntry_t *mib = ill->ill_icmp6_mib; 2113 2114 ip6h = (ip6_t *)mp->b_rptr; 2115 icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN); 2116 len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN; 2117 na = (nd_neighbor_advert_t *)icmp_nd; 2118 2119 if (IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst) && 2120 (na->nd_na_flags_reserved & ND_NA_FLAG_SOLICITED)) { 2121 ip1dbg(("ndp_input_advert: Target is multicast but the " 2122 "solicited flag is not zero\n")); 2123 BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements); 2124 return; 2125 } 2126 target = na->nd_na_target; 2127 if (IN6_IS_ADDR_MULTICAST(&target) || IN6_IS_ADDR_V4MAPPED(&target) || 2128 IN6_IS_ADDR_LOOPBACK(&target)) { 2129 if (ip_debug > 2) { 2130 /* ip1dbg */ 2131 pr_addr_dbg("ndp_input_solicit: Martian Target %s\n", 2132 AF_INET6, &target); 2133 } 2134 BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements); 2135 return; 2136 } 2137 if (len > sizeof (nd_neighbor_advert_t)) { 2138 opt = (nd_opt_hdr_t *)&na[1]; 2139 if (!ndp_verify_optlen(opt, 2140 len - sizeof (nd_neighbor_advert_t))) { 2141 ip1dbg(("ndp_input_advert: cannot verify SLLA\n")); 2142 BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements); 2143 return; 2144 } 2145 /* At this point we have a verified NA per spec */ 2146 len -= sizeof (nd_neighbor_advert_t); 2147 opt = ndp_get_option(opt, len, ND_OPT_TARGET_LINKADDR); 2148 if (opt != NULL) { 2149 haddr = (uchar_t *)&opt[1]; 2150 if (hlen > opt->nd_opt_len * 8 - sizeof (*opt) || 2151 hlen == 0) { 2152 ip1dbg(("ndp_input_advert: bad SLLA\n")); 2153 BUMP_MIB(mib, 2154 ipv6IfIcmpInBadNeighborAdvertisements); 2155 return; 2156 } 2157 } 2158 } 2159 2160 /* 2161 * NOTE: we match across the illgrp since we need to do DAD for all of 2162 * our local addresses, and those are spread across all the active 2163 * ills in the group. 2164 */ 2165 if ((dst_ncec = ncec_lookup_illgrp_v6(ill, &target)) == NULL) 2166 return; 2167 2168 if (NCE_PUBLISH(dst_ncec)) { 2169 /* 2170 * Someone just advertised an addresses that we publish. First, 2171 * check it it was us -- if so, we can safely ignore it. 2172 * We don't get the haddr from the ira_l2src because, in the 2173 * case that the packet originated from us, on an IPMP group, 2174 * the ira_l2src may would be the link-layer address of the 2175 * cast_ill used to send the packet, which may not be the same 2176 * as the dst_ncec->ncec_lladdr of the address. 2177 */ 2178 if (haddr != NULL) { 2179 if (ira->ira_flags & IRAF_L2SRC_LOOPBACK) 2180 goto out; 2181 2182 if (!nce_cmp_ll_addr(dst_ncec, haddr, hlen)) 2183 goto out; /* from us -- no conflict */ 2184 2185 /* 2186 * If we're in an IPMP group, check if this is an echo 2187 * from another ill in the group. Use the double- 2188 * checked locking pattern to avoid grabbing 2189 * ill_g_lock in the non-IPMP case. 2190 */ 2191 if (IS_UNDER_IPMP(ill)) { 2192 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 2193 if (IS_UNDER_IPMP(ill) && ipmp_illgrp_find_ill( 2194 ill->ill_grp, haddr, hlen) != NULL) { 2195 rw_exit(&ipst->ips_ill_g_lock); 2196 goto out; 2197 } 2198 rw_exit(&ipst->ips_ill_g_lock); 2199 } 2200 } 2201 2202 /* 2203 * This appears to be a real conflict. If we're trying to 2204 * configure this NCE (ND_PROBE), then shut it down. 2205 * Otherwise, handle the discovered conflict. 2206 */ 2207 if (dst_ncec->ncec_state == ND_PROBE) { 2208 ndp_failure(mp, ira); 2209 } else { 2210 if (ip_nce_conflict(mp, ira, dst_ncec)) { 2211 char hbuf[MAC_STR_LEN]; 2212 char sbuf[INET6_ADDRSTRLEN]; 2213 2214 cmn_err(CE_WARN, 2215 "node '%s' is using %s on %s", 2216 inet_ntop(AF_INET6, &target, sbuf, 2217 sizeof (sbuf)), 2218 haddr == NULL ? "<none>" : 2219 mac_colon_addr(haddr, hlen, hbuf, 2220 sizeof (hbuf)), ill->ill_name); 2221 /* 2222 * RFC 4862, Section 5.4.4 does not mandate 2223 * any specific behavior when an NA matches 2224 * a non-tentative address assigned to the 2225 * receiver. We make the choice of defending 2226 * our address, based on the assumption that 2227 * the sender has not detected the Duplicate. 2228 * 2229 * ncec_last_time_defended has been adjusted 2230 * in ip_nce_conflict() 2231 */ 2232 (void) ndp_announce(dst_ncec); 2233 } 2234 } 2235 } else { 2236 if (na->nd_na_flags_reserved & ND_NA_FLAG_ROUTER) 2237 dst_ncec->ncec_flags |= NCE_F_ISROUTER; 2238 2239 /* B_TRUE indicates this an advertisement */ 2240 nce_process(dst_ncec, haddr, na->nd_na_flags_reserved, B_TRUE); 2241 } 2242 out: 2243 ncec_refrele(dst_ncec); 2244 } 2245 2246 /* 2247 * Process NDP neighbor solicitation/advertisement messages. 2248 * The checksum has already checked o.k before reaching here. 2249 * Information about the datalink header is contained in ira_l2src, but 2250 * that should be ignored for loopback packets. 2251 */ 2252 void 2253 ndp_input(mblk_t *mp, ip_recv_attr_t *ira) 2254 { 2255 ill_t *ill = ira->ira_rill; 2256 icmp6_t *icmp_nd; 2257 ip6_t *ip6h; 2258 int len; 2259 mib2_ipv6IfIcmpEntry_t *mib = ill->ill_icmp6_mib; 2260 ill_t *orig_ill = NULL; 2261 2262 /* 2263 * Since ira_ill is where the IRE_LOCAL was hosted we use ira_rill 2264 * and make it be the IPMP upper so avoid being confused by a packet 2265 * addressed to a unicast address on a different ill. 2266 */ 2267 if (IS_UNDER_IPMP(ill)) { 2268 orig_ill = ill; 2269 ill = ipmp_ill_hold_ipmp_ill(orig_ill); 2270 if (ill == NULL) { 2271 ill = orig_ill; 2272 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 2273 ip_drop_input("ipIfStatsInDiscards - IPMP ill", 2274 mp, ill); 2275 freemsg(mp); 2276 return; 2277 } 2278 ASSERT(ill != orig_ill); 2279 orig_ill = ira->ira_ill; 2280 ira->ira_ill = ill; 2281 mib = ill->ill_icmp6_mib; 2282 } 2283 if (!pullupmsg(mp, -1)) { 2284 ip1dbg(("ndp_input: pullupmsg failed\n")); 2285 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 2286 ip_drop_input("ipIfStatsInDiscards - pullupmsg", mp, ill); 2287 goto done; 2288 } 2289 ip6h = (ip6_t *)mp->b_rptr; 2290 if (ip6h->ip6_hops != IPV6_MAX_HOPS) { 2291 ip1dbg(("ndp_input: hoplimit != IPV6_MAX_HOPS\n")); 2292 ip_drop_input("ipv6IfIcmpBadHoplimit", mp, ill); 2293 BUMP_MIB(mib, ipv6IfIcmpBadHoplimit); 2294 goto done; 2295 } 2296 /* 2297 * NDP does not accept any extension headers between the 2298 * IP header and the ICMP header since e.g. a routing 2299 * header could be dangerous. 2300 * This assumes that any AH or ESP headers are removed 2301 * by ip prior to passing the packet to ndp_input. 2302 */ 2303 if (ip6h->ip6_nxt != IPPROTO_ICMPV6) { 2304 ip1dbg(("ndp_input: Wrong next header 0x%x\n", 2305 ip6h->ip6_nxt)); 2306 ip_drop_input("Wrong next header", mp, ill); 2307 BUMP_MIB(mib, ipv6IfIcmpInErrors); 2308 goto done; 2309 } 2310 icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN); 2311 ASSERT(icmp_nd->icmp6_type == ND_NEIGHBOR_SOLICIT || 2312 icmp_nd->icmp6_type == ND_NEIGHBOR_ADVERT); 2313 if (icmp_nd->icmp6_code != 0) { 2314 ip1dbg(("ndp_input: icmp6 code != 0 \n")); 2315 ip_drop_input("code non-zero", mp, ill); 2316 BUMP_MIB(mib, ipv6IfIcmpInErrors); 2317 goto done; 2318 } 2319 len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN; 2320 /* 2321 * Make sure packet length is large enough for either 2322 * a NS or a NA icmp packet. 2323 */ 2324 if (len < sizeof (struct icmp6_hdr) + sizeof (struct in6_addr)) { 2325 ip1dbg(("ndp_input: packet too short\n")); 2326 ip_drop_input("packet too short", mp, ill); 2327 BUMP_MIB(mib, ipv6IfIcmpInErrors); 2328 goto done; 2329 } 2330 if (icmp_nd->icmp6_type == ND_NEIGHBOR_SOLICIT) { 2331 ndp_input_solicit(mp, ira); 2332 } else { 2333 ndp_input_advert(mp, ira); 2334 } 2335 done: 2336 freemsg(mp); 2337 if (orig_ill != NULL) { 2338 ill_refrele(ill); 2339 ira->ira_ill = orig_ill; 2340 } 2341 } 2342 2343 /* 2344 * ndp_xmit is called to form and transmit a ND solicitation or 2345 * advertisement ICMP packet. 2346 * 2347 * If the source address is unspecified and this isn't a probe (used for 2348 * duplicate address detection), an appropriate source address and link layer 2349 * address will be chosen here. The link layer address option is included if 2350 * the source is specified (i.e., all non-probe packets), and omitted (per the 2351 * specification) otherwise. 2352 * 2353 * It returns B_FALSE only if it does a successful put() to the 2354 * corresponding ill's ill_wq otherwise returns B_TRUE. 2355 */ 2356 static boolean_t 2357 ndp_xmit(ill_t *ill, uint32_t operation, uint8_t *hw_addr, uint_t hw_addr_len, 2358 const in6_addr_t *sender, const in6_addr_t *target, int flag) 2359 { 2360 uint32_t len; 2361 icmp6_t *icmp6; 2362 mblk_t *mp; 2363 ip6_t *ip6h; 2364 nd_opt_hdr_t *opt; 2365 uint_t plen; 2366 zoneid_t zoneid = GLOBAL_ZONEID; 2367 ill_t *hwaddr_ill = ill; 2368 ip_xmit_attr_t ixas; 2369 ip_stack_t *ipst = ill->ill_ipst; 2370 boolean_t need_refrele = B_FALSE; 2371 boolean_t probe = B_FALSE; 2372 2373 if (IS_UNDER_IPMP(ill)) { 2374 probe = ipif_lookup_testaddr_v6(ill, sender, NULL); 2375 /* 2376 * We send non-probe packets on the upper IPMP interface. 2377 * ip_output_simple() will use cast_ill for sending any 2378 * multicast packets. Note that we can't follow the same 2379 * logic for probe packets because all interfaces in the ipmp 2380 * group may have failed, so that we really want to only try 2381 * to send the ND packet on the ill corresponding to the src 2382 * address. 2383 */ 2384 if (!probe) { 2385 ill = ipmp_ill_hold_ipmp_ill(ill); 2386 if (ill != NULL) 2387 need_refrele = B_TRUE; 2388 else 2389 ill = hwaddr_ill; 2390 } 2391 } 2392 2393 /* 2394 * If we have a unspecified source(sender) address, select a 2395 * proper source address for the solicitation here itself so 2396 * that we can initialize the h/w address correctly. 2397 * 2398 * If the sender is specified then we use this address in order 2399 * to lookup the zoneid before calling ip_output_v6(). This is to 2400 * enable unicast ND_NEIGHBOR_ADVERT packets to be routed correctly 2401 * by IP (we cannot guarantee that the global zone has an interface 2402 * route to the destination). 2403 * 2404 * Note that the NA never comes here with the unspecified source 2405 * address. 2406 */ 2407 2408 /* 2409 * Probes will have unspec src at this point. 2410 */ 2411 if (!(IN6_IS_ADDR_UNSPECIFIED(sender))) { 2412 zoneid = ipif_lookup_addr_zoneid_v6(sender, ill, ipst); 2413 /* 2414 * It's possible for ipif_lookup_addr_zoneid_v6() to return 2415 * ALL_ZONES if it cannot find a matching ipif for the address 2416 * we are trying to use. In this case we err on the side of 2417 * trying to send the packet by defaulting to the GLOBAL_ZONEID. 2418 */ 2419 if (zoneid == ALL_ZONES) 2420 zoneid = GLOBAL_ZONEID; 2421 } 2422 2423 plen = (sizeof (nd_opt_hdr_t) + hw_addr_len + 7) / 8; 2424 len = IPV6_HDR_LEN + sizeof (nd_neighbor_advert_t) + plen * 8; 2425 mp = allocb(len, BPRI_LO); 2426 if (mp == NULL) { 2427 if (need_refrele) 2428 ill_refrele(ill); 2429 return (B_TRUE); 2430 } 2431 2432 bzero((char *)mp->b_rptr, len); 2433 mp->b_wptr = mp->b_rptr + len; 2434 2435 bzero(&ixas, sizeof (ixas)); 2436 ixas.ixa_flags = IXAF_SET_ULP_CKSUM | IXAF_NO_HW_CKSUM; 2437 2438 ixas.ixa_ifindex = ill->ill_phyint->phyint_ifindex; 2439 ixas.ixa_ipst = ipst; 2440 ixas.ixa_cred = kcred; 2441 ixas.ixa_cpid = NOPID; 2442 ixas.ixa_tsl = NULL; 2443 ixas.ixa_zoneid = zoneid; 2444 2445 ip6h = (ip6_t *)mp->b_rptr; 2446 ip6h->ip6_vcf = IPV6_DEFAULT_VERS_AND_FLOW; 2447 ip6h->ip6_plen = htons(len - IPV6_HDR_LEN); 2448 ip6h->ip6_nxt = IPPROTO_ICMPV6; 2449 ip6h->ip6_hops = IPV6_MAX_HOPS; 2450 ixas.ixa_multicast_ttl = ip6h->ip6_hops; 2451 ip6h->ip6_dst = *target; 2452 icmp6 = (icmp6_t *)&ip6h[1]; 2453 2454 if (hw_addr_len != 0) { 2455 opt = (nd_opt_hdr_t *)((uint8_t *)ip6h + IPV6_HDR_LEN + 2456 sizeof (nd_neighbor_advert_t)); 2457 } else { 2458 opt = NULL; 2459 } 2460 if (operation == ND_NEIGHBOR_SOLICIT) { 2461 nd_neighbor_solicit_t *ns = (nd_neighbor_solicit_t *)icmp6; 2462 2463 if (opt != NULL && !(flag & NDP_PROBE)) { 2464 /* 2465 * Note that we don't send out SLLA for ND probes 2466 * per RFC 4862, even though we do send out the src 2467 * haddr for IPv4 DAD probes, even though both IPv4 2468 * and IPv6 go out with the unspecified/INADDR_ANY 2469 * src IP addr. 2470 */ 2471 opt->nd_opt_type = ND_OPT_SOURCE_LINKADDR; 2472 } 2473 ip6h->ip6_src = *sender; 2474 ns->nd_ns_target = *target; 2475 if (!(flag & NDP_UNICAST)) { 2476 /* Form multicast address of the target */ 2477 ip6h->ip6_dst = ipv6_solicited_node_mcast; 2478 ip6h->ip6_dst.s6_addr32[3] |= 2479 ns->nd_ns_target.s6_addr32[3]; 2480 } 2481 } else { 2482 nd_neighbor_advert_t *na = (nd_neighbor_advert_t *)icmp6; 2483 2484 ASSERT(!(flag & NDP_PROBE)); 2485 if (opt != NULL) 2486 opt->nd_opt_type = ND_OPT_TARGET_LINKADDR; 2487 ip6h->ip6_src = *sender; 2488 na->nd_na_target = *sender; 2489 if (flag & NDP_ISROUTER) 2490 na->nd_na_flags_reserved |= ND_NA_FLAG_ROUTER; 2491 if (flag & NDP_SOLICITED) 2492 na->nd_na_flags_reserved |= ND_NA_FLAG_SOLICITED; 2493 if (flag & NDP_ORIDE) 2494 na->nd_na_flags_reserved |= ND_NA_FLAG_OVERRIDE; 2495 } 2496 2497 if (!(flag & NDP_PROBE)) { 2498 if (hw_addr != NULL && opt != NULL) { 2499 /* Fill in link layer address and option len */ 2500 opt->nd_opt_len = (uint8_t)plen; 2501 bcopy(hw_addr, &opt[1], hw_addr_len); 2502 } 2503 } 2504 if (opt != NULL && opt->nd_opt_type == 0) { 2505 /* If there's no link layer address option, then strip it. */ 2506 len -= plen * 8; 2507 mp->b_wptr = mp->b_rptr + len; 2508 ip6h->ip6_plen = htons(len - IPV6_HDR_LEN); 2509 } 2510 2511 icmp6->icmp6_type = (uint8_t)operation; 2512 icmp6->icmp6_code = 0; 2513 /* 2514 * Prepare for checksum by putting icmp length in the icmp 2515 * checksum field. The checksum is calculated in ip_output.c. 2516 */ 2517 icmp6->icmp6_cksum = ip6h->ip6_plen; 2518 2519 (void) ip_output_simple(mp, &ixas); 2520 ixa_cleanup(&ixas); 2521 if (need_refrele) 2522 ill_refrele(ill); 2523 return (B_FALSE); 2524 } 2525 2526 /* 2527 * Used to set ND_UNREACHBLE before ncec_delete sets it NCE_F_CONDEMNED. 2528 * The datapath uses this as an indication that there 2529 * is a problem (as opposed to a NCE that was just 2530 * reclaimed due to lack of memory. 2531 * Note that static ARP entries never become unreachable. 2532 */ 2533 void 2534 nce_make_unreachable(ncec_t *ncec) 2535 { 2536 mutex_enter(&ncec->ncec_lock); 2537 ncec->ncec_state = ND_UNREACHABLE; 2538 mutex_exit(&ncec->ncec_lock); 2539 } 2540 2541 /* 2542 * NCE retransmit timer. Common to IPv4 and IPv6. 2543 * This timer goes off when: 2544 * a. It is time to retransmit a resolution for resolver. 2545 * b. It is time to send reachability probes. 2546 */ 2547 void 2548 nce_timer(void *arg) 2549 { 2550 ncec_t *ncec = arg; 2551 ill_t *ill = ncec->ncec_ill, *src_ill; 2552 char addrbuf[INET6_ADDRSTRLEN]; 2553 boolean_t dropped = B_FALSE; 2554 ip_stack_t *ipst = ncec->ncec_ipst; 2555 boolean_t isv6 = (ncec->ncec_ipversion == IPV6_VERSION); 2556 in_addr_t sender4 = INADDR_ANY; 2557 in6_addr_t sender6 = ipv6_all_zeros; 2558 2559 /* 2560 * The timer has to be cancelled by ncec_delete before doing the final 2561 * refrele. So the NCE is guaranteed to exist when the timer runs 2562 * until it clears the timeout_id. Before clearing the timeout_id 2563 * bump up the refcnt so that we can continue to use the ncec 2564 */ 2565 ASSERT(ncec != NULL); 2566 mutex_enter(&ncec->ncec_lock); 2567 ncec_refhold_locked(ncec); 2568 ncec->ncec_timeout_id = 0; 2569 mutex_exit(&ncec->ncec_lock); 2570 2571 src_ill = nce_resolve_src(ncec, &sender6); 2572 /* if we could not find a sender address, return */ 2573 if (src_ill == NULL) { 2574 if (!isv6) { 2575 IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, sender4); 2576 ip1dbg(("no src ill for %s\n", inet_ntop(AF_INET, 2577 &sender4, addrbuf, sizeof (addrbuf)))); 2578 } else { 2579 ip1dbg(("no src ill for %s\n", inet_ntop(AF_INET6, 2580 &ncec->ncec_addr, addrbuf, sizeof (addrbuf)))); 2581 } 2582 nce_restart_timer(ncec, ill->ill_reachable_retrans_time); 2583 ncec_refrele(ncec); 2584 return; 2585 } 2586 if (!isv6) 2587 IN6_V4MAPPED_TO_IPADDR(&sender6, sender4); 2588 2589 mutex_enter(&ncec->ncec_lock); 2590 /* 2591 * Check the reachability state. 2592 */ 2593 switch (ncec->ncec_state) { 2594 case ND_DELAY: 2595 ASSERT(ncec->ncec_lladdr != NULL); 2596 ncec->ncec_state = ND_PROBE; 2597 ncec->ncec_pcnt = ND_MAX_UNICAST_SOLICIT; 2598 if (isv6) { 2599 mutex_exit(&ncec->ncec_lock); 2600 dropped = ndp_xmit(src_ill, ND_NEIGHBOR_SOLICIT, 2601 src_ill->ill_phys_addr, 2602 src_ill->ill_phys_addr_length, 2603 &sender6, &ncec->ncec_addr, 2604 NDP_UNICAST); 2605 } else { 2606 dropped = (arp_request(ncec, sender4, src_ill) == 0); 2607 mutex_exit(&ncec->ncec_lock); 2608 } 2609 if (!dropped) { 2610 mutex_enter(&ncec->ncec_lock); 2611 ncec->ncec_pcnt--; 2612 mutex_exit(&ncec->ncec_lock); 2613 } 2614 if (ip_debug > 3) { 2615 /* ip2dbg */ 2616 pr_addr_dbg("nce_timer: state for %s changed " 2617 "to PROBE\n", AF_INET6, &ncec->ncec_addr); 2618 } 2619 nce_restart_timer(ncec, ill->ill_reachable_retrans_time); 2620 break; 2621 case ND_PROBE: 2622 /* must be retransmit timer */ 2623 ASSERT(ncec->ncec_pcnt >= -1); 2624 if (ncec->ncec_pcnt > 0) { 2625 /* 2626 * As per RFC2461, the ncec gets deleted after 2627 * MAX_UNICAST_SOLICIT unsuccessful re-transmissions. 2628 * Note that the first unicast solicitation is sent 2629 * during the DELAY state. 2630 */ 2631 ip2dbg(("nce_timer: pcount=%x dst %s\n", 2632 ncec->ncec_pcnt, 2633 inet_ntop((isv6? AF_INET6 : AF_INET), 2634 &ncec->ncec_addr, addrbuf, sizeof (addrbuf)))); 2635 if (NCE_PUBLISH(ncec)) { 2636 mutex_exit(&ncec->ncec_lock); 2637 /* 2638 * send out a probe; note that src_ill 2639 * is ignored by nce_dad() for all 2640 * DAD message types other than IPv6 2641 * unicast probes 2642 */ 2643 nce_dad(ncec, src_ill, B_TRUE); 2644 } else { 2645 ASSERT(src_ill != NULL); 2646 if (isv6) { 2647 mutex_exit(&ncec->ncec_lock); 2648 dropped = ndp_xmit(src_ill, 2649 ND_NEIGHBOR_SOLICIT, 2650 src_ill->ill_phys_addr, 2651 src_ill->ill_phys_addr_length, 2652 &sender6, &ncec->ncec_addr, 2653 NDP_UNICAST); 2654 } else { 2655 /* 2656 * since the nce is REACHABLE, 2657 * the ARP request will be sent out 2658 * as a link-layer unicast. 2659 */ 2660 dropped = (arp_request(ncec, sender4, 2661 src_ill) == 0); 2662 mutex_exit(&ncec->ncec_lock); 2663 } 2664 if (!dropped) { 2665 mutex_enter(&ncec->ncec_lock); 2666 ncec->ncec_pcnt--; 2667 mutex_exit(&ncec->ncec_lock); 2668 } 2669 nce_restart_timer(ncec, 2670 ill->ill_reachable_retrans_time); 2671 } 2672 } else if (ncec->ncec_pcnt < 0) { 2673 /* No hope, delete the ncec */ 2674 /* Tell datapath it went bad */ 2675 ncec->ncec_state = ND_UNREACHABLE; 2676 mutex_exit(&ncec->ncec_lock); 2677 if (ip_debug > 2) { 2678 /* ip1dbg */ 2679 pr_addr_dbg("nce_timer: Delete NCE for" 2680 " dst %s\n", (isv6? AF_INET6: AF_INET), 2681 &ncec->ncec_addr); 2682 } 2683 /* if static ARP can't delete. */ 2684 if ((ncec->ncec_flags & NCE_F_STATIC) == 0) 2685 ncec_delete(ncec); 2686 2687 } else if (!NCE_PUBLISH(ncec)) { 2688 /* 2689 * Probe count is 0 for a dynamic entry (one that we 2690 * ourselves are not publishing). We should never get 2691 * here if NONUD was requested, hence the ASSERT below. 2692 */ 2693 ASSERT((ncec->ncec_flags & NCE_F_NONUD) == 0); 2694 ip2dbg(("nce_timer: pcount=%x dst %s\n", 2695 ncec->ncec_pcnt, inet_ntop(AF_INET6, 2696 &ncec->ncec_addr, addrbuf, sizeof (addrbuf)))); 2697 ncec->ncec_pcnt--; 2698 mutex_exit(&ncec->ncec_lock); 2699 /* Wait one interval before killing */ 2700 nce_restart_timer(ncec, 2701 ill->ill_reachable_retrans_time); 2702 } else if (ill->ill_phyint->phyint_flags & PHYI_RUNNING) { 2703 ipif_t *ipif; 2704 ipaddr_t ncec_addr; 2705 2706 /* 2707 * We're done probing, and we can now declare this 2708 * address to be usable. Let IP know that it's ok to 2709 * use. 2710 */ 2711 ncec->ncec_state = ND_REACHABLE; 2712 ncec->ncec_flags &= ~NCE_F_UNVERIFIED; 2713 mutex_exit(&ncec->ncec_lock); 2714 if (isv6) { 2715 ipif = ipif_lookup_addr_exact_v6( 2716 &ncec->ncec_addr, ill, ipst); 2717 } else { 2718 IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, 2719 ncec_addr); 2720 ipif = ipif_lookup_addr_exact(ncec_addr, ill, 2721 ipst); 2722 } 2723 if (ipif != NULL) { 2724 if (ipif->ipif_was_dup) { 2725 char ibuf[LIFNAMSIZ]; 2726 char sbuf[INET6_ADDRSTRLEN]; 2727 2728 ipif->ipif_was_dup = B_FALSE; 2729 (void) inet_ntop(AF_INET6, 2730 &ipif->ipif_v6lcl_addr, 2731 sbuf, sizeof (sbuf)); 2732 ipif_get_name(ipif, ibuf, 2733 sizeof (ibuf)); 2734 cmn_err(CE_NOTE, "recovered address " 2735 "%s on %s", sbuf, ibuf); 2736 } 2737 if ((ipif->ipif_flags & IPIF_UP) && 2738 !ipif->ipif_addr_ready) 2739 ipif_up_notify(ipif); 2740 ipif->ipif_addr_ready = 1; 2741 ipif_refrele(ipif); 2742 } 2743 if (!isv6 && arp_no_defense) 2744 break; 2745 /* Begin defending our new address */ 2746 if (ncec->ncec_unsolicit_count > 0) { 2747 ncec->ncec_unsolicit_count--; 2748 if (isv6) { 2749 dropped = ndp_announce(ncec); 2750 } else { 2751 dropped = arp_announce(ncec); 2752 } 2753 2754 if (dropped) 2755 ncec->ncec_unsolicit_count++; 2756 else 2757 ncec->ncec_last_time_defended = 2758 ddi_get_lbolt(); 2759 } 2760 if (ncec->ncec_unsolicit_count > 0) { 2761 nce_restart_timer(ncec, 2762 ANNOUNCE_INTERVAL(isv6)); 2763 } else if (DEFENSE_INTERVAL(isv6) != 0) { 2764 nce_restart_timer(ncec, DEFENSE_INTERVAL(isv6)); 2765 } 2766 } else { 2767 /* 2768 * This is an address we're probing to be our own, but 2769 * the ill is down. Wait until it comes back before 2770 * doing anything, but switch to reachable state so 2771 * that the restart will work. 2772 */ 2773 ncec->ncec_state = ND_REACHABLE; 2774 mutex_exit(&ncec->ncec_lock); 2775 } 2776 break; 2777 case ND_INCOMPLETE: { 2778 mblk_t *mp, *nextmp; 2779 mblk_t **prevmpp; 2780 2781 /* 2782 * Per case (2) in the nce_queue_mp() comments, scan ncec_qd_mp 2783 * for any IPMP probe packets, and toss them. IPMP probe 2784 * packets will always be at the head of ncec_qd_mp, so that 2785 * we can stop at the first queued ND packet that is 2786 * not a probe packet. 2787 */ 2788 prevmpp = &ncec->ncec_qd_mp; 2789 for (mp = ncec->ncec_qd_mp; mp != NULL; mp = nextmp) { 2790 nextmp = mp->b_next; 2791 2792 if (IS_UNDER_IPMP(ill) && ncec->ncec_nprobes > 0) { 2793 inet_freemsg(mp); 2794 ncec->ncec_nprobes--; 2795 *prevmpp = nextmp; 2796 } else { 2797 prevmpp = &mp->b_next; 2798 } 2799 } 2800 2801 /* 2802 * Must be resolver's retransmit timer. 2803 */ 2804 mutex_exit(&ncec->ncec_lock); 2805 ip_ndp_resolve(ncec); 2806 break; 2807 } 2808 case ND_REACHABLE: 2809 if (((ncec->ncec_flags & NCE_F_UNSOL_ADV) && 2810 ncec->ncec_unsolicit_count != 0) || 2811 (NCE_PUBLISH(ncec) && DEFENSE_INTERVAL(isv6) != 0)) { 2812 if (ncec->ncec_unsolicit_count > 0) { 2813 ncec->ncec_unsolicit_count--; 2814 mutex_exit(&ncec->ncec_lock); 2815 /* 2816 * When we get to zero announcements left, 2817 * switch to address defense 2818 */ 2819 } else { 2820 boolean_t rate_limit; 2821 2822 mutex_exit(&ncec->ncec_lock); 2823 rate_limit = ill_defend_rate_limit(ill, ncec); 2824 if (rate_limit) { 2825 nce_restart_timer(ncec, 2826 DEFENSE_INTERVAL(isv6)); 2827 break; 2828 } 2829 } 2830 if (isv6) { 2831 dropped = ndp_announce(ncec); 2832 } else { 2833 dropped = arp_announce(ncec); 2834 } 2835 mutex_enter(&ncec->ncec_lock); 2836 if (dropped) { 2837 ncec->ncec_unsolicit_count++; 2838 } else { 2839 ncec->ncec_last_time_defended = 2840 ddi_get_lbolt(); 2841 } 2842 mutex_exit(&ncec->ncec_lock); 2843 if (ncec->ncec_unsolicit_count != 0) { 2844 nce_restart_timer(ncec, 2845 ANNOUNCE_INTERVAL(isv6)); 2846 } else { 2847 nce_restart_timer(ncec, DEFENSE_INTERVAL(isv6)); 2848 } 2849 } else { 2850 mutex_exit(&ncec->ncec_lock); 2851 } 2852 break; 2853 default: 2854 mutex_exit(&ncec->ncec_lock); 2855 break; 2856 } 2857 done: 2858 ncec_refrele(ncec); 2859 ill_refrele(src_ill); 2860 } 2861 2862 /* 2863 * Set a link layer address from the ll_addr passed in. 2864 * Copy SAP from ill. 2865 */ 2866 static void 2867 nce_set_ll(ncec_t *ncec, uchar_t *ll_addr) 2868 { 2869 ill_t *ill = ncec->ncec_ill; 2870 2871 ASSERT(ll_addr != NULL); 2872 if (ill->ill_phys_addr_length > 0) { 2873 /* 2874 * The bcopy() below used to be called for the physical address 2875 * length rather than the link layer address length. For 2876 * ethernet and many other media, the phys_addr and lla are 2877 * identical. 2878 * 2879 * The phys_addr and lla may not be the same for devices that 2880 * support DL_IPV6_LINK_LAYER_ADDR, though there are currently 2881 * no known instances of these. 2882 * 2883 * For PPP or other interfaces with a zero length 2884 * physical address, don't do anything here. 2885 * The bcopy() with a zero phys_addr length was previously 2886 * a no-op for interfaces with a zero-length physical address. 2887 * Using the lla for them would change the way they operate. 2888 * Doing nothing in such cases preserves expected behavior. 2889 */ 2890 bcopy(ll_addr, ncec->ncec_lladdr, ill->ill_nd_lla_len); 2891 } 2892 } 2893 2894 boolean_t 2895 nce_cmp_ll_addr(const ncec_t *ncec, const uchar_t *ll_addr, 2896 uint32_t ll_addr_len) 2897 { 2898 ASSERT(ncec->ncec_lladdr != NULL); 2899 if (ll_addr == NULL) 2900 return (B_FALSE); 2901 if (bcmp(ll_addr, ncec->ncec_lladdr, ll_addr_len) != 0) 2902 return (B_TRUE); 2903 return (B_FALSE); 2904 } 2905 2906 /* 2907 * Updates the link layer address or the reachability state of 2908 * a cache entry. Reset probe counter if needed. 2909 */ 2910 void 2911 nce_update(ncec_t *ncec, uint16_t new_state, uchar_t *new_ll_addr) 2912 { 2913 ill_t *ill = ncec->ncec_ill; 2914 boolean_t need_stop_timer = B_FALSE; 2915 boolean_t need_fastpath_update = B_FALSE; 2916 nce_t *nce = NULL; 2917 timeout_id_t tid; 2918 2919 ASSERT(MUTEX_HELD(&ncec->ncec_lock)); 2920 /* 2921 * If this interface does not do NUD, there is no point 2922 * in allowing an update to the cache entry. Although 2923 * we will respond to NS. 2924 * The only time we accept an update for a resolver when 2925 * NUD is turned off is when it has just been created. 2926 * Non-Resolvers will always be created as REACHABLE. 2927 */ 2928 if (new_state != ND_UNCHANGED) { 2929 if ((ncec->ncec_flags & NCE_F_NONUD) && 2930 (ncec->ncec_state != ND_INCOMPLETE)) 2931 return; 2932 ASSERT((int16_t)new_state >= ND_STATE_VALID_MIN); 2933 ASSERT((int16_t)new_state <= ND_STATE_VALID_MAX); 2934 need_stop_timer = B_TRUE; 2935 if (new_state == ND_REACHABLE) 2936 ncec->ncec_last = TICK_TO_MSEC(ddi_get_lbolt64()); 2937 else { 2938 /* We force NUD in this case */ 2939 ncec->ncec_last = 0; 2940 } 2941 ncec->ncec_state = new_state; 2942 ncec->ncec_pcnt = ND_MAX_UNICAST_SOLICIT; 2943 ASSERT(ncec->ncec_lladdr != NULL || new_state == ND_INITIAL || 2944 new_state == ND_INCOMPLETE); 2945 } 2946 2947 tid = 0; 2948 if (need_stop_timer || (ncec->ncec_flags & NCE_F_STATIC)) { 2949 tid = ncec->ncec_timeout_id; 2950 ncec->ncec_timeout_id = 0; 2951 } 2952 /* 2953 * Re-trigger fastpath probe and 2954 * overwrite the DL_UNITDATA_REQ data, noting we'll lose 2955 * whatever packets that happens to be transmitting at the time. 2956 */ 2957 if (new_ll_addr != NULL) { 2958 bcopy(new_ll_addr, ncec->ncec_lladdr, 2959 ill->ill_phys_addr_length); 2960 need_fastpath_update = B_TRUE; 2961 } 2962 mutex_exit(&ncec->ncec_lock); 2963 if (need_stop_timer || (ncec->ncec_flags & NCE_F_STATIC)) { 2964 if (tid != 0) 2965 (void) untimeout(tid); 2966 } 2967 if (need_fastpath_update) { 2968 /* 2969 * Delete any existing existing dlur_mp and fp_mp information. 2970 * For IPMP interfaces, all underlying ill's must be checked 2971 * and purged. 2972 */ 2973 nce_fastpath_list_delete(ncec->ncec_ill, ncec, NULL); 2974 /* 2975 * add the new dlur_mp and fp_mp 2976 */ 2977 nce = nce_fastpath(ncec, B_TRUE, NULL); 2978 if (nce != NULL) 2979 nce_refrele(nce); 2980 } 2981 mutex_enter(&ncec->ncec_lock); 2982 } 2983 2984 static void 2985 nce_queue_mp_common(ncec_t *ncec, mblk_t *mp, boolean_t head_insert) 2986 { 2987 uint_t count = 0; 2988 mblk_t **mpp, *tmp; 2989 2990 ASSERT(MUTEX_HELD(&ncec->ncec_lock)); 2991 2992 for (mpp = &ncec->ncec_qd_mp; *mpp != NULL; mpp = &(*mpp)->b_next) { 2993 if (++count > ncec->ncec_ill->ill_max_buf) { 2994 tmp = ncec->ncec_qd_mp->b_next; 2995 ncec->ncec_qd_mp->b_next = NULL; 2996 /* 2997 * if we never create data addrs on the under_ill 2998 * does this matter? 2999 */ 3000 BUMP_MIB(ncec->ncec_ill->ill_ip_mib, 3001 ipIfStatsOutDiscards); 3002 ip_drop_output("ipIfStatsOutDiscards", ncec->ncec_qd_mp, 3003 ncec->ncec_ill); 3004 freemsg(ncec->ncec_qd_mp); 3005 ncec->ncec_qd_mp = tmp; 3006 } 3007 } 3008 3009 if (head_insert) { 3010 ncec->ncec_nprobes++; 3011 mp->b_next = ncec->ncec_qd_mp; 3012 ncec->ncec_qd_mp = mp; 3013 } else { 3014 *mpp = mp; 3015 } 3016 } 3017 3018 /* 3019 * nce_queue_mp will queue the packet into the ncec_qd_mp. The packet will be 3020 * queued at the head or tail of the queue based on the input argument 3021 * 'head_insert'. The caller should specify this argument as B_TRUE if this 3022 * packet is an IPMP probe packet, in which case the following happens: 3023 * 3024 * 1. Insert it at the head of the ncec_qd_mp list. Consider the normal 3025 * (non-ipmp_probe) load-speading case where the source address of the ND 3026 * packet is not tied to ncec_ill. If the ill bound to the source address 3027 * cannot receive, the response to the ND packet will not be received. 3028 * However, if ND packets for ncec_ill's probes are queued behind that ND 3029 * packet, those probes will also fail to be sent, and thus in.mpathd will 3030 * erroneously conclude that ncec_ill has also failed. 3031 * 3032 * 2. Drop the ipmp_probe packet in ndp_timer() if the ND did not succeed on 3033 * the first attempt. This ensures that ND problems do not manifest as 3034 * probe RTT spikes. 3035 * 3036 * We achieve this by inserting ipmp_probe() packets at the head of the 3037 * nce_queue. 3038 * 3039 * The ncec for the probe target is created with ncec_ill set to the ipmp_ill, 3040 * but the caller needs to set head_insert to B_TRUE if this is a probe packet. 3041 */ 3042 void 3043 nce_queue_mp(ncec_t *ncec, mblk_t *mp, boolean_t head_insert) 3044 { 3045 ASSERT(MUTEX_HELD(&ncec->ncec_lock)); 3046 nce_queue_mp_common(ncec, mp, head_insert); 3047 } 3048 3049 /* 3050 * Called when address resolution failed due to a timeout. 3051 * Send an ICMP unreachable in response to all queued packets. 3052 */ 3053 void 3054 ndp_resolv_failed(ncec_t *ncec) 3055 { 3056 mblk_t *mp, *nxt_mp; 3057 char buf[INET6_ADDRSTRLEN]; 3058 ill_t *ill = ncec->ncec_ill; 3059 ip_recv_attr_t iras; 3060 3061 bzero(&iras, sizeof (iras)); 3062 iras.ira_flags = 0; 3063 /* 3064 * we are setting the ira_rill to the ipmp_ill (instead of 3065 * the actual ill on which the packet was received), but this 3066 * is ok because we don't actually need the real ira_rill. 3067 * to send the icmp unreachable to the sender. 3068 */ 3069 iras.ira_ill = iras.ira_rill = ill; 3070 iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex; 3071 iras.ira_rifindex = iras.ira_ruifindex; 3072 3073 ip1dbg(("ndp_resolv_failed: dst %s\n", 3074 inet_ntop(AF_INET6, (char *)&ncec->ncec_addr, buf, sizeof (buf)))); 3075 mutex_enter(&ncec->ncec_lock); 3076 mp = ncec->ncec_qd_mp; 3077 ncec->ncec_qd_mp = NULL; 3078 ncec->ncec_nprobes = 0; 3079 mutex_exit(&ncec->ncec_lock); 3080 while (mp != NULL) { 3081 nxt_mp = mp->b_next; 3082 mp->b_next = NULL; 3083 3084 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); 3085 ip_drop_output("ipIfStatsOutDiscards - address unreachable", 3086 mp, ill); 3087 icmp_unreachable_v6(mp, 3088 ICMP6_DST_UNREACH_ADDR, B_FALSE, &iras); 3089 ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE)); 3090 mp = nxt_mp; 3091 } 3092 ncec_cb_dispatch(ncec); /* finish off waiting callbacks */ 3093 } 3094 3095 /* 3096 * Handle the completion of NDP and ARP resolution. 3097 */ 3098 void 3099 nce_resolv_ok(ncec_t *ncec) 3100 { 3101 mblk_t *mp; 3102 uint_t pkt_len; 3103 iaflags_t ixaflags = IXAF_NO_TRACE; 3104 nce_t *nce; 3105 ill_t *ill = ncec->ncec_ill; 3106 boolean_t isv6 = (ncec->ncec_ipversion == IPV6_VERSION); 3107 ip_stack_t *ipst = ill->ill_ipst; 3108 3109 if (IS_IPMP(ncec->ncec_ill)) { 3110 nce_resolv_ipmp_ok(ncec); 3111 return; 3112 } 3113 /* non IPMP case */ 3114 3115 mutex_enter(&ncec->ncec_lock); 3116 ASSERT(ncec->ncec_nprobes == 0); 3117 mp = ncec->ncec_qd_mp; 3118 ncec->ncec_qd_mp = NULL; 3119 mutex_exit(&ncec->ncec_lock); 3120 3121 while (mp != NULL) { 3122 mblk_t *nxt_mp; 3123 3124 if (ill->ill_isv6) { 3125 ip6_t *ip6h = (ip6_t *)mp->b_rptr; 3126 3127 pkt_len = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN; 3128 } else { 3129 ipha_t *ipha = (ipha_t *)mp->b_rptr; 3130 3131 ixaflags |= IXAF_IS_IPV4; 3132 pkt_len = ntohs(ipha->ipha_length); 3133 } 3134 nxt_mp = mp->b_next; 3135 mp->b_next = NULL; 3136 /* 3137 * IXAF_NO_DEV_FLOW_CTL information for TCP packets is no 3138 * longer available, but it's ok to drop this flag because TCP 3139 * has its own flow-control in effect, so TCP packets 3140 * are not likely to get here when flow-control is in effect. 3141 */ 3142 mutex_enter(&ill->ill_lock); 3143 nce = nce_lookup(ill, &ncec->ncec_addr); 3144 mutex_exit(&ill->ill_lock); 3145 3146 if (nce == NULL) { 3147 if (isv6) { 3148 BUMP_MIB(&ipst->ips_ip6_mib, 3149 ipIfStatsOutDiscards); 3150 } else { 3151 BUMP_MIB(&ipst->ips_ip_mib, 3152 ipIfStatsOutDiscards); 3153 } 3154 ip_drop_output("ipIfStatsOutDiscards - no nce", 3155 mp, NULL); 3156 freemsg(mp); 3157 } else { 3158 /* 3159 * We don't know the zoneid, but 3160 * ip_xmit does not care since IXAF_NO_TRACE 3161 * is set. (We traced the packet the first 3162 * time through ip_xmit.) 3163 */ 3164 (void) ip_xmit(mp, nce, ixaflags, pkt_len, 0, 3165 ALL_ZONES, 0, NULL); 3166 nce_refrele(nce); 3167 } 3168 mp = nxt_mp; 3169 } 3170 3171 ncec_cb_dispatch(ncec); /* complete callbacks */ 3172 } 3173 3174 /* 3175 * Called by SIOCSNDP* ioctl to add/change an ncec entry 3176 * and the corresponding attributes. 3177 * Disallow states other than ND_REACHABLE or ND_STALE. 3178 */ 3179 int 3180 ndp_sioc_update(ill_t *ill, lif_nd_req_t *lnr) 3181 { 3182 sin6_t *sin6; 3183 in6_addr_t *addr; 3184 ncec_t *ncec; 3185 nce_t *nce; 3186 int err = 0; 3187 uint16_t new_flags = 0; 3188 uint16_t old_flags = 0; 3189 int inflags = lnr->lnr_flags; 3190 ip_stack_t *ipst = ill->ill_ipst; 3191 boolean_t do_postprocess = B_FALSE; 3192 3193 ASSERT(ill->ill_isv6); 3194 if ((lnr->lnr_state_create != ND_REACHABLE) && 3195 (lnr->lnr_state_create != ND_STALE)) 3196 return (EINVAL); 3197 3198 sin6 = (sin6_t *)&lnr->lnr_addr; 3199 addr = &sin6->sin6_addr; 3200 3201 mutex_enter(&ipst->ips_ndp6->ndp_g_lock); 3202 ASSERT(!IS_UNDER_IPMP(ill)); 3203 nce = nce_lookup_addr(ill, addr); 3204 if (nce != NULL) 3205 new_flags = nce->nce_common->ncec_flags; 3206 3207 switch (inflags & (NDF_ISROUTER_ON|NDF_ISROUTER_OFF)) { 3208 case NDF_ISROUTER_ON: 3209 new_flags |= NCE_F_ISROUTER; 3210 break; 3211 case NDF_ISROUTER_OFF: 3212 new_flags &= ~NCE_F_ISROUTER; 3213 break; 3214 case (NDF_ISROUTER_OFF|NDF_ISROUTER_ON): 3215 mutex_exit(&ipst->ips_ndp6->ndp_g_lock); 3216 if (nce != NULL) 3217 nce_refrele(nce); 3218 return (EINVAL); 3219 } 3220 if (inflags & NDF_STATIC) 3221 new_flags |= NCE_F_STATIC; 3222 3223 switch (inflags & (NDF_ANYCAST_ON|NDF_ANYCAST_OFF)) { 3224 case NDF_ANYCAST_ON: 3225 new_flags |= NCE_F_ANYCAST; 3226 break; 3227 case NDF_ANYCAST_OFF: 3228 new_flags &= ~NCE_F_ANYCAST; 3229 break; 3230 case (NDF_ANYCAST_OFF|NDF_ANYCAST_ON): 3231 mutex_exit(&ipst->ips_ndp6->ndp_g_lock); 3232 if (nce != NULL) 3233 nce_refrele(nce); 3234 return (EINVAL); 3235 } 3236 3237 if (nce == NULL) { 3238 err = nce_add_v6(ill, 3239 (uchar_t *)lnr->lnr_hdw_addr, 3240 ill->ill_phys_addr_length, 3241 addr, 3242 new_flags, 3243 lnr->lnr_state_create, 3244 &nce); 3245 if (err != 0) { 3246 mutex_exit(&ipst->ips_ndp6->ndp_g_lock); 3247 ip1dbg(("ndp_sioc_update: Can't create NCE %d\n", err)); 3248 return (err); 3249 } else { 3250 do_postprocess = B_TRUE; 3251 } 3252 } 3253 ncec = nce->nce_common; 3254 old_flags = ncec->ncec_flags; 3255 if (old_flags & NCE_F_ISROUTER && !(new_flags & NCE_F_ISROUTER)) { 3256 ncec_router_to_host(ncec); 3257 mutex_exit(&ipst->ips_ndp6->ndp_g_lock); 3258 if (do_postprocess) 3259 err = nce_add_v6_postprocess(nce); 3260 nce_refrele(nce); 3261 return (0); 3262 } 3263 mutex_exit(&ipst->ips_ndp6->ndp_g_lock); 3264 3265 if (do_postprocess) 3266 err = nce_add_v6_postprocess(nce); 3267 /* 3268 * err cannot be anything other than 0 because we don't support 3269 * proxy arp of static addresses. 3270 */ 3271 ASSERT(err == 0); 3272 3273 mutex_enter(&ncec->ncec_lock); 3274 ncec->ncec_flags = new_flags; 3275 mutex_exit(&ncec->ncec_lock); 3276 /* 3277 * Note that we ignore the state at this point, which 3278 * should be either STALE or REACHABLE. Instead we let 3279 * the link layer address passed in to determine the state 3280 * much like incoming packets. 3281 */ 3282 nce_process(ncec, (uchar_t *)lnr->lnr_hdw_addr, 0, B_FALSE); 3283 nce_refrele(nce); 3284 return (0); 3285 } 3286 3287 /* 3288 * Create an nce_t structure for ill using the ncec->ncec_lladdr to set up 3289 * the nce_dlur_mp. If ill != ncec->ncec_ill, then the ips_ill_g_lock must 3290 * be held to ensure that they are in the same group. 3291 */ 3292 static nce_t * 3293 nce_fastpath_create(ill_t *ill, ncec_t *ncec) 3294 { 3295 3296 nce_t *nce; 3297 3298 nce = nce_ill_lookup_then_add(ill, ncec); 3299 3300 if (nce == NULL || IS_LOOPBACK(nce->nce_ill) || IS_VNI(nce->nce_ill)) 3301 return (nce); 3302 3303 /* 3304 * hold the ncec_lock to synchronize with nce_update() so that, 3305 * at the end of this function, the contents of nce_dlur_mp are 3306 * consistent with ncec->ncec_lladdr, even though some intermediate 3307 * packet may have been sent out with a mangled address, which would 3308 * only be a transient condition. 3309 */ 3310 mutex_enter(&ncec->ncec_lock); 3311 if (ncec->ncec_lladdr != NULL) { 3312 bcopy(ncec->ncec_lladdr, nce->nce_dlur_mp->b_rptr + 3313 NCE_LL_ADDR_OFFSET(ill), ill->ill_phys_addr_length); 3314 } else { 3315 nce->nce_dlur_mp = ill_dlur_gen(NULL, 0, ill->ill_sap, 3316 ill->ill_sap_length); 3317 } 3318 mutex_exit(&ncec->ncec_lock); 3319 return (nce); 3320 } 3321 3322 /* 3323 * we make nce_fp_mp to have an M_DATA prepend. 3324 * The caller ensures there is hold on ncec for this function. 3325 * Note that since ill_fastpath_probe() copies the mblk there is 3326 * no need to hold the nce or ncec beyond this function. 3327 * 3328 * If the caller has passed in a non-null ncec_nce to nce_fastpath() that 3329 * ncec_nce must correspond to the nce for ncec with nce_ill == ncec->ncec_ill 3330 * and will be returned back by this function, so that no extra nce_refrele 3331 * is required for the caller. The calls from nce_add_common() use this 3332 * method. All other callers (that pass in NULL ncec_nce) will have to do a 3333 * nce_refrele of the returned nce (when it is non-null). 3334 */ 3335 static nce_t * 3336 nce_fastpath(ncec_t *ncec, boolean_t trigger_fp_req, nce_t *ncec_nce) 3337 { 3338 nce_t *nce; 3339 ill_t *ill = ncec->ncec_ill; 3340 3341 ASSERT(ill != NULL); 3342 3343 if (IS_IPMP(ill) && trigger_fp_req) { 3344 trigger_fp_req = B_FALSE; 3345 ipmp_ncec_refresh_nce(ncec); 3346 } 3347 3348 /* 3349 * If the caller already has the nce corresponding to the ill, use 3350 * that one. Otherwise we have to lookup/add the nce. Calls from 3351 * nce_add_common() fall in the former category, and have just done 3352 * the nce lookup/add that can be reused. 3353 */ 3354 if (ncec_nce == NULL) 3355 nce = nce_fastpath_create(ill, ncec); 3356 else 3357 nce = ncec_nce; 3358 3359 if (nce == NULL || IS_LOOPBACK(nce->nce_ill) || IS_VNI(nce->nce_ill)) 3360 return (nce); 3361 3362 if (trigger_fp_req) 3363 nce_fastpath_trigger(nce); 3364 return (nce); 3365 } 3366 3367 /* 3368 * Trigger fastpath on nce. No locks may be held. 3369 */ 3370 static void 3371 nce_fastpath_trigger(nce_t *nce) 3372 { 3373 int res; 3374 ill_t *ill = nce->nce_ill; 3375 ncec_t *ncec = nce->nce_common; 3376 3377 res = ill_fastpath_probe(ill, nce->nce_dlur_mp); 3378 /* 3379 * EAGAIN is an indication of a transient error 3380 * i.e. allocation failure etc. leave the ncec in the list it 3381 * will be updated when another probe happens for another ire 3382 * if not it will be taken out of the list when the ire is 3383 * deleted. 3384 */ 3385 if (res != 0 && res != EAGAIN && res != ENOTSUP) 3386 nce_fastpath_list_delete(ill, ncec, NULL); 3387 } 3388 3389 /* 3390 * Add ncec to the nce fastpath list on ill. 3391 */ 3392 static nce_t * 3393 nce_ill_lookup_then_add_locked(ill_t *ill, ncec_t *ncec, list_t *graveyard) 3394 { 3395 nce_t *nce = NULL; 3396 3397 ASSERT(MUTEX_HELD(&ill->ill_lock)); 3398 /* 3399 * Atomically ensure that the ill is not CONDEMNED and is not going 3400 * down, before adding the NCE. 3401 */ 3402 if (ill->ill_state_flags & ILL_CONDEMNED) 3403 return (NULL); 3404 mutex_enter(&ncec->ncec_lock); 3405 /* 3406 * if ncec has not been deleted and 3407 * is not already in the list add it. 3408 */ 3409 if (!NCE_ISCONDEMNED(ncec)) { 3410 nce = nce_lookup(ill, &ncec->ncec_addr); 3411 if (nce != NULL) 3412 goto done; 3413 nce = nce_add(ill, ncec, graveyard); 3414 } 3415 done: 3416 mutex_exit(&ncec->ncec_lock); 3417 return (nce); 3418 } 3419 3420 static nce_t * 3421 nce_ill_lookup_then_add(ill_t *ill, ncec_t *ncec) 3422 { 3423 nce_t *nce; 3424 list_t graveyard; 3425 3426 list_create(&graveyard, sizeof (nce_t), offsetof(nce_t, nce_node)); 3427 mutex_enter(&ill->ill_lock); 3428 nce = nce_ill_lookup_then_add_locked(ill, ncec, &graveyard); 3429 mutex_exit(&ill->ill_lock); 3430 nce_graveyard_free(&graveyard); 3431 return (nce); 3432 } 3433 3434 3435 /* 3436 * remove ncec from the ill_nce list. If 'dead' is non-null, the deleted 3437 * nce is added to the 'dead' list, and the caller must nce_refrele() the 3438 * entry after all locks have been dropped. 3439 */ 3440 void 3441 nce_fastpath_list_delete(ill_t *ill, ncec_t *ncec, list_t *dead) 3442 { 3443 nce_t *nce; 3444 3445 ASSERT(ill != NULL); 3446 3447 /* delete any nces referencing the ncec from underlying ills */ 3448 if (IS_IPMP(ill)) 3449 ipmp_ncec_delete_nce(ncec); 3450 3451 /* now the ill itself */ 3452 mutex_enter(&ill->ill_lock); 3453 for (nce = list_head(&ill->ill_nce); nce != NULL; 3454 nce = list_next(&ill->ill_nce, nce)) { 3455 if (nce->nce_common == ncec) { 3456 nce_refhold(nce); 3457 nce_delete(nce); 3458 break; 3459 } 3460 } 3461 mutex_exit(&ill->ill_lock); 3462 if (nce != NULL) { 3463 if (dead == NULL) 3464 nce_refrele(nce); 3465 else 3466 list_insert_tail(dead, nce); 3467 } 3468 } 3469 3470 /* 3471 * when the fastpath response does not fit in the datab 3472 * associated with the existing nce_fp_mp, we delete and 3473 * add the nce to retrigger fastpath based on the information 3474 * in the ncec_t. 3475 */ 3476 static nce_t * 3477 nce_delete_then_add(nce_t *nce) 3478 { 3479 ill_t *ill = nce->nce_ill; 3480 nce_t *newnce = NULL; 3481 list_t graveyard; 3482 3483 list_create(&graveyard, sizeof (nce_t), offsetof(nce_t, nce_node)); 3484 ip0dbg(("nce_delete_then_add nce %p ill %s\n", 3485 (void *)nce, ill->ill_name)); 3486 mutex_enter(&ill->ill_lock); 3487 mutex_enter(&nce->nce_common->ncec_lock); 3488 nce_delete(nce); 3489 /* 3490 * Make sure that ncec is not condemned before adding. We hold the 3491 * ill_lock and ncec_lock to synchronize with ncec_delete() and 3492 * ipmp_ncec_delete_nce() 3493 */ 3494 if (!NCE_ISCONDEMNED(nce->nce_common)) 3495 newnce = nce_add(ill, nce->nce_common, &graveyard); 3496 mutex_exit(&nce->nce_common->ncec_lock); 3497 mutex_exit(&ill->ill_lock); 3498 nce_graveyard_free(&graveyard); 3499 nce_refrele(nce); 3500 return (newnce); /* could be null if nomem */ 3501 } 3502 3503 typedef struct nce_fp_match_s { 3504 nce_t *nce_fp_match_res; 3505 mblk_t *nce_fp_match_ack_mp; 3506 } nce_fp_match_t; 3507 3508 /* ARGSUSED */ 3509 static int 3510 nce_fastpath_match_dlur(ill_t *ill, nce_t *nce, void *arg) 3511 { 3512 nce_fp_match_t *nce_fp_marg = arg; 3513 ncec_t *ncec = nce->nce_common; 3514 mblk_t *mp = nce_fp_marg->nce_fp_match_ack_mp; 3515 uchar_t *mp_rptr, *ud_mp_rptr; 3516 mblk_t *ud_mp = nce->nce_dlur_mp; 3517 ptrdiff_t cmplen; 3518 3519 /* 3520 * mp is the mp associated with the fastpath ack. 3521 * ud_mp is the outstanding DL_UNITDATA_REQ on the nce_t 3522 * under consideration. If the contents match, then the 3523 * fastpath ack is used to update the nce. 3524 */ 3525 if (ud_mp == NULL) 3526 return (0); 3527 mp_rptr = mp->b_rptr; 3528 cmplen = mp->b_wptr - mp_rptr; 3529 ASSERT(cmplen >= 0); 3530 3531 ud_mp_rptr = ud_mp->b_rptr; 3532 /* 3533 * The ncec is locked here to prevent any other threads from accessing 3534 * and changing nce_dlur_mp when the address becomes resolved to an 3535 * lla while we're in the middle of looking at and comparing the 3536 * hardware address (lla). It is also locked to prevent multiple 3537 * threads in nce_fastpath() from examining nce_dlur_mp at the same 3538 * time. 3539 */ 3540 mutex_enter(&ncec->ncec_lock); 3541 if (ud_mp->b_wptr - ud_mp_rptr != cmplen || 3542 bcmp((char *)mp_rptr, (char *)ud_mp_rptr, cmplen) == 0) { 3543 nce_fp_marg->nce_fp_match_res = nce; 3544 mutex_exit(&ncec->ncec_lock); 3545 nce_refhold(nce); 3546 return (1); 3547 } 3548 mutex_exit(&ncec->ncec_lock); 3549 return (0); 3550 } 3551 3552 /* 3553 * Update all NCE's that are not in fastpath mode and 3554 * have an nce_fp_mp that matches mp. mp->b_cont contains 3555 * the fastpath header. 3556 * 3557 * Returns TRUE if entry should be dequeued, or FALSE otherwise. 3558 */ 3559 void 3560 nce_fastpath_update(ill_t *ill, mblk_t *mp) 3561 { 3562 nce_fp_match_t nce_fp_marg; 3563 nce_t *nce; 3564 mblk_t *nce_fp_mp, *fp_mp; 3565 3566 nce_fp_marg.nce_fp_match_res = NULL; 3567 nce_fp_marg.nce_fp_match_ack_mp = mp; 3568 3569 nce_walk(ill, nce_fastpath_match_dlur, &nce_fp_marg); 3570 3571 if ((nce = nce_fp_marg.nce_fp_match_res) == NULL) 3572 return; 3573 3574 mutex_enter(&nce->nce_lock); 3575 nce_fp_mp = nce->nce_fp_mp; 3576 3577 if (nce_fp_mp != NULL) { 3578 fp_mp = mp->b_cont; 3579 if (nce_fp_mp->b_rptr + MBLKL(fp_mp) > 3580 nce_fp_mp->b_datap->db_lim) { 3581 mutex_exit(&nce->nce_lock); 3582 nce = nce_delete_then_add(nce); 3583 if (nce == NULL) { 3584 return; 3585 } 3586 mutex_enter(&nce->nce_lock); 3587 nce_fp_mp = nce->nce_fp_mp; 3588 } 3589 } 3590 3591 /* Matched - install mp as the fastpath mp */ 3592 if (nce_fp_mp == NULL) { 3593 fp_mp = dupb(mp->b_cont); 3594 nce->nce_fp_mp = fp_mp; 3595 } else { 3596 fp_mp = mp->b_cont; 3597 bcopy(fp_mp->b_rptr, nce_fp_mp->b_rptr, MBLKL(fp_mp)); 3598 nce->nce_fp_mp->b_wptr = nce->nce_fp_mp->b_rptr 3599 + MBLKL(fp_mp); 3600 } 3601 mutex_exit(&nce->nce_lock); 3602 nce_refrele(nce); 3603 } 3604 3605 /* 3606 * Return a pointer to a given option in the packet. 3607 * Assumes that option part of the packet have already been validated. 3608 */ 3609 nd_opt_hdr_t * 3610 ndp_get_option(nd_opt_hdr_t *opt, int optlen, int opt_type) 3611 { 3612 while (optlen > 0) { 3613 if (opt->nd_opt_type == opt_type) 3614 return (opt); 3615 optlen -= 8 * opt->nd_opt_len; 3616 opt = (struct nd_opt_hdr *)((char *)opt + 8 * opt->nd_opt_len); 3617 } 3618 return (NULL); 3619 } 3620 3621 /* 3622 * Verify all option lengths present are > 0, also check to see 3623 * if the option lengths and packet length are consistent. 3624 */ 3625 boolean_t 3626 ndp_verify_optlen(nd_opt_hdr_t *opt, int optlen) 3627 { 3628 ASSERT(opt != NULL); 3629 while (optlen > 0) { 3630 if (opt->nd_opt_len == 0) 3631 return (B_FALSE); 3632 optlen -= 8 * opt->nd_opt_len; 3633 if (optlen < 0) 3634 return (B_FALSE); 3635 opt = (struct nd_opt_hdr *)((char *)opt + 8 * opt->nd_opt_len); 3636 } 3637 return (B_TRUE); 3638 } 3639 3640 /* 3641 * ncec_walk function. 3642 * Free a fraction of the NCE cache entries. 3643 * 3644 * A possible optimization here would be to use ncec_last where possible, and 3645 * delete the least-frequently used entry, which would require more complex 3646 * computation as we walk through the ncec's (e.g., track ncec entries by 3647 * order of ncec_last and/or maintain state) 3648 */ 3649 static void 3650 ncec_cache_reclaim(ncec_t *ncec, void *arg) 3651 { 3652 ip_stack_t *ipst = ncec->ncec_ipst; 3653 uint_t fraction = *(uint_t *)arg; 3654 uint_t rand; 3655 3656 if ((ncec->ncec_flags & 3657 (NCE_F_MYADDR | NCE_F_STATIC | NCE_F_BCAST)) != 0) { 3658 return; 3659 } 3660 3661 rand = (uint_t)ddi_get_lbolt() + 3662 NCE_ADDR_HASH_V6(ncec->ncec_addr, NCE_TABLE_SIZE); 3663 if ((rand/fraction)*fraction == rand) { 3664 IP_STAT(ipst, ip_nce_reclaim_deleted); 3665 ncec_delete(ncec); 3666 } 3667 } 3668 3669 /* 3670 * kmem_cache callback to free up memory. 3671 * 3672 * For now we just delete a fixed fraction. 3673 */ 3674 static void 3675 ip_nce_reclaim_stack(ip_stack_t *ipst) 3676 { 3677 uint_t fraction = ipst->ips_ip_nce_reclaim_fraction; 3678 3679 IP_STAT(ipst, ip_nce_reclaim_calls); 3680 3681 ncec_walk(NULL, ncec_cache_reclaim, &fraction, ipst); 3682 3683 /* 3684 * Walk all CONNs that can have a reference on an ire, ncec or dce. 3685 * Get them to update any stale references to drop any refholds they 3686 * have. 3687 */ 3688 ipcl_walk(conn_ixa_cleanup, (void *)B_FALSE, ipst); 3689 } 3690 3691 /* 3692 * Called by the memory allocator subsystem directly, when the system 3693 * is running low on memory. 3694 */ 3695 /* ARGSUSED */ 3696 void 3697 ip_nce_reclaim(void *args) 3698 { 3699 netstack_handle_t nh; 3700 netstack_t *ns; 3701 ip_stack_t *ipst; 3702 3703 netstack_next_init(&nh); 3704 while ((ns = netstack_next(&nh)) != NULL) { 3705 /* 3706 * netstack_next() can return a netstack_t with a NULL 3707 * netstack_ip at boot time. 3708 */ 3709 if ((ipst = ns->netstack_ip) == NULL) { 3710 netstack_rele(ns); 3711 continue; 3712 } 3713 ip_nce_reclaim_stack(ipst); 3714 netstack_rele(ns); 3715 } 3716 netstack_next_fini(&nh); 3717 } 3718 3719 #ifdef DEBUG 3720 void 3721 ncec_trace_ref(ncec_t *ncec) 3722 { 3723 ASSERT(MUTEX_HELD(&ncec->ncec_lock)); 3724 3725 if (ncec->ncec_trace_disable) 3726 return; 3727 3728 if (!th_trace_ref(ncec, ncec->ncec_ipst)) { 3729 ncec->ncec_trace_disable = B_TRUE; 3730 ncec_trace_cleanup(ncec); 3731 } 3732 } 3733 3734 void 3735 ncec_untrace_ref(ncec_t *ncec) 3736 { 3737 ASSERT(MUTEX_HELD(&ncec->ncec_lock)); 3738 3739 if (!ncec->ncec_trace_disable) 3740 th_trace_unref(ncec); 3741 } 3742 3743 static void 3744 ncec_trace_cleanup(const ncec_t *ncec) 3745 { 3746 th_trace_cleanup(ncec, ncec->ncec_trace_disable); 3747 } 3748 #endif 3749 3750 /* 3751 * Called when address resolution fails due to a timeout. 3752 * Send an ICMP unreachable in response to all queued packets. 3753 */ 3754 void 3755 arp_resolv_failed(ncec_t *ncec) 3756 { 3757 mblk_t *mp, *nxt_mp; 3758 char buf[INET6_ADDRSTRLEN]; 3759 struct in_addr ipv4addr; 3760 ill_t *ill = ncec->ncec_ill; 3761 ip_stack_t *ipst = ncec->ncec_ipst; 3762 ip_recv_attr_t iras; 3763 3764 bzero(&iras, sizeof (iras)); 3765 iras.ira_flags = IRAF_IS_IPV4; 3766 /* 3767 * we are setting the ira_rill to the ipmp_ill (instead of 3768 * the actual ill on which the packet was received), but this 3769 * is ok because we don't actually need the real ira_rill. 3770 * to send the icmp unreachable to the sender. 3771 */ 3772 iras.ira_ill = iras.ira_rill = ill; 3773 iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex; 3774 iras.ira_rifindex = iras.ira_ruifindex; 3775 3776 IN6_V4MAPPED_TO_INADDR(&ncec->ncec_addr, &ipv4addr); 3777 ip3dbg(("arp_resolv_failed: dst %s\n", 3778 inet_ntop(AF_INET, &ipv4addr, buf, sizeof (buf)))); 3779 mutex_enter(&ncec->ncec_lock); 3780 mp = ncec->ncec_qd_mp; 3781 ncec->ncec_qd_mp = NULL; 3782 ncec->ncec_nprobes = 0; 3783 mutex_exit(&ncec->ncec_lock); 3784 while (mp != NULL) { 3785 nxt_mp = mp->b_next; 3786 mp->b_next = NULL; 3787 3788 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); 3789 ip_drop_output("ipIfStatsOutDiscards - address unreachable", 3790 mp, ill); 3791 if (ipst->ips_ip_arp_icmp_error) { 3792 ip3dbg(("arp_resolv_failed: " 3793 "Calling icmp_unreachable\n")); 3794 icmp_unreachable(mp, ICMP_HOST_UNREACHABLE, &iras); 3795 } else { 3796 freemsg(mp); 3797 } 3798 ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE)); 3799 mp = nxt_mp; 3800 } 3801 ncec_cb_dispatch(ncec); /* finish off waiting callbacks */ 3802 } 3803 3804 /* 3805 * if ill is an under_ill, translate it to the ipmp_ill and add the 3806 * nce on the ipmp_ill. Two nce_t entries (one on the ipmp_ill, and 3807 * one on the underlying in_ill) will be created for the 3808 * ncec_t in this case. The ncec_t itself will be created on the ipmp_ill. 3809 */ 3810 int 3811 nce_lookup_then_add_v4(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len, 3812 const in_addr_t *addr, uint16_t flags, uint16_t state, nce_t **newnce) 3813 { 3814 int err; 3815 in6_addr_t addr6; 3816 ip_stack_t *ipst = ill->ill_ipst; 3817 nce_t *nce, *upper_nce = NULL; 3818 ill_t *in_ill = ill, *under = NULL; 3819 boolean_t need_ill_refrele = B_FALSE; 3820 3821 if (flags & NCE_F_MCAST) { 3822 /* 3823 * hw_addr will be figured out in nce_set_multicast_v4; 3824 * caller needs to pass in the cast_ill for ipmp 3825 */ 3826 ASSERT(hw_addr == NULL); 3827 ASSERT(!IS_IPMP(ill)); 3828 err = nce_set_multicast_v4(ill, addr, flags, newnce); 3829 return (err); 3830 } 3831 3832 if (IS_UNDER_IPMP(ill) && !(flags & NCE_F_MYADDR)) { 3833 ill = ipmp_ill_hold_ipmp_ill(ill); 3834 if (ill == NULL) 3835 return (ENXIO); 3836 need_ill_refrele = B_TRUE; 3837 } 3838 if ((flags & NCE_F_BCAST) != 0) { 3839 /* 3840 * IPv4 broadcast ncec: compute the hwaddr. 3841 */ 3842 if (IS_IPMP(ill)) { 3843 under = ipmp_ill_hold_xmit_ill(ill, B_FALSE); 3844 if (under == NULL) { 3845 if (need_ill_refrele) 3846 ill_refrele(ill); 3847 return (ENETDOWN); 3848 } 3849 hw_addr = under->ill_bcast_mp->b_rptr + 3850 NCE_LL_ADDR_OFFSET(under); 3851 hw_addr_len = under->ill_phys_addr_length; 3852 } else { 3853 hw_addr = ill->ill_bcast_mp->b_rptr + 3854 NCE_LL_ADDR_OFFSET(ill), 3855 hw_addr_len = ill->ill_phys_addr_length; 3856 } 3857 } 3858 3859 mutex_enter(&ipst->ips_ndp4->ndp_g_lock); 3860 IN6_IPADDR_TO_V4MAPPED(*addr, &addr6); 3861 nce = nce_lookup_addr(ill, &addr6); 3862 if (nce == NULL) { 3863 err = nce_add_v4(ill, hw_addr, hw_addr_len, addr, flags, 3864 state, &nce); 3865 } else { 3866 err = EEXIST; 3867 } 3868 mutex_exit(&ipst->ips_ndp4->ndp_g_lock); 3869 if (err == 0) 3870 err = nce_add_v4_postprocess(nce); 3871 3872 if (in_ill != ill && nce != NULL) { 3873 nce_t *under_nce = NULL; 3874 3875 /* 3876 * in_ill was the under_ill. Try to create the under_nce. 3877 * Hold the ill_g_lock to prevent changes to group membership 3878 * until we are done. 3879 */ 3880 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 3881 if (!IS_IN_SAME_ILLGRP(in_ill, ill)) { 3882 DTRACE_PROBE2(ill__not__in__group, nce_t *, nce, 3883 ill_t *, ill); 3884 rw_exit(&ipst->ips_ill_g_lock); 3885 err = ENXIO; 3886 nce_refrele(nce); 3887 nce = NULL; 3888 goto bail; 3889 } 3890 under_nce = nce_fastpath_create(in_ill, nce->nce_common); 3891 if (under_nce == NULL) { 3892 rw_exit(&ipst->ips_ill_g_lock); 3893 err = EINVAL; 3894 nce_refrele(nce); 3895 nce = NULL; 3896 goto bail; 3897 } 3898 rw_exit(&ipst->ips_ill_g_lock); 3899 upper_nce = nce; 3900 nce = under_nce; /* will be returned to caller */ 3901 if (NCE_ISREACHABLE(nce->nce_common)) 3902 nce_fastpath_trigger(under_nce); 3903 } 3904 if (nce != NULL) { 3905 if (newnce != NULL) 3906 *newnce = nce; 3907 else 3908 nce_refrele(nce); 3909 } 3910 bail: 3911 if (under != NULL) 3912 ill_refrele(under); 3913 if (upper_nce != NULL) 3914 nce_refrele(upper_nce); 3915 if (need_ill_refrele) 3916 ill_refrele(ill); 3917 3918 return (err); 3919 } 3920 3921 /* 3922 * NDP Cache Entry creation routine for IPv4. 3923 * This routine must always be called with ndp4->ndp_g_lock held. 3924 * Prior to return, ncec_refcnt is incremented. 3925 * 3926 * IPMP notes: the ncec for non-local (i.e., !NCE_MYADDR(ncec) addresses 3927 * are always added pointing at the ipmp_ill. Thus, when the ill passed 3928 * to nce_add_v4 is an under_ill (i.e., IS_UNDER_IPMP(ill)) two nce_t 3929 * entries will be created, both pointing at the same ncec_t. The nce_t 3930 * entries will have their nce_ill set to the ipmp_ill and the under_ill 3931 * respectively, with the ncec_t having its ncec_ill pointing at the ipmp_ill. 3932 * Local addresses are always created on the ill passed to nce_add_v4. 3933 */ 3934 int 3935 nce_add_v4(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len, 3936 const in_addr_t *addr, uint16_t flags, uint16_t state, nce_t **newnce) 3937 { 3938 int err; 3939 boolean_t is_multicast = (flags & NCE_F_MCAST); 3940 struct in6_addr addr6; 3941 nce_t *nce; 3942 3943 ASSERT(MUTEX_HELD(&ill->ill_ipst->ips_ndp4->ndp_g_lock)); 3944 ASSERT(!ill->ill_isv6); 3945 ASSERT(!IN_MULTICAST(htonl(*addr)) || is_multicast); 3946 3947 IN6_IPADDR_TO_V4MAPPED(*addr, &addr6); 3948 err = nce_add_common(ill, hw_addr, hw_addr_len, &addr6, flags, state, 3949 &nce); 3950 ASSERT(newnce != NULL); 3951 *newnce = nce; 3952 return (err); 3953 } 3954 3955 /* 3956 * Post-processing routine to be executed after nce_add_v4(). This function 3957 * triggers fastpath (if appropriate) and DAD on the newly added nce entry 3958 * and must be called without any locks held. 3959 * 3960 * Always returns 0, but we return an int to keep this symmetric with the 3961 * IPv6 counter-part. 3962 */ 3963 int 3964 nce_add_v4_postprocess(nce_t *nce) 3965 { 3966 ncec_t *ncec = nce->nce_common; 3967 uint16_t flags = ncec->ncec_flags; 3968 boolean_t ndp_need_dad = B_FALSE; 3969 boolean_t dropped; 3970 clock_t delay; 3971 ip_stack_t *ipst = ncec->ncec_ill->ill_ipst; 3972 uchar_t *hw_addr = ncec->ncec_lladdr; 3973 boolean_t trigger_fastpath = B_TRUE; 3974 3975 /* 3976 * If the hw_addr is NULL, typically for ND_INCOMPLETE nces, then 3977 * we call nce_fastpath as soon as the ncec is resolved in nce_process. 3978 * We call nce_fastpath from nce_update if the link layer address of 3979 * the peer changes from nce_update 3980 */ 3981 if (NCE_PUBLISH(ncec) || !NCE_ISREACHABLE(ncec) || (hw_addr == NULL && 3982 ncec->ncec_ill->ill_net_type != IRE_IF_NORESOLVER)) 3983 trigger_fastpath = B_FALSE; 3984 3985 if (trigger_fastpath) 3986 nce_fastpath_trigger(nce); 3987 3988 if (NCE_PUBLISH(ncec) && ncec->ncec_state == ND_PROBE) { 3989 /* 3990 * Either the caller (by passing in ND_PROBE) 3991 * or nce_add_common() (by the internally computed state 3992 * based on ncec_addr and ill_net_type) has determined 3993 * that this unicast entry needs DAD. Trigger DAD. 3994 */ 3995 ndp_need_dad = B_TRUE; 3996 } else if (flags & NCE_F_UNSOL_ADV) { 3997 /* 3998 * We account for the transmit below by assigning one 3999 * less than the ndd variable. Subsequent decrements 4000 * are done in nce_timer. 4001 */ 4002 mutex_enter(&ncec->ncec_lock); 4003 ncec->ncec_unsolicit_count = 4004 ipst->ips_ip_arp_publish_count - 1; 4005 mutex_exit(&ncec->ncec_lock); 4006 dropped = arp_announce(ncec); 4007 mutex_enter(&ncec->ncec_lock); 4008 if (dropped) 4009 ncec->ncec_unsolicit_count++; 4010 else 4011 ncec->ncec_last_time_defended = ddi_get_lbolt(); 4012 if (ncec->ncec_unsolicit_count != 0) { 4013 nce_start_timer(ncec, 4014 ipst->ips_ip_arp_publish_interval); 4015 } 4016 mutex_exit(&ncec->ncec_lock); 4017 } 4018 4019 /* 4020 * If ncec_xmit_interval is 0, user has configured us to send the first 4021 * probe right away. Do so, and set up for the subsequent probes. 4022 */ 4023 if (ndp_need_dad) { 4024 mutex_enter(&ncec->ncec_lock); 4025 if (ncec->ncec_pcnt == 0) { 4026 /* 4027 * DAD probes and announce can be 4028 * administratively disabled by setting the 4029 * probe_count to zero. Restart the timer in 4030 * this case to mark the ipif as ready. 4031 */ 4032 ncec->ncec_unsolicit_count = 0; 4033 mutex_exit(&ncec->ncec_lock); 4034 nce_restart_timer(ncec, 0); 4035 } else { 4036 mutex_exit(&ncec->ncec_lock); 4037 delay = ((ncec->ncec_flags & NCE_F_FAST) ? 4038 ipst->ips_arp_probe_delay : 4039 ipst->ips_arp_fastprobe_delay); 4040 nce_dad(ncec, NULL, (delay == 0 ? B_TRUE : B_FALSE)); 4041 } 4042 } 4043 return (0); 4044 } 4045 4046 /* 4047 * ncec_walk routine to update all entries that have a given destination or 4048 * gateway address and cached link layer (MAC) address. This is used when ARP 4049 * informs us that a network-to-link-layer mapping may have changed. 4050 */ 4051 void 4052 nce_update_hw_changed(ncec_t *ncec, void *arg) 4053 { 4054 nce_hw_map_t *hwm = arg; 4055 ipaddr_t ncec_addr; 4056 4057 if (ncec->ncec_state != ND_REACHABLE) 4058 return; 4059 4060 IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, ncec_addr); 4061 if (ncec_addr != hwm->hwm_addr) 4062 return; 4063 4064 mutex_enter(&ncec->ncec_lock); 4065 if (hwm->hwm_flags != 0) 4066 ncec->ncec_flags = hwm->hwm_flags; 4067 nce_update(ncec, ND_STALE, hwm->hwm_hwaddr); 4068 mutex_exit(&ncec->ncec_lock); 4069 } 4070 4071 void 4072 ncec_refhold(ncec_t *ncec) 4073 { 4074 mutex_enter(&(ncec)->ncec_lock); 4075 (ncec)->ncec_refcnt++; 4076 ASSERT((ncec)->ncec_refcnt != 0); 4077 #ifdef DEBUG 4078 ncec_trace_ref(ncec); 4079 #endif 4080 mutex_exit(&(ncec)->ncec_lock); 4081 } 4082 4083 void 4084 ncec_refhold_notr(ncec_t *ncec) 4085 { 4086 mutex_enter(&(ncec)->ncec_lock); 4087 (ncec)->ncec_refcnt++; 4088 ASSERT((ncec)->ncec_refcnt != 0); 4089 mutex_exit(&(ncec)->ncec_lock); 4090 } 4091 4092 static void 4093 ncec_refhold_locked(ncec_t *ncec) 4094 { 4095 ASSERT(MUTEX_HELD(&(ncec)->ncec_lock)); 4096 (ncec)->ncec_refcnt++; 4097 #ifdef DEBUG 4098 ncec_trace_ref(ncec); 4099 #endif 4100 } 4101 4102 /* ncec_inactive destroys the mutex thus no mutex_exit is needed */ 4103 void 4104 ncec_refrele(ncec_t *ncec) 4105 { 4106 mutex_enter(&(ncec)->ncec_lock); 4107 #ifdef DEBUG 4108 ncec_untrace_ref(ncec); 4109 #endif 4110 ASSERT((ncec)->ncec_refcnt != 0); 4111 if (--(ncec)->ncec_refcnt == 0) { 4112 ncec_inactive(ncec); 4113 } else { 4114 mutex_exit(&(ncec)->ncec_lock); 4115 } 4116 } 4117 4118 void 4119 ncec_refrele_notr(ncec_t *ncec) 4120 { 4121 mutex_enter(&(ncec)->ncec_lock); 4122 ASSERT((ncec)->ncec_refcnt != 0); 4123 if (--(ncec)->ncec_refcnt == 0) { 4124 ncec_inactive(ncec); 4125 } else { 4126 mutex_exit(&(ncec)->ncec_lock); 4127 } 4128 } 4129 4130 /* 4131 * Common to IPv4 and IPv6. 4132 */ 4133 void 4134 nce_restart_timer(ncec_t *ncec, uint_t ms) 4135 { 4136 timeout_id_t tid; 4137 4138 ASSERT(!MUTEX_HELD(&(ncec)->ncec_lock)); 4139 4140 /* First cancel any running timer */ 4141 mutex_enter(&ncec->ncec_lock); 4142 tid = ncec->ncec_timeout_id; 4143 ncec->ncec_timeout_id = 0; 4144 if (tid != 0) { 4145 mutex_exit(&ncec->ncec_lock); 4146 (void) untimeout(tid); 4147 mutex_enter(&ncec->ncec_lock); 4148 } 4149 4150 /* Restart timer */ 4151 nce_start_timer(ncec, ms); 4152 mutex_exit(&ncec->ncec_lock); 4153 } 4154 4155 static void 4156 nce_start_timer(ncec_t *ncec, uint_t ms) 4157 { 4158 ASSERT(MUTEX_HELD(&ncec->ncec_lock)); 4159 /* 4160 * Don't start the timer if the ncec has been deleted, or if the timer 4161 * is already running 4162 */ 4163 if (!NCE_ISCONDEMNED(ncec) && ncec->ncec_timeout_id == 0) { 4164 ncec->ncec_timeout_id = timeout(nce_timer, ncec, 4165 MSEC_TO_TICK(ms) == 0 ? 1 : MSEC_TO_TICK(ms)); 4166 } 4167 } 4168 4169 int 4170 nce_set_multicast_v4(ill_t *ill, const in_addr_t *dst, 4171 uint16_t flags, nce_t **newnce) 4172 { 4173 uchar_t *hw_addr; 4174 int err = 0; 4175 ip_stack_t *ipst = ill->ill_ipst; 4176 in6_addr_t dst6; 4177 nce_t *nce; 4178 4179 ASSERT(!ill->ill_isv6); 4180 4181 IN6_IPADDR_TO_V4MAPPED(*dst, &dst6); 4182 mutex_enter(&ipst->ips_ndp4->ndp_g_lock); 4183 if ((nce = nce_lookup_addr(ill, &dst6)) != NULL) { 4184 mutex_exit(&ipst->ips_ndp4->ndp_g_lock); 4185 goto done; 4186 } 4187 if (ill->ill_net_type == IRE_IF_RESOLVER) { 4188 /* 4189 * For IRE_IF_RESOLVER a hardware mapping can be 4190 * generated, for IRE_IF_NORESOLVER, resolution cookie 4191 * in the ill is copied in nce_add_v4(). 4192 */ 4193 hw_addr = kmem_alloc(ill->ill_phys_addr_length, KM_NOSLEEP); 4194 if (hw_addr == NULL) { 4195 mutex_exit(&ipst->ips_ndp4->ndp_g_lock); 4196 return (ENOMEM); 4197 } 4198 ip_mcast_mapping(ill, (uchar_t *)dst, hw_addr); 4199 } else { 4200 /* 4201 * IRE_IF_NORESOLVER type simply copies the resolution 4202 * cookie passed in. So no hw_addr is needed. 4203 */ 4204 hw_addr = NULL; 4205 } 4206 ASSERT(flags & NCE_F_MCAST); 4207 ASSERT(flags & NCE_F_NONUD); 4208 /* nce_state will be computed by nce_add_common() */ 4209 err = nce_add_v4(ill, hw_addr, ill->ill_phys_addr_length, dst, flags, 4210 ND_UNCHANGED, &nce); 4211 mutex_exit(&ipst->ips_ndp4->ndp_g_lock); 4212 if (err == 0) 4213 err = (nce != NULL) ? nce_add_v4_postprocess(nce) : ENOMEM; 4214 if (hw_addr != NULL) 4215 kmem_free(hw_addr, ill->ill_phys_addr_length); 4216 if (err != 0) { 4217 ip1dbg(("nce_set_multicast_v4: create failed" "%d\n", err)); 4218 return (err); 4219 } 4220 done: 4221 if (newnce != NULL) 4222 *newnce = nce; 4223 else 4224 nce_refrele(nce); 4225 return (0); 4226 } 4227 4228 /* 4229 * This is used when scanning for "old" (least recently broadcast) NCEs. We 4230 * don't want to have to walk the list for every single one, so we gather up 4231 * batches at a time. 4232 */ 4233 #define NCE_RESCHED_LIST_LEN 8 4234 4235 typedef struct { 4236 ill_t *ncert_ill; 4237 uint_t ncert_num; 4238 ncec_t *ncert_nces[NCE_RESCHED_LIST_LEN]; 4239 } nce_resched_t; 4240 4241 /* 4242 * Pick the longest waiting NCEs for defense. 4243 */ 4244 /* ARGSUSED */ 4245 static int 4246 ncec_reschedule(ill_t *ill, nce_t *nce, void *arg) 4247 { 4248 nce_resched_t *ncert = arg; 4249 ncec_t **ncecs; 4250 ncec_t **ncec_max; 4251 ncec_t *ncec_temp; 4252 ncec_t *ncec = nce->nce_common; 4253 4254 ASSERT(ncec->ncec_ill == ncert->ncert_ill); 4255 /* 4256 * Only reachable entries that are ready for announcement are eligible. 4257 */ 4258 if (!NCE_MYADDR(ncec) || ncec->ncec_state != ND_REACHABLE) 4259 return (0); 4260 if (ncert->ncert_num < NCE_RESCHED_LIST_LEN) { 4261 ncec_refhold(ncec); 4262 ncert->ncert_nces[ncert->ncert_num++] = ncec; 4263 } else { 4264 ncecs = ncert->ncert_nces; 4265 ncec_max = ncecs + NCE_RESCHED_LIST_LEN; 4266 ncec_refhold(ncec); 4267 for (; ncecs < ncec_max; ncecs++) { 4268 ASSERT(ncec != NULL); 4269 if ((*ncecs)->ncec_last_time_defended > 4270 ncec->ncec_last_time_defended) { 4271 ncec_temp = *ncecs; 4272 *ncecs = ncec; 4273 ncec = ncec_temp; 4274 } 4275 } 4276 ncec_refrele(ncec); 4277 } 4278 return (0); 4279 } 4280 4281 /* 4282 * Reschedule the ARP defense of any long-waiting NCEs. It's assumed that this 4283 * doesn't happen very often (if at all), and thus it needn't be highly 4284 * optimized. (Note, though, that it's actually O(N) complexity, because the 4285 * outer loop is bounded by a constant rather than by the length of the list.) 4286 */ 4287 static void 4288 nce_ill_reschedule(ill_t *ill, nce_resched_t *ncert) 4289 { 4290 ncec_t *ncec; 4291 ip_stack_t *ipst = ill->ill_ipst; 4292 uint_t i, defend_rate; 4293 4294 i = ill->ill_defend_count; 4295 ill->ill_defend_count = 0; 4296 if (ill->ill_isv6) 4297 defend_rate = ipst->ips_ndp_defend_rate; 4298 else 4299 defend_rate = ipst->ips_arp_defend_rate; 4300 /* If none could be sitting around, then don't reschedule */ 4301 if (i < defend_rate) { 4302 DTRACE_PROBE1(reschedule_none, ill_t *, ill); 4303 return; 4304 } 4305 ncert->ncert_ill = ill; 4306 while (ill->ill_defend_count < defend_rate) { 4307 nce_walk_common(ill, ncec_reschedule, ncert); 4308 for (i = 0; i < ncert->ncert_num; i++) { 4309 4310 ncec = ncert->ncert_nces[i]; 4311 mutex_enter(&ncec->ncec_lock); 4312 ncec->ncec_flags |= NCE_F_DELAYED; 4313 mutex_exit(&ncec->ncec_lock); 4314 /* 4315 * we plan to schedule this ncec, so incr the 4316 * defend_count in anticipation. 4317 */ 4318 if (++ill->ill_defend_count >= defend_rate) 4319 break; 4320 } 4321 if (ncert->ncert_num < NCE_RESCHED_LIST_LEN) 4322 break; 4323 } 4324 } 4325 4326 /* 4327 * Check if the current rate-limiting parameters permit the sending 4328 * of another address defense announcement for both IPv4 and IPv6. 4329 * Returns B_TRUE if rate-limiting is in effect (i.e., send is not 4330 * permitted), and B_FALSE otherwise. The `defend_rate' parameter 4331 * determines how many address defense announcements are permitted 4332 * in any `defense_perio' interval. 4333 */ 4334 static boolean_t 4335 ill_defend_rate_limit(ill_t *ill, ncec_t *ncec) 4336 { 4337 clock_t now = ddi_get_lbolt(); 4338 ip_stack_t *ipst = ill->ill_ipst; 4339 clock_t start = ill->ill_defend_start; 4340 uint32_t elapsed, defend_period, defend_rate; 4341 nce_resched_t ncert; 4342 boolean_t ret; 4343 int i; 4344 4345 if (ill->ill_isv6) { 4346 defend_period = ipst->ips_ndp_defend_period; 4347 defend_rate = ipst->ips_ndp_defend_rate; 4348 } else { 4349 defend_period = ipst->ips_arp_defend_period; 4350 defend_rate = ipst->ips_arp_defend_rate; 4351 } 4352 if (defend_rate == 0) 4353 return (B_TRUE); 4354 bzero(&ncert, sizeof (ncert)); 4355 mutex_enter(&ill->ill_lock); 4356 if (start > 0) { 4357 elapsed = now - start; 4358 if (elapsed > SEC_TO_TICK(defend_period)) { 4359 ill->ill_defend_start = now; 4360 /* 4361 * nce_ill_reschedule will attempt to 4362 * prevent starvation by reschduling the 4363 * oldest entries, which are marked with 4364 * the NCE_F_DELAYED flag. 4365 */ 4366 nce_ill_reschedule(ill, &ncert); 4367 } 4368 } else { 4369 ill->ill_defend_start = now; 4370 } 4371 ASSERT(ill->ill_defend_count <= defend_rate); 4372 mutex_enter(&ncec->ncec_lock); 4373 if (ncec->ncec_flags & NCE_F_DELAYED) { 4374 /* 4375 * This ncec was rescheduled as one of the really old 4376 * entries needing on-going defense. The 4377 * ill_defend_count was already incremented in 4378 * nce_ill_reschedule. Go ahead and send the announce. 4379 */ 4380 ncec->ncec_flags &= ~NCE_F_DELAYED; 4381 mutex_exit(&ncec->ncec_lock); 4382 ret = B_FALSE; 4383 goto done; 4384 } 4385 mutex_exit(&ncec->ncec_lock); 4386 if (ill->ill_defend_count < defend_rate) 4387 ill->ill_defend_count++; 4388 if (ill->ill_defend_count == defend_rate) { 4389 /* 4390 * we are no longer allowed to send unbidden defense 4391 * messages. Wait for rescheduling. 4392 */ 4393 ret = B_TRUE; 4394 } else { 4395 ret = B_FALSE; 4396 } 4397 done: 4398 mutex_exit(&ill->ill_lock); 4399 /* 4400 * After all the locks have been dropped we can restart nce timer, 4401 * and refrele the delayed ncecs 4402 */ 4403 for (i = 0; i < ncert.ncert_num; i++) { 4404 clock_t xmit_interval; 4405 ncec_t *tmp; 4406 4407 tmp = ncert.ncert_nces[i]; 4408 xmit_interval = nce_fuzz_interval(tmp->ncec_xmit_interval, 4409 B_FALSE); 4410 nce_restart_timer(tmp, xmit_interval); 4411 ncec_refrele(tmp); 4412 } 4413 return (ret); 4414 } 4415 4416 boolean_t 4417 ndp_announce(ncec_t *ncec) 4418 { 4419 return (ndp_xmit(ncec->ncec_ill, ND_NEIGHBOR_ADVERT, ncec->ncec_lladdr, 4420 ncec->ncec_lladdr_length, &ncec->ncec_addr, &ipv6_all_hosts_mcast, 4421 nce_advert_flags(ncec))); 4422 } 4423 4424 ill_t * 4425 nce_resolve_src(ncec_t *ncec, in6_addr_t *src) 4426 { 4427 mblk_t *mp; 4428 in6_addr_t src6; 4429 ipaddr_t src4; 4430 ill_t *ill = ncec->ncec_ill; 4431 ill_t *src_ill = NULL; 4432 ipif_t *ipif = NULL; 4433 boolean_t is_myaddr = NCE_MYADDR(ncec); 4434 boolean_t isv6 = (ncec->ncec_ipversion == IPV6_VERSION); 4435 4436 ASSERT(src != NULL); 4437 ASSERT(IN6_IS_ADDR_UNSPECIFIED(src)); 4438 src4 = 0; 4439 src6 = *src; 4440 if (is_myaddr) { 4441 src6 = ncec->ncec_addr; 4442 if (!isv6) 4443 IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, src4); 4444 } else { 4445 /* 4446 * try to find one from the outgoing packet. 4447 */ 4448 mutex_enter(&ncec->ncec_lock); 4449 mp = ncec->ncec_qd_mp; 4450 if (mp != NULL) { 4451 if (isv6) { 4452 ip6_t *ip6h = (ip6_t *)mp->b_rptr; 4453 4454 src6 = ip6h->ip6_src; 4455 } else { 4456 ipha_t *ipha = (ipha_t *)mp->b_rptr; 4457 4458 src4 = ipha->ipha_src; 4459 IN6_IPADDR_TO_V4MAPPED(src4, &src6); 4460 } 4461 } 4462 mutex_exit(&ncec->ncec_lock); 4463 } 4464 4465 /* 4466 * For outgoing packets, if the src of outgoing packet is one 4467 * of the assigned interface addresses use it, otherwise we 4468 * will pick the source address below. 4469 * For local addresses (is_myaddr) doing DAD, NDP announce 4470 * messages are mcast. So we use the (IPMP) cast_ill or the 4471 * (non-IPMP) ncec_ill for these message types. The only case 4472 * of unicast DAD messages are for IPv6 ND probes, for which 4473 * we find the ipif_bound_ill corresponding to the ncec_addr. 4474 */ 4475 if (!IN6_IS_ADDR_UNSPECIFIED(&src6) || is_myaddr) { 4476 if (isv6) { 4477 ipif = ipif_lookup_addr_nondup_v6(&src6, ill, ALL_ZONES, 4478 ill->ill_ipst); 4479 } else { 4480 ipif = ipif_lookup_addr_nondup(src4, ill, ALL_ZONES, 4481 ill->ill_ipst); 4482 } 4483 4484 /* 4485 * If no relevant ipif can be found, then it's not one of our 4486 * addresses. Reset to :: and try to find a src for the NS or 4487 * ARP request using ipif_select_source_v[4,6] below. 4488 * If an ipif can be found, but it's not yet done with 4489 * DAD verification, and we are not being invoked for 4490 * DAD (i.e., !is_myaddr), then just postpone this 4491 * transmission until later. 4492 */ 4493 if (ipif == NULL) { 4494 src6 = ipv6_all_zeros; 4495 src4 = INADDR_ANY; 4496 } else if (!ipif->ipif_addr_ready && !is_myaddr) { 4497 DTRACE_PROBE2(nce__resolve__ipif__not__ready, 4498 ncec_t *, ncec, ipif_t *, ipif); 4499 ipif_refrele(ipif); 4500 return (NULL); 4501 } 4502 } 4503 4504 if (IN6_IS_ADDR_UNSPECIFIED(&src6) && !is_myaddr) { 4505 /* 4506 * Pick a source address for this solicitation, but 4507 * restrict the selection to addresses assigned to the 4508 * output interface. We do this because the destination will 4509 * create a neighbor cache entry for the source address of 4510 * this packet, so the source address had better be a valid 4511 * neighbor. 4512 */ 4513 if (isv6) { 4514 ipif = ipif_select_source_v6(ill, &ncec->ncec_addr, 4515 B_TRUE, IPV6_PREFER_SRC_DEFAULT, ALL_ZONES, 4516 B_FALSE, NULL); 4517 } else { 4518 ipaddr_t nce_addr; 4519 4520 IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, nce_addr); 4521 ipif = ipif_select_source_v4(ill, nce_addr, ALL_ZONES, 4522 B_FALSE, NULL); 4523 } 4524 if (ipif == NULL && IS_IPMP(ill)) { 4525 ill_t *send_ill = ipmp_ill_hold_xmit_ill(ill, B_TRUE); 4526 4527 if (send_ill != NULL) { 4528 if (isv6) { 4529 ipif = ipif_select_source_v6(send_ill, 4530 &ncec->ncec_addr, B_TRUE, 4531 IPV6_PREFER_SRC_DEFAULT, ALL_ZONES, 4532 B_FALSE, NULL); 4533 } else { 4534 IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, 4535 src4); 4536 ipif = ipif_select_source_v4(send_ill, 4537 src4, ALL_ZONES, B_TRUE, NULL); 4538 } 4539 ill_refrele(send_ill); 4540 } 4541 } 4542 4543 if (ipif == NULL) { 4544 char buf[INET6_ADDRSTRLEN]; 4545 4546 ip1dbg(("nce_resolve_src: No source ipif for dst %s\n", 4547 inet_ntop((isv6 ? AF_INET6 : AF_INET), 4548 (char *)&ncec->ncec_addr, buf, sizeof (buf)))); 4549 DTRACE_PROBE1(nce__resolve__no__ipif, ncec_t *, ncec); 4550 return (NULL); 4551 } 4552 src6 = ipif->ipif_v6lcl_addr; 4553 } 4554 *src = src6; 4555 if (ipif != NULL) { 4556 src_ill = ipif->ipif_ill; 4557 if (IS_IPMP(src_ill)) 4558 src_ill = ipmp_ipif_hold_bound_ill(ipif); 4559 else 4560 ill_refhold(src_ill); 4561 ipif_refrele(ipif); 4562 DTRACE_PROBE2(nce__resolve__src__ill, ncec_t *, ncec, 4563 ill_t *, src_ill); 4564 } 4565 return (src_ill); 4566 } 4567 4568 void 4569 ip_nce_lookup_and_update(ipaddr_t *addr, ipif_t *ipif, ip_stack_t *ipst, 4570 uchar_t *hwaddr, int hwaddr_len, int flags) 4571 { 4572 ill_t *ill; 4573 ncec_t *ncec; 4574 nce_t *nce; 4575 uint16_t new_state; 4576 4577 ill = (ipif ? ipif->ipif_ill : NULL); 4578 if (ill != NULL) { 4579 /* 4580 * only one ncec is possible 4581 */ 4582 nce = nce_lookup_v4(ill, addr); 4583 if (nce != NULL) { 4584 ncec = nce->nce_common; 4585 mutex_enter(&ncec->ncec_lock); 4586 if (NCE_ISREACHABLE(ncec)) 4587 new_state = ND_UNCHANGED; 4588 else 4589 new_state = ND_STALE; 4590 ncec->ncec_flags = flags; 4591 nce_update(ncec, new_state, hwaddr); 4592 mutex_exit(&ncec->ncec_lock); 4593 nce_refrele(nce); 4594 return; 4595 } 4596 } else { 4597 /* 4598 * ill is wildcard; clean up all ncec's and ire's 4599 * that match on addr. 4600 */ 4601 nce_hw_map_t hwm; 4602 4603 hwm.hwm_addr = *addr; 4604 hwm.hwm_hwlen = hwaddr_len; 4605 hwm.hwm_hwaddr = hwaddr; 4606 hwm.hwm_flags = flags; 4607 4608 ncec_walk_common(ipst->ips_ndp4, NULL, 4609 nce_update_hw_changed, &hwm, B_TRUE); 4610 } 4611 } 4612 4613 /* 4614 * Common function to add ncec entries. 4615 * we always add the ncec with ncec_ill == ill, and always create 4616 * nce_t on ncec_ill. A dlpi fastpath message may be triggered if the 4617 * ncec is !reachable. 4618 * 4619 * When the caller passes in an nce_state of ND_UNCHANGED, 4620 * nce_add_common() will determine the state of the created nce based 4621 * on the ill_net_type and nce_flags used. Otherwise, the nce will 4622 * be created with state set to the passed in nce_state. 4623 */ 4624 static int 4625 nce_add_common(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len, 4626 const in6_addr_t *addr, uint16_t flags, uint16_t nce_state, nce_t **retnce) 4627 { 4628 static ncec_t nce_nil; 4629 uchar_t *template = NULL; 4630 int err; 4631 ncec_t *ncec; 4632 ncec_t **ncep; 4633 ip_stack_t *ipst = ill->ill_ipst; 4634 uint16_t state; 4635 boolean_t fastprobe = B_FALSE; 4636 struct ndp_g_s *ndp; 4637 nce_t *nce = NULL; 4638 list_t graveyard; 4639 mblk_t *dlur_mp = NULL; 4640 4641 if (ill->ill_isv6) 4642 ndp = ill->ill_ipst->ips_ndp6; 4643 else 4644 ndp = ill->ill_ipst->ips_ndp4; 4645 4646 *retnce = NULL; 4647 state = 0; 4648 4649 ASSERT(MUTEX_HELD(&ndp->ndp_g_lock)); 4650 4651 if (IN6_IS_ADDR_UNSPECIFIED(addr)) { 4652 ip0dbg(("nce_add_common: no addr\n")); 4653 return (EINVAL); 4654 } 4655 if ((flags & ~NCE_EXTERNAL_FLAGS_MASK)) { 4656 ip0dbg(("nce_add_common: flags = %x\n", (int)flags)); 4657 return (EINVAL); 4658 } 4659 4660 if (ill->ill_isv6) { 4661 ncep = ((ncec_t **)NCE_HASH_PTR_V6(ipst, *addr)); 4662 } else { 4663 ipaddr_t v4addr; 4664 4665 IN6_V4MAPPED_TO_IPADDR(addr, v4addr); 4666 ncep = ((ncec_t **)NCE_HASH_PTR_V4(ipst, v4addr)); 4667 } 4668 4669 /* 4670 * The caller has ensured that there is no nce on ill, but there could 4671 * still be an nce_common_t for the address, so that we find exisiting 4672 * ncec_t strucutures first, and atomically add a new nce_t if 4673 * one is found. The ndp_g_lock ensures that we don't cross threads 4674 * with an ncec_delete(). Unlike ncec_lookup_illgrp() we do not 4675 * compare for matches across the illgrp because this function is 4676 * called via nce_lookup_then_add_v* -> nce_add_v* -> nce_add_common, 4677 * with the nce_lookup_then_add_v* passing in the ipmp_ill where 4678 * appropriate. 4679 */ 4680 ncec = *ncep; 4681 for (; ncec != NULL; ncec = ncec->ncec_next) { 4682 if (ncec->ncec_ill == ill) { 4683 if (IN6_ARE_ADDR_EQUAL(&ncec->ncec_addr, addr)) { 4684 /* 4685 * We should never find *retnce to be 4686 * MYADDR, since the caller may then 4687 * incorrectly restart a DAD timer that's 4688 * already running. However, if we are in 4689 * forwarding mode, and the interface is 4690 * moving in/out of groups, the data 4691 * path ire lookup (e.g., ire_revalidate_nce) 4692 * may have determined that some destination 4693 * is offlink while the control path is adding 4694 * that address as a local address. 4695 * Recover from this case by failing the 4696 * lookup 4697 */ 4698 if (NCE_MYADDR(ncec)) 4699 return (ENXIO); 4700 *retnce = nce_ill_lookup_then_add(ill, ncec); 4701 if (*retnce != NULL) 4702 break; 4703 } 4704 } 4705 } 4706 if (*retnce != NULL) /* caller must trigger fastpath on nce */ 4707 return (0); 4708 4709 ncec = kmem_cache_alloc(ncec_cache, KM_NOSLEEP); 4710 if (ncec == NULL) 4711 return (ENOMEM); 4712 *ncec = nce_nil; 4713 ncec->ncec_ill = ill; 4714 ncec->ncec_ipversion = (ill->ill_isv6 ? IPV6_VERSION : IPV4_VERSION); 4715 ncec->ncec_flags = flags; 4716 ncec->ncec_ipst = ipst; /* No netstack_hold */ 4717 4718 if (!ill->ill_isv6) { 4719 ipaddr_t addr4; 4720 4721 /* 4722 * DAD probe interval and probe count are set based on 4723 * fast/slow probe settings. If the underlying link doesn't 4724 * have reliably up/down notifications or if we're working 4725 * with IPv4 169.254.0.0/16 Link Local Address space, then 4726 * don't use the fast timers. Otherwise, use them. 4727 */ 4728 ASSERT(IN6_IS_ADDR_V4MAPPED(addr)); 4729 IN6_V4MAPPED_TO_IPADDR(addr, addr4); 4730 if (ill->ill_note_link && !IS_IPV4_LL_SPACE(&addr4)) { 4731 fastprobe = B_TRUE; 4732 } else if (IS_IPMP(ill) && NCE_PUBLISH(ncec) && 4733 !IS_IPV4_LL_SPACE(&addr4)) { 4734 ill_t *hwaddr_ill; 4735 4736 hwaddr_ill = ipmp_illgrp_find_ill(ill->ill_grp, hw_addr, 4737 hw_addr_len); 4738 if (hwaddr_ill != NULL && hwaddr_ill->ill_note_link) 4739 fastprobe = B_TRUE; 4740 } 4741 if (fastprobe) { 4742 ncec->ncec_xmit_interval = 4743 ipst->ips_arp_fastprobe_interval; 4744 ncec->ncec_pcnt = 4745 ipst->ips_arp_fastprobe_count; 4746 ncec->ncec_flags |= NCE_F_FAST; 4747 } else { 4748 ncec->ncec_xmit_interval = 4749 ipst->ips_arp_probe_interval; 4750 ncec->ncec_pcnt = 4751 ipst->ips_arp_probe_count; 4752 } 4753 if (NCE_PUBLISH(ncec)) { 4754 ncec->ncec_unsolicit_count = 4755 ipst->ips_ip_arp_publish_count; 4756 } 4757 } else { 4758 /* 4759 * probe interval is constant: ILL_PROBE_INTERVAL 4760 * probe count is constant: ND_MAX_UNICAST_SOLICIT 4761 */ 4762 ncec->ncec_pcnt = ND_MAX_UNICAST_SOLICIT; 4763 if (NCE_PUBLISH(ncec)) { 4764 ncec->ncec_unsolicit_count = 4765 ipst->ips_ip_ndp_unsolicit_count; 4766 } 4767 } 4768 ncec->ncec_rcnt = ill->ill_xmit_count; 4769 ncec->ncec_addr = *addr; 4770 ncec->ncec_qd_mp = NULL; 4771 ncec->ncec_refcnt = 1; /* for ncec getting created */ 4772 mutex_init(&ncec->ncec_lock, NULL, MUTEX_DEFAULT, NULL); 4773 ncec->ncec_trace_disable = B_FALSE; 4774 4775 /* 4776 * ncec_lladdr holds link layer address 4777 */ 4778 if (hw_addr_len > 0) { 4779 template = kmem_alloc(hw_addr_len, KM_NOSLEEP); 4780 if (template == NULL) { 4781 err = ENOMEM; 4782 goto err_ret; 4783 } 4784 ncec->ncec_lladdr = template; 4785 ncec->ncec_lladdr_length = hw_addr_len; 4786 bzero(ncec->ncec_lladdr, hw_addr_len); 4787 } 4788 if ((flags & NCE_F_BCAST) != 0) { 4789 state = ND_REACHABLE; 4790 ASSERT(hw_addr_len > 0); 4791 } else if (ill->ill_net_type == IRE_IF_RESOLVER) { 4792 state = ND_INITIAL; 4793 } else if (ill->ill_net_type == IRE_IF_NORESOLVER) { 4794 /* 4795 * NORESOLVER entries are always created in the REACHABLE 4796 * state. 4797 */ 4798 state = ND_REACHABLE; 4799 if (ill->ill_phys_addr_length == IP_ADDR_LEN && 4800 ill->ill_mactype != DL_IPV4 && 4801 ill->ill_mactype != DL_6TO4) { 4802 /* 4803 * We create a nce_res_mp with the IP nexthop address 4804 * as the destination address if the physical length 4805 * is exactly 4 bytes for point-to-multipoint links 4806 * that do their own resolution from IP to link-layer 4807 * address (e.g. IP over X.25). 4808 */ 4809 bcopy((uchar_t *)addr, 4810 ncec->ncec_lladdr, ill->ill_phys_addr_length); 4811 } 4812 if (ill->ill_phys_addr_length == IPV6_ADDR_LEN && 4813 ill->ill_mactype != DL_IPV6) { 4814 /* 4815 * We create a nce_res_mp with the IP nexthop address 4816 * as the destination address if the physical legnth 4817 * is exactly 16 bytes for point-to-multipoint links 4818 * that do their own resolution from IP to link-layer 4819 * address. 4820 */ 4821 bcopy((uchar_t *)addr, 4822 ncec->ncec_lladdr, ill->ill_phys_addr_length); 4823 } 4824 /* 4825 * Since NUD is not part of the base IPv4 protocol definition, 4826 * IPv4 neighbor entries on NORESOLVER interfaces will never 4827 * age, and are marked NCE_F_NONUD. 4828 */ 4829 if (!ill->ill_isv6) 4830 ncec->ncec_flags |= NCE_F_NONUD; 4831 } else if (ill->ill_net_type == IRE_LOOPBACK) { 4832 state = ND_REACHABLE; 4833 } 4834 4835 if (hw_addr != NULL || ill->ill_net_type == IRE_IF_NORESOLVER) { 4836 /* 4837 * We are adding an ncec with a deterministic hw_addr, 4838 * so the state can only be one of {REACHABLE, STALE, PROBE}. 4839 * 4840 * if we are adding a unicast ncec for the local address 4841 * it would be REACHABLE; we would be adding a ND_STALE entry 4842 * for the requestor of an ARP_REQUEST/ND_SOLICIT. Our own 4843 * addresses are added in PROBE to trigger DAD. 4844 */ 4845 if ((flags & (NCE_F_MCAST|NCE_F_BCAST)) || 4846 ill->ill_net_type == IRE_IF_NORESOLVER) 4847 state = ND_REACHABLE; 4848 else if (!NCE_PUBLISH(ncec)) 4849 state = ND_STALE; 4850 else 4851 state = ND_PROBE; 4852 if (hw_addr != NULL) 4853 nce_set_ll(ncec, hw_addr); 4854 } 4855 /* caller overrides internally computed state */ 4856 if (nce_state != ND_UNCHANGED) 4857 state = nce_state; 4858 4859 if (state == ND_PROBE) 4860 ncec->ncec_flags |= NCE_F_UNVERIFIED; 4861 4862 ncec->ncec_state = state; 4863 4864 if (state == ND_REACHABLE) { 4865 ncec->ncec_last = ncec->ncec_init_time = 4866 TICK_TO_MSEC(ddi_get_lbolt64()); 4867 } else { 4868 ncec->ncec_last = 0; 4869 if (state == ND_INITIAL) 4870 ncec->ncec_init_time = TICK_TO_MSEC(ddi_get_lbolt64()); 4871 } 4872 list_create(&ncec->ncec_cb, sizeof (ncec_cb_t), 4873 offsetof(ncec_cb_t, ncec_cb_node)); 4874 /* 4875 * have all the memory allocations out of the way before taking locks 4876 * and adding the nce. 4877 */ 4878 nce = kmem_cache_alloc(nce_cache, KM_NOSLEEP); 4879 if (nce == NULL) { 4880 err = ENOMEM; 4881 goto err_ret; 4882 } 4883 if (ncec->ncec_lladdr != NULL || 4884 ill->ill_net_type == IRE_IF_NORESOLVER) { 4885 dlur_mp = ill_dlur_gen(ncec->ncec_lladdr, 4886 ill->ill_phys_addr_length, ill->ill_sap, 4887 ill->ill_sap_length); 4888 if (dlur_mp == NULL) { 4889 err = ENOMEM; 4890 goto err_ret; 4891 } 4892 } 4893 4894 /* 4895 * Atomically ensure that the ill is not CONDEMNED, before 4896 * adding the NCE. 4897 */ 4898 mutex_enter(&ill->ill_lock); 4899 if (ill->ill_state_flags & ILL_CONDEMNED) { 4900 mutex_exit(&ill->ill_lock); 4901 err = EINVAL; 4902 goto err_ret; 4903 } 4904 if (!NCE_MYADDR(ncec) && 4905 (ill->ill_state_flags & ILL_DOWN_IN_PROGRESS)) { 4906 mutex_exit(&ill->ill_lock); 4907 DTRACE_PROBE1(nce__add__on__down__ill, ncec_t *, ncec); 4908 err = EINVAL; 4909 goto err_ret; 4910 } 4911 /* 4912 * Acquire the ncec_lock even before adding the ncec to the list 4913 * so that it cannot get deleted after the ncec is added, but 4914 * before we add the nce. 4915 */ 4916 mutex_enter(&ncec->ncec_lock); 4917 if ((ncec->ncec_next = *ncep) != NULL) 4918 ncec->ncec_next->ncec_ptpn = &ncec->ncec_next; 4919 *ncep = ncec; 4920 ncec->ncec_ptpn = ncep; 4921 4922 /* Bump up the number of ncec's referencing this ill */ 4923 DTRACE_PROBE3(ill__incr__cnt, (ill_t *), ill, 4924 (char *), "ncec", (void *), ncec); 4925 ill->ill_ncec_cnt++; 4926 /* 4927 * Since we hold the ncec_lock at this time, the ncec cannot be 4928 * condemned, and we can safely add the nce. 4929 */ 4930 list_create(&graveyard, sizeof (nce_t), offsetof(nce_t, nce_node)); 4931 *retnce = nce_add_impl(ill, ncec, nce, dlur_mp, &graveyard); 4932 mutex_exit(&ncec->ncec_lock); 4933 mutex_exit(&ill->ill_lock); 4934 nce_graveyard_free(&graveyard); 4935 4936 /* caller must trigger fastpath on *retnce */ 4937 return (0); 4938 4939 err_ret: 4940 if (ncec != NULL) 4941 kmem_cache_free(ncec_cache, ncec); 4942 if (nce != NULL) 4943 kmem_cache_free(nce_cache, nce); 4944 freemsg(dlur_mp); 4945 if (template != NULL) 4946 kmem_free(template, ill->ill_phys_addr_length); 4947 return (err); 4948 } 4949 4950 /* 4951 * take a ref on the nce 4952 */ 4953 void 4954 nce_refhold(nce_t *nce) 4955 { 4956 mutex_enter(&nce->nce_lock); 4957 nce->nce_refcnt++; 4958 ASSERT((nce)->nce_refcnt != 0); 4959 mutex_exit(&nce->nce_lock); 4960 } 4961 4962 /* 4963 * release a ref on the nce; In general, this 4964 * cannot be called with locks held because nce_inactive 4965 * may result in nce_inactive which will take the ill_lock, 4966 * do ipif_ill_refrele_tail etc. Thus the one exception 4967 * where this can be called with locks held is when the caller 4968 * is certain that the nce_refcnt is sufficient to prevent 4969 * the invocation of nce_inactive. 4970 */ 4971 void 4972 nce_refrele(nce_t *nce) 4973 { 4974 ASSERT((nce)->nce_refcnt != 0); 4975 mutex_enter(&nce->nce_lock); 4976 if (--nce->nce_refcnt == 0) 4977 nce_inactive(nce); /* destroys the mutex */ 4978 else 4979 mutex_exit(&nce->nce_lock); 4980 } 4981 4982 /* 4983 * free the nce after all refs have gone away. 4984 */ 4985 static void 4986 nce_inactive(nce_t *nce) 4987 { 4988 ill_t *ill = nce->nce_ill; 4989 4990 ASSERT(nce->nce_refcnt == 0); 4991 4992 ncec_refrele_notr(nce->nce_common); 4993 nce->nce_common = NULL; 4994 freemsg(nce->nce_fp_mp); 4995 freemsg(nce->nce_dlur_mp); 4996 4997 mutex_enter(&ill->ill_lock); 4998 DTRACE_PROBE3(ill__decr__cnt, (ill_t *), ill, 4999 (char *), "nce", (void *), nce); 5000 ill->ill_nce_cnt--; 5001 nce->nce_ill = NULL; 5002 /* 5003 * If the number of ncec's associated with this ill have dropped 5004 * to zero, check whether we need to restart any operation that 5005 * is waiting for this to happen. 5006 */ 5007 if (ILL_DOWN_OK(ill)) { 5008 /* ipif_ill_refrele_tail drops the ill_lock */ 5009 ipif_ill_refrele_tail(ill); 5010 } else { 5011 mutex_exit(&ill->ill_lock); 5012 } 5013 5014 mutex_destroy(&nce->nce_lock); 5015 kmem_cache_free(nce_cache, nce); 5016 } 5017 5018 /* 5019 * Add an nce to the ill_nce list. 5020 * 5021 * Adding multicast NCEs is subject to a per-ill limit. This function returns 5022 * NULL if that's the case, and it may reap a number of multicast nces. 5023 * Callers (and upstack) must be able to cope with NULL returns. 5024 */ 5025 static nce_t * 5026 nce_add_impl(ill_t *ill, ncec_t *ncec, nce_t *nce, mblk_t *dlur_mp, 5027 list_t *graveyard) 5028 { 5029 ASSERT(MUTEX_HELD(&ill->ill_lock)); 5030 5031 if ((ncec->ncec_flags & NCE_F_MCAST) != 0) { 5032 if (nce_too_many_mcast(ill, graveyard)) { 5033 kmem_cache_free(nce_cache, nce); 5034 return (NULL); 5035 } 5036 ill->ill_mcast_nces++; 5037 } 5038 5039 bzero(nce, sizeof (*nce)); 5040 mutex_init(&nce->nce_lock, NULL, MUTEX_DEFAULT, NULL); 5041 nce->nce_common = ncec; 5042 nce->nce_addr = ncec->ncec_addr; 5043 nce->nce_ill = ill; 5044 DTRACE_PROBE3(ill__incr__cnt, (ill_t *), ill, 5045 (char *), "nce", (void *), nce); 5046 ill->ill_nce_cnt++; 5047 5048 nce->nce_refcnt = 1; /* for the thread */ 5049 ncec->ncec_refcnt++; /* want ncec_refhold_locked_notr(ncec) */ 5050 nce->nce_dlur_mp = dlur_mp; 5051 5052 /* add nce to the ill's fastpath list. */ 5053 nce->nce_refcnt++; /* for the list */ 5054 list_insert_head(&ill->ill_nce, nce); 5055 return (nce); 5056 } 5057 5058 static nce_t * 5059 nce_add(ill_t *ill, ncec_t *ncec, list_t *graveyard) 5060 { 5061 nce_t *nce; 5062 mblk_t *dlur_mp = NULL; 5063 5064 ASSERT(MUTEX_HELD(&ill->ill_lock)); 5065 ASSERT(MUTEX_HELD(&ncec->ncec_lock)); 5066 5067 nce = kmem_cache_alloc(nce_cache, KM_NOSLEEP); 5068 if (nce == NULL) 5069 return (NULL); 5070 if (ncec->ncec_lladdr != NULL || 5071 ill->ill_net_type == IRE_IF_NORESOLVER) { 5072 dlur_mp = ill_dlur_gen(ncec->ncec_lladdr, 5073 ill->ill_phys_addr_length, ill->ill_sap, 5074 ill->ill_sap_length); 5075 if (dlur_mp == NULL) { 5076 kmem_cache_free(nce_cache, nce); 5077 return (NULL); 5078 } 5079 } 5080 /* 5081 * If nce_add_impl() returns NULL due to on multicast limiting, caller 5082 * will (correctly) assume ENOMEM. 5083 */ 5084 return (nce_add_impl(ill, ncec, nce, dlur_mp, graveyard)); 5085 } 5086 5087 /* 5088 * remove the nce from the ill_faspath list 5089 */ 5090 void 5091 nce_delete(nce_t *nce) 5092 { 5093 ill_t *ill = nce->nce_ill; 5094 5095 ASSERT(MUTEX_HELD(&ill->ill_lock)); 5096 5097 mutex_enter(&nce->nce_lock); 5098 if (nce->nce_is_condemned) { 5099 /* 5100 * some other thread has removed this nce from the ill_nce list 5101 */ 5102 mutex_exit(&nce->nce_lock); 5103 return; 5104 } 5105 nce->nce_is_condemned = B_TRUE; 5106 mutex_exit(&nce->nce_lock); 5107 5108 /* Update the count of multicast NCEs. */ 5109 if ((nce->nce_common->ncec_flags & NCE_F_MCAST) == NCE_F_MCAST) 5110 ill->ill_mcast_nces--; 5111 5112 list_remove(&ill->ill_nce, nce); 5113 /* 5114 * even though we are holding the ill_lock, it is ok to 5115 * call nce_refrele here because we know that we should have 5116 * at least 2 refs on the nce: one for the thread, and one 5117 * for the list. The refrele below will release the one for 5118 * the list. 5119 */ 5120 nce_refrele(nce); 5121 } 5122 5123 nce_t * 5124 nce_lookup(ill_t *ill, const in6_addr_t *addr) 5125 { 5126 nce_t *nce = NULL; 5127 5128 ASSERT(ill != NULL); 5129 ASSERT(MUTEX_HELD(&ill->ill_lock)); 5130 5131 for (nce = list_head(&ill->ill_nce); nce != NULL; 5132 nce = list_next(&ill->ill_nce, nce)) { 5133 if (IN6_ARE_ADDR_EQUAL(&nce->nce_addr, addr)) 5134 break; 5135 } 5136 5137 /* 5138 * if we found the nce on the ill_nce list while holding 5139 * the ill_lock, then it cannot be condemned yet. 5140 */ 5141 if (nce != NULL) { 5142 ASSERT(!nce->nce_is_condemned); 5143 nce_refhold(nce); 5144 } 5145 return (nce); 5146 } 5147 5148 /* 5149 * Walk the ill_nce list on ill. The callback function func() cannot perform 5150 * any destructive actions. 5151 */ 5152 static void 5153 nce_walk_common(ill_t *ill, pfi_t func, void *arg) 5154 { 5155 nce_t *nce = NULL, *nce_next; 5156 5157 ASSERT(MUTEX_HELD(&ill->ill_lock)); 5158 for (nce = list_head(&ill->ill_nce); nce != NULL; ) { 5159 nce_next = list_next(&ill->ill_nce, nce); 5160 if (func(ill, nce, arg) != 0) 5161 break; 5162 nce = nce_next; 5163 } 5164 } 5165 5166 void 5167 nce_walk(ill_t *ill, pfi_t func, void *arg) 5168 { 5169 mutex_enter(&ill->ill_lock); 5170 nce_walk_common(ill, func, arg); 5171 mutex_exit(&ill->ill_lock); 5172 } 5173 5174 void 5175 nce_flush(ill_t *ill, boolean_t flushall) 5176 { 5177 nce_t *nce, *nce_next; 5178 list_t dead; 5179 5180 list_create(&dead, sizeof (nce_t), offsetof(nce_t, nce_node)); 5181 mutex_enter(&ill->ill_lock); 5182 for (nce = list_head(&ill->ill_nce); nce != NULL; ) { 5183 nce_next = list_next(&ill->ill_nce, nce); 5184 if (!flushall && NCE_PUBLISH(nce->nce_common)) { 5185 nce = nce_next; 5186 continue; 5187 } 5188 /* 5189 * nce_delete requires that the caller should either not 5190 * be holding locks, or should hold a ref to ensure that 5191 * we wont hit ncec_inactive. So take a ref and clean up 5192 * after the list is flushed. 5193 */ 5194 nce_refhold(nce); 5195 nce_delete(nce); 5196 list_insert_tail(&dead, nce); 5197 nce = nce_next; 5198 } 5199 mutex_exit(&ill->ill_lock); 5200 while ((nce = list_head(&dead)) != NULL) { 5201 list_remove(&dead, nce); 5202 nce_refrele(nce); 5203 } 5204 ASSERT(list_is_empty(&dead)); 5205 list_destroy(&dead); 5206 } 5207 5208 /* Return an interval that is anywhere in the [1 .. intv] range */ 5209 static clock_t 5210 nce_fuzz_interval(clock_t intv, boolean_t initial_time) 5211 { 5212 clock_t rnd, frac; 5213 5214 (void) random_get_pseudo_bytes((uint8_t *)&rnd, sizeof (rnd)); 5215 /* Note that clock_t is signed; must chop off bits */ 5216 rnd &= (1ul << (NBBY * sizeof (rnd) - 1)) - 1; 5217 if (initial_time) { 5218 if (intv <= 0) 5219 intv = 1; 5220 else 5221 intv = (rnd % intv) + 1; 5222 } else { 5223 /* Compute 'frac' as 20% of the configured interval */ 5224 if ((frac = intv / 5) <= 1) 5225 frac = 2; 5226 /* Set intv randomly in the range [intv-frac .. intv+frac] */ 5227 if ((intv = intv - frac + rnd % (2 * frac + 1)) <= 0) 5228 intv = 1; 5229 } 5230 return (intv); 5231 } 5232 5233 void 5234 nce_resolv_ipmp_ok(ncec_t *ncec) 5235 { 5236 mblk_t *mp; 5237 uint_t pkt_len; 5238 iaflags_t ixaflags = IXAF_NO_TRACE; 5239 nce_t *under_nce; 5240 ill_t *ill = ncec->ncec_ill; 5241 boolean_t isv6 = (ncec->ncec_ipversion == IPV6_VERSION); 5242 ipif_t *src_ipif = NULL; 5243 ip_stack_t *ipst = ill->ill_ipst; 5244 ill_t *send_ill; 5245 uint_t nprobes; 5246 5247 ASSERT(IS_IPMP(ill)); 5248 5249 mutex_enter(&ncec->ncec_lock); 5250 nprobes = ncec->ncec_nprobes; 5251 mp = ncec->ncec_qd_mp; 5252 ncec->ncec_qd_mp = NULL; 5253 ncec->ncec_nprobes = 0; 5254 mutex_exit(&ncec->ncec_lock); 5255 5256 while (mp != NULL) { 5257 mblk_t *nxt_mp; 5258 5259 nxt_mp = mp->b_next; 5260 mp->b_next = NULL; 5261 if (isv6) { 5262 ip6_t *ip6h = (ip6_t *)mp->b_rptr; 5263 5264 pkt_len = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN; 5265 src_ipif = ipif_lookup_addr_nondup_v6(&ip6h->ip6_src, 5266 ill, ALL_ZONES, ipst); 5267 } else { 5268 ipha_t *ipha = (ipha_t *)mp->b_rptr; 5269 5270 ixaflags |= IXAF_IS_IPV4; 5271 pkt_len = ntohs(ipha->ipha_length); 5272 src_ipif = ipif_lookup_addr_nondup(ipha->ipha_src, 5273 ill, ALL_ZONES, ipst); 5274 } 5275 5276 /* 5277 * find a new nce based on an under_ill. The first IPMP probe 5278 * packet gets queued, so we could still find a src_ipif that 5279 * matches an IPMP test address. 5280 */ 5281 if (src_ipif == NULL || IS_IPMP(src_ipif->ipif_ill)) { 5282 /* 5283 * if src_ipif is null, this could be either a 5284 * forwarded packet or a probe whose src got deleted. 5285 * We identify the former case by looking for the 5286 * ncec_nprobes: the first ncec_nprobes packets are 5287 * probes; 5288 */ 5289 if (src_ipif == NULL && nprobes > 0) 5290 goto drop_pkt; 5291 5292 /* 5293 * For forwarded packets, we use the ipmp rotor 5294 * to find send_ill. 5295 */ 5296 send_ill = ipmp_ill_hold_xmit_ill(ncec->ncec_ill, 5297 B_TRUE); 5298 } else { 5299 send_ill = src_ipif->ipif_ill; 5300 ill_refhold(send_ill); 5301 } 5302 5303 DTRACE_PROBE4(nce__resolve__ipmp, (mblk_t *), mp, 5304 (ncec_t *), ncec, (ipif_t *), 5305 src_ipif, (ill_t *), send_ill); 5306 5307 if (send_ill == NULL) { 5308 if (src_ipif != NULL) 5309 ipif_refrele(src_ipif); 5310 goto drop_pkt; 5311 } 5312 /* create an under_nce on send_ill */ 5313 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 5314 if (IS_IN_SAME_ILLGRP(send_ill, ncec->ncec_ill)) 5315 under_nce = nce_fastpath_create(send_ill, ncec); 5316 else 5317 under_nce = NULL; 5318 rw_exit(&ipst->ips_ill_g_lock); 5319 if (under_nce != NULL && NCE_ISREACHABLE(ncec)) 5320 nce_fastpath_trigger(under_nce); 5321 5322 ill_refrele(send_ill); 5323 if (src_ipif != NULL) 5324 ipif_refrele(src_ipif); 5325 5326 if (under_nce != NULL) { 5327 (void) ip_xmit(mp, under_nce, ixaflags, pkt_len, 0, 5328 ALL_ZONES, 0, NULL); 5329 nce_refrele(under_nce); 5330 if (nprobes > 0) 5331 nprobes--; 5332 mp = nxt_mp; 5333 continue; 5334 } 5335 drop_pkt: 5336 if (isv6) { 5337 BUMP_MIB(&ipst->ips_ip6_mib, ipIfStatsOutDiscards); 5338 } else { 5339 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); 5340 } 5341 ip_drop_output("ipIfStatsOutDiscards - no under_ill", mp, NULL); 5342 freemsg(mp); 5343 if (nprobes > 0) 5344 nprobes--; 5345 mp = nxt_mp; 5346 } 5347 ncec_cb_dispatch(ncec); /* complete callbacks */ 5348 } 5349