1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved. 23 */ 24 25 /* 26 * Copyright (c) 2019, Joyent, Inc. 27 */ 28 29 #include <sys/types.h> 30 #include <sys/stream.h> 31 #include <sys/stropts.h> 32 #include <sys/strsun.h> 33 #include <sys/sysmacros.h> 34 #include <sys/errno.h> 35 #include <sys/dlpi.h> 36 #include <sys/socket.h> 37 #include <sys/ddi.h> 38 #include <sys/sunddi.h> 39 #include <sys/cmn_err.h> 40 #include <sys/debug.h> 41 #include <sys/vtrace.h> 42 #include <sys/kmem.h> 43 #include <sys/zone.h> 44 #include <sys/ethernet.h> 45 #include <sys/sdt.h> 46 #include <sys/mac.h> 47 48 #include <net/if.h> 49 #include <net/if_types.h> 50 #include <net/if_dl.h> 51 #include <net/route.h> 52 #include <netinet/in.h> 53 #include <netinet/ip6.h> 54 #include <netinet/icmp6.h> 55 56 #include <inet/common.h> 57 #include <inet/mi.h> 58 #include <inet/mib2.h> 59 #include <inet/nd.h> 60 #include <inet/ip.h> 61 #include <inet/ip_impl.h> 62 #include <inet/ipclassifier.h> 63 #include <inet/ip_if.h> 64 #include <inet/ip_ire.h> 65 #include <inet/ip_rts.h> 66 #include <inet/ip6.h> 67 #include <inet/ip_ndp.h> 68 #include <inet/sctp_ip.h> 69 #include <inet/ip_arp.h> 70 #include <inet/ip2mac_impl.h> 71 72 #define ANNOUNCE_INTERVAL(isv6) \ 73 (isv6 ? ipst->ips_ip_ndp_unsolicit_interval : \ 74 ipst->ips_ip_arp_publish_interval) 75 76 #define DEFENSE_INTERVAL(isv6) \ 77 (isv6 ? ipst->ips_ndp_defend_interval : \ 78 ipst->ips_arp_defend_interval) 79 80 /* Non-tunable probe interval, based on link capabilities */ 81 #define ILL_PROBE_INTERVAL(ill) ((ill)->ill_note_link ? 150 : 1500) 82 83 /* 84 * The IPv4 Link Local address space is special; we do extra duplicate checking 85 * there, as the entire assignment mechanism rests on random numbers. 86 */ 87 #define IS_IPV4_LL_SPACE(ptr) (((uchar_t *)ptr)[0] == 169 && \ 88 ((uchar_t *)ptr)[1] == 254) 89 90 /* 91 * NCE_EXTERNAL_FLAGS_MASK defines the set of ncec_flags that may be passed 92 * in to the ncec*add* functions. 93 * 94 * NCE_F_AUTHORITY means that we ignore any incoming adverts for that 95 * mapping (though DAD is performed for the mapping). NCE_F_PUBLISH means 96 * that we will respond to requests for the protocol address. 97 */ 98 #define NCE_EXTERNAL_FLAGS_MASK \ 99 (NCE_F_MYADDR | NCE_F_ISROUTER | NCE_F_NONUD | \ 100 NCE_F_ANYCAST | NCE_F_UNSOL_ADV | NCE_F_BCAST | NCE_F_MCAST | \ 101 NCE_F_AUTHORITY | NCE_F_PUBLISH | NCE_F_STATIC) 102 103 /* 104 * Lock ordering: 105 * 106 * ndp_g_lock -> ill_lock -> ncec_lock 107 * 108 * The ndp_g_lock protects the NCE hash (nce_hash_tbl, NCE_HASH_PTR) and 109 * ncec_next. ncec_lock protects the contents of the NCE (particularly 110 * ncec_refcnt). 111 */ 112 113 static void nce_cleanup_list(ncec_t *ncec); 114 static void nce_set_ll(ncec_t *ncec, uchar_t *ll_addr); 115 static ncec_t *ncec_lookup_illgrp(ill_t *, const in6_addr_t *, 116 ncec_t *); 117 static nce_t *nce_lookup_addr(ill_t *, const in6_addr_t *); 118 static int nce_set_multicast_v6(ill_t *ill, const in6_addr_t *addr, 119 uint16_t ncec_flags, nce_t **newnce); 120 static int nce_set_multicast_v4(ill_t *ill, const in_addr_t *dst, 121 uint16_t ncec_flags, nce_t **newnce); 122 static boolean_t ndp_xmit(ill_t *ill, uint32_t operation, 123 uint8_t *hwaddr, uint_t hwaddr_len, const in6_addr_t *sender, 124 const in6_addr_t *target, int flag); 125 static void ncec_refhold_locked(ncec_t *); 126 static boolean_t ill_defend_rate_limit(ill_t *, ncec_t *); 127 static void nce_queue_mp_common(ncec_t *, mblk_t *, boolean_t); 128 static int nce_add_common(ill_t *, uchar_t *, uint_t, const in6_addr_t *, 129 uint16_t, uint16_t, nce_t **); 130 static nce_t *nce_add_impl(ill_t *, ncec_t *, nce_t *, mblk_t *, list_t *); 131 static nce_t *nce_add(ill_t *, ncec_t *, list_t *); 132 static void nce_inactive(nce_t *); 133 extern nce_t *nce_lookup(ill_t *, const in6_addr_t *); 134 static nce_t *nce_ill_lookup_then_add(ill_t *, ncec_t *); 135 static int nce_add_v6(ill_t *, uchar_t *, uint_t, const in6_addr_t *, 136 uint16_t, uint16_t, nce_t **); 137 static int nce_add_v4(ill_t *, uchar_t *, uint_t, const in_addr_t *, 138 uint16_t, uint16_t, nce_t **); 139 static int nce_add_v6_postprocess(nce_t *); 140 static int nce_add_v4_postprocess(nce_t *); 141 static ill_t *nce_resolve_src(ncec_t *, in6_addr_t *); 142 static clock_t nce_fuzz_interval(clock_t, boolean_t); 143 static void nce_resolv_ipmp_ok(ncec_t *); 144 static void nce_walk_common(ill_t *, pfi_t, void *); 145 static void nce_start_timer(ncec_t *, uint_t); 146 static nce_t *nce_fastpath_create(ill_t *, ncec_t *); 147 static void nce_fastpath_trigger(nce_t *); 148 static nce_t *nce_fastpath(ncec_t *, boolean_t, nce_t *); 149 150 #ifdef DEBUG 151 static void ncec_trace_cleanup(const ncec_t *); 152 #endif 153 154 #define NCE_HASH_PTR_V4(ipst, addr) \ 155 (&((ipst)->ips_ndp4->nce_hash_tbl[IRE_ADDR_HASH(addr, NCE_TABLE_SIZE)])) 156 157 #define NCE_HASH_PTR_V6(ipst, addr) \ 158 (&((ipst)->ips_ndp6->nce_hash_tbl[NCE_ADDR_HASH_V6(addr, \ 159 NCE_TABLE_SIZE)])) 160 161 extern kmem_cache_t *ncec_cache; 162 extern kmem_cache_t *nce_cache; 163 164 /* 165 * Send out a IPv6 (unicast) or IPv4 (broadcast) DAD probe 166 * If src_ill is not null, the ncec_addr is bound to src_ill. The 167 * src_ill is ignored by nce_dad for IPv4 Neighbor Cache entries where 168 * the probe is sent on the ncec_ill (in the non-IPMP case) or the 169 * IPMP cast_ill (in the IPMP case). 170 * 171 * Note that the probe interval is based on the src_ill for IPv6, and 172 * the ncec_xmit_interval for IPv4. 173 */ 174 static void 175 nce_dad(ncec_t *ncec, ill_t *src_ill, boolean_t send_probe) 176 { 177 boolean_t dropped; 178 uint32_t probe_interval; 179 180 ASSERT(!(ncec->ncec_flags & NCE_F_MCAST)); 181 ASSERT(!(ncec->ncec_flags & NCE_F_BCAST)); 182 if (ncec->ncec_ipversion == IPV6_VERSION) { 183 dropped = ndp_xmit(src_ill, ND_NEIGHBOR_SOLICIT, 184 ncec->ncec_lladdr, ncec->ncec_lladdr_length, 185 &ipv6_all_zeros, &ncec->ncec_addr, NDP_PROBE); 186 probe_interval = ILL_PROBE_INTERVAL(src_ill); 187 } else { 188 /* IPv4 DAD delay the initial probe. */ 189 if (send_probe) 190 dropped = arp_probe(ncec); 191 else 192 dropped = B_TRUE; 193 probe_interval = nce_fuzz_interval(ncec->ncec_xmit_interval, 194 !send_probe); 195 } 196 if (!dropped) { 197 mutex_enter(&ncec->ncec_lock); 198 ncec->ncec_pcnt--; 199 mutex_exit(&ncec->ncec_lock); 200 } 201 nce_restart_timer(ncec, probe_interval); 202 } 203 204 /* 205 * Compute default flags to use for an advertisement of this ncec's address. 206 */ 207 static int 208 nce_advert_flags(const ncec_t *ncec) 209 { 210 int flag = 0; 211 212 if (ncec->ncec_flags & NCE_F_ISROUTER) 213 flag |= NDP_ISROUTER; 214 if (!(ncec->ncec_flags & NCE_F_ANYCAST)) 215 flag |= NDP_ORIDE; 216 217 return (flag); 218 } 219 220 /* 221 * NDP Cache Entry creation routine. 222 * This routine must always be called with ndp6->ndp_g_lock held. 223 */ 224 int 225 nce_add_v6(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len, 226 const in6_addr_t *addr, uint16_t flags, uint16_t state, nce_t **newnce) 227 { 228 int err; 229 nce_t *nce; 230 231 ASSERT(MUTEX_HELD(&ill->ill_ipst->ips_ndp6->ndp_g_lock)); 232 ASSERT(ill != NULL && ill->ill_isv6); 233 234 err = nce_add_common(ill, hw_addr, hw_addr_len, addr, flags, state, 235 &nce); 236 if (err != 0) 237 return (err); 238 ASSERT(newnce != NULL); 239 *newnce = nce; 240 return (err); 241 } 242 243 /* 244 * Post-processing routine to be executed after nce_add_v6(). This function 245 * triggers fastpath (if appropriate) and DAD on the newly added nce entry 246 * and must be called without any locks held. 247 */ 248 int 249 nce_add_v6_postprocess(nce_t *nce) 250 { 251 ncec_t *ncec = nce->nce_common; 252 boolean_t dropped = B_FALSE; 253 uchar_t *hw_addr = ncec->ncec_lladdr; 254 uint_t hw_addr_len = ncec->ncec_lladdr_length; 255 ill_t *ill = ncec->ncec_ill; 256 int err = 0; 257 uint16_t flags = ncec->ncec_flags; 258 ip_stack_t *ipst = ill->ill_ipst; 259 boolean_t trigger_fastpath = B_TRUE; 260 261 /* 262 * If the hw_addr is NULL, typically for ND_INCOMPLETE nces, then 263 * we call nce_fastpath as soon as the ncec is resolved in nce_process. 264 * We call nce_fastpath from nce_update if the link layer address of 265 * the peer changes from nce_update 266 */ 267 if (NCE_PUBLISH(ncec) || !NCE_ISREACHABLE(ncec) || 268 (hw_addr == NULL && ill->ill_net_type != IRE_IF_NORESOLVER)) 269 trigger_fastpath = B_FALSE; 270 271 if (trigger_fastpath) 272 nce_fastpath_trigger(nce); 273 if (NCE_PUBLISH(ncec) && ncec->ncec_state == ND_PROBE) { 274 ill_t *hwaddr_ill; 275 /* 276 * Unicast entry that needs DAD. 277 */ 278 if (IS_IPMP(ill)) { 279 hwaddr_ill = ipmp_illgrp_find_ill(ill->ill_grp, 280 hw_addr, hw_addr_len); 281 } else { 282 hwaddr_ill = ill; 283 } 284 nce_dad(ncec, hwaddr_ill, B_TRUE); 285 err = EINPROGRESS; 286 } else if (flags & NCE_F_UNSOL_ADV) { 287 /* 288 * We account for the transmit below by assigning one 289 * less than the ndd variable. Subsequent decrements 290 * are done in nce_timer. 291 */ 292 mutex_enter(&ncec->ncec_lock); 293 ncec->ncec_unsolicit_count = 294 ipst->ips_ip_ndp_unsolicit_count - 1; 295 mutex_exit(&ncec->ncec_lock); 296 dropped = ndp_xmit(ill, 297 ND_NEIGHBOR_ADVERT, 298 hw_addr, 299 hw_addr_len, 300 &ncec->ncec_addr, /* Source and target of the adv */ 301 &ipv6_all_hosts_mcast, /* Destination of the packet */ 302 nce_advert_flags(ncec)); 303 mutex_enter(&ncec->ncec_lock); 304 if (dropped) 305 ncec->ncec_unsolicit_count++; 306 else 307 ncec->ncec_last_time_defended = ddi_get_lbolt(); 308 if (ncec->ncec_unsolicit_count != 0) { 309 nce_start_timer(ncec, 310 ipst->ips_ip_ndp_unsolicit_interval); 311 } 312 mutex_exit(&ncec->ncec_lock); 313 } 314 return (err); 315 } 316 317 /* 318 * Atomically lookup and add (if needed) Neighbor Cache information for 319 * an address. 320 * 321 * IPMP notes: the ncec for non-local (i.e., !NCE_MYADDR(ncec) addresses 322 * are always added pointing at the ipmp_ill. Thus, when the ill passed 323 * to nce_add_v6 is an under_ill (i.e., IS_UNDER_IPMP(ill)) two nce_t 324 * entries will be created, both pointing at the same ncec_t. The nce_t 325 * entries will have their nce_ill set to the ipmp_ill and the under_ill 326 * respectively, with the ncec_t having its ncec_ill pointing at the ipmp_ill. 327 * Local addresses are always created on the ill passed to nce_add_v6. 328 */ 329 int 330 nce_lookup_then_add_v6(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len, 331 const in6_addr_t *addr, uint16_t flags, uint16_t state, nce_t **newnce) 332 { 333 int err = 0; 334 ip_stack_t *ipst = ill->ill_ipst; 335 nce_t *nce, *upper_nce = NULL; 336 ill_t *in_ill = ill; 337 boolean_t need_ill_refrele = B_FALSE; 338 339 if (flags & NCE_F_MCAST) { 340 /* 341 * hw_addr will be figured out in nce_set_multicast_v6; 342 * caller has to select the cast_ill 343 */ 344 ASSERT(hw_addr == NULL); 345 ASSERT(!IS_IPMP(ill)); 346 err = nce_set_multicast_v6(ill, addr, flags, newnce); 347 return (err); 348 } 349 ASSERT(ill->ill_isv6); 350 if (IS_UNDER_IPMP(ill) && !(flags & NCE_F_MYADDR)) { 351 ill = ipmp_ill_hold_ipmp_ill(ill); 352 if (ill == NULL) 353 return (ENXIO); 354 need_ill_refrele = B_TRUE; 355 } 356 357 mutex_enter(&ipst->ips_ndp6->ndp_g_lock); 358 nce = nce_lookup_addr(ill, addr); 359 if (nce == NULL) { 360 err = nce_add_v6(ill, hw_addr, hw_addr_len, addr, flags, state, 361 &nce); 362 } else { 363 err = EEXIST; 364 } 365 mutex_exit(&ipst->ips_ndp6->ndp_g_lock); 366 if (err == 0) 367 err = nce_add_v6_postprocess(nce); 368 if (in_ill != ill && nce != NULL) { 369 nce_t *under_nce = NULL; 370 371 /* 372 * in_ill was the under_ill. Try to create the under_nce. 373 * Hold the ill_g_lock to prevent changes to group membership 374 * until we are done. 375 */ 376 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 377 if (!IS_IN_SAME_ILLGRP(in_ill, ill)) { 378 DTRACE_PROBE2(ill__not__in__group, nce_t *, nce, 379 ill_t *, ill); 380 rw_exit(&ipst->ips_ill_g_lock); 381 err = ENXIO; 382 nce_refrele(nce); 383 nce = NULL; 384 goto bail; 385 } 386 under_nce = nce_fastpath_create(in_ill, nce->nce_common); 387 if (under_nce == NULL) { 388 rw_exit(&ipst->ips_ill_g_lock); 389 err = EINVAL; 390 nce_refrele(nce); 391 nce = NULL; 392 goto bail; 393 } 394 rw_exit(&ipst->ips_ill_g_lock); 395 upper_nce = nce; 396 nce = under_nce; /* will be returned to caller */ 397 if (NCE_ISREACHABLE(nce->nce_common)) 398 nce_fastpath_trigger(under_nce); 399 } 400 /* nce_refrele is deferred until the lock is dropped */ 401 if (nce != NULL) { 402 if (newnce != NULL) 403 *newnce = nce; 404 else 405 nce_refrele(nce); 406 } 407 bail: 408 if (upper_nce != NULL) 409 nce_refrele(upper_nce); 410 if (need_ill_refrele) 411 ill_refrele(ill); 412 return (err); 413 } 414 415 /* 416 * Remove all the CONDEMNED nces from the appropriate hash table. 417 * We create a private list of NCEs, these may have ires pointing 418 * to them, so the list will be passed through to clean up dependent 419 * ires and only then we can do ncec_refrele() which can make NCE inactive. 420 */ 421 static void 422 nce_remove(ndp_g_t *ndp, ncec_t *ncec, ncec_t **free_nce_list) 423 { 424 ncec_t *ncec1; 425 ncec_t **ptpn; 426 427 ASSERT(MUTEX_HELD(&ndp->ndp_g_lock)); 428 ASSERT(ndp->ndp_g_walker == 0); 429 for (; ncec; ncec = ncec1) { 430 ncec1 = ncec->ncec_next; 431 mutex_enter(&ncec->ncec_lock); 432 if (NCE_ISCONDEMNED(ncec)) { 433 ptpn = ncec->ncec_ptpn; 434 ncec1 = ncec->ncec_next; 435 if (ncec1 != NULL) 436 ncec1->ncec_ptpn = ptpn; 437 *ptpn = ncec1; 438 ncec->ncec_ptpn = NULL; 439 ncec->ncec_next = NULL; 440 ncec->ncec_next = *free_nce_list; 441 *free_nce_list = ncec; 442 } 443 mutex_exit(&ncec->ncec_lock); 444 } 445 } 446 447 /* 448 * 1. Mark the entry CONDEMNED. This ensures that no new nce_lookup() 449 * will return this NCE. Also no new timeouts will 450 * be started (See nce_restart_timer). 451 * 2. Cancel any currently running timeouts. 452 * 3. If there is an ndp walker, return. The walker will do the cleanup. 453 * This ensures that walkers see a consistent list of NCEs while walking. 454 * 4. Otherwise remove the NCE from the list of NCEs 455 */ 456 void 457 ncec_delete(ncec_t *ncec) 458 { 459 ncec_t **ptpn; 460 ncec_t *ncec1; 461 int ipversion = ncec->ncec_ipversion; 462 ndp_g_t *ndp; 463 ip_stack_t *ipst = ncec->ncec_ipst; 464 465 if (ipversion == IPV4_VERSION) 466 ndp = ipst->ips_ndp4; 467 else 468 ndp = ipst->ips_ndp6; 469 470 /* Serialize deletes */ 471 mutex_enter(&ncec->ncec_lock); 472 if (NCE_ISCONDEMNED(ncec)) { 473 /* Some other thread is doing the delete */ 474 mutex_exit(&ncec->ncec_lock); 475 return; 476 } 477 /* 478 * Caller has a refhold. Also 1 ref for being in the list. Thus 479 * refcnt has to be >= 2 480 */ 481 ASSERT(ncec->ncec_refcnt >= 2); 482 ncec->ncec_flags |= NCE_F_CONDEMNED; 483 mutex_exit(&ncec->ncec_lock); 484 485 /* Count how many condemned ires for kmem_cache callback */ 486 atomic_inc_32(&ipst->ips_num_nce_condemned); 487 nce_fastpath_list_delete(ncec->ncec_ill, ncec, NULL); 488 489 /* Complete any waiting callbacks */ 490 ncec_cb_dispatch(ncec); 491 492 /* 493 * Cancel any running timer. Timeout can't be restarted 494 * since CONDEMNED is set. Can't hold ncec_lock across untimeout. 495 * Passing invalid timeout id is fine. 496 */ 497 if (ncec->ncec_timeout_id != 0) { 498 (void) untimeout(ncec->ncec_timeout_id); 499 ncec->ncec_timeout_id = 0; 500 } 501 502 mutex_enter(&ndp->ndp_g_lock); 503 if (ncec->ncec_ptpn == NULL) { 504 /* 505 * The last ndp walker has already removed this ncec from 506 * the list after we marked the ncec CONDEMNED and before 507 * we grabbed the global lock. 508 */ 509 mutex_exit(&ndp->ndp_g_lock); 510 return; 511 } 512 if (ndp->ndp_g_walker > 0) { 513 /* 514 * Can't unlink. The walker will clean up 515 */ 516 ndp->ndp_g_walker_cleanup = B_TRUE; 517 mutex_exit(&ndp->ndp_g_lock); 518 return; 519 } 520 521 /* 522 * Now remove the ncec from the list. nce_restart_timer won't restart 523 * the timer since it is marked CONDEMNED. 524 */ 525 ptpn = ncec->ncec_ptpn; 526 ncec1 = ncec->ncec_next; 527 if (ncec1 != NULL) 528 ncec1->ncec_ptpn = ptpn; 529 *ptpn = ncec1; 530 ncec->ncec_ptpn = NULL; 531 ncec->ncec_next = NULL; 532 mutex_exit(&ndp->ndp_g_lock); 533 534 /* Removed from ncec_ptpn/ncec_next list */ 535 ncec_refrele_notr(ncec); 536 } 537 538 void 539 ncec_inactive(ncec_t *ncec) 540 { 541 mblk_t **mpp; 542 ill_t *ill = ncec->ncec_ill; 543 ip_stack_t *ipst = ncec->ncec_ipst; 544 545 ASSERT(ncec->ncec_refcnt == 0); 546 ASSERT(MUTEX_HELD(&ncec->ncec_lock)); 547 548 /* Count how many condemned nces for kmem_cache callback */ 549 if (NCE_ISCONDEMNED(ncec)) 550 atomic_add_32(&ipst->ips_num_nce_condemned, -1); 551 552 /* Free all allocated messages */ 553 mpp = &ncec->ncec_qd_mp; 554 while (*mpp != NULL) { 555 mblk_t *mp; 556 557 mp = *mpp; 558 *mpp = mp->b_next; 559 560 inet_freemsg(mp); 561 } 562 /* 563 * must have been cleaned up in ncec_delete 564 */ 565 ASSERT(list_is_empty(&ncec->ncec_cb)); 566 list_destroy(&ncec->ncec_cb); 567 /* 568 * free the ncec_lladdr if one was allocated in nce_add_common() 569 */ 570 if (ncec->ncec_lladdr_length > 0) 571 kmem_free(ncec->ncec_lladdr, ncec->ncec_lladdr_length); 572 573 #ifdef DEBUG 574 ncec_trace_cleanup(ncec); 575 #endif 576 577 mutex_enter(&ill->ill_lock); 578 DTRACE_PROBE3(ill__decr__cnt, (ill_t *), ill, 579 (char *), "ncec", (void *), ncec); 580 ill->ill_ncec_cnt--; 581 ncec->ncec_ill = NULL; 582 /* 583 * If the number of ncec's associated with this ill have dropped 584 * to zero, check whether we need to restart any operation that 585 * is waiting for this to happen. 586 */ 587 if (ILL_DOWN_OK(ill)) { 588 /* ipif_ill_refrele_tail drops the ill_lock */ 589 ipif_ill_refrele_tail(ill); 590 } else { 591 mutex_exit(&ill->ill_lock); 592 } 593 594 mutex_destroy(&ncec->ncec_lock); 595 kmem_cache_free(ncec_cache, ncec); 596 } 597 598 /* 599 * ncec_walk routine. Delete the ncec if it is associated with the ill 600 * that is going away. Always called as a writer. 601 */ 602 void 603 ncec_delete_per_ill(ncec_t *ncec, void *arg) 604 { 605 if ((ncec != NULL) && ncec->ncec_ill == arg) { 606 ncec_delete(ncec); 607 } 608 } 609 610 /* 611 * Neighbor Cache cleanup logic for a list of ncec_t entries. 612 */ 613 static void 614 nce_cleanup_list(ncec_t *ncec) 615 { 616 ncec_t *ncec_next; 617 618 ASSERT(ncec != NULL); 619 while (ncec != NULL) { 620 ncec_next = ncec->ncec_next; 621 ncec->ncec_next = NULL; 622 623 /* 624 * It is possible for the last ndp walker (this thread) 625 * to come here after ncec_delete has marked the ncec CONDEMNED 626 * and before it has removed the ncec from the fastpath list 627 * or called untimeout. So we need to do it here. It is safe 628 * for both ncec_delete and this thread to do it twice or 629 * even simultaneously since each of the threads has a 630 * reference on the ncec. 631 */ 632 nce_fastpath_list_delete(ncec->ncec_ill, ncec, NULL); 633 /* 634 * Cancel any running timer. Timeout can't be restarted 635 * since CONDEMNED is set. The ncec_lock can't be 636 * held across untimeout though passing invalid timeout 637 * id is fine. 638 */ 639 if (ncec->ncec_timeout_id != 0) { 640 (void) untimeout(ncec->ncec_timeout_id); 641 ncec->ncec_timeout_id = 0; 642 } 643 /* Removed from ncec_ptpn/ncec_next list */ 644 ncec_refrele_notr(ncec); 645 ncec = ncec_next; 646 } 647 } 648 649 /* 650 * Restart DAD on given NCE. Returns B_TRUE if DAD has been restarted. 651 */ 652 boolean_t 653 nce_restart_dad(ncec_t *ncec) 654 { 655 boolean_t started; 656 ill_t *ill, *hwaddr_ill; 657 658 if (ncec == NULL) 659 return (B_FALSE); 660 ill = ncec->ncec_ill; 661 mutex_enter(&ncec->ncec_lock); 662 if (ncec->ncec_state == ND_PROBE) { 663 mutex_exit(&ncec->ncec_lock); 664 started = B_TRUE; 665 } else if (ncec->ncec_state == ND_REACHABLE) { 666 ASSERT(ncec->ncec_lladdr != NULL); 667 ncec->ncec_state = ND_PROBE; 668 ncec->ncec_pcnt = ND_MAX_UNICAST_SOLICIT; 669 /* 670 * Slight cheat here: we don't use the initial probe delay 671 * for IPv4 in this obscure case. 672 */ 673 mutex_exit(&ncec->ncec_lock); 674 if (IS_IPMP(ill)) { 675 hwaddr_ill = ipmp_illgrp_find_ill(ill->ill_grp, 676 ncec->ncec_lladdr, ncec->ncec_lladdr_length); 677 } else { 678 hwaddr_ill = ill; 679 } 680 nce_dad(ncec, hwaddr_ill, B_TRUE); 681 started = B_TRUE; 682 } else { 683 mutex_exit(&ncec->ncec_lock); 684 started = B_FALSE; 685 } 686 return (started); 687 } 688 689 /* 690 * IPv6 Cache entry lookup. Try to find an ncec matching the parameters passed. 691 * If one is found, the refcnt on the ncec will be incremented. 692 */ 693 ncec_t * 694 ncec_lookup_illgrp_v6(ill_t *ill, const in6_addr_t *addr) 695 { 696 ncec_t *ncec; 697 ip_stack_t *ipst = ill->ill_ipst; 698 699 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 700 mutex_enter(&ipst->ips_ndp6->ndp_g_lock); 701 702 /* Get head of v6 hash table */ 703 ncec = *((ncec_t **)NCE_HASH_PTR_V6(ipst, *addr)); 704 ncec = ncec_lookup_illgrp(ill, addr, ncec); 705 mutex_exit(&ipst->ips_ndp6->ndp_g_lock); 706 rw_exit(&ipst->ips_ill_g_lock); 707 return (ncec); 708 } 709 /* 710 * IPv4 Cache entry lookup. Try to find an ncec matching the parameters passed. 711 * If one is found, the refcnt on the ncec will be incremented. 712 */ 713 ncec_t * 714 ncec_lookup_illgrp_v4(ill_t *ill, const in_addr_t *addr) 715 { 716 ncec_t *ncec = NULL; 717 in6_addr_t addr6; 718 ip_stack_t *ipst = ill->ill_ipst; 719 720 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 721 mutex_enter(&ipst->ips_ndp4->ndp_g_lock); 722 723 /* Get head of v4 hash table */ 724 ncec = *((ncec_t **)NCE_HASH_PTR_V4(ipst, *addr)); 725 IN6_IPADDR_TO_V4MAPPED(*addr, &addr6); 726 ncec = ncec_lookup_illgrp(ill, &addr6, ncec); 727 mutex_exit(&ipst->ips_ndp4->ndp_g_lock); 728 rw_exit(&ipst->ips_ill_g_lock); 729 return (ncec); 730 } 731 732 /* 733 * Cache entry lookup. Try to find an ncec matching the parameters passed. 734 * If an ncec is found, increment the hold count on that ncec. 735 * The caller passes in the start of the appropriate hash table, and must 736 * be holding the appropriate global lock (ndp_g_lock). In addition, since 737 * this function matches ncec_t entries across the illgrp, the ips_ill_g_lock 738 * must be held as reader. 739 * 740 * This function always matches across the ipmp group. 741 */ 742 ncec_t * 743 ncec_lookup_illgrp(ill_t *ill, const in6_addr_t *addr, ncec_t *ncec) 744 { 745 ndp_g_t *ndp; 746 ip_stack_t *ipst = ill->ill_ipst; 747 748 if (ill->ill_isv6) 749 ndp = ipst->ips_ndp6; 750 else 751 ndp = ipst->ips_ndp4; 752 753 ASSERT(ill != NULL); 754 ASSERT(MUTEX_HELD(&ndp->ndp_g_lock)); 755 if (IN6_IS_ADDR_UNSPECIFIED(addr)) 756 return (NULL); 757 for (; ncec != NULL; ncec = ncec->ncec_next) { 758 if (ncec->ncec_ill == ill || 759 IS_IN_SAME_ILLGRP(ill, ncec->ncec_ill)) { 760 if (IN6_ARE_ADDR_EQUAL(&ncec->ncec_addr, addr)) { 761 mutex_enter(&ncec->ncec_lock); 762 if (!NCE_ISCONDEMNED(ncec)) { 763 ncec_refhold_locked(ncec); 764 mutex_exit(&ncec->ncec_lock); 765 break; 766 } 767 mutex_exit(&ncec->ncec_lock); 768 } 769 } 770 } 771 return (ncec); 772 } 773 774 /* 775 * Find an nce_t on ill with nce_addr == addr. Lookup the nce_t 776 * entries for ill only, i.e., when ill is part of an ipmp group, 777 * nce_lookup_v4 will never try to match across the group. 778 */ 779 nce_t * 780 nce_lookup_v4(ill_t *ill, const in_addr_t *addr) 781 { 782 nce_t *nce; 783 in6_addr_t addr6; 784 ip_stack_t *ipst = ill->ill_ipst; 785 786 mutex_enter(&ipst->ips_ndp4->ndp_g_lock); 787 IN6_IPADDR_TO_V4MAPPED(*addr, &addr6); 788 nce = nce_lookup_addr(ill, &addr6); 789 mutex_exit(&ipst->ips_ndp4->ndp_g_lock); 790 return (nce); 791 } 792 793 /* 794 * Find an nce_t on ill with nce_addr == addr. Lookup the nce_t 795 * entries for ill only, i.e., when ill is part of an ipmp group, 796 * nce_lookup_v6 will never try to match across the group. 797 */ 798 nce_t * 799 nce_lookup_v6(ill_t *ill, const in6_addr_t *addr6) 800 { 801 nce_t *nce; 802 ip_stack_t *ipst = ill->ill_ipst; 803 804 mutex_enter(&ipst->ips_ndp6->ndp_g_lock); 805 nce = nce_lookup_addr(ill, addr6); 806 mutex_exit(&ipst->ips_ndp6->ndp_g_lock); 807 return (nce); 808 } 809 810 static nce_t * 811 nce_lookup_addr(ill_t *ill, const in6_addr_t *addr) 812 { 813 nce_t *nce; 814 815 ASSERT(ill != NULL); 816 #ifdef DEBUG 817 if (ill->ill_isv6) 818 ASSERT(MUTEX_HELD(&ill->ill_ipst->ips_ndp6->ndp_g_lock)); 819 else 820 ASSERT(MUTEX_HELD(&ill->ill_ipst->ips_ndp4->ndp_g_lock)); 821 #endif 822 mutex_enter(&ill->ill_lock); 823 nce = nce_lookup(ill, addr); 824 mutex_exit(&ill->ill_lock); 825 return (nce); 826 } 827 828 829 /* 830 * Router turned to host. We need to make sure that cached copies of the ncec 831 * are not used for forwarding packets if they were derived from the default 832 * route, and that the default route itself is removed, as required by 833 * section 7.2.5 of RFC 2461. 834 * 835 * Note that the ncec itself probably has valid link-layer information for the 836 * nexthop, so that there is no reason to delete the ncec, as long as the 837 * ISROUTER flag is turned off. 838 */ 839 static void 840 ncec_router_to_host(ncec_t *ncec) 841 { 842 ire_t *ire; 843 ip_stack_t *ipst = ncec->ncec_ipst; 844 845 mutex_enter(&ncec->ncec_lock); 846 ncec->ncec_flags &= ~NCE_F_ISROUTER; 847 mutex_exit(&ncec->ncec_lock); 848 849 ire = ire_ftable_lookup_v6(&ipv6_all_zeros, &ipv6_all_zeros, 850 &ncec->ncec_addr, IRE_DEFAULT, ncec->ncec_ill, ALL_ZONES, NULL, 851 MATCH_IRE_ILL | MATCH_IRE_TYPE | MATCH_IRE_GW, 0, ipst, NULL); 852 if (ire != NULL) { 853 ip_rts_rtmsg(RTM_DELETE, ire, 0, ipst); 854 ire_delete(ire); 855 ire_refrele(ire); 856 } 857 } 858 859 /* 860 * Process passed in parameters either from an incoming packet or via 861 * user ioctl. 862 */ 863 void 864 nce_process(ncec_t *ncec, uchar_t *hw_addr, uint32_t flag, boolean_t is_adv) 865 { 866 ill_t *ill = ncec->ncec_ill; 867 uint32_t hw_addr_len = ill->ill_phys_addr_length; 868 boolean_t ll_updated = B_FALSE; 869 boolean_t ll_changed; 870 nce_t *nce; 871 872 ASSERT(ncec->ncec_ipversion == IPV6_VERSION); 873 /* 874 * No updates of link layer address or the neighbor state is 875 * allowed, when the cache is in NONUD state. This still 876 * allows for responding to reachability solicitation. 877 */ 878 mutex_enter(&ncec->ncec_lock); 879 if (ncec->ncec_state == ND_INCOMPLETE) { 880 if (hw_addr == NULL) { 881 mutex_exit(&ncec->ncec_lock); 882 return; 883 } 884 nce_set_ll(ncec, hw_addr); 885 /* 886 * Update ncec state and send the queued packets 887 * back to ip this time ire will be added. 888 */ 889 if (flag & ND_NA_FLAG_SOLICITED) { 890 nce_update(ncec, ND_REACHABLE, NULL); 891 } else { 892 nce_update(ncec, ND_STALE, NULL); 893 } 894 mutex_exit(&ncec->ncec_lock); 895 nce = nce_fastpath(ncec, B_TRUE, NULL); 896 nce_resolv_ok(ncec); 897 if (nce != NULL) 898 nce_refrele(nce); 899 return; 900 } 901 ll_changed = nce_cmp_ll_addr(ncec, hw_addr, hw_addr_len); 902 if (!is_adv) { 903 /* If this is a SOLICITATION request only */ 904 if (ll_changed) 905 nce_update(ncec, ND_STALE, hw_addr); 906 mutex_exit(&ncec->ncec_lock); 907 ncec_cb_dispatch(ncec); 908 return; 909 } 910 if (!(flag & ND_NA_FLAG_OVERRIDE) && ll_changed) { 911 /* If in any other state than REACHABLE, ignore */ 912 if (ncec->ncec_state == ND_REACHABLE) { 913 nce_update(ncec, ND_STALE, NULL); 914 } 915 mutex_exit(&ncec->ncec_lock); 916 ncec_cb_dispatch(ncec); 917 return; 918 } else { 919 if (ll_changed) { 920 nce_update(ncec, ND_UNCHANGED, hw_addr); 921 ll_updated = B_TRUE; 922 } 923 if (flag & ND_NA_FLAG_SOLICITED) { 924 nce_update(ncec, ND_REACHABLE, NULL); 925 } else { 926 if (ll_updated) { 927 nce_update(ncec, ND_STALE, NULL); 928 } 929 } 930 mutex_exit(&ncec->ncec_lock); 931 if (!(flag & ND_NA_FLAG_ROUTER) && (ncec->ncec_flags & 932 NCE_F_ISROUTER)) { 933 ncec_router_to_host(ncec); 934 } else { 935 ncec_cb_dispatch(ncec); 936 } 937 } 938 } 939 940 /* 941 * Pass arg1 to the cbf supplied, along with each ncec in existence. 942 * ncec_walk() places a REFHOLD on the ncec and drops the lock when 943 * walking the hash list. 944 */ 945 void 946 ncec_walk_common(ndp_g_t *ndp, ill_t *ill, ncec_walk_cb_t cbf, 947 void *arg1, boolean_t trace) 948 { 949 ncec_t *ncec; 950 ncec_t *ncec1; 951 ncec_t **ncep; 952 ncec_t *free_nce_list = NULL; 953 954 mutex_enter(&ndp->ndp_g_lock); 955 /* Prevent ncec_delete from unlink and free of NCE */ 956 ndp->ndp_g_walker++; 957 mutex_exit(&ndp->ndp_g_lock); 958 for (ncep = ndp->nce_hash_tbl; 959 ncep < A_END(ndp->nce_hash_tbl); ncep++) { 960 for (ncec = *ncep; ncec != NULL; ncec = ncec1) { 961 ncec1 = ncec->ncec_next; 962 if (ill == NULL || ncec->ncec_ill == ill) { 963 if (trace) { 964 ncec_refhold(ncec); 965 (*cbf)(ncec, arg1); 966 ncec_refrele(ncec); 967 } else { 968 ncec_refhold_notr(ncec); 969 (*cbf)(ncec, arg1); 970 ncec_refrele_notr(ncec); 971 } 972 } 973 } 974 } 975 mutex_enter(&ndp->ndp_g_lock); 976 ndp->ndp_g_walker--; 977 if (ndp->ndp_g_walker_cleanup && ndp->ndp_g_walker == 0) { 978 /* Time to delete condemned entries */ 979 for (ncep = ndp->nce_hash_tbl; 980 ncep < A_END(ndp->nce_hash_tbl); ncep++) { 981 ncec = *ncep; 982 if (ncec != NULL) { 983 nce_remove(ndp, ncec, &free_nce_list); 984 } 985 } 986 ndp->ndp_g_walker_cleanup = B_FALSE; 987 } 988 989 mutex_exit(&ndp->ndp_g_lock); 990 991 if (free_nce_list != NULL) { 992 nce_cleanup_list(free_nce_list); 993 } 994 } 995 996 /* 997 * Walk everything. 998 * Note that ill can be NULL hence can't derive the ipst from it. 999 */ 1000 void 1001 ncec_walk(ill_t *ill, ncec_walk_cb_t cbf, void *arg1, ip_stack_t *ipst) 1002 { 1003 ncec_walk_common(ipst->ips_ndp4, ill, cbf, arg1, B_TRUE); 1004 ncec_walk_common(ipst->ips_ndp6, ill, cbf, arg1, B_TRUE); 1005 } 1006 1007 /* 1008 * Cheesy globals (i.e. all netstacks) for both a limit on per-ill multicast 1009 * NCEs, and the number to reclaim if we hit the limit. Used by 1010 * nce_set_multicast_v[46]() to limit the linked-list length of ill_nce. Until 1011 * we solve the multicast-mappings-shouldn't-be-NCEs problem, use this. 1012 */ 1013 1014 /* Maximum number of multicast NCEs on an ill. */ 1015 uint_t ip_max_ill_mcast_nces = 16384; 1016 /* 1017 * Number of NCEs to delete if we hit the maximum above. 0 means *don't* and 1018 * return an error. Non-zero means delete so many, and if the number is >= 1019 * the max above, that means delete them all. 1020 */ 1021 uint_t ip_ill_mcast_reclaim = 256; 1022 1023 /* 1024 * Encapsulate multicast ill capping in a function, for easier DTrace 1025 * detections. Return a list of refheld NCEs to destroy-via-refrele. That 1026 * list can be NULL, but can only be non-NULL if we successfully reclaimed. 1027 * 1028 * NOTE: This function must be called while holding the ill_lock AND 1029 * JUST PRIOR to making the insertion into the ill_nce list. 1030 * 1031 * We can't release the ones we delete ourselves because the ill_lock is held 1032 * by the caller. They are, instead, passed back in a list_t for deletion 1033 * outside of the ill_lock hold. nce_graveyard_free() actually frees them. 1034 * 1035 * While this covers nce_t, ncec_t gets done even further down the road. See 1036 * nce_graveyard_free() for why. 1037 */ 1038 static boolean_t 1039 nce_too_many_mcast(ill_t *ill, list_t *graveyard) 1040 { 1041 uint_t reclaim_count, max_count, reclaimed = 0; 1042 boolean_t too_many; 1043 nce_t *nce, *deadman; 1044 1045 ASSERT(graveyard != NULL); 1046 ASSERT(list_is_empty(graveyard)); 1047 ASSERT(MUTEX_HELD(&ill->ill_lock)); 1048 1049 /* 1050 * NOTE: Some grinning weirdo may have lowered the global max beyond 1051 * what this ill currently has. The behavior in this case will be 1052 * trim-back just by the reclaim amount for any new ones. 1053 */ 1054 max_count = ip_max_ill_mcast_nces; 1055 reclaim_count = min(ip_ill_mcast_reclaim, max_count); 1056 1057 /* All good? */ 1058 if (ill->ill_mcast_nces < max_count) 1059 return (B_FALSE); /* Yes, all good. */ 1060 1061 if (reclaim_count == 0) 1062 return (B_TRUE); /* Don't bother - we're stuck. */ 1063 1064 /* We need to reclaim now. Exploit our held ill_lock. */ 1065 1066 /* 1067 * Start at the tail and work backwards, new nces are head-inserted, 1068 * so we'll be reaping the oldest entries. 1069 */ 1070 nce = list_tail(&ill->ill_nce); 1071 while (reclaimed < reclaim_count) { 1072 /* Skip ahead to a multicast NCE. */ 1073 while (nce != NULL && 1074 (nce->nce_common->ncec_flags & NCE_F_MCAST) == 0) { 1075 nce = list_prev(&ill->ill_nce, nce); 1076 } 1077 if (nce == NULL) 1078 break; 1079 1080 /* 1081 * NOTE: For now, we just delete the first one(s) we find. 1082 * This is not optimal, and may require some inspection of nce 1083 * & its ncec to be better. 1084 */ 1085 deadman = nce; 1086 nce = list_prev(&ill->ill_nce, nce); 1087 1088 /* nce_delete() requires caller holds... */ 1089 nce_refhold(deadman); 1090 nce_delete(deadman); /* Bumps down ill_mcast_nces. */ 1091 1092 /* Link the dead ones singly, still refheld... */ 1093 list_insert_tail(graveyard, deadman); 1094 reclaimed++; 1095 } 1096 1097 if (reclaimed != reclaim_count) { 1098 /* We didn't have enough to reach reclaim_count. Why?!? */ 1099 DTRACE_PROBE3(ill__mcast__nce__reclaim__mismatch, ill_t *, ill, 1100 uint_t, reclaimed, uint_t, reclaim_count); 1101 1102 /* In case for some REALLY weird reason we found none! */ 1103 too_many = (reclaimed == 0); 1104 } else { 1105 too_many = B_FALSE; 1106 } 1107 1108 return (too_many); 1109 } 1110 1111 static void 1112 ncec_mcast_reap_one(ncec_t *ncec, void *arg) 1113 { 1114 boolean_t reapit; 1115 ill_t *ill = (ill_t *)arg; 1116 1117 /* Obvious no-lock-needed checks... */ 1118 if (ncec == NULL || ncec->ncec_ill != ill || 1119 (ncec->ncec_flags & NCE_F_MCAST) == 0) 1120 return; 1121 1122 mutex_enter(&ncec->ncec_lock); 1123 /* 1124 * It's refheld by the walk infrastructure. It has one reference for 1125 * being in the ndp_g_hash, and if an nce_t exists, that's one more. 1126 * We want ones without an nce_t, so 2 is the magic number. If it's 1127 * LESS than 2, we have much bigger problems anyway. 1128 */ 1129 ASSERT(ncec->ncec_refcnt >= 2); 1130 reapit = (ncec->ncec_refcnt == 2); 1131 mutex_exit(&ncec->ncec_lock); 1132 1133 if (reapit) { 1134 IP_STAT(ill->ill_ipst, ip_nce_mcast_reclaim_deleted); 1135 ncec_delete(ncec); 1136 } 1137 } 1138 1139 /* 1140 * Attempt to reap stray multicast ncec_t structures left in the wake of 1141 * nce_graveyard_free(). This is a taskq servicing routine, as it's well 1142 * outside any netstack-global locks being held - ndp_g_lock in this case. We 1143 * have a reference hold on the ill, which will prevent any unplumbing races. 1144 */ 1145 static void 1146 ncec_mcast_reap(void *arg) 1147 { 1148 ill_t *ill = (ill_t *)arg; 1149 1150 IP_STAT(ill->ill_ipst, ip_nce_mcast_reclaim_calls); 1151 ncec_walk(ill, ncec_mcast_reap_one, ill, ill->ill_ipst); 1152 mutex_enter(&ill->ill_lock); 1153 ill->ill_mcast_ncec_cleanup = B_FALSE; 1154 /* 1155 * Inline a _notr() version of ill_refrele. See nce_graveyard_free() 1156 * below for why. 1157 */ 1158 ill->ill_refcnt--; 1159 if (ill->ill_refcnt == 0) 1160 ipif_ill_refrele_tail(ill); /* Drops ill_lock. */ 1161 else 1162 mutex_exit(&ill->ill_lock); 1163 } 1164 1165 /* 1166 * Free a list (including handling an empty list or NULL list) of 1167 * reference-held NCEs that were reaped from a nce_too_many_mcast() 1168 * call. Separate because the caller must have dropped ndp_g_lock first. 1169 * 1170 * This also schedules a taskq task to unlink underlying NCECs from the 1171 * ndp_g_hash, which are protected by ndp_g_lock. 1172 */ 1173 static void 1174 nce_graveyard_free(list_t *graveyard) 1175 { 1176 nce_t *deadman, *current; 1177 ill_t *ill; 1178 boolean_t doit; 1179 1180 if (graveyard == NULL) 1181 return; 1182 1183 current = list_head(graveyard); 1184 if (current == NULL) { 1185 list_destroy(graveyard); 1186 return; 1187 } 1188 1189 ill = current->nce_ill; 1190 /* 1191 * Normally one should ill_refhold(ill) here. There's no _notr() 1192 * variant like there is for ire_t, dce_t, or even ncec_t, but this is 1193 * the ONLY case that'll break the mh_trace that IP debugging uses for 1194 * reference counts (i.e. they assume same thread releases as 1195 * holds). Instead, we inline ill_refhold() here. We must do the same 1196 * in the release done by the ncec_mcast_reap() above. 1197 */ 1198 mutex_enter(&ill->ill_lock); 1199 ill->ill_refcnt++; 1200 mutex_exit(&ill->ill_lock); 1201 1202 while (current != NULL) { 1203 ASSERT3P(ill, ==, current->nce_ill); 1204 deadman = current; 1205 current = list_next(graveyard, deadman); 1206 list_remove(graveyard, deadman); 1207 ASSERT3U((deadman->nce_common->ncec_flags & NCE_F_MCAST), !=, 1208 0); 1209 nce_refrele(deadman); 1210 } 1211 list_destroy(graveyard); 1212 1213 mutex_enter(&ill->ill_lock); 1214 if (ill->ill_mcast_ncec_cleanup) 1215 doit = B_FALSE; 1216 else { 1217 ill->ill_mcast_ncec_cleanup = B_TRUE; 1218 doit = B_TRUE; 1219 } 1220 mutex_exit(&ill->ill_lock); 1221 if (!doit || taskq_dispatch(system_taskq, ncec_mcast_reap, 1222 ill, TQ_NOSLEEP) == (taskqid_t)NULL) { 1223 mutex_enter(&ill->ill_lock); 1224 if (doit) { 1225 IP_STAT(ill->ill_ipst, ip_nce_mcast_reclaim_tqfail); 1226 ill->ill_mcast_ncec_cleanup = B_FALSE; 1227 } 1228 /* There's no _notr() for ill_refrele(), so inline it here. */ 1229 ill->ill_refcnt--; 1230 if (ill->ill_refcnt == 0) 1231 ipif_ill_refrele_tail(ill); /* Drops ill_lock */ 1232 else 1233 mutex_exit(&ill->ill_lock); 1234 } 1235 } 1236 1237 /* 1238 * For each interface an entry is added for the unspecified multicast group. 1239 * Here that mapping is used to form the multicast cache entry for a particular 1240 * multicast destination. 1241 */ 1242 static int 1243 nce_set_multicast_v6(ill_t *ill, const in6_addr_t *dst, 1244 uint16_t flags, nce_t **newnce) 1245 { 1246 uchar_t *hw_addr; 1247 int err = 0; 1248 ip_stack_t *ipst = ill->ill_ipst; 1249 nce_t *nce; 1250 1251 ASSERT(ill != NULL); 1252 ASSERT(ill->ill_isv6); 1253 ASSERT(!(IN6_IS_ADDR_UNSPECIFIED(dst))); 1254 1255 mutex_enter(&ipst->ips_ndp6->ndp_g_lock); 1256 nce = nce_lookup_addr(ill, dst); 1257 if (nce != NULL) { 1258 mutex_exit(&ipst->ips_ndp6->ndp_g_lock); 1259 goto done; 1260 } 1261 if (ill->ill_net_type == IRE_IF_RESOLVER) { 1262 /* 1263 * For IRE_IF_RESOLVER a hardware mapping can be 1264 * generated. 1265 */ 1266 hw_addr = kmem_alloc(ill->ill_nd_lla_len, KM_NOSLEEP); 1267 if (hw_addr == NULL) { 1268 mutex_exit(&ipst->ips_ndp6->ndp_g_lock); 1269 return (ENOMEM); 1270 } 1271 ip_mcast_mapping(ill, (uchar_t *)dst, hw_addr); 1272 } else { 1273 /* No hw_addr is needed for IRE_IF_NORESOLVER. */ 1274 hw_addr = NULL; 1275 } 1276 ASSERT((flags & NCE_F_MCAST) != 0); 1277 ASSERT((flags & NCE_F_NONUD) != 0); 1278 /* nce_state will be computed by nce_add_common() */ 1279 err = nce_add_v6(ill, hw_addr, ill->ill_phys_addr_length, dst, flags, 1280 ND_UNCHANGED, &nce); 1281 mutex_exit(&ipst->ips_ndp6->ndp_g_lock); 1282 if (err == 0) 1283 err = (nce != NULL) ? nce_add_v6_postprocess(nce) : ENOMEM; 1284 if (hw_addr != NULL) 1285 kmem_free(hw_addr, ill->ill_nd_lla_len); 1286 if (err != 0) { 1287 ip1dbg(("nce_set_multicast_v6: create failed" "%d\n", err)); 1288 return (err); 1289 } 1290 done: 1291 ASSERT(nce->nce_common->ncec_state == ND_REACHABLE); 1292 if (newnce != NULL) 1293 *newnce = nce; 1294 else 1295 nce_refrele(nce); 1296 return (0); 1297 } 1298 1299 /* 1300 * Return the link layer address, and any flags of a ncec. 1301 */ 1302 int 1303 ndp_query(ill_t *ill, struct lif_nd_req *lnr) 1304 { 1305 ncec_t *ncec; 1306 in6_addr_t *addr; 1307 sin6_t *sin6; 1308 1309 ASSERT(ill != NULL && ill->ill_isv6); 1310 sin6 = (sin6_t *)&lnr->lnr_addr; 1311 addr = &sin6->sin6_addr; 1312 1313 /* 1314 * NOTE: if the ill is an IPMP interface, then match against the whole 1315 * illgrp. This e.g. allows in.ndpd to retrieve the link layer 1316 * addresses for the data addresses on an IPMP interface even though 1317 * ipif_ndp_up() created them with an ncec_ill of ipif_bound_ill. 1318 */ 1319 ncec = ncec_lookup_illgrp_v6(ill, addr); 1320 if (ncec == NULL) 1321 return (ESRCH); 1322 /* If no link layer address is available yet, return ESRCH */ 1323 if (!NCE_ISREACHABLE(ncec)) { 1324 ncec_refrele(ncec); 1325 return (ESRCH); 1326 } 1327 lnr->lnr_hdw_len = ill->ill_phys_addr_length; 1328 bcopy(ncec->ncec_lladdr, (uchar_t *)&lnr->lnr_hdw_addr, 1329 lnr->lnr_hdw_len); 1330 if (ncec->ncec_flags & NCE_F_ISROUTER) 1331 lnr->lnr_flags = NDF_ISROUTER_ON; 1332 if (ncec->ncec_flags & NCE_F_ANYCAST) 1333 lnr->lnr_flags |= NDF_ANYCAST_ON; 1334 if (ncec->ncec_flags & NCE_F_STATIC) 1335 lnr->lnr_flags |= NDF_STATIC; 1336 ncec_refrele(ncec); 1337 return (0); 1338 } 1339 1340 /* 1341 * Finish setting up the Enable/Disable multicast for the driver. 1342 */ 1343 mblk_t * 1344 ndp_mcastreq(ill_t *ill, const in6_addr_t *v6group, uint32_t hw_addr_len, 1345 uint32_t hw_addr_offset, mblk_t *mp) 1346 { 1347 uchar_t *hw_addr; 1348 ipaddr_t v4group; 1349 uchar_t *addr; 1350 1351 ASSERT(ill->ill_net_type == IRE_IF_RESOLVER); 1352 if (IN6_IS_ADDR_V4MAPPED(v6group)) { 1353 IN6_V4MAPPED_TO_IPADDR(v6group, v4group); 1354 1355 ASSERT(CLASSD(v4group)); 1356 ASSERT(!(ill->ill_isv6)); 1357 1358 addr = (uchar_t *)&v4group; 1359 } else { 1360 ASSERT(IN6_IS_ADDR_MULTICAST(v6group)); 1361 ASSERT(ill->ill_isv6); 1362 1363 addr = (uchar_t *)v6group; 1364 } 1365 hw_addr = mi_offset_paramc(mp, hw_addr_offset, hw_addr_len); 1366 if (hw_addr == NULL) { 1367 ip0dbg(("ndp_mcastreq NULL hw_addr\n")); 1368 freemsg(mp); 1369 return (NULL); 1370 } 1371 1372 ip_mcast_mapping(ill, addr, hw_addr); 1373 return (mp); 1374 } 1375 1376 void 1377 ip_ndp_resolve(ncec_t *ncec) 1378 { 1379 in_addr_t sender4 = INADDR_ANY; 1380 in6_addr_t sender6 = ipv6_all_zeros; 1381 ill_t *src_ill; 1382 uint32_t ms; 1383 1384 src_ill = nce_resolve_src(ncec, &sender6); 1385 if (src_ill == NULL) { 1386 /* Make sure we try again later */ 1387 ms = ncec->ncec_ill->ill_reachable_retrans_time; 1388 nce_restart_timer(ncec, (clock_t)ms); 1389 return; 1390 } 1391 if (ncec->ncec_ipversion == IPV4_VERSION) 1392 IN6_V4MAPPED_TO_IPADDR(&sender6, sender4); 1393 mutex_enter(&ncec->ncec_lock); 1394 if (ncec->ncec_ipversion == IPV6_VERSION) 1395 ms = ndp_solicit(ncec, sender6, src_ill); 1396 else 1397 ms = arp_request(ncec, sender4, src_ill); 1398 mutex_exit(&ncec->ncec_lock); 1399 if (ms == 0) { 1400 if (ncec->ncec_state != ND_REACHABLE) { 1401 if (ncec->ncec_ipversion == IPV6_VERSION) 1402 ndp_resolv_failed(ncec); 1403 else 1404 arp_resolv_failed(ncec); 1405 ASSERT((ncec->ncec_flags & NCE_F_STATIC) == 0); 1406 nce_make_unreachable(ncec); 1407 ncec_delete(ncec); 1408 } 1409 } else { 1410 nce_restart_timer(ncec, (clock_t)ms); 1411 } 1412 done: 1413 ill_refrele(src_ill); 1414 } 1415 1416 /* 1417 * Send an IPv6 neighbor solicitation. 1418 * Returns number of milliseconds after which we should either rexmit or abort. 1419 * Return of zero means we should abort. 1420 * The caller holds the ncec_lock to protect ncec_qd_mp and ncec_rcnt. 1421 * The optional source address is used as a hint to ndp_solicit for 1422 * which source to use in the packet. 1423 * 1424 * NOTE: This routine drops ncec_lock (and later reacquires it) when sending 1425 * the packet. 1426 */ 1427 uint32_t 1428 ndp_solicit(ncec_t *ncec, in6_addr_t src, ill_t *ill) 1429 { 1430 in6_addr_t dst; 1431 boolean_t dropped = B_FALSE; 1432 1433 ASSERT(ncec->ncec_ipversion == IPV6_VERSION); 1434 ASSERT(MUTEX_HELD(&ncec->ncec_lock)); 1435 1436 if (ncec->ncec_rcnt == 0) 1437 return (0); 1438 1439 dst = ncec->ncec_addr; 1440 ncec->ncec_rcnt--; 1441 mutex_exit(&ncec->ncec_lock); 1442 dropped = ndp_xmit(ill, ND_NEIGHBOR_SOLICIT, ill->ill_phys_addr, 1443 ill->ill_phys_addr_length, &src, &dst, 0); 1444 mutex_enter(&ncec->ncec_lock); 1445 if (dropped) 1446 ncec->ncec_rcnt++; 1447 return (ncec->ncec_ill->ill_reachable_retrans_time); 1448 } 1449 1450 /* 1451 * Attempt to recover an address on an interface that's been marked as a 1452 * duplicate. Because NCEs are destroyed when the interface goes down, there's 1453 * no easy way to just probe the address and have the right thing happen if 1454 * it's no longer in use. Instead, we just bring it up normally and allow the 1455 * regular interface start-up logic to probe for a remaining duplicate and take 1456 * us back down if necessary. 1457 * Neither DHCP nor temporary addresses arrive here; they're excluded by 1458 * ip_ndp_excl. 1459 */ 1460 /* ARGSUSED */ 1461 void 1462 ip_addr_recover(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg) 1463 { 1464 ill_t *ill = rq->q_ptr; 1465 ipif_t *ipif; 1466 in6_addr_t *addr6 = (in6_addr_t *)mp->b_rptr; 1467 in_addr_t *addr4 = (in_addr_t *)mp->b_rptr; 1468 boolean_t addr_equal; 1469 1470 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 1471 /* 1472 * We do not support recovery of proxy ARP'd interfaces, 1473 * because the system lacks a complete proxy ARP mechanism. 1474 */ 1475 if (ill->ill_isv6) { 1476 addr_equal = IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6lcl_addr, 1477 addr6); 1478 } else { 1479 addr_equal = (ipif->ipif_lcl_addr == *addr4); 1480 } 1481 1482 if ((ipif->ipif_flags & IPIF_POINTOPOINT) || !addr_equal) 1483 continue; 1484 1485 /* 1486 * If we have already recovered or if the interface is going 1487 * away, then ignore. 1488 */ 1489 mutex_enter(&ill->ill_lock); 1490 if (!(ipif->ipif_flags & IPIF_DUPLICATE) || 1491 (ipif->ipif_state_flags & IPIF_CONDEMNED)) { 1492 mutex_exit(&ill->ill_lock); 1493 continue; 1494 } 1495 1496 ipif->ipif_flags &= ~IPIF_DUPLICATE; 1497 ill->ill_ipif_dup_count--; 1498 mutex_exit(&ill->ill_lock); 1499 ipif->ipif_was_dup = B_TRUE; 1500 1501 if (ill->ill_isv6) { 1502 VERIFY(ipif_ndp_up(ipif, B_TRUE) != EINPROGRESS); 1503 (void) ipif_up_done_v6(ipif); 1504 } else { 1505 VERIFY(ipif_arp_up(ipif, Res_act_initial, B_TRUE) != 1506 EINPROGRESS); 1507 (void) ipif_up_done(ipif); 1508 } 1509 } 1510 freeb(mp); 1511 } 1512 1513 /* 1514 * Attempt to recover an IPv6 interface that's been shut down as a duplicate. 1515 * As long as someone else holds the address, the interface will stay down. 1516 * When that conflict goes away, the interface is brought back up. This is 1517 * done so that accidental shutdowns of addresses aren't made permanent. Your 1518 * server will recover from a failure. 1519 * 1520 * For DHCP and temporary addresses, recovery is not done in the kernel. 1521 * Instead, it's handled by user space processes (dhcpagent and in.ndpd). 1522 * 1523 * This function is entered on a timer expiry; the ID is in ipif_recovery_id. 1524 */ 1525 void 1526 ipif_dup_recovery(void *arg) 1527 { 1528 ipif_t *ipif = arg; 1529 1530 ipif->ipif_recovery_id = 0; 1531 if (!(ipif->ipif_flags & IPIF_DUPLICATE)) 1532 return; 1533 1534 /* 1535 * No lock, because this is just an optimization. 1536 */ 1537 if (ipif->ipif_state_flags & IPIF_CONDEMNED) 1538 return; 1539 1540 /* If the link is down, we'll retry this later */ 1541 if (!(ipif->ipif_ill->ill_phyint->phyint_flags & PHYI_RUNNING)) 1542 return; 1543 1544 ipif_do_recovery(ipif); 1545 } 1546 1547 /* 1548 * Perform interface recovery by forcing the duplicate interfaces up and 1549 * allowing the system to determine which ones should stay up. 1550 * 1551 * Called both by recovery timer expiry and link-up notification. 1552 */ 1553 void 1554 ipif_do_recovery(ipif_t *ipif) 1555 { 1556 ill_t *ill = ipif->ipif_ill; 1557 mblk_t *mp; 1558 ip_stack_t *ipst = ill->ill_ipst; 1559 size_t mp_size; 1560 1561 if (ipif->ipif_isv6) 1562 mp_size = sizeof (ipif->ipif_v6lcl_addr); 1563 else 1564 mp_size = sizeof (ipif->ipif_lcl_addr); 1565 mp = allocb(mp_size, BPRI_MED); 1566 if (mp == NULL) { 1567 mutex_enter(&ill->ill_lock); 1568 if (ipst->ips_ip_dup_recovery > 0 && 1569 ipif->ipif_recovery_id == 0 && 1570 !(ipif->ipif_state_flags & IPIF_CONDEMNED)) { 1571 ipif->ipif_recovery_id = timeout(ipif_dup_recovery, 1572 ipif, MSEC_TO_TICK(ipst->ips_ip_dup_recovery)); 1573 } 1574 mutex_exit(&ill->ill_lock); 1575 } else { 1576 /* 1577 * A recovery timer may still be running if we got here from 1578 * ill_restart_dad(); cancel that timer. 1579 */ 1580 if (ipif->ipif_recovery_id != 0) 1581 (void) untimeout(ipif->ipif_recovery_id); 1582 ipif->ipif_recovery_id = 0; 1583 1584 if (ipif->ipif_isv6) { 1585 bcopy(&ipif->ipif_v6lcl_addr, mp->b_rptr, 1586 sizeof (ipif->ipif_v6lcl_addr)); 1587 } else { 1588 bcopy(&ipif->ipif_lcl_addr, mp->b_rptr, 1589 sizeof (ipif->ipif_lcl_addr)); 1590 } 1591 ill_refhold(ill); 1592 qwriter_ip(ill, ill->ill_rq, mp, ip_addr_recover, NEW_OP, 1593 B_FALSE); 1594 } 1595 } 1596 1597 /* 1598 * Find the MAC and IP addresses in an NA/NS message. 1599 */ 1600 static void 1601 ip_ndp_find_addresses(mblk_t *mp, ip_recv_attr_t *ira, ill_t *ill, 1602 in6_addr_t *targp, uchar_t **haddr, uint_t *haddrlenp) 1603 { 1604 icmp6_t *icmp6 = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN); 1605 nd_neighbor_solicit_t *ns = (nd_neighbor_solicit_t *)icmp6; 1606 uchar_t *addr; 1607 int alen; 1608 1609 /* icmp_inbound_v6 ensures this */ 1610 ASSERT(ira->ira_flags & IRAF_L2SRC_SET); 1611 1612 addr = ira->ira_l2src; 1613 alen = ill->ill_phys_addr_length; 1614 if (alen > 0) { 1615 *haddr = addr; 1616 *haddrlenp = alen; 1617 } else { 1618 *haddr = NULL; 1619 *haddrlenp = 0; 1620 } 1621 1622 /* nd_ns_target and nd_na_target are at the same offset, so we cheat */ 1623 *targp = ns->nd_ns_target; 1624 } 1625 1626 /* 1627 * This is for exclusive changes due to NDP duplicate address detection 1628 * failure. 1629 */ 1630 /* ARGSUSED */ 1631 static void 1632 ip_ndp_excl(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg) 1633 { 1634 ill_t *ill = rq->q_ptr; 1635 ipif_t *ipif; 1636 uchar_t *haddr; 1637 uint_t haddrlen; 1638 ip_stack_t *ipst = ill->ill_ipst; 1639 in6_addr_t targ; 1640 ip_recv_attr_t iras; 1641 mblk_t *attrmp; 1642 1643 attrmp = mp; 1644 mp = mp->b_cont; 1645 attrmp->b_cont = NULL; 1646 if (!ip_recv_attr_from_mblk(attrmp, &iras)) { 1647 /* The ill or ip_stack_t disappeared on us */ 1648 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 1649 ip_drop_input("ip_recv_attr_from_mblk", mp, ill); 1650 freemsg(mp); 1651 ira_cleanup(&iras, B_TRUE); 1652 return; 1653 } 1654 1655 ASSERT(ill == iras.ira_rill); 1656 1657 ip_ndp_find_addresses(mp, &iras, ill, &targ, &haddr, &haddrlen); 1658 if (haddr != NULL && haddrlen == ill->ill_phys_addr_length) { 1659 /* 1660 * Ignore conflicts generated by misbehaving switches that 1661 * just reflect our own messages back to us. For IPMP, we may 1662 * see reflections across any ill in the illgrp. 1663 * 1664 * RFC2462 and revisions tried to detect both the case 1665 * when a statically configured IPv6 address is a duplicate, 1666 * and the case when the L2 address itself is a duplicate. The 1667 * later is important because, with stateles address autoconf, 1668 * if the L2 address is a duplicate, the resulting IPv6 1669 * address(es) would also be duplicates. We rely on DAD of the 1670 * IPv6 address itself to detect the latter case. 1671 */ 1672 /* For an under ill_grp can change under lock */ 1673 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 1674 if (bcmp(haddr, ill->ill_phys_addr, haddrlen) == 0 || 1675 IS_UNDER_IPMP(ill) && 1676 ipmp_illgrp_find_ill(ill->ill_grp, haddr, 1677 haddrlen) != NULL) { 1678 rw_exit(&ipst->ips_ill_g_lock); 1679 goto ignore_conflict; 1680 } 1681 rw_exit(&ipst->ips_ill_g_lock); 1682 } 1683 1684 /* 1685 * Look up the appropriate ipif. 1686 */ 1687 ipif = ipif_lookup_addr_v6(&targ, ill, ALL_ZONES, ipst); 1688 if (ipif == NULL) 1689 goto ignore_conflict; 1690 1691 /* Reload the ill to match the ipif */ 1692 ill = ipif->ipif_ill; 1693 1694 /* If it's already duplicate or ineligible, then don't do anything. */ 1695 if (ipif->ipif_flags & (IPIF_POINTOPOINT|IPIF_DUPLICATE)) { 1696 ipif_refrele(ipif); 1697 goto ignore_conflict; 1698 } 1699 1700 /* 1701 * If this is a failure during duplicate recovery, then don't 1702 * complain. It may take a long time to recover. 1703 */ 1704 if (!ipif->ipif_was_dup) { 1705 char ibuf[LIFNAMSIZ]; 1706 char hbuf[MAC_STR_LEN]; 1707 char sbuf[INET6_ADDRSTRLEN]; 1708 1709 ipif_get_name(ipif, ibuf, sizeof (ibuf)); 1710 cmn_err(CE_WARN, "%s has duplicate address %s (in use by %s);" 1711 " disabled", ibuf, 1712 inet_ntop(AF_INET6, &targ, sbuf, sizeof (sbuf)), 1713 mac_colon_addr(haddr, haddrlen, hbuf, sizeof (hbuf))); 1714 } 1715 mutex_enter(&ill->ill_lock); 1716 ASSERT(!(ipif->ipif_flags & IPIF_DUPLICATE)); 1717 ipif->ipif_flags |= IPIF_DUPLICATE; 1718 ill->ill_ipif_dup_count++; 1719 mutex_exit(&ill->ill_lock); 1720 (void) ipif_down(ipif, NULL, NULL); 1721 (void) ipif_down_tail(ipif); 1722 mutex_enter(&ill->ill_lock); 1723 if (!(ipif->ipif_flags & (IPIF_DHCPRUNNING|IPIF_TEMPORARY)) && 1724 ill->ill_net_type == IRE_IF_RESOLVER && 1725 !(ipif->ipif_state_flags & IPIF_CONDEMNED) && 1726 ipst->ips_ip_dup_recovery > 0) { 1727 ASSERT(ipif->ipif_recovery_id == 0); 1728 ipif->ipif_recovery_id = timeout(ipif_dup_recovery, 1729 ipif, MSEC_TO_TICK(ipst->ips_ip_dup_recovery)); 1730 } 1731 mutex_exit(&ill->ill_lock); 1732 ipif_refrele(ipif); 1733 1734 ignore_conflict: 1735 freemsg(mp); 1736 ira_cleanup(&iras, B_TRUE); 1737 } 1738 1739 /* 1740 * Handle failure by tearing down the ipifs with the specified address. Note 1741 * that tearing down the ipif also means deleting the ncec through ipif_down, so 1742 * it's not possible to do recovery by just restarting the ncec timer. Instead, 1743 * we start a timer on the ipif. 1744 * Caller has to free mp; 1745 */ 1746 static void 1747 ndp_failure(mblk_t *mp, ip_recv_attr_t *ira) 1748 { 1749 const uchar_t *haddr; 1750 ill_t *ill = ira->ira_rill; 1751 1752 /* 1753 * Ignore conflicts generated by misbehaving switches that just 1754 * reflect our own messages back to us. 1755 */ 1756 1757 /* icmp_inbound_v6 ensures this */ 1758 ASSERT(ira->ira_flags & IRAF_L2SRC_SET); 1759 haddr = ira->ira_l2src; 1760 if (haddr != NULL && 1761 bcmp(haddr, ill->ill_phys_addr, ill->ill_phys_addr_length) == 0) { 1762 return; 1763 } 1764 1765 if ((mp = copymsg(mp)) != NULL) { 1766 mblk_t *attrmp; 1767 1768 attrmp = ip_recv_attr_to_mblk(ira); 1769 if (attrmp == NULL) { 1770 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 1771 ip_drop_input("ipIfStatsInDiscards", mp, ill); 1772 freemsg(mp); 1773 } else { 1774 ASSERT(attrmp->b_cont == NULL); 1775 attrmp->b_cont = mp; 1776 mp = attrmp; 1777 ill_refhold(ill); 1778 qwriter_ip(ill, ill->ill_rq, mp, ip_ndp_excl, NEW_OP, 1779 B_FALSE); 1780 } 1781 } 1782 } 1783 1784 /* 1785 * Handle a discovered conflict: some other system is advertising that it owns 1786 * one of our IP addresses. We need to defend ourselves, or just shut down the 1787 * interface. 1788 * 1789 * Handles both IPv4 and IPv6 1790 */ 1791 boolean_t 1792 ip_nce_conflict(mblk_t *mp, ip_recv_attr_t *ira, ncec_t *ncec) 1793 { 1794 ipif_t *ipif; 1795 clock_t now; 1796 uint_t maxdefense; 1797 uint_t defs; 1798 ill_t *ill = ira->ira_ill; 1799 ip_stack_t *ipst = ill->ill_ipst; 1800 uint32_t elapsed; 1801 boolean_t isv6 = ill->ill_isv6; 1802 ipaddr_t ncec_addr; 1803 1804 if (isv6) { 1805 ipif = ipif_lookup_addr_v6(&ncec->ncec_addr, ill, ALL_ZONES, 1806 ipst); 1807 } else { 1808 if (arp_no_defense) { 1809 /* 1810 * Yes, there is a conflict, but no, we do not 1811 * defend ourself. 1812 */ 1813 return (B_TRUE); 1814 } 1815 IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, ncec_addr); 1816 ipif = ipif_lookup_addr(ncec_addr, ill, ALL_ZONES, 1817 ipst); 1818 } 1819 if (ipif == NULL) 1820 return (B_FALSE); 1821 1822 /* 1823 * First, figure out if this address is disposable. 1824 */ 1825 if (ipif->ipif_flags & (IPIF_DHCPRUNNING | IPIF_TEMPORARY)) 1826 maxdefense = ipst->ips_ip_max_temp_defend; 1827 else 1828 maxdefense = ipst->ips_ip_max_defend; 1829 1830 /* 1831 * Now figure out how many times we've defended ourselves. Ignore 1832 * defenses that happened long in the past. 1833 */ 1834 now = ddi_get_lbolt(); 1835 elapsed = (drv_hztousec(now - ncec->ncec_last_time_defended))/1000000; 1836 mutex_enter(&ncec->ncec_lock); 1837 if ((defs = ncec->ncec_defense_count) > 0 && 1838 elapsed > ipst->ips_ip_defend_interval) { 1839 /* 1840 * ip_defend_interval has elapsed. 1841 * reset the defense count. 1842 */ 1843 ncec->ncec_defense_count = defs = 0; 1844 } 1845 ncec->ncec_defense_count++; 1846 ncec->ncec_last_time_defended = now; 1847 mutex_exit(&ncec->ncec_lock); 1848 ipif_refrele(ipif); 1849 1850 /* 1851 * If we've defended ourselves too many times already, then give up and 1852 * tear down the interface(s) using this address. 1853 * Otherwise, caller has to defend by sending out an announce. 1854 */ 1855 if (defs >= maxdefense) { 1856 if (isv6) 1857 ndp_failure(mp, ira); 1858 else 1859 arp_failure(mp, ira); 1860 } else { 1861 return (B_TRUE); /* caller must defend this address */ 1862 } 1863 return (B_FALSE); 1864 } 1865 1866 /* 1867 * Handle reception of Neighbor Solicitation messages. 1868 */ 1869 static void 1870 ndp_input_solicit(mblk_t *mp, ip_recv_attr_t *ira) 1871 { 1872 ill_t *ill = ira->ira_ill, *under_ill; 1873 nd_neighbor_solicit_t *ns; 1874 uint32_t hlen = ill->ill_phys_addr_length; 1875 uchar_t *haddr = NULL; 1876 icmp6_t *icmp_nd; 1877 ip6_t *ip6h; 1878 ncec_t *our_ncec = NULL; 1879 in6_addr_t target; 1880 in6_addr_t src; 1881 int len; 1882 int flag = 0; 1883 nd_opt_hdr_t *opt = NULL; 1884 boolean_t bad_solicit = B_FALSE; 1885 mib2_ipv6IfIcmpEntry_t *mib = ill->ill_icmp6_mib; 1886 boolean_t need_ill_refrele = B_FALSE; 1887 1888 ip6h = (ip6_t *)mp->b_rptr; 1889 icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN); 1890 len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN; 1891 src = ip6h->ip6_src; 1892 ns = (nd_neighbor_solicit_t *)icmp_nd; 1893 target = ns->nd_ns_target; 1894 if (IN6_IS_ADDR_MULTICAST(&target) || IN6_IS_ADDR_V4MAPPED(&target) || 1895 IN6_IS_ADDR_LOOPBACK(&target)) { 1896 if (ip_debug > 2) { 1897 /* ip1dbg */ 1898 pr_addr_dbg("ndp_input_solicit: Martian Target %s\n", 1899 AF_INET6, &target); 1900 } 1901 bad_solicit = B_TRUE; 1902 goto done; 1903 } 1904 if (len > sizeof (nd_neighbor_solicit_t)) { 1905 /* Options present */ 1906 opt = (nd_opt_hdr_t *)&ns[1]; 1907 len -= sizeof (nd_neighbor_solicit_t); 1908 if (!ndp_verify_optlen(opt, len)) { 1909 ip1dbg(("ndp_input_solicit: Bad opt len\n")); 1910 bad_solicit = B_TRUE; 1911 goto done; 1912 } 1913 } 1914 if (IN6_IS_ADDR_UNSPECIFIED(&src)) { 1915 /* Check to see if this is a valid DAD solicitation */ 1916 if (!IN6_IS_ADDR_MC_SOLICITEDNODE(&ip6h->ip6_dst)) { 1917 if (ip_debug > 2) { 1918 /* ip1dbg */ 1919 pr_addr_dbg("ndp_input_solicit: IPv6 " 1920 "Destination is not solicited node " 1921 "multicast %s\n", AF_INET6, 1922 &ip6h->ip6_dst); 1923 } 1924 bad_solicit = B_TRUE; 1925 goto done; 1926 } 1927 } 1928 1929 /* 1930 * NOTE: with IPMP, it's possible the nominated multicast ill (which 1931 * received this packet if it's multicast) is not the ill tied to 1932 * e.g. the IPMP ill's data link-local. So we match across the illgrp 1933 * to ensure we find the associated NCE. 1934 */ 1935 our_ncec = ncec_lookup_illgrp_v6(ill, &target); 1936 /* 1937 * If this is a valid Solicitation for an address we are publishing, 1938 * then a PUBLISH entry should exist in the cache 1939 */ 1940 if (our_ncec == NULL || !NCE_PUBLISH(our_ncec)) { 1941 ip1dbg(("ndp_input_solicit: Wrong target in NS?!" 1942 "ifname=%s ", ill->ill_name)); 1943 if (ip_debug > 2) { 1944 /* ip1dbg */ 1945 pr_addr_dbg(" dst %s\n", AF_INET6, &target); 1946 } 1947 if (our_ncec == NULL) 1948 bad_solicit = B_TRUE; 1949 goto done; 1950 } 1951 1952 /* At this point we should have a verified NS per spec */ 1953 if (opt != NULL) { 1954 opt = ndp_get_option(opt, len, ND_OPT_SOURCE_LINKADDR); 1955 if (opt != NULL) { 1956 haddr = (uchar_t *)&opt[1]; 1957 if (hlen > opt->nd_opt_len * 8 - sizeof (*opt) || 1958 hlen == 0) { 1959 ip1dbg(("ndp_input_advert: bad SLLA\n")); 1960 bad_solicit = B_TRUE; 1961 goto done; 1962 } 1963 } 1964 } 1965 1966 /* If sending directly to peer, set the unicast flag */ 1967 if (!IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst)) 1968 flag |= NDP_UNICAST; 1969 1970 /* 1971 * Create/update the entry for the soliciting node on the ipmp_ill. 1972 * or respond to outstanding queries, don't if 1973 * the source is unspecified address. 1974 */ 1975 if (!IN6_IS_ADDR_UNSPECIFIED(&src)) { 1976 int err; 1977 nce_t *nnce; 1978 1979 ASSERT(ill->ill_isv6); 1980 /* 1981 * Regular solicitations *must* include the Source Link-Layer 1982 * Address option. Ignore messages that do not. 1983 */ 1984 if (haddr == NULL && IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst)) { 1985 ip1dbg(("ndp_input_solicit: source link-layer address " 1986 "option missing with a specified source.\n")); 1987 bad_solicit = B_TRUE; 1988 goto done; 1989 } 1990 1991 /* 1992 * This is a regular solicitation. If we're still in the 1993 * process of verifying the address, then don't respond at all 1994 * and don't keep track of the sender. 1995 */ 1996 if (our_ncec->ncec_state == ND_PROBE) 1997 goto done; 1998 1999 /* 2000 * If the solicitation doesn't have sender hardware address 2001 * (legal for unicast solicitation), then process without 2002 * installing the return NCE. Either we already know it, or 2003 * we'll be forced to look it up when (and if) we reply to the 2004 * packet. 2005 */ 2006 if (haddr == NULL) 2007 goto no_source; 2008 2009 under_ill = ill; 2010 if (IS_UNDER_IPMP(under_ill)) { 2011 ill = ipmp_ill_hold_ipmp_ill(under_ill); 2012 if (ill == NULL) 2013 ill = under_ill; 2014 else 2015 need_ill_refrele = B_TRUE; 2016 } 2017 err = nce_lookup_then_add_v6(ill, 2018 haddr, hlen, 2019 &src, /* Soliciting nodes address */ 2020 0, 2021 ND_STALE, 2022 &nnce); 2023 2024 if (need_ill_refrele) { 2025 ill_refrele(ill); 2026 ill = under_ill; 2027 need_ill_refrele = B_FALSE; 2028 } 2029 switch (err) { 2030 case 0: 2031 /* done with this entry */ 2032 nce_refrele(nnce); 2033 break; 2034 case EEXIST: 2035 /* 2036 * B_FALSE indicates this is not an an advertisement. 2037 */ 2038 nce_process(nnce->nce_common, haddr, 0, B_FALSE); 2039 nce_refrele(nnce); 2040 break; 2041 default: 2042 ip1dbg(("ndp_input_solicit: Can't create NCE %d\n", 2043 err)); 2044 goto done; 2045 } 2046 no_source: 2047 flag |= NDP_SOLICITED; 2048 } else { 2049 /* 2050 * No source link layer address option should be present in a 2051 * valid DAD request. 2052 */ 2053 if (haddr != NULL) { 2054 ip1dbg(("ndp_input_solicit: source link-layer address " 2055 "option present with an unspecified source.\n")); 2056 bad_solicit = B_TRUE; 2057 goto done; 2058 } 2059 if (our_ncec->ncec_state == ND_PROBE) { 2060 /* 2061 * Internally looped-back probes will have 2062 * IRAF_L2SRC_LOOPBACK set so we can ignore our own 2063 * transmissions. 2064 */ 2065 if (!(ira->ira_flags & IRAF_L2SRC_LOOPBACK)) { 2066 /* 2067 * If someone else is probing our address, then 2068 * we've crossed wires. Declare failure. 2069 */ 2070 ndp_failure(mp, ira); 2071 } 2072 goto done; 2073 } 2074 /* 2075 * This is a DAD probe. Multicast the advertisement to the 2076 * all-nodes address. 2077 */ 2078 src = ipv6_all_hosts_mcast; 2079 } 2080 flag |= nce_advert_flags(our_ncec); 2081 (void) ndp_xmit(ill, 2082 ND_NEIGHBOR_ADVERT, 2083 our_ncec->ncec_lladdr, 2084 our_ncec->ncec_lladdr_length, 2085 &target, /* Source and target of the advertisement pkt */ 2086 &src, /* IP Destination (source of original pkt) */ 2087 flag); 2088 done: 2089 if (bad_solicit) 2090 BUMP_MIB(mib, ipv6IfIcmpInBadNeighborSolicitations); 2091 if (our_ncec != NULL) 2092 ncec_refrele(our_ncec); 2093 } 2094 2095 /* 2096 * Handle reception of Neighbor Solicitation messages 2097 */ 2098 void 2099 ndp_input_advert(mblk_t *mp, ip_recv_attr_t *ira) 2100 { 2101 ill_t *ill = ira->ira_ill; 2102 nd_neighbor_advert_t *na; 2103 uint32_t hlen = ill->ill_phys_addr_length; 2104 uchar_t *haddr = NULL; 2105 icmp6_t *icmp_nd; 2106 ip6_t *ip6h; 2107 ncec_t *dst_ncec = NULL; 2108 in6_addr_t target; 2109 nd_opt_hdr_t *opt = NULL; 2110 int len; 2111 ip_stack_t *ipst = ill->ill_ipst; 2112 mib2_ipv6IfIcmpEntry_t *mib = ill->ill_icmp6_mib; 2113 2114 ip6h = (ip6_t *)mp->b_rptr; 2115 icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN); 2116 len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN; 2117 na = (nd_neighbor_advert_t *)icmp_nd; 2118 2119 if (IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst) && 2120 (na->nd_na_flags_reserved & ND_NA_FLAG_SOLICITED)) { 2121 ip1dbg(("ndp_input_advert: Target is multicast but the " 2122 "solicited flag is not zero\n")); 2123 BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements); 2124 return; 2125 } 2126 target = na->nd_na_target; 2127 if (IN6_IS_ADDR_MULTICAST(&target) || IN6_IS_ADDR_V4MAPPED(&target) || 2128 IN6_IS_ADDR_LOOPBACK(&target)) { 2129 if (ip_debug > 2) { 2130 /* ip1dbg */ 2131 pr_addr_dbg("ndp_input_solicit: Martian Target %s\n", 2132 AF_INET6, &target); 2133 } 2134 BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements); 2135 return; 2136 } 2137 if (len > sizeof (nd_neighbor_advert_t)) { 2138 opt = (nd_opt_hdr_t *)&na[1]; 2139 if (!ndp_verify_optlen(opt, 2140 len - sizeof (nd_neighbor_advert_t))) { 2141 ip1dbg(("ndp_input_advert: cannot verify SLLA\n")); 2142 BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements); 2143 return; 2144 } 2145 /* At this point we have a verified NA per spec */ 2146 len -= sizeof (nd_neighbor_advert_t); 2147 opt = ndp_get_option(opt, len, ND_OPT_TARGET_LINKADDR); 2148 if (opt != NULL) { 2149 haddr = (uchar_t *)&opt[1]; 2150 if (hlen > opt->nd_opt_len * 8 - sizeof (*opt) || 2151 hlen == 0) { 2152 ip1dbg(("ndp_input_advert: bad SLLA\n")); 2153 BUMP_MIB(mib, 2154 ipv6IfIcmpInBadNeighborAdvertisements); 2155 return; 2156 } 2157 } 2158 } 2159 2160 /* 2161 * NOTE: we match across the illgrp since we need to do DAD for all of 2162 * our local addresses, and those are spread across all the active 2163 * ills in the group. 2164 */ 2165 if ((dst_ncec = ncec_lookup_illgrp_v6(ill, &target)) == NULL) 2166 return; 2167 2168 if (NCE_PUBLISH(dst_ncec)) { 2169 /* 2170 * Someone just advertised an addresses that we publish. First, 2171 * check it it was us -- if so, we can safely ignore it. 2172 * We don't get the haddr from the ira_l2src because, in the 2173 * case that the packet originated from us, on an IPMP group, 2174 * the ira_l2src may would be the link-layer address of the 2175 * cast_ill used to send the packet, which may not be the same 2176 * as the dst_ncec->ncec_lladdr of the address. 2177 */ 2178 if (haddr != NULL) { 2179 if (ira->ira_flags & IRAF_L2SRC_LOOPBACK) 2180 goto out; 2181 2182 if (!nce_cmp_ll_addr(dst_ncec, haddr, hlen)) 2183 goto out; /* from us -- no conflict */ 2184 2185 /* 2186 * If we're in an IPMP group, check if this is an echo 2187 * from another ill in the group. Use the double- 2188 * checked locking pattern to avoid grabbing 2189 * ill_g_lock in the non-IPMP case. 2190 */ 2191 if (IS_UNDER_IPMP(ill)) { 2192 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 2193 if (IS_UNDER_IPMP(ill) && ipmp_illgrp_find_ill( 2194 ill->ill_grp, haddr, hlen) != NULL) { 2195 rw_exit(&ipst->ips_ill_g_lock); 2196 goto out; 2197 } 2198 rw_exit(&ipst->ips_ill_g_lock); 2199 } 2200 } 2201 2202 /* 2203 * This appears to be a real conflict. If we're trying to 2204 * configure this NCE (ND_PROBE), then shut it down. 2205 * Otherwise, handle the discovered conflict. 2206 */ 2207 if (dst_ncec->ncec_state == ND_PROBE) { 2208 ndp_failure(mp, ira); 2209 } else { 2210 if (ip_nce_conflict(mp, ira, dst_ncec)) { 2211 char hbuf[MAC_STR_LEN]; 2212 char sbuf[INET6_ADDRSTRLEN]; 2213 2214 cmn_err(CE_WARN, 2215 "node '%s' is using %s on %s", 2216 inet_ntop(AF_INET6, &target, sbuf, 2217 sizeof (sbuf)), 2218 haddr == NULL ? "<none>" : 2219 mac_colon_addr(haddr, hlen, hbuf, 2220 sizeof (hbuf)), ill->ill_name); 2221 /* 2222 * RFC 4862, Section 5.4.4 does not mandate 2223 * any specific behavior when an NA matches 2224 * a non-tentative address assigned to the 2225 * receiver. We make the choice of defending 2226 * our address, based on the assumption that 2227 * the sender has not detected the Duplicate. 2228 * 2229 * ncec_last_time_defended has been adjusted 2230 * in ip_nce_conflict() 2231 */ 2232 (void) ndp_announce(dst_ncec); 2233 } 2234 } 2235 } else { 2236 if (na->nd_na_flags_reserved & ND_NA_FLAG_ROUTER) 2237 dst_ncec->ncec_flags |= NCE_F_ISROUTER; 2238 2239 /* B_TRUE indicates this an advertisement */ 2240 nce_process(dst_ncec, haddr, na->nd_na_flags_reserved, B_TRUE); 2241 } 2242 out: 2243 ncec_refrele(dst_ncec); 2244 } 2245 2246 /* 2247 * Process NDP neighbor solicitation/advertisement messages. 2248 * The checksum has already checked o.k before reaching here. 2249 * Information about the datalink header is contained in ira_l2src, but 2250 * that should be ignored for loopback packets. 2251 */ 2252 void 2253 ndp_input(mblk_t *mp, ip_recv_attr_t *ira) 2254 { 2255 ill_t *ill = ira->ira_rill; 2256 icmp6_t *icmp_nd; 2257 ip6_t *ip6h; 2258 int len; 2259 mib2_ipv6IfIcmpEntry_t *mib = ill->ill_icmp6_mib; 2260 ill_t *orig_ill = NULL; 2261 2262 /* 2263 * Since ira_ill is where the IRE_LOCAL was hosted we use ira_rill 2264 * and make it be the IPMP upper so avoid being confused by a packet 2265 * addressed to a unicast address on a different ill. 2266 */ 2267 if (IS_UNDER_IPMP(ill)) { 2268 orig_ill = ill; 2269 ill = ipmp_ill_hold_ipmp_ill(orig_ill); 2270 if (ill == NULL) { 2271 ill = orig_ill; 2272 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 2273 ip_drop_input("ipIfStatsInDiscards - IPMP ill", 2274 mp, ill); 2275 freemsg(mp); 2276 return; 2277 } 2278 ASSERT(ill != orig_ill); 2279 orig_ill = ira->ira_ill; 2280 ira->ira_ill = ill; 2281 mib = ill->ill_icmp6_mib; 2282 } 2283 if (!pullupmsg(mp, -1)) { 2284 ip1dbg(("ndp_input: pullupmsg failed\n")); 2285 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 2286 ip_drop_input("ipIfStatsInDiscards - pullupmsg", mp, ill); 2287 goto done; 2288 } 2289 ip6h = (ip6_t *)mp->b_rptr; 2290 if (ip6h->ip6_hops != IPV6_MAX_HOPS) { 2291 ip1dbg(("ndp_input: hoplimit != IPV6_MAX_HOPS\n")); 2292 ip_drop_input("ipv6IfIcmpBadHoplimit", mp, ill); 2293 BUMP_MIB(mib, ipv6IfIcmpBadHoplimit); 2294 goto done; 2295 } 2296 /* 2297 * NDP does not accept any extension headers between the 2298 * IP header and the ICMP header since e.g. a routing 2299 * header could be dangerous. 2300 * This assumes that any AH or ESP headers are removed 2301 * by ip prior to passing the packet to ndp_input. 2302 */ 2303 if (ip6h->ip6_nxt != IPPROTO_ICMPV6) { 2304 ip1dbg(("ndp_input: Wrong next header 0x%x\n", 2305 ip6h->ip6_nxt)); 2306 ip_drop_input("Wrong next header", mp, ill); 2307 BUMP_MIB(mib, ipv6IfIcmpInErrors); 2308 goto done; 2309 } 2310 icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN); 2311 ASSERT(icmp_nd->icmp6_type == ND_NEIGHBOR_SOLICIT || 2312 icmp_nd->icmp6_type == ND_NEIGHBOR_ADVERT); 2313 if (icmp_nd->icmp6_code != 0) { 2314 ip1dbg(("ndp_input: icmp6 code != 0 \n")); 2315 ip_drop_input("code non-zero", mp, ill); 2316 BUMP_MIB(mib, ipv6IfIcmpInErrors); 2317 goto done; 2318 } 2319 len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN; 2320 /* 2321 * Make sure packet length is large enough for either 2322 * a NS or a NA icmp packet. 2323 */ 2324 if (len < sizeof (struct icmp6_hdr) + sizeof (struct in6_addr)) { 2325 ip1dbg(("ndp_input: packet too short\n")); 2326 ip_drop_input("packet too short", mp, ill); 2327 BUMP_MIB(mib, ipv6IfIcmpInErrors); 2328 goto done; 2329 } 2330 if (icmp_nd->icmp6_type == ND_NEIGHBOR_SOLICIT) { 2331 ndp_input_solicit(mp, ira); 2332 } else { 2333 ndp_input_advert(mp, ira); 2334 } 2335 done: 2336 freemsg(mp); 2337 if (orig_ill != NULL) { 2338 ill_refrele(ill); 2339 ira->ira_ill = orig_ill; 2340 } 2341 } 2342 2343 /* 2344 * ndp_xmit is called to form and transmit a ND solicitation or 2345 * advertisement ICMP packet. 2346 * 2347 * If the source address is unspecified and this isn't a probe (used for 2348 * duplicate address detection), an appropriate source address and link layer 2349 * address will be chosen here. The link layer address option is included if 2350 * the source is specified (i.e., all non-probe packets), and omitted (per the 2351 * specification) otherwise. 2352 * 2353 * It returns B_FALSE only if it does a successful put() to the 2354 * corresponding ill's ill_wq otherwise returns B_TRUE. 2355 */ 2356 static boolean_t 2357 ndp_xmit(ill_t *ill, uint32_t operation, uint8_t *hw_addr, uint_t hw_addr_len, 2358 const in6_addr_t *sender, const in6_addr_t *target, int flag) 2359 { 2360 uint32_t len; 2361 icmp6_t *icmp6; 2362 mblk_t *mp; 2363 ip6_t *ip6h; 2364 nd_opt_hdr_t *opt; 2365 uint_t plen; 2366 zoneid_t zoneid = GLOBAL_ZONEID; 2367 ill_t *hwaddr_ill = ill; 2368 ip_xmit_attr_t ixas; 2369 ip_stack_t *ipst = ill->ill_ipst; 2370 boolean_t need_refrele = B_FALSE; 2371 boolean_t probe = B_FALSE; 2372 2373 if (IS_UNDER_IPMP(ill)) { 2374 probe = ipif_lookup_testaddr_v6(ill, sender, NULL); 2375 /* 2376 * We send non-probe packets on the upper IPMP interface. 2377 * ip_output_simple() will use cast_ill for sending any 2378 * multicast packets. Note that we can't follow the same 2379 * logic for probe packets because all interfaces in the ipmp 2380 * group may have failed, so that we really want to only try 2381 * to send the ND packet on the ill corresponding to the src 2382 * address. 2383 */ 2384 if (!probe) { 2385 ill = ipmp_ill_hold_ipmp_ill(ill); 2386 if (ill != NULL) 2387 need_refrele = B_TRUE; 2388 else 2389 ill = hwaddr_ill; 2390 } 2391 } 2392 2393 /* 2394 * If we have a unspecified source(sender) address, select a 2395 * proper source address for the solicitation here itself so 2396 * that we can initialize the h/w address correctly. 2397 * 2398 * If the sender is specified then we use this address in order 2399 * to lookup the zoneid before calling ip_output_v6(). This is to 2400 * enable unicast ND_NEIGHBOR_ADVERT packets to be routed correctly 2401 * by IP (we cannot guarantee that the global zone has an interface 2402 * route to the destination). 2403 * 2404 * Note that the NA never comes here with the unspecified source 2405 * address. 2406 */ 2407 2408 /* 2409 * Probes will have unspec src at this point. 2410 */ 2411 if (!(IN6_IS_ADDR_UNSPECIFIED(sender))) { 2412 zoneid = ipif_lookup_addr_zoneid_v6(sender, ill, ipst); 2413 /* 2414 * It's possible for ipif_lookup_addr_zoneid_v6() to return 2415 * ALL_ZONES if it cannot find a matching ipif for the address 2416 * we are trying to use. In this case we err on the side of 2417 * trying to send the packet by defaulting to the GLOBAL_ZONEID. 2418 */ 2419 if (zoneid == ALL_ZONES) 2420 zoneid = GLOBAL_ZONEID; 2421 } 2422 2423 plen = (sizeof (nd_opt_hdr_t) + hw_addr_len + 7) / 8; 2424 len = IPV6_HDR_LEN + sizeof (nd_neighbor_advert_t) + plen * 8; 2425 mp = allocb(len, BPRI_LO); 2426 if (mp == NULL) { 2427 if (need_refrele) 2428 ill_refrele(ill); 2429 return (B_TRUE); 2430 } 2431 2432 bzero((char *)mp->b_rptr, len); 2433 mp->b_wptr = mp->b_rptr + len; 2434 2435 bzero(&ixas, sizeof (ixas)); 2436 ixas.ixa_flags = IXAF_SET_ULP_CKSUM | IXAF_NO_HW_CKSUM; 2437 2438 ixas.ixa_ifindex = ill->ill_phyint->phyint_ifindex; 2439 ixas.ixa_ipst = ipst; 2440 ixas.ixa_cred = kcred; 2441 ixas.ixa_cpid = NOPID; 2442 ixas.ixa_tsl = NULL; 2443 ixas.ixa_zoneid = zoneid; 2444 2445 ip6h = (ip6_t *)mp->b_rptr; 2446 ip6h->ip6_vcf = IPV6_DEFAULT_VERS_AND_FLOW; 2447 ip6h->ip6_plen = htons(len - IPV6_HDR_LEN); 2448 ip6h->ip6_nxt = IPPROTO_ICMPV6; 2449 ip6h->ip6_hops = IPV6_MAX_HOPS; 2450 ixas.ixa_multicast_ttl = ip6h->ip6_hops; 2451 ip6h->ip6_dst = *target; 2452 icmp6 = (icmp6_t *)&ip6h[1]; 2453 2454 if (hw_addr_len != 0) { 2455 opt = (nd_opt_hdr_t *)((uint8_t *)ip6h + IPV6_HDR_LEN + 2456 sizeof (nd_neighbor_advert_t)); 2457 } else { 2458 opt = NULL; 2459 } 2460 if (operation == ND_NEIGHBOR_SOLICIT) { 2461 nd_neighbor_solicit_t *ns = (nd_neighbor_solicit_t *)icmp6; 2462 2463 if (opt != NULL && !(flag & NDP_PROBE)) { 2464 /* 2465 * Note that we don't send out SLLA for ND probes 2466 * per RFC 4862, even though we do send out the src 2467 * haddr for IPv4 DAD probes, even though both IPv4 2468 * and IPv6 go out with the unspecified/INADDR_ANY 2469 * src IP addr. 2470 */ 2471 opt->nd_opt_type = ND_OPT_SOURCE_LINKADDR; 2472 } 2473 ip6h->ip6_src = *sender; 2474 ns->nd_ns_target = *target; 2475 if (!(flag & NDP_UNICAST)) { 2476 /* Form multicast address of the target */ 2477 ip6h->ip6_dst = ipv6_solicited_node_mcast; 2478 ip6h->ip6_dst.s6_addr32[3] |= 2479 ns->nd_ns_target.s6_addr32[3]; 2480 } 2481 } else { 2482 nd_neighbor_advert_t *na = (nd_neighbor_advert_t *)icmp6; 2483 2484 ASSERT(!(flag & NDP_PROBE)); 2485 if (opt != NULL) 2486 opt->nd_opt_type = ND_OPT_TARGET_LINKADDR; 2487 ip6h->ip6_src = *sender; 2488 na->nd_na_target = *sender; 2489 if (flag & NDP_ISROUTER) 2490 na->nd_na_flags_reserved |= ND_NA_FLAG_ROUTER; 2491 if (flag & NDP_SOLICITED) 2492 na->nd_na_flags_reserved |= ND_NA_FLAG_SOLICITED; 2493 if (flag & NDP_ORIDE) 2494 na->nd_na_flags_reserved |= ND_NA_FLAG_OVERRIDE; 2495 } 2496 2497 if (!(flag & NDP_PROBE)) { 2498 if (hw_addr != NULL && opt != NULL) { 2499 /* Fill in link layer address and option len */ 2500 opt->nd_opt_len = (uint8_t)plen; 2501 bcopy(hw_addr, &opt[1], hw_addr_len); 2502 } 2503 } 2504 if (opt != NULL && opt->nd_opt_type == 0) { 2505 /* If there's no link layer address option, then strip it. */ 2506 len -= plen * 8; 2507 mp->b_wptr = mp->b_rptr + len; 2508 ip6h->ip6_plen = htons(len - IPV6_HDR_LEN); 2509 } 2510 2511 icmp6->icmp6_type = (uint8_t)operation; 2512 icmp6->icmp6_code = 0; 2513 /* 2514 * Prepare for checksum by putting icmp length in the icmp 2515 * checksum field. The checksum is calculated in ip_output.c. 2516 */ 2517 icmp6->icmp6_cksum = ip6h->ip6_plen; 2518 2519 (void) ip_output_simple(mp, &ixas); 2520 ixa_cleanup(&ixas); 2521 if (need_refrele) 2522 ill_refrele(ill); 2523 return (B_FALSE); 2524 } 2525 2526 /* 2527 * Used to set ND_UNREACHBLE before ncec_delete sets it NCE_F_CONDEMNED. 2528 * The datapath uses this as an indication that there 2529 * is a problem (as opposed to a NCE that was just 2530 * reclaimed due to lack of memory. 2531 * Note that static ARP entries never become unreachable. 2532 */ 2533 void 2534 nce_make_unreachable(ncec_t *ncec) 2535 { 2536 mutex_enter(&ncec->ncec_lock); 2537 ncec->ncec_state = ND_UNREACHABLE; 2538 mutex_exit(&ncec->ncec_lock); 2539 } 2540 2541 /* 2542 * NCE retransmit timer. Common to IPv4 and IPv6. 2543 * This timer goes off when: 2544 * a. It is time to retransmit a resolution for resolver. 2545 * b. It is time to send reachability probes. 2546 */ 2547 void 2548 nce_timer(void *arg) 2549 { 2550 ncec_t *ncec = arg; 2551 ill_t *ill = ncec->ncec_ill, *src_ill; 2552 char addrbuf[INET6_ADDRSTRLEN]; 2553 boolean_t dropped = B_FALSE; 2554 ip_stack_t *ipst = ncec->ncec_ipst; 2555 boolean_t isv6 = (ncec->ncec_ipversion == IPV6_VERSION); 2556 in_addr_t sender4 = INADDR_ANY; 2557 in6_addr_t sender6 = ipv6_all_zeros; 2558 2559 /* 2560 * The timer has to be cancelled by ncec_delete before doing the final 2561 * refrele. So the NCE is guaranteed to exist when the timer runs 2562 * until it clears the timeout_id. Before clearing the timeout_id 2563 * bump up the refcnt so that we can continue to use the ncec 2564 */ 2565 ASSERT(ncec != NULL); 2566 mutex_enter(&ncec->ncec_lock); 2567 ncec_refhold_locked(ncec); 2568 ncec->ncec_timeout_id = 0; 2569 mutex_exit(&ncec->ncec_lock); 2570 2571 src_ill = nce_resolve_src(ncec, &sender6); 2572 /* if we could not find a sender address, return */ 2573 if (src_ill == NULL) { 2574 if (!isv6) { 2575 IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, sender4); 2576 ip1dbg(("no src ill for %s\n", inet_ntop(AF_INET, 2577 &sender4, addrbuf, sizeof (addrbuf)))); 2578 } else { 2579 ip1dbg(("no src ill for %s\n", inet_ntop(AF_INET6, 2580 &ncec->ncec_addr, addrbuf, sizeof (addrbuf)))); 2581 } 2582 nce_restart_timer(ncec, ill->ill_reachable_retrans_time); 2583 ncec_refrele(ncec); 2584 return; 2585 } 2586 if (!isv6) 2587 IN6_V4MAPPED_TO_IPADDR(&sender6, sender4); 2588 2589 mutex_enter(&ncec->ncec_lock); 2590 /* 2591 * Check the reachability state. 2592 */ 2593 switch (ncec->ncec_state) { 2594 case ND_DELAY: 2595 ASSERT(ncec->ncec_lladdr != NULL); 2596 ncec->ncec_state = ND_PROBE; 2597 ncec->ncec_pcnt = ND_MAX_UNICAST_SOLICIT; 2598 if (isv6) { 2599 mutex_exit(&ncec->ncec_lock); 2600 dropped = ndp_xmit(src_ill, ND_NEIGHBOR_SOLICIT, 2601 src_ill->ill_phys_addr, 2602 src_ill->ill_phys_addr_length, 2603 &sender6, &ncec->ncec_addr, 2604 NDP_UNICAST); 2605 } else { 2606 dropped = (arp_request(ncec, sender4, src_ill) == 0); 2607 mutex_exit(&ncec->ncec_lock); 2608 } 2609 if (!dropped) { 2610 mutex_enter(&ncec->ncec_lock); 2611 ncec->ncec_pcnt--; 2612 mutex_exit(&ncec->ncec_lock); 2613 } 2614 if (ip_debug > 3) { 2615 /* ip2dbg */ 2616 pr_addr_dbg("nce_timer: state for %s changed " 2617 "to PROBE\n", AF_INET6, &ncec->ncec_addr); 2618 } 2619 nce_restart_timer(ncec, ill->ill_reachable_retrans_time); 2620 break; 2621 case ND_PROBE: 2622 /* must be retransmit timer */ 2623 ASSERT(ncec->ncec_pcnt >= -1); 2624 if (ncec->ncec_pcnt > 0) { 2625 /* 2626 * As per RFC2461, the ncec gets deleted after 2627 * MAX_UNICAST_SOLICIT unsuccessful re-transmissions. 2628 * Note that the first unicast solicitation is sent 2629 * during the DELAY state. 2630 */ 2631 ip2dbg(("nce_timer: pcount=%x dst %s\n", 2632 ncec->ncec_pcnt, 2633 inet_ntop((isv6? AF_INET6 : AF_INET), 2634 &ncec->ncec_addr, addrbuf, sizeof (addrbuf)))); 2635 if (NCE_PUBLISH(ncec)) { 2636 mutex_exit(&ncec->ncec_lock); 2637 /* 2638 * send out a probe; note that src_ill 2639 * is ignored by nce_dad() for all 2640 * DAD message types other than IPv6 2641 * unicast probes 2642 */ 2643 nce_dad(ncec, src_ill, B_TRUE); 2644 } else { 2645 ASSERT(src_ill != NULL); 2646 if (isv6) { 2647 mutex_exit(&ncec->ncec_lock); 2648 dropped = ndp_xmit(src_ill, 2649 ND_NEIGHBOR_SOLICIT, 2650 src_ill->ill_phys_addr, 2651 src_ill->ill_phys_addr_length, 2652 &sender6, &ncec->ncec_addr, 2653 NDP_UNICAST); 2654 } else { 2655 /* 2656 * since the nce is REACHABLE, 2657 * the ARP request will be sent out 2658 * as a link-layer unicast. 2659 */ 2660 dropped = (arp_request(ncec, sender4, 2661 src_ill) == 0); 2662 mutex_exit(&ncec->ncec_lock); 2663 } 2664 if (!dropped) { 2665 mutex_enter(&ncec->ncec_lock); 2666 ncec->ncec_pcnt--; 2667 mutex_exit(&ncec->ncec_lock); 2668 } 2669 nce_restart_timer(ncec, 2670 ill->ill_reachable_retrans_time); 2671 } 2672 } else if (ncec->ncec_pcnt < 0) { 2673 /* No hope, delete the ncec */ 2674 /* Tell datapath it went bad */ 2675 ncec->ncec_state = ND_UNREACHABLE; 2676 mutex_exit(&ncec->ncec_lock); 2677 if (ip_debug > 2) { 2678 /* ip1dbg */ 2679 pr_addr_dbg("nce_timer: Delete NCE for" 2680 " dst %s\n", (isv6? AF_INET6: AF_INET), 2681 &ncec->ncec_addr); 2682 } 2683 /* if static ARP can't delete. */ 2684 if ((ncec->ncec_flags & NCE_F_STATIC) == 0) 2685 ncec_delete(ncec); 2686 2687 } else if (!NCE_PUBLISH(ncec)) { 2688 /* 2689 * Probe count is 0 for a dynamic entry (one that we 2690 * ourselves are not publishing). We should never get 2691 * here if NONUD was requested, hence the ASSERT below. 2692 */ 2693 ASSERT((ncec->ncec_flags & NCE_F_NONUD) == 0); 2694 ip2dbg(("nce_timer: pcount=%x dst %s\n", 2695 ncec->ncec_pcnt, inet_ntop(AF_INET6, 2696 &ncec->ncec_addr, addrbuf, sizeof (addrbuf)))); 2697 ncec->ncec_pcnt--; 2698 mutex_exit(&ncec->ncec_lock); 2699 /* Wait one interval before killing */ 2700 nce_restart_timer(ncec, 2701 ill->ill_reachable_retrans_time); 2702 } else if (ill->ill_phyint->phyint_flags & PHYI_RUNNING) { 2703 ipif_t *ipif; 2704 ipaddr_t ncec_addr; 2705 2706 /* 2707 * We're done probing, and we can now declare this 2708 * address to be usable. Let IP know that it's ok to 2709 * use. 2710 */ 2711 ncec->ncec_state = ND_REACHABLE; 2712 ncec->ncec_flags &= ~NCE_F_UNVERIFIED; 2713 mutex_exit(&ncec->ncec_lock); 2714 if (isv6) { 2715 ipif = ipif_lookup_addr_exact_v6( 2716 &ncec->ncec_addr, ill, ipst); 2717 } else { 2718 IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, 2719 ncec_addr); 2720 ipif = ipif_lookup_addr_exact(ncec_addr, ill, 2721 ipst); 2722 } 2723 if (ipif != NULL) { 2724 if (ipif->ipif_was_dup) { 2725 char ibuf[LIFNAMSIZ]; 2726 char sbuf[INET6_ADDRSTRLEN]; 2727 2728 ipif->ipif_was_dup = B_FALSE; 2729 (void) inet_ntop(AF_INET6, 2730 &ipif->ipif_v6lcl_addr, 2731 sbuf, sizeof (sbuf)); 2732 ipif_get_name(ipif, ibuf, 2733 sizeof (ibuf)); 2734 cmn_err(CE_NOTE, "recovered address " 2735 "%s on %s", sbuf, ibuf); 2736 } 2737 if ((ipif->ipif_flags & IPIF_UP) && 2738 !ipif->ipif_addr_ready) 2739 ipif_up_notify(ipif); 2740 ipif->ipif_addr_ready = 1; 2741 ipif_refrele(ipif); 2742 } 2743 if (!isv6 && arp_no_defense) 2744 break; 2745 /* Begin defending our new address */ 2746 if (ncec->ncec_unsolicit_count > 0) { 2747 ncec->ncec_unsolicit_count--; 2748 if (isv6) { 2749 dropped = ndp_announce(ncec); 2750 } else { 2751 dropped = arp_announce(ncec); 2752 } 2753 2754 if (dropped) 2755 ncec->ncec_unsolicit_count++; 2756 else 2757 ncec->ncec_last_time_defended = 2758 ddi_get_lbolt(); 2759 } 2760 if (ncec->ncec_unsolicit_count > 0) { 2761 nce_restart_timer(ncec, 2762 ANNOUNCE_INTERVAL(isv6)); 2763 } else if (DEFENSE_INTERVAL(isv6) != 0) { 2764 nce_restart_timer(ncec, DEFENSE_INTERVAL(isv6)); 2765 } 2766 } else { 2767 /* 2768 * This is an address we're probing to be our own, but 2769 * the ill is down. Wait until it comes back before 2770 * doing anything, but switch to reachable state so 2771 * that the restart will work. 2772 */ 2773 ncec->ncec_state = ND_REACHABLE; 2774 mutex_exit(&ncec->ncec_lock); 2775 } 2776 break; 2777 case ND_INCOMPLETE: { 2778 mblk_t *mp, *nextmp; 2779 mblk_t **prevmpp; 2780 2781 /* 2782 * Per case (2) in the nce_queue_mp() comments, scan ncec_qd_mp 2783 * for any IPMP probe packets, and toss them. IPMP probe 2784 * packets will always be at the head of ncec_qd_mp, so that 2785 * we can stop at the first queued ND packet that is 2786 * not a probe packet. 2787 */ 2788 prevmpp = &ncec->ncec_qd_mp; 2789 for (mp = ncec->ncec_qd_mp; mp != NULL; mp = nextmp) { 2790 nextmp = mp->b_next; 2791 2792 if (IS_UNDER_IPMP(ill) && ncec->ncec_nprobes > 0) { 2793 inet_freemsg(mp); 2794 ncec->ncec_nprobes--; 2795 *prevmpp = nextmp; 2796 } else { 2797 prevmpp = &mp->b_next; 2798 } 2799 } 2800 2801 /* 2802 * Must be resolver's retransmit timer. 2803 */ 2804 mutex_exit(&ncec->ncec_lock); 2805 ip_ndp_resolve(ncec); 2806 break; 2807 } 2808 case ND_REACHABLE: 2809 if (((ncec->ncec_flags & NCE_F_UNSOL_ADV) && 2810 ncec->ncec_unsolicit_count != 0) || 2811 (NCE_PUBLISH(ncec) && DEFENSE_INTERVAL(isv6) != 0)) { 2812 if (ncec->ncec_unsolicit_count > 0) { 2813 ncec->ncec_unsolicit_count--; 2814 mutex_exit(&ncec->ncec_lock); 2815 /* 2816 * When we get to zero announcements left, 2817 * switch to address defense 2818 */ 2819 } else { 2820 boolean_t rate_limit; 2821 2822 mutex_exit(&ncec->ncec_lock); 2823 rate_limit = ill_defend_rate_limit(ill, ncec); 2824 if (rate_limit) { 2825 nce_restart_timer(ncec, 2826 DEFENSE_INTERVAL(isv6)); 2827 break; 2828 } 2829 } 2830 if (isv6) { 2831 dropped = ndp_announce(ncec); 2832 } else { 2833 dropped = arp_announce(ncec); 2834 } 2835 mutex_enter(&ncec->ncec_lock); 2836 if (dropped) { 2837 ncec->ncec_unsolicit_count++; 2838 } else { 2839 ncec->ncec_last_time_defended = 2840 ddi_get_lbolt(); 2841 } 2842 mutex_exit(&ncec->ncec_lock); 2843 if (ncec->ncec_unsolicit_count != 0) { 2844 nce_restart_timer(ncec, 2845 ANNOUNCE_INTERVAL(isv6)); 2846 } else { 2847 nce_restart_timer(ncec, DEFENSE_INTERVAL(isv6)); 2848 } 2849 } else { 2850 mutex_exit(&ncec->ncec_lock); 2851 } 2852 break; 2853 default: 2854 mutex_exit(&ncec->ncec_lock); 2855 break; 2856 } 2857 done: 2858 ncec_refrele(ncec); 2859 ill_refrele(src_ill); 2860 } 2861 2862 /* 2863 * Set a link layer address from the ll_addr passed in. 2864 * Copy SAP from ill. 2865 */ 2866 static void 2867 nce_set_ll(ncec_t *ncec, uchar_t *ll_addr) 2868 { 2869 ill_t *ill = ncec->ncec_ill; 2870 2871 ASSERT(ll_addr != NULL); 2872 if (ill->ill_phys_addr_length > 0) { 2873 /* 2874 * The bcopy() below used to be called for the physical address 2875 * length rather than the link layer address length. For 2876 * ethernet and many other media, the phys_addr and lla are 2877 * identical. 2878 * 2879 * The phys_addr and lla may not be the same for devices that 2880 * support DL_IPV6_LINK_LAYER_ADDR, though there are currently 2881 * no known instances of these. 2882 * 2883 * For PPP or other interfaces with a zero length 2884 * physical address, don't do anything here. 2885 * The bcopy() with a zero phys_addr length was previously 2886 * a no-op for interfaces with a zero-length physical address. 2887 * Using the lla for them would change the way they operate. 2888 * Doing nothing in such cases preserves expected behavior. 2889 */ 2890 bcopy(ll_addr, ncec->ncec_lladdr, ill->ill_nd_lla_len); 2891 } 2892 } 2893 2894 boolean_t 2895 nce_cmp_ll_addr(const ncec_t *ncec, const uchar_t *ll_addr, 2896 uint32_t ll_addr_len) 2897 { 2898 ASSERT(ncec->ncec_lladdr != NULL); 2899 if (ll_addr == NULL) 2900 return (B_FALSE); 2901 if (bcmp(ll_addr, ncec->ncec_lladdr, ll_addr_len) != 0) 2902 return (B_TRUE); 2903 return (B_FALSE); 2904 } 2905 2906 /* 2907 * Updates the link layer address or the reachability state of 2908 * a cache entry. Reset probe counter if needed. 2909 */ 2910 void 2911 nce_update(ncec_t *ncec, uint16_t new_state, uchar_t *new_ll_addr) 2912 { 2913 ill_t *ill = ncec->ncec_ill; 2914 boolean_t need_stop_timer = B_FALSE; 2915 boolean_t need_fastpath_update = B_FALSE; 2916 nce_t *nce = NULL; 2917 timeout_id_t tid; 2918 2919 ASSERT(MUTEX_HELD(&ncec->ncec_lock)); 2920 /* 2921 * If this interface does not do NUD, there is no point 2922 * in allowing an update to the cache entry. Although 2923 * we will respond to NS. 2924 * The only time we accept an update for a resolver when 2925 * NUD is turned off is when it has just been created. 2926 * Non-Resolvers will always be created as REACHABLE. 2927 */ 2928 if (new_state != ND_UNCHANGED) { 2929 if ((ncec->ncec_flags & NCE_F_NONUD) && 2930 (ncec->ncec_state != ND_INCOMPLETE)) 2931 return; 2932 ASSERT((int16_t)new_state >= ND_STATE_VALID_MIN); 2933 ASSERT((int16_t)new_state <= ND_STATE_VALID_MAX); 2934 need_stop_timer = B_TRUE; 2935 if (new_state == ND_REACHABLE) 2936 ncec->ncec_last = TICK_TO_MSEC(ddi_get_lbolt64()); 2937 else { 2938 /* We force NUD in this case */ 2939 ncec->ncec_last = 0; 2940 } 2941 ncec->ncec_state = new_state; 2942 ncec->ncec_pcnt = ND_MAX_UNICAST_SOLICIT; 2943 ASSERT(ncec->ncec_lladdr != NULL || new_state == ND_INITIAL || 2944 new_state == ND_INCOMPLETE); 2945 } 2946 if (need_stop_timer || (ncec->ncec_flags & NCE_F_STATIC)) { 2947 tid = ncec->ncec_timeout_id; 2948 ncec->ncec_timeout_id = 0; 2949 } 2950 /* 2951 * Re-trigger fastpath probe and 2952 * overwrite the DL_UNITDATA_REQ data, noting we'll lose 2953 * whatever packets that happens to be transmitting at the time. 2954 */ 2955 if (new_ll_addr != NULL) { 2956 bcopy(new_ll_addr, ncec->ncec_lladdr, 2957 ill->ill_phys_addr_length); 2958 need_fastpath_update = B_TRUE; 2959 } 2960 mutex_exit(&ncec->ncec_lock); 2961 if (need_stop_timer || (ncec->ncec_flags & NCE_F_STATIC)) { 2962 if (tid != 0) 2963 (void) untimeout(tid); 2964 } 2965 if (need_fastpath_update) { 2966 /* 2967 * Delete any existing existing dlur_mp and fp_mp information. 2968 * For IPMP interfaces, all underlying ill's must be checked 2969 * and purged. 2970 */ 2971 nce_fastpath_list_delete(ncec->ncec_ill, ncec, NULL); 2972 /* 2973 * add the new dlur_mp and fp_mp 2974 */ 2975 nce = nce_fastpath(ncec, B_TRUE, NULL); 2976 if (nce != NULL) 2977 nce_refrele(nce); 2978 } 2979 mutex_enter(&ncec->ncec_lock); 2980 } 2981 2982 static void 2983 nce_queue_mp_common(ncec_t *ncec, mblk_t *mp, boolean_t head_insert) 2984 { 2985 uint_t count = 0; 2986 mblk_t **mpp, *tmp; 2987 2988 ASSERT(MUTEX_HELD(&ncec->ncec_lock)); 2989 2990 for (mpp = &ncec->ncec_qd_mp; *mpp != NULL; mpp = &(*mpp)->b_next) { 2991 if (++count > ncec->ncec_ill->ill_max_buf) { 2992 tmp = ncec->ncec_qd_mp->b_next; 2993 ncec->ncec_qd_mp->b_next = NULL; 2994 /* 2995 * if we never create data addrs on the under_ill 2996 * does this matter? 2997 */ 2998 BUMP_MIB(ncec->ncec_ill->ill_ip_mib, 2999 ipIfStatsOutDiscards); 3000 ip_drop_output("ipIfStatsOutDiscards", ncec->ncec_qd_mp, 3001 ncec->ncec_ill); 3002 freemsg(ncec->ncec_qd_mp); 3003 ncec->ncec_qd_mp = tmp; 3004 } 3005 } 3006 3007 if (head_insert) { 3008 ncec->ncec_nprobes++; 3009 mp->b_next = ncec->ncec_qd_mp; 3010 ncec->ncec_qd_mp = mp; 3011 } else { 3012 *mpp = mp; 3013 } 3014 } 3015 3016 /* 3017 * nce_queue_mp will queue the packet into the ncec_qd_mp. The packet will be 3018 * queued at the head or tail of the queue based on the input argument 3019 * 'head_insert'. The caller should specify this argument as B_TRUE if this 3020 * packet is an IPMP probe packet, in which case the following happens: 3021 * 3022 * 1. Insert it at the head of the ncec_qd_mp list. Consider the normal 3023 * (non-ipmp_probe) load-speading case where the source address of the ND 3024 * packet is not tied to ncec_ill. If the ill bound to the source address 3025 * cannot receive, the response to the ND packet will not be received. 3026 * However, if ND packets for ncec_ill's probes are queued behind that ND 3027 * packet, those probes will also fail to be sent, and thus in.mpathd will 3028 * erroneously conclude that ncec_ill has also failed. 3029 * 3030 * 2. Drop the ipmp_probe packet in ndp_timer() if the ND did not succeed on 3031 * the first attempt. This ensures that ND problems do not manifest as 3032 * probe RTT spikes. 3033 * 3034 * We achieve this by inserting ipmp_probe() packets at the head of the 3035 * nce_queue. 3036 * 3037 * The ncec for the probe target is created with ncec_ill set to the ipmp_ill, 3038 * but the caller needs to set head_insert to B_TRUE if this is a probe packet. 3039 */ 3040 void 3041 nce_queue_mp(ncec_t *ncec, mblk_t *mp, boolean_t head_insert) 3042 { 3043 ASSERT(MUTEX_HELD(&ncec->ncec_lock)); 3044 nce_queue_mp_common(ncec, mp, head_insert); 3045 } 3046 3047 /* 3048 * Called when address resolution failed due to a timeout. 3049 * Send an ICMP unreachable in response to all queued packets. 3050 */ 3051 void 3052 ndp_resolv_failed(ncec_t *ncec) 3053 { 3054 mblk_t *mp, *nxt_mp; 3055 char buf[INET6_ADDRSTRLEN]; 3056 ill_t *ill = ncec->ncec_ill; 3057 ip_recv_attr_t iras; 3058 3059 bzero(&iras, sizeof (iras)); 3060 iras.ira_flags = 0; 3061 /* 3062 * we are setting the ira_rill to the ipmp_ill (instead of 3063 * the actual ill on which the packet was received), but this 3064 * is ok because we don't actually need the real ira_rill. 3065 * to send the icmp unreachable to the sender. 3066 */ 3067 iras.ira_ill = iras.ira_rill = ill; 3068 iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex; 3069 iras.ira_rifindex = iras.ira_ruifindex; 3070 3071 ip1dbg(("ndp_resolv_failed: dst %s\n", 3072 inet_ntop(AF_INET6, (char *)&ncec->ncec_addr, buf, sizeof (buf)))); 3073 mutex_enter(&ncec->ncec_lock); 3074 mp = ncec->ncec_qd_mp; 3075 ncec->ncec_qd_mp = NULL; 3076 ncec->ncec_nprobes = 0; 3077 mutex_exit(&ncec->ncec_lock); 3078 while (mp != NULL) { 3079 nxt_mp = mp->b_next; 3080 mp->b_next = NULL; 3081 3082 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); 3083 ip_drop_output("ipIfStatsOutDiscards - address unreachable", 3084 mp, ill); 3085 icmp_unreachable_v6(mp, 3086 ICMP6_DST_UNREACH_ADDR, B_FALSE, &iras); 3087 ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE)); 3088 mp = nxt_mp; 3089 } 3090 ncec_cb_dispatch(ncec); /* finish off waiting callbacks */ 3091 } 3092 3093 /* 3094 * Handle the completion of NDP and ARP resolution. 3095 */ 3096 void 3097 nce_resolv_ok(ncec_t *ncec) 3098 { 3099 mblk_t *mp; 3100 uint_t pkt_len; 3101 iaflags_t ixaflags = IXAF_NO_TRACE; 3102 nce_t *nce; 3103 ill_t *ill = ncec->ncec_ill; 3104 boolean_t isv6 = (ncec->ncec_ipversion == IPV6_VERSION); 3105 ip_stack_t *ipst = ill->ill_ipst; 3106 3107 if (IS_IPMP(ncec->ncec_ill)) { 3108 nce_resolv_ipmp_ok(ncec); 3109 return; 3110 } 3111 /* non IPMP case */ 3112 3113 mutex_enter(&ncec->ncec_lock); 3114 ASSERT(ncec->ncec_nprobes == 0); 3115 mp = ncec->ncec_qd_mp; 3116 ncec->ncec_qd_mp = NULL; 3117 mutex_exit(&ncec->ncec_lock); 3118 3119 while (mp != NULL) { 3120 mblk_t *nxt_mp; 3121 3122 if (ill->ill_isv6) { 3123 ip6_t *ip6h = (ip6_t *)mp->b_rptr; 3124 3125 pkt_len = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN; 3126 } else { 3127 ipha_t *ipha = (ipha_t *)mp->b_rptr; 3128 3129 ixaflags |= IXAF_IS_IPV4; 3130 pkt_len = ntohs(ipha->ipha_length); 3131 } 3132 nxt_mp = mp->b_next; 3133 mp->b_next = NULL; 3134 /* 3135 * IXAF_NO_DEV_FLOW_CTL information for TCP packets is no 3136 * longer available, but it's ok to drop this flag because TCP 3137 * has its own flow-control in effect, so TCP packets 3138 * are not likely to get here when flow-control is in effect. 3139 */ 3140 mutex_enter(&ill->ill_lock); 3141 nce = nce_lookup(ill, &ncec->ncec_addr); 3142 mutex_exit(&ill->ill_lock); 3143 3144 if (nce == NULL) { 3145 if (isv6) { 3146 BUMP_MIB(&ipst->ips_ip6_mib, 3147 ipIfStatsOutDiscards); 3148 } else { 3149 BUMP_MIB(&ipst->ips_ip_mib, 3150 ipIfStatsOutDiscards); 3151 } 3152 ip_drop_output("ipIfStatsOutDiscards - no nce", 3153 mp, NULL); 3154 freemsg(mp); 3155 } else { 3156 /* 3157 * We don't know the zoneid, but 3158 * ip_xmit does not care since IXAF_NO_TRACE 3159 * is set. (We traced the packet the first 3160 * time through ip_xmit.) 3161 */ 3162 (void) ip_xmit(mp, nce, ixaflags, pkt_len, 0, 3163 ALL_ZONES, 0, NULL); 3164 nce_refrele(nce); 3165 } 3166 mp = nxt_mp; 3167 } 3168 3169 ncec_cb_dispatch(ncec); /* complete callbacks */ 3170 } 3171 3172 /* 3173 * Called by SIOCSNDP* ioctl to add/change an ncec entry 3174 * and the corresponding attributes. 3175 * Disallow states other than ND_REACHABLE or ND_STALE. 3176 */ 3177 int 3178 ndp_sioc_update(ill_t *ill, lif_nd_req_t *lnr) 3179 { 3180 sin6_t *sin6; 3181 in6_addr_t *addr; 3182 ncec_t *ncec; 3183 nce_t *nce; 3184 int err = 0; 3185 uint16_t new_flags = 0; 3186 uint16_t old_flags = 0; 3187 int inflags = lnr->lnr_flags; 3188 ip_stack_t *ipst = ill->ill_ipst; 3189 boolean_t do_postprocess = B_FALSE; 3190 3191 ASSERT(ill->ill_isv6); 3192 if ((lnr->lnr_state_create != ND_REACHABLE) && 3193 (lnr->lnr_state_create != ND_STALE)) 3194 return (EINVAL); 3195 3196 sin6 = (sin6_t *)&lnr->lnr_addr; 3197 addr = &sin6->sin6_addr; 3198 3199 mutex_enter(&ipst->ips_ndp6->ndp_g_lock); 3200 ASSERT(!IS_UNDER_IPMP(ill)); 3201 nce = nce_lookup_addr(ill, addr); 3202 if (nce != NULL) 3203 new_flags = nce->nce_common->ncec_flags; 3204 3205 switch (inflags & (NDF_ISROUTER_ON|NDF_ISROUTER_OFF)) { 3206 case NDF_ISROUTER_ON: 3207 new_flags |= NCE_F_ISROUTER; 3208 break; 3209 case NDF_ISROUTER_OFF: 3210 new_flags &= ~NCE_F_ISROUTER; 3211 break; 3212 case (NDF_ISROUTER_OFF|NDF_ISROUTER_ON): 3213 mutex_exit(&ipst->ips_ndp6->ndp_g_lock); 3214 if (nce != NULL) 3215 nce_refrele(nce); 3216 return (EINVAL); 3217 } 3218 if (inflags & NDF_STATIC) 3219 new_flags |= NCE_F_STATIC; 3220 3221 switch (inflags & (NDF_ANYCAST_ON|NDF_ANYCAST_OFF)) { 3222 case NDF_ANYCAST_ON: 3223 new_flags |= NCE_F_ANYCAST; 3224 break; 3225 case NDF_ANYCAST_OFF: 3226 new_flags &= ~NCE_F_ANYCAST; 3227 break; 3228 case (NDF_ANYCAST_OFF|NDF_ANYCAST_ON): 3229 mutex_exit(&ipst->ips_ndp6->ndp_g_lock); 3230 if (nce != NULL) 3231 nce_refrele(nce); 3232 return (EINVAL); 3233 } 3234 3235 if (nce == NULL) { 3236 err = nce_add_v6(ill, 3237 (uchar_t *)lnr->lnr_hdw_addr, 3238 ill->ill_phys_addr_length, 3239 addr, 3240 new_flags, 3241 lnr->lnr_state_create, 3242 &nce); 3243 if (err != 0) { 3244 mutex_exit(&ipst->ips_ndp6->ndp_g_lock); 3245 ip1dbg(("ndp_sioc_update: Can't create NCE %d\n", err)); 3246 return (err); 3247 } else { 3248 do_postprocess = B_TRUE; 3249 } 3250 } 3251 ncec = nce->nce_common; 3252 old_flags = ncec->ncec_flags; 3253 if (old_flags & NCE_F_ISROUTER && !(new_flags & NCE_F_ISROUTER)) { 3254 ncec_router_to_host(ncec); 3255 mutex_exit(&ipst->ips_ndp6->ndp_g_lock); 3256 if (do_postprocess) 3257 err = nce_add_v6_postprocess(nce); 3258 nce_refrele(nce); 3259 return (0); 3260 } 3261 mutex_exit(&ipst->ips_ndp6->ndp_g_lock); 3262 3263 if (do_postprocess) 3264 err = nce_add_v6_postprocess(nce); 3265 /* 3266 * err cannot be anything other than 0 because we don't support 3267 * proxy arp of static addresses. 3268 */ 3269 ASSERT(err == 0); 3270 3271 mutex_enter(&ncec->ncec_lock); 3272 ncec->ncec_flags = new_flags; 3273 mutex_exit(&ncec->ncec_lock); 3274 /* 3275 * Note that we ignore the state at this point, which 3276 * should be either STALE or REACHABLE. Instead we let 3277 * the link layer address passed in to determine the state 3278 * much like incoming packets. 3279 */ 3280 nce_process(ncec, (uchar_t *)lnr->lnr_hdw_addr, 0, B_FALSE); 3281 nce_refrele(nce); 3282 return (0); 3283 } 3284 3285 /* 3286 * Create an nce_t structure for ill using the ncec->ncec_lladdr to set up 3287 * the nce_dlur_mp. If ill != ncec->ncec_ill, then the ips_ill_g_lock must 3288 * be held to ensure that they are in the same group. 3289 */ 3290 static nce_t * 3291 nce_fastpath_create(ill_t *ill, ncec_t *ncec) 3292 { 3293 3294 nce_t *nce; 3295 3296 nce = nce_ill_lookup_then_add(ill, ncec); 3297 3298 if (nce == NULL || IS_LOOPBACK(nce->nce_ill) || IS_VNI(nce->nce_ill)) 3299 return (nce); 3300 3301 /* 3302 * hold the ncec_lock to synchronize with nce_update() so that, 3303 * at the end of this function, the contents of nce_dlur_mp are 3304 * consistent with ncec->ncec_lladdr, even though some intermediate 3305 * packet may have been sent out with a mangled address, which would 3306 * only be a transient condition. 3307 */ 3308 mutex_enter(&ncec->ncec_lock); 3309 if (ncec->ncec_lladdr != NULL) { 3310 bcopy(ncec->ncec_lladdr, nce->nce_dlur_mp->b_rptr + 3311 NCE_LL_ADDR_OFFSET(ill), ill->ill_phys_addr_length); 3312 } else { 3313 nce->nce_dlur_mp = ill_dlur_gen(NULL, 0, ill->ill_sap, 3314 ill->ill_sap_length); 3315 } 3316 mutex_exit(&ncec->ncec_lock); 3317 return (nce); 3318 } 3319 3320 /* 3321 * we make nce_fp_mp to have an M_DATA prepend. 3322 * The caller ensures there is hold on ncec for this function. 3323 * Note that since ill_fastpath_probe() copies the mblk there is 3324 * no need to hold the nce or ncec beyond this function. 3325 * 3326 * If the caller has passed in a non-null ncec_nce to nce_fastpath() that 3327 * ncec_nce must correspond to the nce for ncec with nce_ill == ncec->ncec_ill 3328 * and will be returned back by this function, so that no extra nce_refrele 3329 * is required for the caller. The calls from nce_add_common() use this 3330 * method. All other callers (that pass in NULL ncec_nce) will have to do a 3331 * nce_refrele of the returned nce (when it is non-null). 3332 */ 3333 static nce_t * 3334 nce_fastpath(ncec_t *ncec, boolean_t trigger_fp_req, nce_t *ncec_nce) 3335 { 3336 nce_t *nce; 3337 ill_t *ill = ncec->ncec_ill; 3338 3339 ASSERT(ill != NULL); 3340 3341 if (IS_IPMP(ill) && trigger_fp_req) { 3342 trigger_fp_req = B_FALSE; 3343 ipmp_ncec_refresh_nce(ncec); 3344 } 3345 3346 /* 3347 * If the caller already has the nce corresponding to the ill, use 3348 * that one. Otherwise we have to lookup/add the nce. Calls from 3349 * nce_add_common() fall in the former category, and have just done 3350 * the nce lookup/add that can be reused. 3351 */ 3352 if (ncec_nce == NULL) 3353 nce = nce_fastpath_create(ill, ncec); 3354 else 3355 nce = ncec_nce; 3356 3357 if (nce == NULL || IS_LOOPBACK(nce->nce_ill) || IS_VNI(nce->nce_ill)) 3358 return (nce); 3359 3360 if (trigger_fp_req) 3361 nce_fastpath_trigger(nce); 3362 return (nce); 3363 } 3364 3365 /* 3366 * Trigger fastpath on nce. No locks may be held. 3367 */ 3368 static void 3369 nce_fastpath_trigger(nce_t *nce) 3370 { 3371 int res; 3372 ill_t *ill = nce->nce_ill; 3373 ncec_t *ncec = nce->nce_common; 3374 3375 res = ill_fastpath_probe(ill, nce->nce_dlur_mp); 3376 /* 3377 * EAGAIN is an indication of a transient error 3378 * i.e. allocation failure etc. leave the ncec in the list it 3379 * will be updated when another probe happens for another ire 3380 * if not it will be taken out of the list when the ire is 3381 * deleted. 3382 */ 3383 if (res != 0 && res != EAGAIN && res != ENOTSUP) 3384 nce_fastpath_list_delete(ill, ncec, NULL); 3385 } 3386 3387 /* 3388 * Add ncec to the nce fastpath list on ill. 3389 */ 3390 static nce_t * 3391 nce_ill_lookup_then_add_locked(ill_t *ill, ncec_t *ncec, list_t *graveyard) 3392 { 3393 nce_t *nce = NULL; 3394 3395 ASSERT(MUTEX_HELD(&ill->ill_lock)); 3396 /* 3397 * Atomically ensure that the ill is not CONDEMNED and is not going 3398 * down, before adding the NCE. 3399 */ 3400 if (ill->ill_state_flags & ILL_CONDEMNED) 3401 return (NULL); 3402 mutex_enter(&ncec->ncec_lock); 3403 /* 3404 * if ncec has not been deleted and 3405 * is not already in the list add it. 3406 */ 3407 if (!NCE_ISCONDEMNED(ncec)) { 3408 nce = nce_lookup(ill, &ncec->ncec_addr); 3409 if (nce != NULL) 3410 goto done; 3411 nce = nce_add(ill, ncec, graveyard); 3412 } 3413 done: 3414 mutex_exit(&ncec->ncec_lock); 3415 return (nce); 3416 } 3417 3418 static nce_t * 3419 nce_ill_lookup_then_add(ill_t *ill, ncec_t *ncec) 3420 { 3421 nce_t *nce; 3422 list_t graveyard; 3423 3424 list_create(&graveyard, sizeof (nce_t), offsetof(nce_t, nce_node)); 3425 mutex_enter(&ill->ill_lock); 3426 nce = nce_ill_lookup_then_add_locked(ill, ncec, &graveyard); 3427 mutex_exit(&ill->ill_lock); 3428 nce_graveyard_free(&graveyard); 3429 return (nce); 3430 } 3431 3432 3433 /* 3434 * remove ncec from the ill_nce list. If 'dead' is non-null, the deleted 3435 * nce is added to the 'dead' list, and the caller must nce_refrele() the 3436 * entry after all locks have been dropped. 3437 */ 3438 void 3439 nce_fastpath_list_delete(ill_t *ill, ncec_t *ncec, list_t *dead) 3440 { 3441 nce_t *nce; 3442 3443 ASSERT(ill != NULL); 3444 3445 /* delete any nces referencing the ncec from underlying ills */ 3446 if (IS_IPMP(ill)) 3447 ipmp_ncec_delete_nce(ncec); 3448 3449 /* now the ill itself */ 3450 mutex_enter(&ill->ill_lock); 3451 for (nce = list_head(&ill->ill_nce); nce != NULL; 3452 nce = list_next(&ill->ill_nce, nce)) { 3453 if (nce->nce_common == ncec) { 3454 nce_refhold(nce); 3455 nce_delete(nce); 3456 break; 3457 } 3458 } 3459 mutex_exit(&ill->ill_lock); 3460 if (nce != NULL) { 3461 if (dead == NULL) 3462 nce_refrele(nce); 3463 else 3464 list_insert_tail(dead, nce); 3465 } 3466 } 3467 3468 /* 3469 * when the fastpath response does not fit in the datab 3470 * associated with the existing nce_fp_mp, we delete and 3471 * add the nce to retrigger fastpath based on the information 3472 * in the ncec_t. 3473 */ 3474 static nce_t * 3475 nce_delete_then_add(nce_t *nce) 3476 { 3477 ill_t *ill = nce->nce_ill; 3478 nce_t *newnce = NULL; 3479 list_t graveyard; 3480 3481 list_create(&graveyard, sizeof (nce_t), offsetof(nce_t, nce_node)); 3482 ip0dbg(("nce_delete_then_add nce %p ill %s\n", 3483 (void *)nce, ill->ill_name)); 3484 mutex_enter(&ill->ill_lock); 3485 mutex_enter(&nce->nce_common->ncec_lock); 3486 nce_delete(nce); 3487 /* 3488 * Make sure that ncec is not condemned before adding. We hold the 3489 * ill_lock and ncec_lock to synchronize with ncec_delete() and 3490 * ipmp_ncec_delete_nce() 3491 */ 3492 if (!NCE_ISCONDEMNED(nce->nce_common)) 3493 newnce = nce_add(ill, nce->nce_common, &graveyard); 3494 mutex_exit(&nce->nce_common->ncec_lock); 3495 mutex_exit(&ill->ill_lock); 3496 nce_graveyard_free(&graveyard); 3497 nce_refrele(nce); 3498 return (newnce); /* could be null if nomem */ 3499 } 3500 3501 typedef struct nce_fp_match_s { 3502 nce_t *nce_fp_match_res; 3503 mblk_t *nce_fp_match_ack_mp; 3504 } nce_fp_match_t; 3505 3506 /* ARGSUSED */ 3507 static int 3508 nce_fastpath_match_dlur(ill_t *ill, nce_t *nce, void *arg) 3509 { 3510 nce_fp_match_t *nce_fp_marg = arg; 3511 ncec_t *ncec = nce->nce_common; 3512 mblk_t *mp = nce_fp_marg->nce_fp_match_ack_mp; 3513 uchar_t *mp_rptr, *ud_mp_rptr; 3514 mblk_t *ud_mp = nce->nce_dlur_mp; 3515 ptrdiff_t cmplen; 3516 3517 /* 3518 * mp is the mp associated with the fastpath ack. 3519 * ud_mp is the outstanding DL_UNITDATA_REQ on the nce_t 3520 * under consideration. If the contents match, then the 3521 * fastpath ack is used to update the nce. 3522 */ 3523 if (ud_mp == NULL) 3524 return (0); 3525 mp_rptr = mp->b_rptr; 3526 cmplen = mp->b_wptr - mp_rptr; 3527 ASSERT(cmplen >= 0); 3528 3529 ud_mp_rptr = ud_mp->b_rptr; 3530 /* 3531 * The ncec is locked here to prevent any other threads from accessing 3532 * and changing nce_dlur_mp when the address becomes resolved to an 3533 * lla while we're in the middle of looking at and comparing the 3534 * hardware address (lla). It is also locked to prevent multiple 3535 * threads in nce_fastpath() from examining nce_dlur_mp at the same 3536 * time. 3537 */ 3538 mutex_enter(&ncec->ncec_lock); 3539 if (ud_mp->b_wptr - ud_mp_rptr != cmplen || 3540 bcmp((char *)mp_rptr, (char *)ud_mp_rptr, cmplen) == 0) { 3541 nce_fp_marg->nce_fp_match_res = nce; 3542 mutex_exit(&ncec->ncec_lock); 3543 nce_refhold(nce); 3544 return (1); 3545 } 3546 mutex_exit(&ncec->ncec_lock); 3547 return (0); 3548 } 3549 3550 /* 3551 * Update all NCE's that are not in fastpath mode and 3552 * have an nce_fp_mp that matches mp. mp->b_cont contains 3553 * the fastpath header. 3554 * 3555 * Returns TRUE if entry should be dequeued, or FALSE otherwise. 3556 */ 3557 void 3558 nce_fastpath_update(ill_t *ill, mblk_t *mp) 3559 { 3560 nce_fp_match_t nce_fp_marg; 3561 nce_t *nce; 3562 mblk_t *nce_fp_mp, *fp_mp; 3563 3564 nce_fp_marg.nce_fp_match_res = NULL; 3565 nce_fp_marg.nce_fp_match_ack_mp = mp; 3566 3567 nce_walk(ill, nce_fastpath_match_dlur, &nce_fp_marg); 3568 3569 if ((nce = nce_fp_marg.nce_fp_match_res) == NULL) 3570 return; 3571 3572 mutex_enter(&nce->nce_lock); 3573 nce_fp_mp = nce->nce_fp_mp; 3574 3575 if (nce_fp_mp != NULL) { 3576 fp_mp = mp->b_cont; 3577 if (nce_fp_mp->b_rptr + MBLKL(fp_mp) > 3578 nce_fp_mp->b_datap->db_lim) { 3579 mutex_exit(&nce->nce_lock); 3580 nce = nce_delete_then_add(nce); 3581 if (nce == NULL) { 3582 return; 3583 } 3584 mutex_enter(&nce->nce_lock); 3585 nce_fp_mp = nce->nce_fp_mp; 3586 } 3587 } 3588 3589 /* Matched - install mp as the fastpath mp */ 3590 if (nce_fp_mp == NULL) { 3591 fp_mp = dupb(mp->b_cont); 3592 nce->nce_fp_mp = fp_mp; 3593 } else { 3594 fp_mp = mp->b_cont; 3595 bcopy(fp_mp->b_rptr, nce_fp_mp->b_rptr, MBLKL(fp_mp)); 3596 nce->nce_fp_mp->b_wptr = nce->nce_fp_mp->b_rptr 3597 + MBLKL(fp_mp); 3598 } 3599 mutex_exit(&nce->nce_lock); 3600 nce_refrele(nce); 3601 } 3602 3603 /* 3604 * Return a pointer to a given option in the packet. 3605 * Assumes that option part of the packet have already been validated. 3606 */ 3607 nd_opt_hdr_t * 3608 ndp_get_option(nd_opt_hdr_t *opt, int optlen, int opt_type) 3609 { 3610 while (optlen > 0) { 3611 if (opt->nd_opt_type == opt_type) 3612 return (opt); 3613 optlen -= 8 * opt->nd_opt_len; 3614 opt = (struct nd_opt_hdr *)((char *)opt + 8 * opt->nd_opt_len); 3615 } 3616 return (NULL); 3617 } 3618 3619 /* 3620 * Verify all option lengths present are > 0, also check to see 3621 * if the option lengths and packet length are consistent. 3622 */ 3623 boolean_t 3624 ndp_verify_optlen(nd_opt_hdr_t *opt, int optlen) 3625 { 3626 ASSERT(opt != NULL); 3627 while (optlen > 0) { 3628 if (opt->nd_opt_len == 0) 3629 return (B_FALSE); 3630 optlen -= 8 * opt->nd_opt_len; 3631 if (optlen < 0) 3632 return (B_FALSE); 3633 opt = (struct nd_opt_hdr *)((char *)opt + 8 * opt->nd_opt_len); 3634 } 3635 return (B_TRUE); 3636 } 3637 3638 /* 3639 * ncec_walk function. 3640 * Free a fraction of the NCE cache entries. 3641 * 3642 * A possible optimization here would be to use ncec_last where possible, and 3643 * delete the least-frequently used entry, which would require more complex 3644 * computation as we walk through the ncec's (e.g., track ncec entries by 3645 * order of ncec_last and/or maintain state) 3646 */ 3647 static void 3648 ncec_cache_reclaim(ncec_t *ncec, void *arg) 3649 { 3650 ip_stack_t *ipst = ncec->ncec_ipst; 3651 uint_t fraction = *(uint_t *)arg; 3652 uint_t rand; 3653 3654 if ((ncec->ncec_flags & 3655 (NCE_F_MYADDR | NCE_F_STATIC | NCE_F_BCAST)) != 0) { 3656 return; 3657 } 3658 3659 rand = (uint_t)ddi_get_lbolt() + 3660 NCE_ADDR_HASH_V6(ncec->ncec_addr, NCE_TABLE_SIZE); 3661 if ((rand/fraction)*fraction == rand) { 3662 IP_STAT(ipst, ip_nce_reclaim_deleted); 3663 ncec_delete(ncec); 3664 } 3665 } 3666 3667 /* 3668 * kmem_cache callback to free up memory. 3669 * 3670 * For now we just delete a fixed fraction. 3671 */ 3672 static void 3673 ip_nce_reclaim_stack(ip_stack_t *ipst) 3674 { 3675 uint_t fraction = ipst->ips_ip_nce_reclaim_fraction; 3676 3677 IP_STAT(ipst, ip_nce_reclaim_calls); 3678 3679 ncec_walk(NULL, ncec_cache_reclaim, &fraction, ipst); 3680 3681 /* 3682 * Walk all CONNs that can have a reference on an ire, ncec or dce. 3683 * Get them to update any stale references to drop any refholds they 3684 * have. 3685 */ 3686 ipcl_walk(conn_ixa_cleanup, (void *)B_FALSE, ipst); 3687 } 3688 3689 /* 3690 * Called by the memory allocator subsystem directly, when the system 3691 * is running low on memory. 3692 */ 3693 /* ARGSUSED */ 3694 void 3695 ip_nce_reclaim(void *args) 3696 { 3697 netstack_handle_t nh; 3698 netstack_t *ns; 3699 ip_stack_t *ipst; 3700 3701 netstack_next_init(&nh); 3702 while ((ns = netstack_next(&nh)) != NULL) { 3703 /* 3704 * netstack_next() can return a netstack_t with a NULL 3705 * netstack_ip at boot time. 3706 */ 3707 if ((ipst = ns->netstack_ip) == NULL) { 3708 netstack_rele(ns); 3709 continue; 3710 } 3711 ip_nce_reclaim_stack(ipst); 3712 netstack_rele(ns); 3713 } 3714 netstack_next_fini(&nh); 3715 } 3716 3717 #ifdef DEBUG 3718 void 3719 ncec_trace_ref(ncec_t *ncec) 3720 { 3721 ASSERT(MUTEX_HELD(&ncec->ncec_lock)); 3722 3723 if (ncec->ncec_trace_disable) 3724 return; 3725 3726 if (!th_trace_ref(ncec, ncec->ncec_ipst)) { 3727 ncec->ncec_trace_disable = B_TRUE; 3728 ncec_trace_cleanup(ncec); 3729 } 3730 } 3731 3732 void 3733 ncec_untrace_ref(ncec_t *ncec) 3734 { 3735 ASSERT(MUTEX_HELD(&ncec->ncec_lock)); 3736 3737 if (!ncec->ncec_trace_disable) 3738 th_trace_unref(ncec); 3739 } 3740 3741 static void 3742 ncec_trace_cleanup(const ncec_t *ncec) 3743 { 3744 th_trace_cleanup(ncec, ncec->ncec_trace_disable); 3745 } 3746 #endif 3747 3748 /* 3749 * Called when address resolution fails due to a timeout. 3750 * Send an ICMP unreachable in response to all queued packets. 3751 */ 3752 void 3753 arp_resolv_failed(ncec_t *ncec) 3754 { 3755 mblk_t *mp, *nxt_mp; 3756 char buf[INET6_ADDRSTRLEN]; 3757 struct in_addr ipv4addr; 3758 ill_t *ill = ncec->ncec_ill; 3759 ip_stack_t *ipst = ncec->ncec_ipst; 3760 ip_recv_attr_t iras; 3761 3762 bzero(&iras, sizeof (iras)); 3763 iras.ira_flags = IRAF_IS_IPV4; 3764 /* 3765 * we are setting the ira_rill to the ipmp_ill (instead of 3766 * the actual ill on which the packet was received), but this 3767 * is ok because we don't actually need the real ira_rill. 3768 * to send the icmp unreachable to the sender. 3769 */ 3770 iras.ira_ill = iras.ira_rill = ill; 3771 iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex; 3772 iras.ira_rifindex = iras.ira_ruifindex; 3773 3774 IN6_V4MAPPED_TO_INADDR(&ncec->ncec_addr, &ipv4addr); 3775 ip3dbg(("arp_resolv_failed: dst %s\n", 3776 inet_ntop(AF_INET, &ipv4addr, buf, sizeof (buf)))); 3777 mutex_enter(&ncec->ncec_lock); 3778 mp = ncec->ncec_qd_mp; 3779 ncec->ncec_qd_mp = NULL; 3780 ncec->ncec_nprobes = 0; 3781 mutex_exit(&ncec->ncec_lock); 3782 while (mp != NULL) { 3783 nxt_mp = mp->b_next; 3784 mp->b_next = NULL; 3785 3786 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); 3787 ip_drop_output("ipIfStatsOutDiscards - address unreachable", 3788 mp, ill); 3789 if (ipst->ips_ip_arp_icmp_error) { 3790 ip3dbg(("arp_resolv_failed: " 3791 "Calling icmp_unreachable\n")); 3792 icmp_unreachable(mp, ICMP_HOST_UNREACHABLE, &iras); 3793 } else { 3794 freemsg(mp); 3795 } 3796 ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE)); 3797 mp = nxt_mp; 3798 } 3799 ncec_cb_dispatch(ncec); /* finish off waiting callbacks */ 3800 } 3801 3802 /* 3803 * if ill is an under_ill, translate it to the ipmp_ill and add the 3804 * nce on the ipmp_ill. Two nce_t entries (one on the ipmp_ill, and 3805 * one on the underlying in_ill) will be created for the 3806 * ncec_t in this case. The ncec_t itself will be created on the ipmp_ill. 3807 */ 3808 int 3809 nce_lookup_then_add_v4(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len, 3810 const in_addr_t *addr, uint16_t flags, uint16_t state, nce_t **newnce) 3811 { 3812 int err; 3813 in6_addr_t addr6; 3814 ip_stack_t *ipst = ill->ill_ipst; 3815 nce_t *nce, *upper_nce = NULL; 3816 ill_t *in_ill = ill, *under = NULL; 3817 boolean_t need_ill_refrele = B_FALSE; 3818 3819 if (flags & NCE_F_MCAST) { 3820 /* 3821 * hw_addr will be figured out in nce_set_multicast_v4; 3822 * caller needs to pass in the cast_ill for ipmp 3823 */ 3824 ASSERT(hw_addr == NULL); 3825 ASSERT(!IS_IPMP(ill)); 3826 err = nce_set_multicast_v4(ill, addr, flags, newnce); 3827 return (err); 3828 } 3829 3830 if (IS_UNDER_IPMP(ill) && !(flags & NCE_F_MYADDR)) { 3831 ill = ipmp_ill_hold_ipmp_ill(ill); 3832 if (ill == NULL) 3833 return (ENXIO); 3834 need_ill_refrele = B_TRUE; 3835 } 3836 if ((flags & NCE_F_BCAST) != 0) { 3837 /* 3838 * IPv4 broadcast ncec: compute the hwaddr. 3839 */ 3840 if (IS_IPMP(ill)) { 3841 under = ipmp_ill_hold_xmit_ill(ill, B_FALSE); 3842 if (under == NULL) { 3843 if (need_ill_refrele) 3844 ill_refrele(ill); 3845 return (ENETDOWN); 3846 } 3847 hw_addr = under->ill_bcast_mp->b_rptr + 3848 NCE_LL_ADDR_OFFSET(under); 3849 hw_addr_len = under->ill_phys_addr_length; 3850 } else { 3851 hw_addr = ill->ill_bcast_mp->b_rptr + 3852 NCE_LL_ADDR_OFFSET(ill), 3853 hw_addr_len = ill->ill_phys_addr_length; 3854 } 3855 } 3856 3857 mutex_enter(&ipst->ips_ndp4->ndp_g_lock); 3858 IN6_IPADDR_TO_V4MAPPED(*addr, &addr6); 3859 nce = nce_lookup_addr(ill, &addr6); 3860 if (nce == NULL) { 3861 err = nce_add_v4(ill, hw_addr, hw_addr_len, addr, flags, 3862 state, &nce); 3863 } else { 3864 err = EEXIST; 3865 } 3866 mutex_exit(&ipst->ips_ndp4->ndp_g_lock); 3867 if (err == 0) 3868 err = nce_add_v4_postprocess(nce); 3869 3870 if (in_ill != ill && nce != NULL) { 3871 nce_t *under_nce = NULL; 3872 3873 /* 3874 * in_ill was the under_ill. Try to create the under_nce. 3875 * Hold the ill_g_lock to prevent changes to group membership 3876 * until we are done. 3877 */ 3878 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 3879 if (!IS_IN_SAME_ILLGRP(in_ill, ill)) { 3880 DTRACE_PROBE2(ill__not__in__group, nce_t *, nce, 3881 ill_t *, ill); 3882 rw_exit(&ipst->ips_ill_g_lock); 3883 err = ENXIO; 3884 nce_refrele(nce); 3885 nce = NULL; 3886 goto bail; 3887 } 3888 under_nce = nce_fastpath_create(in_ill, nce->nce_common); 3889 if (under_nce == NULL) { 3890 rw_exit(&ipst->ips_ill_g_lock); 3891 err = EINVAL; 3892 nce_refrele(nce); 3893 nce = NULL; 3894 goto bail; 3895 } 3896 rw_exit(&ipst->ips_ill_g_lock); 3897 upper_nce = nce; 3898 nce = under_nce; /* will be returned to caller */ 3899 if (NCE_ISREACHABLE(nce->nce_common)) 3900 nce_fastpath_trigger(under_nce); 3901 } 3902 if (nce != NULL) { 3903 if (newnce != NULL) 3904 *newnce = nce; 3905 else 3906 nce_refrele(nce); 3907 } 3908 bail: 3909 if (under != NULL) 3910 ill_refrele(under); 3911 if (upper_nce != NULL) 3912 nce_refrele(upper_nce); 3913 if (need_ill_refrele) 3914 ill_refrele(ill); 3915 3916 return (err); 3917 } 3918 3919 /* 3920 * NDP Cache Entry creation routine for IPv4. 3921 * This routine must always be called with ndp4->ndp_g_lock held. 3922 * Prior to return, ncec_refcnt is incremented. 3923 * 3924 * IPMP notes: the ncec for non-local (i.e., !NCE_MYADDR(ncec) addresses 3925 * are always added pointing at the ipmp_ill. Thus, when the ill passed 3926 * to nce_add_v4 is an under_ill (i.e., IS_UNDER_IPMP(ill)) two nce_t 3927 * entries will be created, both pointing at the same ncec_t. The nce_t 3928 * entries will have their nce_ill set to the ipmp_ill and the under_ill 3929 * respectively, with the ncec_t having its ncec_ill pointing at the ipmp_ill. 3930 * Local addresses are always created on the ill passed to nce_add_v4. 3931 */ 3932 int 3933 nce_add_v4(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len, 3934 const in_addr_t *addr, uint16_t flags, uint16_t state, nce_t **newnce) 3935 { 3936 int err; 3937 boolean_t is_multicast = (flags & NCE_F_MCAST); 3938 struct in6_addr addr6; 3939 nce_t *nce; 3940 3941 ASSERT(MUTEX_HELD(&ill->ill_ipst->ips_ndp4->ndp_g_lock)); 3942 ASSERT(!ill->ill_isv6); 3943 ASSERT(!IN_MULTICAST(htonl(*addr)) || is_multicast); 3944 3945 IN6_IPADDR_TO_V4MAPPED(*addr, &addr6); 3946 err = nce_add_common(ill, hw_addr, hw_addr_len, &addr6, flags, state, 3947 &nce); 3948 ASSERT(newnce != NULL); 3949 *newnce = nce; 3950 return (err); 3951 } 3952 3953 /* 3954 * Post-processing routine to be executed after nce_add_v4(). This function 3955 * triggers fastpath (if appropriate) and DAD on the newly added nce entry 3956 * and must be called without any locks held. 3957 * 3958 * Always returns 0, but we return an int to keep this symmetric with the 3959 * IPv6 counter-part. 3960 */ 3961 int 3962 nce_add_v4_postprocess(nce_t *nce) 3963 { 3964 ncec_t *ncec = nce->nce_common; 3965 uint16_t flags = ncec->ncec_flags; 3966 boolean_t ndp_need_dad = B_FALSE; 3967 boolean_t dropped; 3968 clock_t delay; 3969 ip_stack_t *ipst = ncec->ncec_ill->ill_ipst; 3970 uchar_t *hw_addr = ncec->ncec_lladdr; 3971 boolean_t trigger_fastpath = B_TRUE; 3972 3973 /* 3974 * If the hw_addr is NULL, typically for ND_INCOMPLETE nces, then 3975 * we call nce_fastpath as soon as the ncec is resolved in nce_process. 3976 * We call nce_fastpath from nce_update if the link layer address of 3977 * the peer changes from nce_update 3978 */ 3979 if (NCE_PUBLISH(ncec) || !NCE_ISREACHABLE(ncec) || (hw_addr == NULL && 3980 ncec->ncec_ill->ill_net_type != IRE_IF_NORESOLVER)) 3981 trigger_fastpath = B_FALSE; 3982 3983 if (trigger_fastpath) 3984 nce_fastpath_trigger(nce); 3985 3986 if (NCE_PUBLISH(ncec) && ncec->ncec_state == ND_PROBE) { 3987 /* 3988 * Either the caller (by passing in ND_PROBE) 3989 * or nce_add_common() (by the internally computed state 3990 * based on ncec_addr and ill_net_type) has determined 3991 * that this unicast entry needs DAD. Trigger DAD. 3992 */ 3993 ndp_need_dad = B_TRUE; 3994 } else if (flags & NCE_F_UNSOL_ADV) { 3995 /* 3996 * We account for the transmit below by assigning one 3997 * less than the ndd variable. Subsequent decrements 3998 * are done in nce_timer. 3999 */ 4000 mutex_enter(&ncec->ncec_lock); 4001 ncec->ncec_unsolicit_count = 4002 ipst->ips_ip_arp_publish_count - 1; 4003 mutex_exit(&ncec->ncec_lock); 4004 dropped = arp_announce(ncec); 4005 mutex_enter(&ncec->ncec_lock); 4006 if (dropped) 4007 ncec->ncec_unsolicit_count++; 4008 else 4009 ncec->ncec_last_time_defended = ddi_get_lbolt(); 4010 if (ncec->ncec_unsolicit_count != 0) { 4011 nce_start_timer(ncec, 4012 ipst->ips_ip_arp_publish_interval); 4013 } 4014 mutex_exit(&ncec->ncec_lock); 4015 } 4016 4017 /* 4018 * If ncec_xmit_interval is 0, user has configured us to send the first 4019 * probe right away. Do so, and set up for the subsequent probes. 4020 */ 4021 if (ndp_need_dad) { 4022 mutex_enter(&ncec->ncec_lock); 4023 if (ncec->ncec_pcnt == 0) { 4024 /* 4025 * DAD probes and announce can be 4026 * administratively disabled by setting the 4027 * probe_count to zero. Restart the timer in 4028 * this case to mark the ipif as ready. 4029 */ 4030 ncec->ncec_unsolicit_count = 0; 4031 mutex_exit(&ncec->ncec_lock); 4032 nce_restart_timer(ncec, 0); 4033 } else { 4034 mutex_exit(&ncec->ncec_lock); 4035 delay = ((ncec->ncec_flags & NCE_F_FAST) ? 4036 ipst->ips_arp_probe_delay : 4037 ipst->ips_arp_fastprobe_delay); 4038 nce_dad(ncec, NULL, (delay == 0 ? B_TRUE : B_FALSE)); 4039 } 4040 } 4041 return (0); 4042 } 4043 4044 /* 4045 * ncec_walk routine to update all entries that have a given destination or 4046 * gateway address and cached link layer (MAC) address. This is used when ARP 4047 * informs us that a network-to-link-layer mapping may have changed. 4048 */ 4049 void 4050 nce_update_hw_changed(ncec_t *ncec, void *arg) 4051 { 4052 nce_hw_map_t *hwm = arg; 4053 ipaddr_t ncec_addr; 4054 4055 if (ncec->ncec_state != ND_REACHABLE) 4056 return; 4057 4058 IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, ncec_addr); 4059 if (ncec_addr != hwm->hwm_addr) 4060 return; 4061 4062 mutex_enter(&ncec->ncec_lock); 4063 if (hwm->hwm_flags != 0) 4064 ncec->ncec_flags = hwm->hwm_flags; 4065 nce_update(ncec, ND_STALE, hwm->hwm_hwaddr); 4066 mutex_exit(&ncec->ncec_lock); 4067 } 4068 4069 void 4070 ncec_refhold(ncec_t *ncec) 4071 { 4072 mutex_enter(&(ncec)->ncec_lock); 4073 (ncec)->ncec_refcnt++; 4074 ASSERT((ncec)->ncec_refcnt != 0); 4075 #ifdef DEBUG 4076 ncec_trace_ref(ncec); 4077 #endif 4078 mutex_exit(&(ncec)->ncec_lock); 4079 } 4080 4081 void 4082 ncec_refhold_notr(ncec_t *ncec) 4083 { 4084 mutex_enter(&(ncec)->ncec_lock); 4085 (ncec)->ncec_refcnt++; 4086 ASSERT((ncec)->ncec_refcnt != 0); 4087 mutex_exit(&(ncec)->ncec_lock); 4088 } 4089 4090 static void 4091 ncec_refhold_locked(ncec_t *ncec) 4092 { 4093 ASSERT(MUTEX_HELD(&(ncec)->ncec_lock)); 4094 (ncec)->ncec_refcnt++; 4095 #ifdef DEBUG 4096 ncec_trace_ref(ncec); 4097 #endif 4098 } 4099 4100 /* ncec_inactive destroys the mutex thus no mutex_exit is needed */ 4101 void 4102 ncec_refrele(ncec_t *ncec) 4103 { 4104 mutex_enter(&(ncec)->ncec_lock); 4105 #ifdef DEBUG 4106 ncec_untrace_ref(ncec); 4107 #endif 4108 ASSERT((ncec)->ncec_refcnt != 0); 4109 if (--(ncec)->ncec_refcnt == 0) { 4110 ncec_inactive(ncec); 4111 } else { 4112 mutex_exit(&(ncec)->ncec_lock); 4113 } 4114 } 4115 4116 void 4117 ncec_refrele_notr(ncec_t *ncec) 4118 { 4119 mutex_enter(&(ncec)->ncec_lock); 4120 ASSERT((ncec)->ncec_refcnt != 0); 4121 if (--(ncec)->ncec_refcnt == 0) { 4122 ncec_inactive(ncec); 4123 } else { 4124 mutex_exit(&(ncec)->ncec_lock); 4125 } 4126 } 4127 4128 /* 4129 * Common to IPv4 and IPv6. 4130 */ 4131 void 4132 nce_restart_timer(ncec_t *ncec, uint_t ms) 4133 { 4134 timeout_id_t tid; 4135 4136 ASSERT(!MUTEX_HELD(&(ncec)->ncec_lock)); 4137 4138 /* First cancel any running timer */ 4139 mutex_enter(&ncec->ncec_lock); 4140 tid = ncec->ncec_timeout_id; 4141 ncec->ncec_timeout_id = 0; 4142 if (tid != 0) { 4143 mutex_exit(&ncec->ncec_lock); 4144 (void) untimeout(tid); 4145 mutex_enter(&ncec->ncec_lock); 4146 } 4147 4148 /* Restart timer */ 4149 nce_start_timer(ncec, ms); 4150 mutex_exit(&ncec->ncec_lock); 4151 } 4152 4153 static void 4154 nce_start_timer(ncec_t *ncec, uint_t ms) 4155 { 4156 ASSERT(MUTEX_HELD(&ncec->ncec_lock)); 4157 /* 4158 * Don't start the timer if the ncec has been deleted, or if the timer 4159 * is already running 4160 */ 4161 if (!NCE_ISCONDEMNED(ncec) && ncec->ncec_timeout_id == 0) { 4162 ncec->ncec_timeout_id = timeout(nce_timer, ncec, 4163 MSEC_TO_TICK(ms) == 0 ? 1 : MSEC_TO_TICK(ms)); 4164 } 4165 } 4166 4167 int 4168 nce_set_multicast_v4(ill_t *ill, const in_addr_t *dst, 4169 uint16_t flags, nce_t **newnce) 4170 { 4171 uchar_t *hw_addr; 4172 int err = 0; 4173 ip_stack_t *ipst = ill->ill_ipst; 4174 in6_addr_t dst6; 4175 nce_t *nce; 4176 4177 ASSERT(!ill->ill_isv6); 4178 4179 IN6_IPADDR_TO_V4MAPPED(*dst, &dst6); 4180 mutex_enter(&ipst->ips_ndp4->ndp_g_lock); 4181 if ((nce = nce_lookup_addr(ill, &dst6)) != NULL) { 4182 mutex_exit(&ipst->ips_ndp4->ndp_g_lock); 4183 goto done; 4184 } 4185 if (ill->ill_net_type == IRE_IF_RESOLVER) { 4186 /* 4187 * For IRE_IF_RESOLVER a hardware mapping can be 4188 * generated, for IRE_IF_NORESOLVER, resolution cookie 4189 * in the ill is copied in nce_add_v4(). 4190 */ 4191 hw_addr = kmem_alloc(ill->ill_phys_addr_length, KM_NOSLEEP); 4192 if (hw_addr == NULL) { 4193 mutex_exit(&ipst->ips_ndp4->ndp_g_lock); 4194 return (ENOMEM); 4195 } 4196 ip_mcast_mapping(ill, (uchar_t *)dst, hw_addr); 4197 } else { 4198 /* 4199 * IRE_IF_NORESOLVER type simply copies the resolution 4200 * cookie passed in. So no hw_addr is needed. 4201 */ 4202 hw_addr = NULL; 4203 } 4204 ASSERT(flags & NCE_F_MCAST); 4205 ASSERT(flags & NCE_F_NONUD); 4206 /* nce_state will be computed by nce_add_common() */ 4207 err = nce_add_v4(ill, hw_addr, ill->ill_phys_addr_length, dst, flags, 4208 ND_UNCHANGED, &nce); 4209 mutex_exit(&ipst->ips_ndp4->ndp_g_lock); 4210 if (err == 0) 4211 err = (nce != NULL) ? nce_add_v4_postprocess(nce) : ENOMEM; 4212 if (hw_addr != NULL) 4213 kmem_free(hw_addr, ill->ill_phys_addr_length); 4214 if (err != 0) { 4215 ip1dbg(("nce_set_multicast_v4: create failed" "%d\n", err)); 4216 return (err); 4217 } 4218 done: 4219 if (newnce != NULL) 4220 *newnce = nce; 4221 else 4222 nce_refrele(nce); 4223 return (0); 4224 } 4225 4226 /* 4227 * This is used when scanning for "old" (least recently broadcast) NCEs. We 4228 * don't want to have to walk the list for every single one, so we gather up 4229 * batches at a time. 4230 */ 4231 #define NCE_RESCHED_LIST_LEN 8 4232 4233 typedef struct { 4234 ill_t *ncert_ill; 4235 uint_t ncert_num; 4236 ncec_t *ncert_nces[NCE_RESCHED_LIST_LEN]; 4237 } nce_resched_t; 4238 4239 /* 4240 * Pick the longest waiting NCEs for defense. 4241 */ 4242 /* ARGSUSED */ 4243 static int 4244 ncec_reschedule(ill_t *ill, nce_t *nce, void *arg) 4245 { 4246 nce_resched_t *ncert = arg; 4247 ncec_t **ncecs; 4248 ncec_t **ncec_max; 4249 ncec_t *ncec_temp; 4250 ncec_t *ncec = nce->nce_common; 4251 4252 ASSERT(ncec->ncec_ill == ncert->ncert_ill); 4253 /* 4254 * Only reachable entries that are ready for announcement are eligible. 4255 */ 4256 if (!NCE_MYADDR(ncec) || ncec->ncec_state != ND_REACHABLE) 4257 return (0); 4258 if (ncert->ncert_num < NCE_RESCHED_LIST_LEN) { 4259 ncec_refhold(ncec); 4260 ncert->ncert_nces[ncert->ncert_num++] = ncec; 4261 } else { 4262 ncecs = ncert->ncert_nces; 4263 ncec_max = ncecs + NCE_RESCHED_LIST_LEN; 4264 ncec_refhold(ncec); 4265 for (; ncecs < ncec_max; ncecs++) { 4266 ASSERT(ncec != NULL); 4267 if ((*ncecs)->ncec_last_time_defended > 4268 ncec->ncec_last_time_defended) { 4269 ncec_temp = *ncecs; 4270 *ncecs = ncec; 4271 ncec = ncec_temp; 4272 } 4273 } 4274 ncec_refrele(ncec); 4275 } 4276 return (0); 4277 } 4278 4279 /* 4280 * Reschedule the ARP defense of any long-waiting NCEs. It's assumed that this 4281 * doesn't happen very often (if at all), and thus it needn't be highly 4282 * optimized. (Note, though, that it's actually O(N) complexity, because the 4283 * outer loop is bounded by a constant rather than by the length of the list.) 4284 */ 4285 static void 4286 nce_ill_reschedule(ill_t *ill, nce_resched_t *ncert) 4287 { 4288 ncec_t *ncec; 4289 ip_stack_t *ipst = ill->ill_ipst; 4290 uint_t i, defend_rate; 4291 4292 i = ill->ill_defend_count; 4293 ill->ill_defend_count = 0; 4294 if (ill->ill_isv6) 4295 defend_rate = ipst->ips_ndp_defend_rate; 4296 else 4297 defend_rate = ipst->ips_arp_defend_rate; 4298 /* If none could be sitting around, then don't reschedule */ 4299 if (i < defend_rate) { 4300 DTRACE_PROBE1(reschedule_none, ill_t *, ill); 4301 return; 4302 } 4303 ncert->ncert_ill = ill; 4304 while (ill->ill_defend_count < defend_rate) { 4305 nce_walk_common(ill, ncec_reschedule, ncert); 4306 for (i = 0; i < ncert->ncert_num; i++) { 4307 4308 ncec = ncert->ncert_nces[i]; 4309 mutex_enter(&ncec->ncec_lock); 4310 ncec->ncec_flags |= NCE_F_DELAYED; 4311 mutex_exit(&ncec->ncec_lock); 4312 /* 4313 * we plan to schedule this ncec, so incr the 4314 * defend_count in anticipation. 4315 */ 4316 if (++ill->ill_defend_count >= defend_rate) 4317 break; 4318 } 4319 if (ncert->ncert_num < NCE_RESCHED_LIST_LEN) 4320 break; 4321 } 4322 } 4323 4324 /* 4325 * Check if the current rate-limiting parameters permit the sending 4326 * of another address defense announcement for both IPv4 and IPv6. 4327 * Returns B_TRUE if rate-limiting is in effect (i.e., send is not 4328 * permitted), and B_FALSE otherwise. The `defend_rate' parameter 4329 * determines how many address defense announcements are permitted 4330 * in any `defense_perio' interval. 4331 */ 4332 static boolean_t 4333 ill_defend_rate_limit(ill_t *ill, ncec_t *ncec) 4334 { 4335 clock_t now = ddi_get_lbolt(); 4336 ip_stack_t *ipst = ill->ill_ipst; 4337 clock_t start = ill->ill_defend_start; 4338 uint32_t elapsed, defend_period, defend_rate; 4339 nce_resched_t ncert; 4340 boolean_t ret; 4341 int i; 4342 4343 if (ill->ill_isv6) { 4344 defend_period = ipst->ips_ndp_defend_period; 4345 defend_rate = ipst->ips_ndp_defend_rate; 4346 } else { 4347 defend_period = ipst->ips_arp_defend_period; 4348 defend_rate = ipst->ips_arp_defend_rate; 4349 } 4350 if (defend_rate == 0) 4351 return (B_TRUE); 4352 bzero(&ncert, sizeof (ncert)); 4353 mutex_enter(&ill->ill_lock); 4354 if (start > 0) { 4355 elapsed = now - start; 4356 if (elapsed > SEC_TO_TICK(defend_period)) { 4357 ill->ill_defend_start = now; 4358 /* 4359 * nce_ill_reschedule will attempt to 4360 * prevent starvation by reschduling the 4361 * oldest entries, which are marked with 4362 * the NCE_F_DELAYED flag. 4363 */ 4364 nce_ill_reschedule(ill, &ncert); 4365 } 4366 } else { 4367 ill->ill_defend_start = now; 4368 } 4369 ASSERT(ill->ill_defend_count <= defend_rate); 4370 mutex_enter(&ncec->ncec_lock); 4371 if (ncec->ncec_flags & NCE_F_DELAYED) { 4372 /* 4373 * This ncec was rescheduled as one of the really old 4374 * entries needing on-going defense. The 4375 * ill_defend_count was already incremented in 4376 * nce_ill_reschedule. Go ahead and send the announce. 4377 */ 4378 ncec->ncec_flags &= ~NCE_F_DELAYED; 4379 mutex_exit(&ncec->ncec_lock); 4380 ret = B_FALSE; 4381 goto done; 4382 } 4383 mutex_exit(&ncec->ncec_lock); 4384 if (ill->ill_defend_count < defend_rate) 4385 ill->ill_defend_count++; 4386 if (ill->ill_defend_count == defend_rate) { 4387 /* 4388 * we are no longer allowed to send unbidden defense 4389 * messages. Wait for rescheduling. 4390 */ 4391 ret = B_TRUE; 4392 } else { 4393 ret = B_FALSE; 4394 } 4395 done: 4396 mutex_exit(&ill->ill_lock); 4397 /* 4398 * After all the locks have been dropped we can restart nce timer, 4399 * and refrele the delayed ncecs 4400 */ 4401 for (i = 0; i < ncert.ncert_num; i++) { 4402 clock_t xmit_interval; 4403 ncec_t *tmp; 4404 4405 tmp = ncert.ncert_nces[i]; 4406 xmit_interval = nce_fuzz_interval(tmp->ncec_xmit_interval, 4407 B_FALSE); 4408 nce_restart_timer(tmp, xmit_interval); 4409 ncec_refrele(tmp); 4410 } 4411 return (ret); 4412 } 4413 4414 boolean_t 4415 ndp_announce(ncec_t *ncec) 4416 { 4417 return (ndp_xmit(ncec->ncec_ill, ND_NEIGHBOR_ADVERT, ncec->ncec_lladdr, 4418 ncec->ncec_lladdr_length, &ncec->ncec_addr, &ipv6_all_hosts_mcast, 4419 nce_advert_flags(ncec))); 4420 } 4421 4422 ill_t * 4423 nce_resolve_src(ncec_t *ncec, in6_addr_t *src) 4424 { 4425 mblk_t *mp; 4426 in6_addr_t src6; 4427 ipaddr_t src4; 4428 ill_t *ill = ncec->ncec_ill; 4429 ill_t *src_ill = NULL; 4430 ipif_t *ipif = NULL; 4431 boolean_t is_myaddr = NCE_MYADDR(ncec); 4432 boolean_t isv6 = (ncec->ncec_ipversion == IPV6_VERSION); 4433 4434 ASSERT(src != NULL); 4435 ASSERT(IN6_IS_ADDR_UNSPECIFIED(src)); 4436 src6 = *src; 4437 if (is_myaddr) { 4438 src6 = ncec->ncec_addr; 4439 if (!isv6) 4440 IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, src4); 4441 } else { 4442 /* 4443 * try to find one from the outgoing packet. 4444 */ 4445 mutex_enter(&ncec->ncec_lock); 4446 mp = ncec->ncec_qd_mp; 4447 if (mp != NULL) { 4448 if (isv6) { 4449 ip6_t *ip6h = (ip6_t *)mp->b_rptr; 4450 4451 src6 = ip6h->ip6_src; 4452 } else { 4453 ipha_t *ipha = (ipha_t *)mp->b_rptr; 4454 4455 src4 = ipha->ipha_src; 4456 IN6_IPADDR_TO_V4MAPPED(src4, &src6); 4457 } 4458 } 4459 mutex_exit(&ncec->ncec_lock); 4460 } 4461 4462 /* 4463 * For outgoing packets, if the src of outgoing packet is one 4464 * of the assigned interface addresses use it, otherwise we 4465 * will pick the source address below. 4466 * For local addresses (is_myaddr) doing DAD, NDP announce 4467 * messages are mcast. So we use the (IPMP) cast_ill or the 4468 * (non-IPMP) ncec_ill for these message types. The only case 4469 * of unicast DAD messages are for IPv6 ND probes, for which 4470 * we find the ipif_bound_ill corresponding to the ncec_addr. 4471 */ 4472 if (!IN6_IS_ADDR_UNSPECIFIED(&src6) || is_myaddr) { 4473 if (isv6) { 4474 ipif = ipif_lookup_addr_nondup_v6(&src6, ill, ALL_ZONES, 4475 ill->ill_ipst); 4476 } else { 4477 ipif = ipif_lookup_addr_nondup(src4, ill, ALL_ZONES, 4478 ill->ill_ipst); 4479 } 4480 4481 /* 4482 * If no relevant ipif can be found, then it's not one of our 4483 * addresses. Reset to :: and try to find a src for the NS or 4484 * ARP request using ipif_select_source_v[4,6] below. 4485 * If an ipif can be found, but it's not yet done with 4486 * DAD verification, and we are not being invoked for 4487 * DAD (i.e., !is_myaddr), then just postpone this 4488 * transmission until later. 4489 */ 4490 if (ipif == NULL) { 4491 src6 = ipv6_all_zeros; 4492 src4 = INADDR_ANY; 4493 } else if (!ipif->ipif_addr_ready && !is_myaddr) { 4494 DTRACE_PROBE2(nce__resolve__ipif__not__ready, 4495 ncec_t *, ncec, ipif_t *, ipif); 4496 ipif_refrele(ipif); 4497 return (NULL); 4498 } 4499 } 4500 4501 if (IN6_IS_ADDR_UNSPECIFIED(&src6) && !is_myaddr) { 4502 /* 4503 * Pick a source address for this solicitation, but 4504 * restrict the selection to addresses assigned to the 4505 * output interface. We do this because the destination will 4506 * create a neighbor cache entry for the source address of 4507 * this packet, so the source address had better be a valid 4508 * neighbor. 4509 */ 4510 if (isv6) { 4511 ipif = ipif_select_source_v6(ill, &ncec->ncec_addr, 4512 B_TRUE, IPV6_PREFER_SRC_DEFAULT, ALL_ZONES, 4513 B_FALSE, NULL); 4514 } else { 4515 ipaddr_t nce_addr; 4516 4517 IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, nce_addr); 4518 ipif = ipif_select_source_v4(ill, nce_addr, ALL_ZONES, 4519 B_FALSE, NULL); 4520 } 4521 if (ipif == NULL && IS_IPMP(ill)) { 4522 ill_t *send_ill = ipmp_ill_hold_xmit_ill(ill, B_TRUE); 4523 4524 if (send_ill != NULL) { 4525 if (isv6) { 4526 ipif = ipif_select_source_v6(send_ill, 4527 &ncec->ncec_addr, B_TRUE, 4528 IPV6_PREFER_SRC_DEFAULT, ALL_ZONES, 4529 B_FALSE, NULL); 4530 } else { 4531 IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, 4532 src4); 4533 ipif = ipif_select_source_v4(send_ill, 4534 src4, ALL_ZONES, B_TRUE, NULL); 4535 } 4536 ill_refrele(send_ill); 4537 } 4538 } 4539 4540 if (ipif == NULL) { 4541 char buf[INET6_ADDRSTRLEN]; 4542 4543 ip1dbg(("nce_resolve_src: No source ipif for dst %s\n", 4544 inet_ntop((isv6 ? AF_INET6 : AF_INET), 4545 (char *)&ncec->ncec_addr, buf, sizeof (buf)))); 4546 DTRACE_PROBE1(nce__resolve__no__ipif, ncec_t *, ncec); 4547 return (NULL); 4548 } 4549 src6 = ipif->ipif_v6lcl_addr; 4550 } 4551 *src = src6; 4552 if (ipif != NULL) { 4553 src_ill = ipif->ipif_ill; 4554 if (IS_IPMP(src_ill)) 4555 src_ill = ipmp_ipif_hold_bound_ill(ipif); 4556 else 4557 ill_refhold(src_ill); 4558 ipif_refrele(ipif); 4559 DTRACE_PROBE2(nce__resolve__src__ill, ncec_t *, ncec, 4560 ill_t *, src_ill); 4561 } 4562 return (src_ill); 4563 } 4564 4565 void 4566 ip_nce_lookup_and_update(ipaddr_t *addr, ipif_t *ipif, ip_stack_t *ipst, 4567 uchar_t *hwaddr, int hwaddr_len, int flags) 4568 { 4569 ill_t *ill; 4570 ncec_t *ncec; 4571 nce_t *nce; 4572 uint16_t new_state; 4573 4574 ill = (ipif ? ipif->ipif_ill : NULL); 4575 if (ill != NULL) { 4576 /* 4577 * only one ncec is possible 4578 */ 4579 nce = nce_lookup_v4(ill, addr); 4580 if (nce != NULL) { 4581 ncec = nce->nce_common; 4582 mutex_enter(&ncec->ncec_lock); 4583 if (NCE_ISREACHABLE(ncec)) 4584 new_state = ND_UNCHANGED; 4585 else 4586 new_state = ND_STALE; 4587 ncec->ncec_flags = flags; 4588 nce_update(ncec, new_state, hwaddr); 4589 mutex_exit(&ncec->ncec_lock); 4590 nce_refrele(nce); 4591 return; 4592 } 4593 } else { 4594 /* 4595 * ill is wildcard; clean up all ncec's and ire's 4596 * that match on addr. 4597 */ 4598 nce_hw_map_t hwm; 4599 4600 hwm.hwm_addr = *addr; 4601 hwm.hwm_hwlen = hwaddr_len; 4602 hwm.hwm_hwaddr = hwaddr; 4603 hwm.hwm_flags = flags; 4604 4605 ncec_walk_common(ipst->ips_ndp4, NULL, 4606 nce_update_hw_changed, &hwm, B_TRUE); 4607 } 4608 } 4609 4610 /* 4611 * Common function to add ncec entries. 4612 * we always add the ncec with ncec_ill == ill, and always create 4613 * nce_t on ncec_ill. A dlpi fastpath message may be triggered if the 4614 * ncec is !reachable. 4615 * 4616 * When the caller passes in an nce_state of ND_UNCHANGED, 4617 * nce_add_common() will determine the state of the created nce based 4618 * on the ill_net_type and nce_flags used. Otherwise, the nce will 4619 * be created with state set to the passed in nce_state. 4620 */ 4621 static int 4622 nce_add_common(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len, 4623 const in6_addr_t *addr, uint16_t flags, uint16_t nce_state, nce_t **retnce) 4624 { 4625 static ncec_t nce_nil; 4626 uchar_t *template = NULL; 4627 int err; 4628 ncec_t *ncec; 4629 ncec_t **ncep; 4630 ip_stack_t *ipst = ill->ill_ipst; 4631 uint16_t state; 4632 boolean_t fastprobe = B_FALSE; 4633 struct ndp_g_s *ndp; 4634 nce_t *nce = NULL; 4635 list_t graveyard; 4636 mblk_t *dlur_mp = NULL; 4637 4638 if (ill->ill_isv6) 4639 ndp = ill->ill_ipst->ips_ndp6; 4640 else 4641 ndp = ill->ill_ipst->ips_ndp4; 4642 4643 *retnce = NULL; 4644 4645 ASSERT(MUTEX_HELD(&ndp->ndp_g_lock)); 4646 4647 if (IN6_IS_ADDR_UNSPECIFIED(addr)) { 4648 ip0dbg(("nce_add_common: no addr\n")); 4649 return (EINVAL); 4650 } 4651 if ((flags & ~NCE_EXTERNAL_FLAGS_MASK)) { 4652 ip0dbg(("nce_add_common: flags = %x\n", (int)flags)); 4653 return (EINVAL); 4654 } 4655 4656 if (ill->ill_isv6) { 4657 ncep = ((ncec_t **)NCE_HASH_PTR_V6(ipst, *addr)); 4658 } else { 4659 ipaddr_t v4addr; 4660 4661 IN6_V4MAPPED_TO_IPADDR(addr, v4addr); 4662 ncep = ((ncec_t **)NCE_HASH_PTR_V4(ipst, v4addr)); 4663 } 4664 4665 /* 4666 * The caller has ensured that there is no nce on ill, but there could 4667 * still be an nce_common_t for the address, so that we find exisiting 4668 * ncec_t strucutures first, and atomically add a new nce_t if 4669 * one is found. The ndp_g_lock ensures that we don't cross threads 4670 * with an ncec_delete(). Unlike ncec_lookup_illgrp() we do not 4671 * compare for matches across the illgrp because this function is 4672 * called via nce_lookup_then_add_v* -> nce_add_v* -> nce_add_common, 4673 * with the nce_lookup_then_add_v* passing in the ipmp_ill where 4674 * appropriate. 4675 */ 4676 ncec = *ncep; 4677 for (; ncec != NULL; ncec = ncec->ncec_next) { 4678 if (ncec->ncec_ill == ill) { 4679 if (IN6_ARE_ADDR_EQUAL(&ncec->ncec_addr, addr)) { 4680 /* 4681 * We should never find *retnce to be 4682 * MYADDR, since the caller may then 4683 * incorrectly restart a DAD timer that's 4684 * already running. However, if we are in 4685 * forwarding mode, and the interface is 4686 * moving in/out of groups, the data 4687 * path ire lookup (e.g., ire_revalidate_nce) 4688 * may have determined that some destination 4689 * is offlink while the control path is adding 4690 * that address as a local address. 4691 * Recover from this case by failing the 4692 * lookup 4693 */ 4694 if (NCE_MYADDR(ncec)) 4695 return (ENXIO); 4696 *retnce = nce_ill_lookup_then_add(ill, ncec); 4697 if (*retnce != NULL) 4698 break; 4699 } 4700 } 4701 } 4702 if (*retnce != NULL) /* caller must trigger fastpath on nce */ 4703 return (0); 4704 4705 ncec = kmem_cache_alloc(ncec_cache, KM_NOSLEEP); 4706 if (ncec == NULL) 4707 return (ENOMEM); 4708 *ncec = nce_nil; 4709 ncec->ncec_ill = ill; 4710 ncec->ncec_ipversion = (ill->ill_isv6 ? IPV6_VERSION : IPV4_VERSION); 4711 ncec->ncec_flags = flags; 4712 ncec->ncec_ipst = ipst; /* No netstack_hold */ 4713 4714 if (!ill->ill_isv6) { 4715 ipaddr_t addr4; 4716 4717 /* 4718 * DAD probe interval and probe count are set based on 4719 * fast/slow probe settings. If the underlying link doesn't 4720 * have reliably up/down notifications or if we're working 4721 * with IPv4 169.254.0.0/16 Link Local Address space, then 4722 * don't use the fast timers. Otherwise, use them. 4723 */ 4724 ASSERT(IN6_IS_ADDR_V4MAPPED(addr)); 4725 IN6_V4MAPPED_TO_IPADDR(addr, addr4); 4726 if (ill->ill_note_link && !IS_IPV4_LL_SPACE(&addr4)) { 4727 fastprobe = B_TRUE; 4728 } else if (IS_IPMP(ill) && NCE_PUBLISH(ncec) && 4729 !IS_IPV4_LL_SPACE(&addr4)) { 4730 ill_t *hwaddr_ill; 4731 4732 hwaddr_ill = ipmp_illgrp_find_ill(ill->ill_grp, hw_addr, 4733 hw_addr_len); 4734 if (hwaddr_ill != NULL && hwaddr_ill->ill_note_link) 4735 fastprobe = B_TRUE; 4736 } 4737 if (fastprobe) { 4738 ncec->ncec_xmit_interval = 4739 ipst->ips_arp_fastprobe_interval; 4740 ncec->ncec_pcnt = 4741 ipst->ips_arp_fastprobe_count; 4742 ncec->ncec_flags |= NCE_F_FAST; 4743 } else { 4744 ncec->ncec_xmit_interval = 4745 ipst->ips_arp_probe_interval; 4746 ncec->ncec_pcnt = 4747 ipst->ips_arp_probe_count; 4748 } 4749 if (NCE_PUBLISH(ncec)) { 4750 ncec->ncec_unsolicit_count = 4751 ipst->ips_ip_arp_publish_count; 4752 } 4753 } else { 4754 /* 4755 * probe interval is constant: ILL_PROBE_INTERVAL 4756 * probe count is constant: ND_MAX_UNICAST_SOLICIT 4757 */ 4758 ncec->ncec_pcnt = ND_MAX_UNICAST_SOLICIT; 4759 if (NCE_PUBLISH(ncec)) { 4760 ncec->ncec_unsolicit_count = 4761 ipst->ips_ip_ndp_unsolicit_count; 4762 } 4763 } 4764 ncec->ncec_rcnt = ill->ill_xmit_count; 4765 ncec->ncec_addr = *addr; 4766 ncec->ncec_qd_mp = NULL; 4767 ncec->ncec_refcnt = 1; /* for ncec getting created */ 4768 mutex_init(&ncec->ncec_lock, NULL, MUTEX_DEFAULT, NULL); 4769 ncec->ncec_trace_disable = B_FALSE; 4770 4771 /* 4772 * ncec_lladdr holds link layer address 4773 */ 4774 if (hw_addr_len > 0) { 4775 template = kmem_alloc(hw_addr_len, KM_NOSLEEP); 4776 if (template == NULL) { 4777 err = ENOMEM; 4778 goto err_ret; 4779 } 4780 ncec->ncec_lladdr = template; 4781 ncec->ncec_lladdr_length = hw_addr_len; 4782 bzero(ncec->ncec_lladdr, hw_addr_len); 4783 } 4784 if ((flags & NCE_F_BCAST) != 0) { 4785 state = ND_REACHABLE; 4786 ASSERT(hw_addr_len > 0); 4787 } else if (ill->ill_net_type == IRE_IF_RESOLVER) { 4788 state = ND_INITIAL; 4789 } else if (ill->ill_net_type == IRE_IF_NORESOLVER) { 4790 /* 4791 * NORESOLVER entries are always created in the REACHABLE 4792 * state. 4793 */ 4794 state = ND_REACHABLE; 4795 if (ill->ill_phys_addr_length == IP_ADDR_LEN && 4796 ill->ill_mactype != DL_IPV4 && 4797 ill->ill_mactype != DL_6TO4) { 4798 /* 4799 * We create a nce_res_mp with the IP nexthop address 4800 * as the destination address if the physical length 4801 * is exactly 4 bytes for point-to-multipoint links 4802 * that do their own resolution from IP to link-layer 4803 * address (e.g. IP over X.25). 4804 */ 4805 bcopy((uchar_t *)addr, 4806 ncec->ncec_lladdr, ill->ill_phys_addr_length); 4807 } 4808 if (ill->ill_phys_addr_length == IPV6_ADDR_LEN && 4809 ill->ill_mactype != DL_IPV6) { 4810 /* 4811 * We create a nce_res_mp with the IP nexthop address 4812 * as the destination address if the physical legnth 4813 * is exactly 16 bytes for point-to-multipoint links 4814 * that do their own resolution from IP to link-layer 4815 * address. 4816 */ 4817 bcopy((uchar_t *)addr, 4818 ncec->ncec_lladdr, ill->ill_phys_addr_length); 4819 } 4820 /* 4821 * Since NUD is not part of the base IPv4 protocol definition, 4822 * IPv4 neighbor entries on NORESOLVER interfaces will never 4823 * age, and are marked NCE_F_NONUD. 4824 */ 4825 if (!ill->ill_isv6) 4826 ncec->ncec_flags |= NCE_F_NONUD; 4827 } else if (ill->ill_net_type == IRE_LOOPBACK) { 4828 state = ND_REACHABLE; 4829 } 4830 4831 if (hw_addr != NULL || ill->ill_net_type == IRE_IF_NORESOLVER) { 4832 /* 4833 * We are adding an ncec with a deterministic hw_addr, 4834 * so the state can only be one of {REACHABLE, STALE, PROBE}. 4835 * 4836 * if we are adding a unicast ncec for the local address 4837 * it would be REACHABLE; we would be adding a ND_STALE entry 4838 * for the requestor of an ARP_REQUEST/ND_SOLICIT. Our own 4839 * addresses are added in PROBE to trigger DAD. 4840 */ 4841 if ((flags & (NCE_F_MCAST|NCE_F_BCAST)) || 4842 ill->ill_net_type == IRE_IF_NORESOLVER) 4843 state = ND_REACHABLE; 4844 else if (!NCE_PUBLISH(ncec)) 4845 state = ND_STALE; 4846 else 4847 state = ND_PROBE; 4848 if (hw_addr != NULL) 4849 nce_set_ll(ncec, hw_addr); 4850 } 4851 /* caller overrides internally computed state */ 4852 if (nce_state != ND_UNCHANGED) 4853 state = nce_state; 4854 4855 if (state == ND_PROBE) 4856 ncec->ncec_flags |= NCE_F_UNVERIFIED; 4857 4858 ncec->ncec_state = state; 4859 4860 if (state == ND_REACHABLE) { 4861 ncec->ncec_last = ncec->ncec_init_time = 4862 TICK_TO_MSEC(ddi_get_lbolt64()); 4863 } else { 4864 ncec->ncec_last = 0; 4865 if (state == ND_INITIAL) 4866 ncec->ncec_init_time = TICK_TO_MSEC(ddi_get_lbolt64()); 4867 } 4868 list_create(&ncec->ncec_cb, sizeof (ncec_cb_t), 4869 offsetof(ncec_cb_t, ncec_cb_node)); 4870 /* 4871 * have all the memory allocations out of the way before taking locks 4872 * and adding the nce. 4873 */ 4874 nce = kmem_cache_alloc(nce_cache, KM_NOSLEEP); 4875 if (nce == NULL) { 4876 err = ENOMEM; 4877 goto err_ret; 4878 } 4879 if (ncec->ncec_lladdr != NULL || 4880 ill->ill_net_type == IRE_IF_NORESOLVER) { 4881 dlur_mp = ill_dlur_gen(ncec->ncec_lladdr, 4882 ill->ill_phys_addr_length, ill->ill_sap, 4883 ill->ill_sap_length); 4884 if (dlur_mp == NULL) { 4885 err = ENOMEM; 4886 goto err_ret; 4887 } 4888 } 4889 4890 /* 4891 * Atomically ensure that the ill is not CONDEMNED, before 4892 * adding the NCE. 4893 */ 4894 mutex_enter(&ill->ill_lock); 4895 if (ill->ill_state_flags & ILL_CONDEMNED) { 4896 mutex_exit(&ill->ill_lock); 4897 err = EINVAL; 4898 goto err_ret; 4899 } 4900 if (!NCE_MYADDR(ncec) && 4901 (ill->ill_state_flags & ILL_DOWN_IN_PROGRESS)) { 4902 mutex_exit(&ill->ill_lock); 4903 DTRACE_PROBE1(nce__add__on__down__ill, ncec_t *, ncec); 4904 err = EINVAL; 4905 goto err_ret; 4906 } 4907 /* 4908 * Acquire the ncec_lock even before adding the ncec to the list 4909 * so that it cannot get deleted after the ncec is added, but 4910 * before we add the nce. 4911 */ 4912 mutex_enter(&ncec->ncec_lock); 4913 if ((ncec->ncec_next = *ncep) != NULL) 4914 ncec->ncec_next->ncec_ptpn = &ncec->ncec_next; 4915 *ncep = ncec; 4916 ncec->ncec_ptpn = ncep; 4917 4918 /* Bump up the number of ncec's referencing this ill */ 4919 DTRACE_PROBE3(ill__incr__cnt, (ill_t *), ill, 4920 (char *), "ncec", (void *), ncec); 4921 ill->ill_ncec_cnt++; 4922 /* 4923 * Since we hold the ncec_lock at this time, the ncec cannot be 4924 * condemned, and we can safely add the nce. 4925 */ 4926 list_create(&graveyard, sizeof (nce_t), offsetof(nce_t, nce_node)); 4927 *retnce = nce_add_impl(ill, ncec, nce, dlur_mp, &graveyard); 4928 mutex_exit(&ncec->ncec_lock); 4929 mutex_exit(&ill->ill_lock); 4930 nce_graveyard_free(&graveyard); 4931 4932 /* caller must trigger fastpath on *retnce */ 4933 return (0); 4934 4935 err_ret: 4936 if (ncec != NULL) 4937 kmem_cache_free(ncec_cache, ncec); 4938 if (nce != NULL) 4939 kmem_cache_free(nce_cache, nce); 4940 freemsg(dlur_mp); 4941 if (template != NULL) 4942 kmem_free(template, ill->ill_phys_addr_length); 4943 return (err); 4944 } 4945 4946 /* 4947 * take a ref on the nce 4948 */ 4949 void 4950 nce_refhold(nce_t *nce) 4951 { 4952 mutex_enter(&nce->nce_lock); 4953 nce->nce_refcnt++; 4954 ASSERT((nce)->nce_refcnt != 0); 4955 mutex_exit(&nce->nce_lock); 4956 } 4957 4958 /* 4959 * release a ref on the nce; In general, this 4960 * cannot be called with locks held because nce_inactive 4961 * may result in nce_inactive which will take the ill_lock, 4962 * do ipif_ill_refrele_tail etc. Thus the one exception 4963 * where this can be called with locks held is when the caller 4964 * is certain that the nce_refcnt is sufficient to prevent 4965 * the invocation of nce_inactive. 4966 */ 4967 void 4968 nce_refrele(nce_t *nce) 4969 { 4970 ASSERT((nce)->nce_refcnt != 0); 4971 mutex_enter(&nce->nce_lock); 4972 if (--nce->nce_refcnt == 0) 4973 nce_inactive(nce); /* destroys the mutex */ 4974 else 4975 mutex_exit(&nce->nce_lock); 4976 } 4977 4978 /* 4979 * free the nce after all refs have gone away. 4980 */ 4981 static void 4982 nce_inactive(nce_t *nce) 4983 { 4984 ill_t *ill = nce->nce_ill; 4985 4986 ASSERT(nce->nce_refcnt == 0); 4987 4988 ncec_refrele_notr(nce->nce_common); 4989 nce->nce_common = NULL; 4990 freemsg(nce->nce_fp_mp); 4991 freemsg(nce->nce_dlur_mp); 4992 4993 mutex_enter(&ill->ill_lock); 4994 DTRACE_PROBE3(ill__decr__cnt, (ill_t *), ill, 4995 (char *), "nce", (void *), nce); 4996 ill->ill_nce_cnt--; 4997 nce->nce_ill = NULL; 4998 /* 4999 * If the number of ncec's associated with this ill have dropped 5000 * to zero, check whether we need to restart any operation that 5001 * is waiting for this to happen. 5002 */ 5003 if (ILL_DOWN_OK(ill)) { 5004 /* ipif_ill_refrele_tail drops the ill_lock */ 5005 ipif_ill_refrele_tail(ill); 5006 } else { 5007 mutex_exit(&ill->ill_lock); 5008 } 5009 5010 mutex_destroy(&nce->nce_lock); 5011 kmem_cache_free(nce_cache, nce); 5012 } 5013 5014 /* 5015 * Add an nce to the ill_nce list. 5016 * 5017 * Adding multicast NCEs is subject to a per-ill limit. This function returns 5018 * NULL if that's the case, and it may reap a number of multicast nces. 5019 * Callers (and upstack) must be able to cope with NULL returns. 5020 */ 5021 static nce_t * 5022 nce_add_impl(ill_t *ill, ncec_t *ncec, nce_t *nce, mblk_t *dlur_mp, 5023 list_t *graveyard) 5024 { 5025 ASSERT(MUTEX_HELD(&ill->ill_lock)); 5026 5027 if ((ncec->ncec_flags & NCE_F_MCAST) != 0) { 5028 if (nce_too_many_mcast(ill, graveyard)) { 5029 kmem_cache_free(nce_cache, nce); 5030 return (NULL); 5031 } 5032 ill->ill_mcast_nces++; 5033 } 5034 5035 bzero(nce, sizeof (*nce)); 5036 mutex_init(&nce->nce_lock, NULL, MUTEX_DEFAULT, NULL); 5037 nce->nce_common = ncec; 5038 nce->nce_addr = ncec->ncec_addr; 5039 nce->nce_ill = ill; 5040 DTRACE_PROBE3(ill__incr__cnt, (ill_t *), ill, 5041 (char *), "nce", (void *), nce); 5042 ill->ill_nce_cnt++; 5043 5044 nce->nce_refcnt = 1; /* for the thread */ 5045 ncec->ncec_refcnt++; /* want ncec_refhold_locked_notr(ncec) */ 5046 nce->nce_dlur_mp = dlur_mp; 5047 5048 /* add nce to the ill's fastpath list. */ 5049 nce->nce_refcnt++; /* for the list */ 5050 list_insert_head(&ill->ill_nce, nce); 5051 return (nce); 5052 } 5053 5054 static nce_t * 5055 nce_add(ill_t *ill, ncec_t *ncec, list_t *graveyard) 5056 { 5057 nce_t *nce; 5058 mblk_t *dlur_mp = NULL; 5059 5060 ASSERT(MUTEX_HELD(&ill->ill_lock)); 5061 ASSERT(MUTEX_HELD(&ncec->ncec_lock)); 5062 5063 nce = kmem_cache_alloc(nce_cache, KM_NOSLEEP); 5064 if (nce == NULL) 5065 return (NULL); 5066 if (ncec->ncec_lladdr != NULL || 5067 ill->ill_net_type == IRE_IF_NORESOLVER) { 5068 dlur_mp = ill_dlur_gen(ncec->ncec_lladdr, 5069 ill->ill_phys_addr_length, ill->ill_sap, 5070 ill->ill_sap_length); 5071 if (dlur_mp == NULL) { 5072 kmem_cache_free(nce_cache, nce); 5073 return (NULL); 5074 } 5075 } 5076 /* 5077 * If nce_add_impl() returns NULL due to on multicast limiting, caller 5078 * will (correctly) assume ENOMEM. 5079 */ 5080 return (nce_add_impl(ill, ncec, nce, dlur_mp, graveyard)); 5081 } 5082 5083 /* 5084 * remove the nce from the ill_faspath list 5085 */ 5086 void 5087 nce_delete(nce_t *nce) 5088 { 5089 ill_t *ill = nce->nce_ill; 5090 5091 ASSERT(MUTEX_HELD(&ill->ill_lock)); 5092 5093 mutex_enter(&nce->nce_lock); 5094 if (nce->nce_is_condemned) { 5095 /* 5096 * some other thread has removed this nce from the ill_nce list 5097 */ 5098 mutex_exit(&nce->nce_lock); 5099 return; 5100 } 5101 nce->nce_is_condemned = B_TRUE; 5102 mutex_exit(&nce->nce_lock); 5103 5104 /* Update the count of multicast NCEs. */ 5105 if ((nce->nce_common->ncec_flags & NCE_F_MCAST) == NCE_F_MCAST) 5106 ill->ill_mcast_nces--; 5107 5108 list_remove(&ill->ill_nce, nce); 5109 /* 5110 * even though we are holding the ill_lock, it is ok to 5111 * call nce_refrele here because we know that we should have 5112 * at least 2 refs on the nce: one for the thread, and one 5113 * for the list. The refrele below will release the one for 5114 * the list. 5115 */ 5116 nce_refrele(nce); 5117 } 5118 5119 nce_t * 5120 nce_lookup(ill_t *ill, const in6_addr_t *addr) 5121 { 5122 nce_t *nce = NULL; 5123 5124 ASSERT(ill != NULL); 5125 ASSERT(MUTEX_HELD(&ill->ill_lock)); 5126 5127 for (nce = list_head(&ill->ill_nce); nce != NULL; 5128 nce = list_next(&ill->ill_nce, nce)) { 5129 if (IN6_ARE_ADDR_EQUAL(&nce->nce_addr, addr)) 5130 break; 5131 } 5132 5133 /* 5134 * if we found the nce on the ill_nce list while holding 5135 * the ill_lock, then it cannot be condemned yet. 5136 */ 5137 if (nce != NULL) { 5138 ASSERT(!nce->nce_is_condemned); 5139 nce_refhold(nce); 5140 } 5141 return (nce); 5142 } 5143 5144 /* 5145 * Walk the ill_nce list on ill. The callback function func() cannot perform 5146 * any destructive actions. 5147 */ 5148 static void 5149 nce_walk_common(ill_t *ill, pfi_t func, void *arg) 5150 { 5151 nce_t *nce = NULL, *nce_next; 5152 5153 ASSERT(MUTEX_HELD(&ill->ill_lock)); 5154 for (nce = list_head(&ill->ill_nce); nce != NULL; ) { 5155 nce_next = list_next(&ill->ill_nce, nce); 5156 if (func(ill, nce, arg) != 0) 5157 break; 5158 nce = nce_next; 5159 } 5160 } 5161 5162 void 5163 nce_walk(ill_t *ill, pfi_t func, void *arg) 5164 { 5165 mutex_enter(&ill->ill_lock); 5166 nce_walk_common(ill, func, arg); 5167 mutex_exit(&ill->ill_lock); 5168 } 5169 5170 void 5171 nce_flush(ill_t *ill, boolean_t flushall) 5172 { 5173 nce_t *nce, *nce_next; 5174 list_t dead; 5175 5176 list_create(&dead, sizeof (nce_t), offsetof(nce_t, nce_node)); 5177 mutex_enter(&ill->ill_lock); 5178 for (nce = list_head(&ill->ill_nce); nce != NULL; ) { 5179 nce_next = list_next(&ill->ill_nce, nce); 5180 if (!flushall && NCE_PUBLISH(nce->nce_common)) { 5181 nce = nce_next; 5182 continue; 5183 } 5184 /* 5185 * nce_delete requires that the caller should either not 5186 * be holding locks, or should hold a ref to ensure that 5187 * we wont hit ncec_inactive. So take a ref and clean up 5188 * after the list is flushed. 5189 */ 5190 nce_refhold(nce); 5191 nce_delete(nce); 5192 list_insert_tail(&dead, nce); 5193 nce = nce_next; 5194 } 5195 mutex_exit(&ill->ill_lock); 5196 while ((nce = list_head(&dead)) != NULL) { 5197 list_remove(&dead, nce); 5198 nce_refrele(nce); 5199 } 5200 ASSERT(list_is_empty(&dead)); 5201 list_destroy(&dead); 5202 } 5203 5204 /* Return an interval that is anywhere in the [1 .. intv] range */ 5205 static clock_t 5206 nce_fuzz_interval(clock_t intv, boolean_t initial_time) 5207 { 5208 clock_t rnd, frac; 5209 5210 (void) random_get_pseudo_bytes((uint8_t *)&rnd, sizeof (rnd)); 5211 /* Note that clock_t is signed; must chop off bits */ 5212 rnd &= (1ul << (NBBY * sizeof (rnd) - 1)) - 1; 5213 if (initial_time) { 5214 if (intv <= 0) 5215 intv = 1; 5216 else 5217 intv = (rnd % intv) + 1; 5218 } else { 5219 /* Compute 'frac' as 20% of the configured interval */ 5220 if ((frac = intv / 5) <= 1) 5221 frac = 2; 5222 /* Set intv randomly in the range [intv-frac .. intv+frac] */ 5223 if ((intv = intv - frac + rnd % (2 * frac + 1)) <= 0) 5224 intv = 1; 5225 } 5226 return (intv); 5227 } 5228 5229 void 5230 nce_resolv_ipmp_ok(ncec_t *ncec) 5231 { 5232 mblk_t *mp; 5233 uint_t pkt_len; 5234 iaflags_t ixaflags = IXAF_NO_TRACE; 5235 nce_t *under_nce; 5236 ill_t *ill = ncec->ncec_ill; 5237 boolean_t isv6 = (ncec->ncec_ipversion == IPV6_VERSION); 5238 ipif_t *src_ipif = NULL; 5239 ip_stack_t *ipst = ill->ill_ipst; 5240 ill_t *send_ill; 5241 uint_t nprobes; 5242 5243 ASSERT(IS_IPMP(ill)); 5244 5245 mutex_enter(&ncec->ncec_lock); 5246 nprobes = ncec->ncec_nprobes; 5247 mp = ncec->ncec_qd_mp; 5248 ncec->ncec_qd_mp = NULL; 5249 ncec->ncec_nprobes = 0; 5250 mutex_exit(&ncec->ncec_lock); 5251 5252 while (mp != NULL) { 5253 mblk_t *nxt_mp; 5254 5255 nxt_mp = mp->b_next; 5256 mp->b_next = NULL; 5257 if (isv6) { 5258 ip6_t *ip6h = (ip6_t *)mp->b_rptr; 5259 5260 pkt_len = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN; 5261 src_ipif = ipif_lookup_addr_nondup_v6(&ip6h->ip6_src, 5262 ill, ALL_ZONES, ipst); 5263 } else { 5264 ipha_t *ipha = (ipha_t *)mp->b_rptr; 5265 5266 ixaflags |= IXAF_IS_IPV4; 5267 pkt_len = ntohs(ipha->ipha_length); 5268 src_ipif = ipif_lookup_addr_nondup(ipha->ipha_src, 5269 ill, ALL_ZONES, ipst); 5270 } 5271 5272 /* 5273 * find a new nce based on an under_ill. The first IPMP probe 5274 * packet gets queued, so we could still find a src_ipif that 5275 * matches an IPMP test address. 5276 */ 5277 if (src_ipif == NULL || IS_IPMP(src_ipif->ipif_ill)) { 5278 /* 5279 * if src_ipif is null, this could be either a 5280 * forwarded packet or a probe whose src got deleted. 5281 * We identify the former case by looking for the 5282 * ncec_nprobes: the first ncec_nprobes packets are 5283 * probes; 5284 */ 5285 if (src_ipif == NULL && nprobes > 0) 5286 goto drop_pkt; 5287 5288 /* 5289 * For forwarded packets, we use the ipmp rotor 5290 * to find send_ill. 5291 */ 5292 send_ill = ipmp_ill_hold_xmit_ill(ncec->ncec_ill, 5293 B_TRUE); 5294 } else { 5295 send_ill = src_ipif->ipif_ill; 5296 ill_refhold(send_ill); 5297 } 5298 5299 DTRACE_PROBE4(nce__resolve__ipmp, (mblk_t *), mp, 5300 (ncec_t *), ncec, (ipif_t *), 5301 src_ipif, (ill_t *), send_ill); 5302 5303 if (send_ill == NULL) { 5304 if (src_ipif != NULL) 5305 ipif_refrele(src_ipif); 5306 goto drop_pkt; 5307 } 5308 /* create an under_nce on send_ill */ 5309 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 5310 if (IS_IN_SAME_ILLGRP(send_ill, ncec->ncec_ill)) 5311 under_nce = nce_fastpath_create(send_ill, ncec); 5312 else 5313 under_nce = NULL; 5314 rw_exit(&ipst->ips_ill_g_lock); 5315 if (under_nce != NULL && NCE_ISREACHABLE(ncec)) 5316 nce_fastpath_trigger(under_nce); 5317 5318 ill_refrele(send_ill); 5319 if (src_ipif != NULL) 5320 ipif_refrele(src_ipif); 5321 5322 if (under_nce != NULL) { 5323 (void) ip_xmit(mp, under_nce, ixaflags, pkt_len, 0, 5324 ALL_ZONES, 0, NULL); 5325 nce_refrele(under_nce); 5326 if (nprobes > 0) 5327 nprobes--; 5328 mp = nxt_mp; 5329 continue; 5330 } 5331 drop_pkt: 5332 if (isv6) { 5333 BUMP_MIB(&ipst->ips_ip6_mib, ipIfStatsOutDiscards); 5334 } else { 5335 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); 5336 } 5337 ip_drop_output("ipIfStatsOutDiscards - no under_ill", mp, NULL); 5338 freemsg(mp); 5339 if (nprobes > 0) 5340 nprobes--; 5341 mp = nxt_mp; 5342 } 5343 ncec_cb_dispatch(ncec); /* complete callbacks */ 5344 } 5345