1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved. 23 */ 24 25 /* 26 * Copyright (c) 2018, Joyent, Inc. 27 */ 28 29 #include <sys/types.h> 30 #include <sys/stream.h> 31 #include <sys/stropts.h> 32 #include <sys/strsun.h> 33 #include <sys/sysmacros.h> 34 #include <sys/errno.h> 35 #include <sys/dlpi.h> 36 #include <sys/socket.h> 37 #include <sys/ddi.h> 38 #include <sys/sunddi.h> 39 #include <sys/cmn_err.h> 40 #include <sys/debug.h> 41 #include <sys/vtrace.h> 42 #include <sys/kmem.h> 43 #include <sys/zone.h> 44 #include <sys/ethernet.h> 45 #include <sys/sdt.h> 46 #include <sys/mac.h> 47 48 #include <net/if.h> 49 #include <net/if_types.h> 50 #include <net/if_dl.h> 51 #include <net/route.h> 52 #include <netinet/in.h> 53 #include <netinet/ip6.h> 54 #include <netinet/icmp6.h> 55 56 #include <inet/common.h> 57 #include <inet/mi.h> 58 #include <inet/mib2.h> 59 #include <inet/nd.h> 60 #include <inet/ip.h> 61 #include <inet/ip_impl.h> 62 #include <inet/ipclassifier.h> 63 #include <inet/ip_if.h> 64 #include <inet/ip_ire.h> 65 #include <inet/ip_rts.h> 66 #include <inet/ip6.h> 67 #include <inet/ip_ndp.h> 68 #include <inet/sctp_ip.h> 69 #include <inet/ip_arp.h> 70 #include <inet/ip2mac_impl.h> 71 72 #define ANNOUNCE_INTERVAL(isv6) \ 73 (isv6 ? ipst->ips_ip_ndp_unsolicit_interval : \ 74 ipst->ips_ip_arp_publish_interval) 75 76 #define DEFENSE_INTERVAL(isv6) \ 77 (isv6 ? ipst->ips_ndp_defend_interval : \ 78 ipst->ips_arp_defend_interval) 79 80 /* Non-tunable probe interval, based on link capabilities */ 81 #define ILL_PROBE_INTERVAL(ill) ((ill)->ill_note_link ? 150 : 1500) 82 83 /* 84 * The IPv4 Link Local address space is special; we do extra duplicate checking 85 * there, as the entire assignment mechanism rests on random numbers. 86 */ 87 #define IS_IPV4_LL_SPACE(ptr) (((uchar_t *)ptr)[0] == 169 && \ 88 ((uchar_t *)ptr)[1] == 254) 89 90 /* 91 * NCE_EXTERNAL_FLAGS_MASK defines the set of ncec_flags that may be passed 92 * in to the ncec*add* functions. 93 * 94 * NCE_F_AUTHORITY means that we ignore any incoming adverts for that 95 * mapping (though DAD is performed for the mapping). NCE_F_PUBLISH means 96 * that we will respond to requests for the protocol address. 97 */ 98 #define NCE_EXTERNAL_FLAGS_MASK \ 99 (NCE_F_MYADDR | NCE_F_ISROUTER | NCE_F_NONUD | \ 100 NCE_F_ANYCAST | NCE_F_UNSOL_ADV | NCE_F_BCAST | NCE_F_MCAST | \ 101 NCE_F_AUTHORITY | NCE_F_PUBLISH | NCE_F_STATIC) 102 103 /* 104 * Lock ordering: 105 * 106 * ndp_g_lock -> ill_lock -> ncec_lock 107 * 108 * The ndp_g_lock protects the NCE hash (nce_hash_tbl, NCE_HASH_PTR) and 109 * ncec_next. ncec_lock protects the contents of the NCE (particularly 110 * ncec_refcnt). 111 */ 112 113 static void nce_cleanup_list(ncec_t *ncec); 114 static void nce_set_ll(ncec_t *ncec, uchar_t *ll_addr); 115 static ncec_t *ncec_lookup_illgrp(ill_t *, const in6_addr_t *, 116 ncec_t *); 117 static nce_t *nce_lookup_addr(ill_t *, const in6_addr_t *); 118 static int nce_set_multicast_v6(ill_t *ill, const in6_addr_t *addr, 119 uint16_t ncec_flags, nce_t **newnce); 120 static int nce_set_multicast_v4(ill_t *ill, const in_addr_t *dst, 121 uint16_t ncec_flags, nce_t **newnce); 122 static boolean_t ndp_xmit(ill_t *ill, uint32_t operation, 123 uint8_t *hwaddr, uint_t hwaddr_len, const in6_addr_t *sender, 124 const in6_addr_t *target, int flag); 125 static void ncec_refhold_locked(ncec_t *); 126 static boolean_t ill_defend_rate_limit(ill_t *, ncec_t *); 127 static void nce_queue_mp_common(ncec_t *, mblk_t *, boolean_t); 128 static int nce_add_common(ill_t *, uchar_t *, uint_t, const in6_addr_t *, 129 uint16_t, uint16_t, nce_t **); 130 static nce_t *nce_add_impl(ill_t *, ncec_t *, nce_t *, mblk_t *); 131 static nce_t *nce_add(ill_t *, ncec_t *); 132 static void nce_inactive(nce_t *); 133 extern nce_t *nce_lookup(ill_t *, const in6_addr_t *); 134 static nce_t *nce_ill_lookup_then_add(ill_t *, ncec_t *); 135 static int nce_add_v6(ill_t *, uchar_t *, uint_t, const in6_addr_t *, 136 uint16_t, uint16_t, nce_t **); 137 static int nce_add_v4(ill_t *, uchar_t *, uint_t, const in_addr_t *, 138 uint16_t, uint16_t, nce_t **); 139 static int nce_add_v6_postprocess(nce_t *); 140 static int nce_add_v4_postprocess(nce_t *); 141 static ill_t *nce_resolve_src(ncec_t *, in6_addr_t *); 142 static clock_t nce_fuzz_interval(clock_t, boolean_t); 143 static void nce_resolv_ipmp_ok(ncec_t *); 144 static void nce_walk_common(ill_t *, pfi_t, void *); 145 static void nce_start_timer(ncec_t *, uint_t); 146 static nce_t *nce_fastpath_create(ill_t *, ncec_t *); 147 static void nce_fastpath_trigger(nce_t *); 148 static nce_t *nce_fastpath(ncec_t *, boolean_t, nce_t *); 149 150 #ifdef DEBUG 151 static void ncec_trace_cleanup(const ncec_t *); 152 #endif 153 154 #define NCE_HASH_PTR_V4(ipst, addr) \ 155 (&((ipst)->ips_ndp4->nce_hash_tbl[IRE_ADDR_HASH(addr, NCE_TABLE_SIZE)])) 156 157 #define NCE_HASH_PTR_V6(ipst, addr) \ 158 (&((ipst)->ips_ndp6->nce_hash_tbl[NCE_ADDR_HASH_V6(addr, \ 159 NCE_TABLE_SIZE)])) 160 161 extern kmem_cache_t *ncec_cache; 162 extern kmem_cache_t *nce_cache; 163 164 /* 165 * Send out a IPv6 (unicast) or IPv4 (broadcast) DAD probe 166 * If src_ill is not null, the ncec_addr is bound to src_ill. The 167 * src_ill is ignored by nce_dad for IPv4 Neighbor Cache entries where 168 * the probe is sent on the ncec_ill (in the non-IPMP case) or the 169 * IPMP cast_ill (in the IPMP case). 170 * 171 * Note that the probe interval is based on the src_ill for IPv6, and 172 * the ncec_xmit_interval for IPv4. 173 */ 174 static void 175 nce_dad(ncec_t *ncec, ill_t *src_ill, boolean_t send_probe) 176 { 177 boolean_t dropped; 178 uint32_t probe_interval; 179 180 ASSERT(!(ncec->ncec_flags & NCE_F_MCAST)); 181 ASSERT(!(ncec->ncec_flags & NCE_F_BCAST)); 182 if (ncec->ncec_ipversion == IPV6_VERSION) { 183 dropped = ndp_xmit(src_ill, ND_NEIGHBOR_SOLICIT, 184 ncec->ncec_lladdr, ncec->ncec_lladdr_length, 185 &ipv6_all_zeros, &ncec->ncec_addr, NDP_PROBE); 186 probe_interval = ILL_PROBE_INTERVAL(src_ill); 187 } else { 188 /* IPv4 DAD delay the initial probe. */ 189 if (send_probe) 190 dropped = arp_probe(ncec); 191 else 192 dropped = B_TRUE; 193 probe_interval = nce_fuzz_interval(ncec->ncec_xmit_interval, 194 !send_probe); 195 } 196 if (!dropped) { 197 mutex_enter(&ncec->ncec_lock); 198 ncec->ncec_pcnt--; 199 mutex_exit(&ncec->ncec_lock); 200 } 201 nce_restart_timer(ncec, probe_interval); 202 } 203 204 /* 205 * Compute default flags to use for an advertisement of this ncec's address. 206 */ 207 static int 208 nce_advert_flags(const ncec_t *ncec) 209 { 210 int flag = 0; 211 212 if (ncec->ncec_flags & NCE_F_ISROUTER) 213 flag |= NDP_ISROUTER; 214 if (!(ncec->ncec_flags & NCE_F_ANYCAST)) 215 flag |= NDP_ORIDE; 216 217 return (flag); 218 } 219 220 /* 221 * NDP Cache Entry creation routine. 222 * This routine must always be called with ndp6->ndp_g_lock held. 223 */ 224 int 225 nce_add_v6(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len, 226 const in6_addr_t *addr, uint16_t flags, uint16_t state, nce_t **newnce) 227 { 228 int err; 229 nce_t *nce; 230 231 ASSERT(MUTEX_HELD(&ill->ill_ipst->ips_ndp6->ndp_g_lock)); 232 ASSERT(ill != NULL && ill->ill_isv6); 233 234 err = nce_add_common(ill, hw_addr, hw_addr_len, addr, flags, state, 235 &nce); 236 if (err != 0) 237 return (err); 238 ASSERT(newnce != NULL); 239 *newnce = nce; 240 return (err); 241 } 242 243 /* 244 * Post-processing routine to be executed after nce_add_v6(). This function 245 * triggers fastpath (if appropriate) and DAD on the newly added nce entry 246 * and must be called without any locks held. 247 */ 248 int 249 nce_add_v6_postprocess(nce_t *nce) 250 { 251 ncec_t *ncec = nce->nce_common; 252 boolean_t dropped = B_FALSE; 253 uchar_t *hw_addr = ncec->ncec_lladdr; 254 uint_t hw_addr_len = ncec->ncec_lladdr_length; 255 ill_t *ill = ncec->ncec_ill; 256 int err = 0; 257 uint16_t flags = ncec->ncec_flags; 258 ip_stack_t *ipst = ill->ill_ipst; 259 boolean_t trigger_fastpath = B_TRUE; 260 261 /* 262 * If the hw_addr is NULL, typically for ND_INCOMPLETE nces, then 263 * we call nce_fastpath as soon as the ncec is resolved in nce_process. 264 * We call nce_fastpath from nce_update if the link layer address of 265 * the peer changes from nce_update 266 */ 267 if (NCE_PUBLISH(ncec) || !NCE_ISREACHABLE(ncec) || 268 (hw_addr == NULL && ill->ill_net_type != IRE_IF_NORESOLVER)) 269 trigger_fastpath = B_FALSE; 270 271 if (trigger_fastpath) 272 nce_fastpath_trigger(nce); 273 if (NCE_PUBLISH(ncec) && ncec->ncec_state == ND_PROBE) { 274 ill_t *hwaddr_ill; 275 /* 276 * Unicast entry that needs DAD. 277 */ 278 if (IS_IPMP(ill)) { 279 hwaddr_ill = ipmp_illgrp_find_ill(ill->ill_grp, 280 hw_addr, hw_addr_len); 281 } else { 282 hwaddr_ill = ill; 283 } 284 nce_dad(ncec, hwaddr_ill, B_TRUE); 285 err = EINPROGRESS; 286 } else if (flags & NCE_F_UNSOL_ADV) { 287 /* 288 * We account for the transmit below by assigning one 289 * less than the ndd variable. Subsequent decrements 290 * are done in nce_timer. 291 */ 292 mutex_enter(&ncec->ncec_lock); 293 ncec->ncec_unsolicit_count = 294 ipst->ips_ip_ndp_unsolicit_count - 1; 295 mutex_exit(&ncec->ncec_lock); 296 dropped = ndp_xmit(ill, 297 ND_NEIGHBOR_ADVERT, 298 hw_addr, 299 hw_addr_len, 300 &ncec->ncec_addr, /* Source and target of the adv */ 301 &ipv6_all_hosts_mcast, /* Destination of the packet */ 302 nce_advert_flags(ncec)); 303 mutex_enter(&ncec->ncec_lock); 304 if (dropped) 305 ncec->ncec_unsolicit_count++; 306 else 307 ncec->ncec_last_time_defended = ddi_get_lbolt(); 308 if (ncec->ncec_unsolicit_count != 0) { 309 nce_start_timer(ncec, 310 ipst->ips_ip_ndp_unsolicit_interval); 311 } 312 mutex_exit(&ncec->ncec_lock); 313 } 314 return (err); 315 } 316 317 /* 318 * Atomically lookup and add (if needed) Neighbor Cache information for 319 * an address. 320 * 321 * IPMP notes: the ncec for non-local (i.e., !NCE_MYADDR(ncec) addresses 322 * are always added pointing at the ipmp_ill. Thus, when the ill passed 323 * to nce_add_v6 is an under_ill (i.e., IS_UNDER_IPMP(ill)) two nce_t 324 * entries will be created, both pointing at the same ncec_t. The nce_t 325 * entries will have their nce_ill set to the ipmp_ill and the under_ill 326 * respectively, with the ncec_t having its ncec_ill pointing at the ipmp_ill. 327 * Local addresses are always created on the ill passed to nce_add_v6. 328 */ 329 int 330 nce_lookup_then_add_v6(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len, 331 const in6_addr_t *addr, uint16_t flags, uint16_t state, nce_t **newnce) 332 { 333 int err = 0; 334 ip_stack_t *ipst = ill->ill_ipst; 335 nce_t *nce, *upper_nce = NULL; 336 ill_t *in_ill = ill; 337 boolean_t need_ill_refrele = B_FALSE; 338 339 if (flags & NCE_F_MCAST) { 340 /* 341 * hw_addr will be figured out in nce_set_multicast_v6; 342 * caller has to select the cast_ill 343 */ 344 ASSERT(hw_addr == NULL); 345 ASSERT(!IS_IPMP(ill)); 346 err = nce_set_multicast_v6(ill, addr, flags, newnce); 347 return (err); 348 } 349 ASSERT(ill->ill_isv6); 350 if (IS_UNDER_IPMP(ill) && !(flags & NCE_F_MYADDR)) { 351 ill = ipmp_ill_hold_ipmp_ill(ill); 352 if (ill == NULL) 353 return (ENXIO); 354 need_ill_refrele = B_TRUE; 355 } 356 357 mutex_enter(&ipst->ips_ndp6->ndp_g_lock); 358 nce = nce_lookup_addr(ill, addr); 359 if (nce == NULL) { 360 err = nce_add_v6(ill, hw_addr, hw_addr_len, addr, flags, state, 361 &nce); 362 } else { 363 err = EEXIST; 364 } 365 mutex_exit(&ipst->ips_ndp6->ndp_g_lock); 366 if (err == 0) 367 err = nce_add_v6_postprocess(nce); 368 if (in_ill != ill && nce != NULL) { 369 nce_t *under_nce = NULL; 370 371 /* 372 * in_ill was the under_ill. Try to create the under_nce. 373 * Hold the ill_g_lock to prevent changes to group membership 374 * until we are done. 375 */ 376 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 377 if (!IS_IN_SAME_ILLGRP(in_ill, ill)) { 378 DTRACE_PROBE2(ill__not__in__group, nce_t *, nce, 379 ill_t *, ill); 380 rw_exit(&ipst->ips_ill_g_lock); 381 err = ENXIO; 382 nce_refrele(nce); 383 nce = NULL; 384 goto bail; 385 } 386 under_nce = nce_fastpath_create(in_ill, nce->nce_common); 387 if (under_nce == NULL) { 388 rw_exit(&ipst->ips_ill_g_lock); 389 err = EINVAL; 390 nce_refrele(nce); 391 nce = NULL; 392 goto bail; 393 } 394 rw_exit(&ipst->ips_ill_g_lock); 395 upper_nce = nce; 396 nce = under_nce; /* will be returned to caller */ 397 if (NCE_ISREACHABLE(nce->nce_common)) 398 nce_fastpath_trigger(under_nce); 399 } 400 /* nce_refrele is deferred until the lock is dropped */ 401 if (nce != NULL) { 402 if (newnce != NULL) 403 *newnce = nce; 404 else 405 nce_refrele(nce); 406 } 407 bail: 408 if (upper_nce != NULL) 409 nce_refrele(upper_nce); 410 if (need_ill_refrele) 411 ill_refrele(ill); 412 return (err); 413 } 414 415 /* 416 * Remove all the CONDEMNED nces from the appropriate hash table. 417 * We create a private list of NCEs, these may have ires pointing 418 * to them, so the list will be passed through to clean up dependent 419 * ires and only then we can do ncec_refrele() which can make NCE inactive. 420 */ 421 static void 422 nce_remove(ndp_g_t *ndp, ncec_t *ncec, ncec_t **free_nce_list) 423 { 424 ncec_t *ncec1; 425 ncec_t **ptpn; 426 427 ASSERT(MUTEX_HELD(&ndp->ndp_g_lock)); 428 ASSERT(ndp->ndp_g_walker == 0); 429 for (; ncec; ncec = ncec1) { 430 ncec1 = ncec->ncec_next; 431 mutex_enter(&ncec->ncec_lock); 432 if (NCE_ISCONDEMNED(ncec)) { 433 ptpn = ncec->ncec_ptpn; 434 ncec1 = ncec->ncec_next; 435 if (ncec1 != NULL) 436 ncec1->ncec_ptpn = ptpn; 437 *ptpn = ncec1; 438 ncec->ncec_ptpn = NULL; 439 ncec->ncec_next = NULL; 440 ncec->ncec_next = *free_nce_list; 441 *free_nce_list = ncec; 442 } 443 mutex_exit(&ncec->ncec_lock); 444 } 445 } 446 447 /* 448 * 1. Mark the entry CONDEMNED. This ensures that no new nce_lookup() 449 * will return this NCE. Also no new timeouts will 450 * be started (See nce_restart_timer). 451 * 2. Cancel any currently running timeouts. 452 * 3. If there is an ndp walker, return. The walker will do the cleanup. 453 * This ensures that walkers see a consistent list of NCEs while walking. 454 * 4. Otherwise remove the NCE from the list of NCEs 455 */ 456 void 457 ncec_delete(ncec_t *ncec) 458 { 459 ncec_t **ptpn; 460 ncec_t *ncec1; 461 int ipversion = ncec->ncec_ipversion; 462 ndp_g_t *ndp; 463 ip_stack_t *ipst = ncec->ncec_ipst; 464 465 if (ipversion == IPV4_VERSION) 466 ndp = ipst->ips_ndp4; 467 else 468 ndp = ipst->ips_ndp6; 469 470 /* Serialize deletes */ 471 mutex_enter(&ncec->ncec_lock); 472 if (NCE_ISCONDEMNED(ncec)) { 473 /* Some other thread is doing the delete */ 474 mutex_exit(&ncec->ncec_lock); 475 return; 476 } 477 /* 478 * Caller has a refhold. Also 1 ref for being in the list. Thus 479 * refcnt has to be >= 2 480 */ 481 ASSERT(ncec->ncec_refcnt >= 2); 482 ncec->ncec_flags |= NCE_F_CONDEMNED; 483 mutex_exit(&ncec->ncec_lock); 484 485 /* Count how many condemned ires for kmem_cache callback */ 486 atomic_inc_32(&ipst->ips_num_nce_condemned); 487 nce_fastpath_list_delete(ncec->ncec_ill, ncec, NULL); 488 489 /* Complete any waiting callbacks */ 490 ncec_cb_dispatch(ncec); 491 492 /* 493 * Cancel any running timer. Timeout can't be restarted 494 * since CONDEMNED is set. Can't hold ncec_lock across untimeout. 495 * Passing invalid timeout id is fine. 496 */ 497 if (ncec->ncec_timeout_id != 0) { 498 (void) untimeout(ncec->ncec_timeout_id); 499 ncec->ncec_timeout_id = 0; 500 } 501 502 mutex_enter(&ndp->ndp_g_lock); 503 if (ncec->ncec_ptpn == NULL) { 504 /* 505 * The last ndp walker has already removed this ncec from 506 * the list after we marked the ncec CONDEMNED and before 507 * we grabbed the global lock. 508 */ 509 mutex_exit(&ndp->ndp_g_lock); 510 return; 511 } 512 if (ndp->ndp_g_walker > 0) { 513 /* 514 * Can't unlink. The walker will clean up 515 */ 516 ndp->ndp_g_walker_cleanup = B_TRUE; 517 mutex_exit(&ndp->ndp_g_lock); 518 return; 519 } 520 521 /* 522 * Now remove the ncec from the list. nce_restart_timer won't restart 523 * the timer since it is marked CONDEMNED. 524 */ 525 ptpn = ncec->ncec_ptpn; 526 ncec1 = ncec->ncec_next; 527 if (ncec1 != NULL) 528 ncec1->ncec_ptpn = ptpn; 529 *ptpn = ncec1; 530 ncec->ncec_ptpn = NULL; 531 ncec->ncec_next = NULL; 532 mutex_exit(&ndp->ndp_g_lock); 533 534 /* Removed from ncec_ptpn/ncec_next list */ 535 ncec_refrele_notr(ncec); 536 } 537 538 void 539 ncec_inactive(ncec_t *ncec) 540 { 541 mblk_t **mpp; 542 ill_t *ill = ncec->ncec_ill; 543 ip_stack_t *ipst = ncec->ncec_ipst; 544 545 ASSERT(ncec->ncec_refcnt == 0); 546 ASSERT(MUTEX_HELD(&ncec->ncec_lock)); 547 548 /* Count how many condemned nces for kmem_cache callback */ 549 if (NCE_ISCONDEMNED(ncec)) 550 atomic_add_32(&ipst->ips_num_nce_condemned, -1); 551 552 /* Free all allocated messages */ 553 mpp = &ncec->ncec_qd_mp; 554 while (*mpp != NULL) { 555 mblk_t *mp; 556 557 mp = *mpp; 558 *mpp = mp->b_next; 559 560 inet_freemsg(mp); 561 } 562 /* 563 * must have been cleaned up in ncec_delete 564 */ 565 ASSERT(list_is_empty(&ncec->ncec_cb)); 566 list_destroy(&ncec->ncec_cb); 567 /* 568 * free the ncec_lladdr if one was allocated in nce_add_common() 569 */ 570 if (ncec->ncec_lladdr_length > 0) 571 kmem_free(ncec->ncec_lladdr, ncec->ncec_lladdr_length); 572 573 #ifdef DEBUG 574 ncec_trace_cleanup(ncec); 575 #endif 576 577 mutex_enter(&ill->ill_lock); 578 DTRACE_PROBE3(ill__decr__cnt, (ill_t *), ill, 579 (char *), "ncec", (void *), ncec); 580 ill->ill_ncec_cnt--; 581 ncec->ncec_ill = NULL; 582 /* 583 * If the number of ncec's associated with this ill have dropped 584 * to zero, check whether we need to restart any operation that 585 * is waiting for this to happen. 586 */ 587 if (ILL_DOWN_OK(ill)) { 588 /* ipif_ill_refrele_tail drops the ill_lock */ 589 ipif_ill_refrele_tail(ill); 590 } else { 591 mutex_exit(&ill->ill_lock); 592 } 593 594 mutex_destroy(&ncec->ncec_lock); 595 kmem_cache_free(ncec_cache, ncec); 596 } 597 598 /* 599 * ncec_walk routine. Delete the ncec if it is associated with the ill 600 * that is going away. Always called as a writer. 601 */ 602 void 603 ncec_delete_per_ill(ncec_t *ncec, void *arg) 604 { 605 if ((ncec != NULL) && ncec->ncec_ill == arg) { 606 ncec_delete(ncec); 607 } 608 } 609 610 /* 611 * Neighbor Cache cleanup logic for a list of ncec_t entries. 612 */ 613 static void 614 nce_cleanup_list(ncec_t *ncec) 615 { 616 ncec_t *ncec_next; 617 618 ASSERT(ncec != NULL); 619 while (ncec != NULL) { 620 ncec_next = ncec->ncec_next; 621 ncec->ncec_next = NULL; 622 623 /* 624 * It is possible for the last ndp walker (this thread) 625 * to come here after ncec_delete has marked the ncec CONDEMNED 626 * and before it has removed the ncec from the fastpath list 627 * or called untimeout. So we need to do it here. It is safe 628 * for both ncec_delete and this thread to do it twice or 629 * even simultaneously since each of the threads has a 630 * reference on the ncec. 631 */ 632 nce_fastpath_list_delete(ncec->ncec_ill, ncec, NULL); 633 /* 634 * Cancel any running timer. Timeout can't be restarted 635 * since CONDEMNED is set. The ncec_lock can't be 636 * held across untimeout though passing invalid timeout 637 * id is fine. 638 */ 639 if (ncec->ncec_timeout_id != 0) { 640 (void) untimeout(ncec->ncec_timeout_id); 641 ncec->ncec_timeout_id = 0; 642 } 643 /* Removed from ncec_ptpn/ncec_next list */ 644 ncec_refrele_notr(ncec); 645 ncec = ncec_next; 646 } 647 } 648 649 /* 650 * Restart DAD on given NCE. Returns B_TRUE if DAD has been restarted. 651 */ 652 boolean_t 653 nce_restart_dad(ncec_t *ncec) 654 { 655 boolean_t started; 656 ill_t *ill, *hwaddr_ill; 657 658 if (ncec == NULL) 659 return (B_FALSE); 660 ill = ncec->ncec_ill; 661 mutex_enter(&ncec->ncec_lock); 662 if (ncec->ncec_state == ND_PROBE) { 663 mutex_exit(&ncec->ncec_lock); 664 started = B_TRUE; 665 } else if (ncec->ncec_state == ND_REACHABLE) { 666 ASSERT(ncec->ncec_lladdr != NULL); 667 ncec->ncec_state = ND_PROBE; 668 ncec->ncec_pcnt = ND_MAX_UNICAST_SOLICIT; 669 /* 670 * Slight cheat here: we don't use the initial probe delay 671 * for IPv4 in this obscure case. 672 */ 673 mutex_exit(&ncec->ncec_lock); 674 if (IS_IPMP(ill)) { 675 hwaddr_ill = ipmp_illgrp_find_ill(ill->ill_grp, 676 ncec->ncec_lladdr, ncec->ncec_lladdr_length); 677 } else { 678 hwaddr_ill = ill; 679 } 680 nce_dad(ncec, hwaddr_ill, B_TRUE); 681 started = B_TRUE; 682 } else { 683 mutex_exit(&ncec->ncec_lock); 684 started = B_FALSE; 685 } 686 return (started); 687 } 688 689 /* 690 * IPv6 Cache entry lookup. Try to find an ncec matching the parameters passed. 691 * If one is found, the refcnt on the ncec will be incremented. 692 */ 693 ncec_t * 694 ncec_lookup_illgrp_v6(ill_t *ill, const in6_addr_t *addr) 695 { 696 ncec_t *ncec; 697 ip_stack_t *ipst = ill->ill_ipst; 698 699 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 700 mutex_enter(&ipst->ips_ndp6->ndp_g_lock); 701 702 /* Get head of v6 hash table */ 703 ncec = *((ncec_t **)NCE_HASH_PTR_V6(ipst, *addr)); 704 ncec = ncec_lookup_illgrp(ill, addr, ncec); 705 mutex_exit(&ipst->ips_ndp6->ndp_g_lock); 706 rw_exit(&ipst->ips_ill_g_lock); 707 return (ncec); 708 } 709 /* 710 * IPv4 Cache entry lookup. Try to find an ncec matching the parameters passed. 711 * If one is found, the refcnt on the ncec will be incremented. 712 */ 713 ncec_t * 714 ncec_lookup_illgrp_v4(ill_t *ill, const in_addr_t *addr) 715 { 716 ncec_t *ncec = NULL; 717 in6_addr_t addr6; 718 ip_stack_t *ipst = ill->ill_ipst; 719 720 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 721 mutex_enter(&ipst->ips_ndp4->ndp_g_lock); 722 723 /* Get head of v4 hash table */ 724 ncec = *((ncec_t **)NCE_HASH_PTR_V4(ipst, *addr)); 725 IN6_IPADDR_TO_V4MAPPED(*addr, &addr6); 726 ncec = ncec_lookup_illgrp(ill, &addr6, ncec); 727 mutex_exit(&ipst->ips_ndp4->ndp_g_lock); 728 rw_exit(&ipst->ips_ill_g_lock); 729 return (ncec); 730 } 731 732 /* 733 * Cache entry lookup. Try to find an ncec matching the parameters passed. 734 * If an ncec is found, increment the hold count on that ncec. 735 * The caller passes in the start of the appropriate hash table, and must 736 * be holding the appropriate global lock (ndp_g_lock). In addition, since 737 * this function matches ncec_t entries across the illgrp, the ips_ill_g_lock 738 * must be held as reader. 739 * 740 * This function always matches across the ipmp group. 741 */ 742 ncec_t * 743 ncec_lookup_illgrp(ill_t *ill, const in6_addr_t *addr, ncec_t *ncec) 744 { 745 ndp_g_t *ndp; 746 ip_stack_t *ipst = ill->ill_ipst; 747 748 if (ill->ill_isv6) 749 ndp = ipst->ips_ndp6; 750 else 751 ndp = ipst->ips_ndp4; 752 753 ASSERT(ill != NULL); 754 ASSERT(MUTEX_HELD(&ndp->ndp_g_lock)); 755 if (IN6_IS_ADDR_UNSPECIFIED(addr)) 756 return (NULL); 757 for (; ncec != NULL; ncec = ncec->ncec_next) { 758 if (ncec->ncec_ill == ill || 759 IS_IN_SAME_ILLGRP(ill, ncec->ncec_ill)) { 760 if (IN6_ARE_ADDR_EQUAL(&ncec->ncec_addr, addr)) { 761 mutex_enter(&ncec->ncec_lock); 762 if (!NCE_ISCONDEMNED(ncec)) { 763 ncec_refhold_locked(ncec); 764 mutex_exit(&ncec->ncec_lock); 765 break; 766 } 767 mutex_exit(&ncec->ncec_lock); 768 } 769 } 770 } 771 return (ncec); 772 } 773 774 /* 775 * Find an nce_t on ill with nce_addr == addr. Lookup the nce_t 776 * entries for ill only, i.e., when ill is part of an ipmp group, 777 * nce_lookup_v4 will never try to match across the group. 778 */ 779 nce_t * 780 nce_lookup_v4(ill_t *ill, const in_addr_t *addr) 781 { 782 nce_t *nce; 783 in6_addr_t addr6; 784 ip_stack_t *ipst = ill->ill_ipst; 785 786 mutex_enter(&ipst->ips_ndp4->ndp_g_lock); 787 IN6_IPADDR_TO_V4MAPPED(*addr, &addr6); 788 nce = nce_lookup_addr(ill, &addr6); 789 mutex_exit(&ipst->ips_ndp4->ndp_g_lock); 790 return (nce); 791 } 792 793 /* 794 * Find an nce_t on ill with nce_addr == addr. Lookup the nce_t 795 * entries for ill only, i.e., when ill is part of an ipmp group, 796 * nce_lookup_v6 will never try to match across the group. 797 */ 798 nce_t * 799 nce_lookup_v6(ill_t *ill, const in6_addr_t *addr6) 800 { 801 nce_t *nce; 802 ip_stack_t *ipst = ill->ill_ipst; 803 804 mutex_enter(&ipst->ips_ndp6->ndp_g_lock); 805 nce = nce_lookup_addr(ill, addr6); 806 mutex_exit(&ipst->ips_ndp6->ndp_g_lock); 807 return (nce); 808 } 809 810 static nce_t * 811 nce_lookup_addr(ill_t *ill, const in6_addr_t *addr) 812 { 813 nce_t *nce; 814 815 ASSERT(ill != NULL); 816 #ifdef DEBUG 817 if (ill->ill_isv6) 818 ASSERT(MUTEX_HELD(&ill->ill_ipst->ips_ndp6->ndp_g_lock)); 819 else 820 ASSERT(MUTEX_HELD(&ill->ill_ipst->ips_ndp4->ndp_g_lock)); 821 #endif 822 mutex_enter(&ill->ill_lock); 823 nce = nce_lookup(ill, addr); 824 mutex_exit(&ill->ill_lock); 825 return (nce); 826 } 827 828 829 /* 830 * Router turned to host. We need to make sure that cached copies of the ncec 831 * are not used for forwarding packets if they were derived from the default 832 * route, and that the default route itself is removed, as required by 833 * section 7.2.5 of RFC 2461. 834 * 835 * Note that the ncec itself probably has valid link-layer information for the 836 * nexthop, so that there is no reason to delete the ncec, as long as the 837 * ISROUTER flag is turned off. 838 */ 839 static void 840 ncec_router_to_host(ncec_t *ncec) 841 { 842 ire_t *ire; 843 ip_stack_t *ipst = ncec->ncec_ipst; 844 845 mutex_enter(&ncec->ncec_lock); 846 ncec->ncec_flags &= ~NCE_F_ISROUTER; 847 mutex_exit(&ncec->ncec_lock); 848 849 ire = ire_ftable_lookup_v6(&ipv6_all_zeros, &ipv6_all_zeros, 850 &ncec->ncec_addr, IRE_DEFAULT, ncec->ncec_ill, ALL_ZONES, NULL, 851 MATCH_IRE_ILL | MATCH_IRE_TYPE | MATCH_IRE_GW, 0, ipst, NULL); 852 if (ire != NULL) { 853 ip_rts_rtmsg(RTM_DELETE, ire, 0, ipst); 854 ire_delete(ire); 855 ire_refrele(ire); 856 } 857 } 858 859 /* 860 * Process passed in parameters either from an incoming packet or via 861 * user ioctl. 862 */ 863 void 864 nce_process(ncec_t *ncec, uchar_t *hw_addr, uint32_t flag, boolean_t is_adv) 865 { 866 ill_t *ill = ncec->ncec_ill; 867 uint32_t hw_addr_len = ill->ill_phys_addr_length; 868 boolean_t ll_updated = B_FALSE; 869 boolean_t ll_changed; 870 nce_t *nce; 871 872 ASSERT(ncec->ncec_ipversion == IPV6_VERSION); 873 /* 874 * No updates of link layer address or the neighbor state is 875 * allowed, when the cache is in NONUD state. This still 876 * allows for responding to reachability solicitation. 877 */ 878 mutex_enter(&ncec->ncec_lock); 879 if (ncec->ncec_state == ND_INCOMPLETE) { 880 if (hw_addr == NULL) { 881 mutex_exit(&ncec->ncec_lock); 882 return; 883 } 884 nce_set_ll(ncec, hw_addr); 885 /* 886 * Update ncec state and send the queued packets 887 * back to ip this time ire will be added. 888 */ 889 if (flag & ND_NA_FLAG_SOLICITED) { 890 nce_update(ncec, ND_REACHABLE, NULL); 891 } else { 892 nce_update(ncec, ND_STALE, NULL); 893 } 894 mutex_exit(&ncec->ncec_lock); 895 nce = nce_fastpath(ncec, B_TRUE, NULL); 896 nce_resolv_ok(ncec); 897 if (nce != NULL) 898 nce_refrele(nce); 899 return; 900 } 901 ll_changed = nce_cmp_ll_addr(ncec, hw_addr, hw_addr_len); 902 if (!is_adv) { 903 /* If this is a SOLICITATION request only */ 904 if (ll_changed) 905 nce_update(ncec, ND_STALE, hw_addr); 906 mutex_exit(&ncec->ncec_lock); 907 ncec_cb_dispatch(ncec); 908 return; 909 } 910 if (!(flag & ND_NA_FLAG_OVERRIDE) && ll_changed) { 911 /* If in any other state than REACHABLE, ignore */ 912 if (ncec->ncec_state == ND_REACHABLE) { 913 nce_update(ncec, ND_STALE, NULL); 914 } 915 mutex_exit(&ncec->ncec_lock); 916 ncec_cb_dispatch(ncec); 917 return; 918 } else { 919 if (ll_changed) { 920 nce_update(ncec, ND_UNCHANGED, hw_addr); 921 ll_updated = B_TRUE; 922 } 923 if (flag & ND_NA_FLAG_SOLICITED) { 924 nce_update(ncec, ND_REACHABLE, NULL); 925 } else { 926 if (ll_updated) { 927 nce_update(ncec, ND_STALE, NULL); 928 } 929 } 930 mutex_exit(&ncec->ncec_lock); 931 if (!(flag & ND_NA_FLAG_ROUTER) && (ncec->ncec_flags & 932 NCE_F_ISROUTER)) { 933 ncec_router_to_host(ncec); 934 } else { 935 ncec_cb_dispatch(ncec); 936 } 937 } 938 } 939 940 /* 941 * Pass arg1 to the cbf supplied, along with each ncec in existence. 942 * ncec_walk() places a REFHOLD on the ncec and drops the lock when 943 * walking the hash list. 944 */ 945 void 946 ncec_walk_common(ndp_g_t *ndp, ill_t *ill, ncec_walk_cb_t cbf, 947 void *arg1, boolean_t trace) 948 { 949 ncec_t *ncec; 950 ncec_t *ncec1; 951 ncec_t **ncep; 952 ncec_t *free_nce_list = NULL; 953 954 mutex_enter(&ndp->ndp_g_lock); 955 /* Prevent ncec_delete from unlink and free of NCE */ 956 ndp->ndp_g_walker++; 957 mutex_exit(&ndp->ndp_g_lock); 958 for (ncep = ndp->nce_hash_tbl; 959 ncep < A_END(ndp->nce_hash_tbl); ncep++) { 960 for (ncec = *ncep; ncec != NULL; ncec = ncec1) { 961 ncec1 = ncec->ncec_next; 962 if (ill == NULL || ncec->ncec_ill == ill) { 963 if (trace) { 964 ncec_refhold(ncec); 965 (*cbf)(ncec, arg1); 966 ncec_refrele(ncec); 967 } else { 968 ncec_refhold_notr(ncec); 969 (*cbf)(ncec, arg1); 970 ncec_refrele_notr(ncec); 971 } 972 } 973 } 974 } 975 mutex_enter(&ndp->ndp_g_lock); 976 ndp->ndp_g_walker--; 977 if (ndp->ndp_g_walker_cleanup && ndp->ndp_g_walker == 0) { 978 /* Time to delete condemned entries */ 979 for (ncep = ndp->nce_hash_tbl; 980 ncep < A_END(ndp->nce_hash_tbl); ncep++) { 981 ncec = *ncep; 982 if (ncec != NULL) { 983 nce_remove(ndp, ncec, &free_nce_list); 984 } 985 } 986 ndp->ndp_g_walker_cleanup = B_FALSE; 987 } 988 989 mutex_exit(&ndp->ndp_g_lock); 990 991 if (free_nce_list != NULL) { 992 nce_cleanup_list(free_nce_list); 993 } 994 } 995 996 /* 997 * Walk everything. 998 * Note that ill can be NULL hence can't derive the ipst from it. 999 */ 1000 void 1001 ncec_walk(ill_t *ill, ncec_walk_cb_t cbf, void *arg1, ip_stack_t *ipst) 1002 { 1003 ncec_walk_common(ipst->ips_ndp4, ill, cbf, arg1, B_TRUE); 1004 ncec_walk_common(ipst->ips_ndp6, ill, cbf, arg1, B_TRUE); 1005 } 1006 1007 /* 1008 * For each interface an entry is added for the unspecified multicast group. 1009 * Here that mapping is used to form the multicast cache entry for a particular 1010 * multicast destination. 1011 */ 1012 static int 1013 nce_set_multicast_v6(ill_t *ill, const in6_addr_t *dst, 1014 uint16_t flags, nce_t **newnce) 1015 { 1016 uchar_t *hw_addr; 1017 int err = 0; 1018 ip_stack_t *ipst = ill->ill_ipst; 1019 nce_t *nce; 1020 1021 ASSERT(ill != NULL); 1022 ASSERT(ill->ill_isv6); 1023 ASSERT(!(IN6_IS_ADDR_UNSPECIFIED(dst))); 1024 1025 mutex_enter(&ipst->ips_ndp6->ndp_g_lock); 1026 nce = nce_lookup_addr(ill, dst); 1027 if (nce != NULL) { 1028 mutex_exit(&ipst->ips_ndp6->ndp_g_lock); 1029 goto done; 1030 } 1031 if (ill->ill_net_type == IRE_IF_RESOLVER) { 1032 /* 1033 * For IRE_IF_RESOLVER a hardware mapping can be 1034 * generated. 1035 */ 1036 hw_addr = kmem_alloc(ill->ill_nd_lla_len, KM_NOSLEEP); 1037 if (hw_addr == NULL) { 1038 mutex_exit(&ipst->ips_ndp6->ndp_g_lock); 1039 return (ENOMEM); 1040 } 1041 ip_mcast_mapping(ill, (uchar_t *)dst, hw_addr); 1042 } else { 1043 /* No hw_addr is needed for IRE_IF_NORESOLVER. */ 1044 hw_addr = NULL; 1045 } 1046 ASSERT((flags & NCE_F_MCAST) != 0); 1047 ASSERT((flags & NCE_F_NONUD) != 0); 1048 /* nce_state will be computed by nce_add_common() */ 1049 err = nce_add_v6(ill, hw_addr, ill->ill_phys_addr_length, dst, flags, 1050 ND_UNCHANGED, &nce); 1051 mutex_exit(&ipst->ips_ndp6->ndp_g_lock); 1052 if (err == 0) 1053 err = nce_add_v6_postprocess(nce); 1054 if (hw_addr != NULL) 1055 kmem_free(hw_addr, ill->ill_nd_lla_len); 1056 if (err != 0) { 1057 ip1dbg(("nce_set_multicast_v6: create failed" "%d\n", err)); 1058 return (err); 1059 } 1060 done: 1061 ASSERT(nce->nce_common->ncec_state == ND_REACHABLE); 1062 if (newnce != NULL) 1063 *newnce = nce; 1064 else 1065 nce_refrele(nce); 1066 return (0); 1067 } 1068 1069 /* 1070 * Return the link layer address, and any flags of a ncec. 1071 */ 1072 int 1073 ndp_query(ill_t *ill, struct lif_nd_req *lnr) 1074 { 1075 ncec_t *ncec; 1076 in6_addr_t *addr; 1077 sin6_t *sin6; 1078 1079 ASSERT(ill != NULL && ill->ill_isv6); 1080 sin6 = (sin6_t *)&lnr->lnr_addr; 1081 addr = &sin6->sin6_addr; 1082 1083 /* 1084 * NOTE: if the ill is an IPMP interface, then match against the whole 1085 * illgrp. This e.g. allows in.ndpd to retrieve the link layer 1086 * addresses for the data addresses on an IPMP interface even though 1087 * ipif_ndp_up() created them with an ncec_ill of ipif_bound_ill. 1088 */ 1089 ncec = ncec_lookup_illgrp_v6(ill, addr); 1090 if (ncec == NULL) 1091 return (ESRCH); 1092 /* If no link layer address is available yet, return ESRCH */ 1093 if (!NCE_ISREACHABLE(ncec)) { 1094 ncec_refrele(ncec); 1095 return (ESRCH); 1096 } 1097 lnr->lnr_hdw_len = ill->ill_phys_addr_length; 1098 bcopy(ncec->ncec_lladdr, (uchar_t *)&lnr->lnr_hdw_addr, 1099 lnr->lnr_hdw_len); 1100 if (ncec->ncec_flags & NCE_F_ISROUTER) 1101 lnr->lnr_flags = NDF_ISROUTER_ON; 1102 if (ncec->ncec_flags & NCE_F_ANYCAST) 1103 lnr->lnr_flags |= NDF_ANYCAST_ON; 1104 if (ncec->ncec_flags & NCE_F_STATIC) 1105 lnr->lnr_flags |= NDF_STATIC; 1106 ncec_refrele(ncec); 1107 return (0); 1108 } 1109 1110 /* 1111 * Finish setting up the Enable/Disable multicast for the driver. 1112 */ 1113 mblk_t * 1114 ndp_mcastreq(ill_t *ill, const in6_addr_t *v6group, uint32_t hw_addr_len, 1115 uint32_t hw_addr_offset, mblk_t *mp) 1116 { 1117 uchar_t *hw_addr; 1118 ipaddr_t v4group; 1119 uchar_t *addr; 1120 1121 ASSERT(ill->ill_net_type == IRE_IF_RESOLVER); 1122 if (IN6_IS_ADDR_V4MAPPED(v6group)) { 1123 IN6_V4MAPPED_TO_IPADDR(v6group, v4group); 1124 1125 ASSERT(CLASSD(v4group)); 1126 ASSERT(!(ill->ill_isv6)); 1127 1128 addr = (uchar_t *)&v4group; 1129 } else { 1130 ASSERT(IN6_IS_ADDR_MULTICAST(v6group)); 1131 ASSERT(ill->ill_isv6); 1132 1133 addr = (uchar_t *)v6group; 1134 } 1135 hw_addr = mi_offset_paramc(mp, hw_addr_offset, hw_addr_len); 1136 if (hw_addr == NULL) { 1137 ip0dbg(("ndp_mcastreq NULL hw_addr\n")); 1138 freemsg(mp); 1139 return (NULL); 1140 } 1141 1142 ip_mcast_mapping(ill, addr, hw_addr); 1143 return (mp); 1144 } 1145 1146 void 1147 ip_ndp_resolve(ncec_t *ncec) 1148 { 1149 in_addr_t sender4 = INADDR_ANY; 1150 in6_addr_t sender6 = ipv6_all_zeros; 1151 ill_t *src_ill; 1152 uint32_t ms; 1153 1154 src_ill = nce_resolve_src(ncec, &sender6); 1155 if (src_ill == NULL) { 1156 /* Make sure we try again later */ 1157 ms = ncec->ncec_ill->ill_reachable_retrans_time; 1158 nce_restart_timer(ncec, (clock_t)ms); 1159 return; 1160 } 1161 if (ncec->ncec_ipversion == IPV4_VERSION) 1162 IN6_V4MAPPED_TO_IPADDR(&sender6, sender4); 1163 mutex_enter(&ncec->ncec_lock); 1164 if (ncec->ncec_ipversion == IPV6_VERSION) 1165 ms = ndp_solicit(ncec, sender6, src_ill); 1166 else 1167 ms = arp_request(ncec, sender4, src_ill); 1168 mutex_exit(&ncec->ncec_lock); 1169 if (ms == 0) { 1170 if (ncec->ncec_state != ND_REACHABLE) { 1171 if (ncec->ncec_ipversion == IPV6_VERSION) 1172 ndp_resolv_failed(ncec); 1173 else 1174 arp_resolv_failed(ncec); 1175 ASSERT((ncec->ncec_flags & NCE_F_STATIC) == 0); 1176 nce_make_unreachable(ncec); 1177 ncec_delete(ncec); 1178 } 1179 } else { 1180 nce_restart_timer(ncec, (clock_t)ms); 1181 } 1182 done: 1183 ill_refrele(src_ill); 1184 } 1185 1186 /* 1187 * Send an IPv6 neighbor solicitation. 1188 * Returns number of milliseconds after which we should either rexmit or abort. 1189 * Return of zero means we should abort. 1190 * The caller holds the ncec_lock to protect ncec_qd_mp and ncec_rcnt. 1191 * The optional source address is used as a hint to ndp_solicit for 1192 * which source to use in the packet. 1193 * 1194 * NOTE: This routine drops ncec_lock (and later reacquires it) when sending 1195 * the packet. 1196 */ 1197 uint32_t 1198 ndp_solicit(ncec_t *ncec, in6_addr_t src, ill_t *ill) 1199 { 1200 in6_addr_t dst; 1201 boolean_t dropped = B_FALSE; 1202 1203 ASSERT(ncec->ncec_ipversion == IPV6_VERSION); 1204 ASSERT(MUTEX_HELD(&ncec->ncec_lock)); 1205 1206 if (ncec->ncec_rcnt == 0) 1207 return (0); 1208 1209 dst = ncec->ncec_addr; 1210 ncec->ncec_rcnt--; 1211 mutex_exit(&ncec->ncec_lock); 1212 dropped = ndp_xmit(ill, ND_NEIGHBOR_SOLICIT, ill->ill_phys_addr, 1213 ill->ill_phys_addr_length, &src, &dst, 0); 1214 mutex_enter(&ncec->ncec_lock); 1215 if (dropped) 1216 ncec->ncec_rcnt++; 1217 return (ncec->ncec_ill->ill_reachable_retrans_time); 1218 } 1219 1220 /* 1221 * Attempt to recover an address on an interface that's been marked as a 1222 * duplicate. Because NCEs are destroyed when the interface goes down, there's 1223 * no easy way to just probe the address and have the right thing happen if 1224 * it's no longer in use. Instead, we just bring it up normally and allow the 1225 * regular interface start-up logic to probe for a remaining duplicate and take 1226 * us back down if necessary. 1227 * Neither DHCP nor temporary addresses arrive here; they're excluded by 1228 * ip_ndp_excl. 1229 */ 1230 /* ARGSUSED */ 1231 void 1232 ip_addr_recover(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg) 1233 { 1234 ill_t *ill = rq->q_ptr; 1235 ipif_t *ipif; 1236 in6_addr_t *addr6 = (in6_addr_t *)mp->b_rptr; 1237 in_addr_t *addr4 = (in_addr_t *)mp->b_rptr; 1238 boolean_t addr_equal; 1239 1240 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 1241 /* 1242 * We do not support recovery of proxy ARP'd interfaces, 1243 * because the system lacks a complete proxy ARP mechanism. 1244 */ 1245 if (ill->ill_isv6) { 1246 addr_equal = IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6lcl_addr, 1247 addr6); 1248 } else { 1249 addr_equal = (ipif->ipif_lcl_addr == *addr4); 1250 } 1251 1252 if ((ipif->ipif_flags & IPIF_POINTOPOINT) || !addr_equal) 1253 continue; 1254 1255 /* 1256 * If we have already recovered or if the interface is going 1257 * away, then ignore. 1258 */ 1259 mutex_enter(&ill->ill_lock); 1260 if (!(ipif->ipif_flags & IPIF_DUPLICATE) || 1261 (ipif->ipif_state_flags & IPIF_CONDEMNED)) { 1262 mutex_exit(&ill->ill_lock); 1263 continue; 1264 } 1265 1266 ipif->ipif_flags &= ~IPIF_DUPLICATE; 1267 ill->ill_ipif_dup_count--; 1268 mutex_exit(&ill->ill_lock); 1269 ipif->ipif_was_dup = B_TRUE; 1270 1271 if (ill->ill_isv6) { 1272 VERIFY(ipif_ndp_up(ipif, B_TRUE) != EINPROGRESS); 1273 (void) ipif_up_done_v6(ipif); 1274 } else { 1275 VERIFY(ipif_arp_up(ipif, Res_act_initial, B_TRUE) != 1276 EINPROGRESS); 1277 (void) ipif_up_done(ipif); 1278 } 1279 } 1280 freeb(mp); 1281 } 1282 1283 /* 1284 * Attempt to recover an IPv6 interface that's been shut down as a duplicate. 1285 * As long as someone else holds the address, the interface will stay down. 1286 * When that conflict goes away, the interface is brought back up. This is 1287 * done so that accidental shutdowns of addresses aren't made permanent. Your 1288 * server will recover from a failure. 1289 * 1290 * For DHCP and temporary addresses, recovery is not done in the kernel. 1291 * Instead, it's handled by user space processes (dhcpagent and in.ndpd). 1292 * 1293 * This function is entered on a timer expiry; the ID is in ipif_recovery_id. 1294 */ 1295 void 1296 ipif_dup_recovery(void *arg) 1297 { 1298 ipif_t *ipif = arg; 1299 1300 ipif->ipif_recovery_id = 0; 1301 if (!(ipif->ipif_flags & IPIF_DUPLICATE)) 1302 return; 1303 1304 /* 1305 * No lock, because this is just an optimization. 1306 */ 1307 if (ipif->ipif_state_flags & IPIF_CONDEMNED) 1308 return; 1309 1310 /* If the link is down, we'll retry this later */ 1311 if (!(ipif->ipif_ill->ill_phyint->phyint_flags & PHYI_RUNNING)) 1312 return; 1313 1314 ipif_do_recovery(ipif); 1315 } 1316 1317 /* 1318 * Perform interface recovery by forcing the duplicate interfaces up and 1319 * allowing the system to determine which ones should stay up. 1320 * 1321 * Called both by recovery timer expiry and link-up notification. 1322 */ 1323 void 1324 ipif_do_recovery(ipif_t *ipif) 1325 { 1326 ill_t *ill = ipif->ipif_ill; 1327 mblk_t *mp; 1328 ip_stack_t *ipst = ill->ill_ipst; 1329 size_t mp_size; 1330 1331 if (ipif->ipif_isv6) 1332 mp_size = sizeof (ipif->ipif_v6lcl_addr); 1333 else 1334 mp_size = sizeof (ipif->ipif_lcl_addr); 1335 mp = allocb(mp_size, BPRI_MED); 1336 if (mp == NULL) { 1337 mutex_enter(&ill->ill_lock); 1338 if (ipst->ips_ip_dup_recovery > 0 && 1339 ipif->ipif_recovery_id == 0 && 1340 !(ipif->ipif_state_flags & IPIF_CONDEMNED)) { 1341 ipif->ipif_recovery_id = timeout(ipif_dup_recovery, 1342 ipif, MSEC_TO_TICK(ipst->ips_ip_dup_recovery)); 1343 } 1344 mutex_exit(&ill->ill_lock); 1345 } else { 1346 /* 1347 * A recovery timer may still be running if we got here from 1348 * ill_restart_dad(); cancel that timer. 1349 */ 1350 if (ipif->ipif_recovery_id != 0) 1351 (void) untimeout(ipif->ipif_recovery_id); 1352 ipif->ipif_recovery_id = 0; 1353 1354 if (ipif->ipif_isv6) { 1355 bcopy(&ipif->ipif_v6lcl_addr, mp->b_rptr, 1356 sizeof (ipif->ipif_v6lcl_addr)); 1357 } else { 1358 bcopy(&ipif->ipif_lcl_addr, mp->b_rptr, 1359 sizeof (ipif->ipif_lcl_addr)); 1360 } 1361 ill_refhold(ill); 1362 qwriter_ip(ill, ill->ill_rq, mp, ip_addr_recover, NEW_OP, 1363 B_FALSE); 1364 } 1365 } 1366 1367 /* 1368 * Find the MAC and IP addresses in an NA/NS message. 1369 */ 1370 static void 1371 ip_ndp_find_addresses(mblk_t *mp, ip_recv_attr_t *ira, ill_t *ill, 1372 in6_addr_t *targp, uchar_t **haddr, uint_t *haddrlenp) 1373 { 1374 icmp6_t *icmp6 = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN); 1375 nd_neighbor_solicit_t *ns = (nd_neighbor_solicit_t *)icmp6; 1376 uchar_t *addr; 1377 int alen; 1378 1379 /* icmp_inbound_v6 ensures this */ 1380 ASSERT(ira->ira_flags & IRAF_L2SRC_SET); 1381 1382 addr = ira->ira_l2src; 1383 alen = ill->ill_phys_addr_length; 1384 if (alen > 0) { 1385 *haddr = addr; 1386 *haddrlenp = alen; 1387 } else { 1388 *haddr = NULL; 1389 *haddrlenp = 0; 1390 } 1391 1392 /* nd_ns_target and nd_na_target are at the same offset, so we cheat */ 1393 *targp = ns->nd_ns_target; 1394 } 1395 1396 /* 1397 * This is for exclusive changes due to NDP duplicate address detection 1398 * failure. 1399 */ 1400 /* ARGSUSED */ 1401 static void 1402 ip_ndp_excl(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg) 1403 { 1404 ill_t *ill = rq->q_ptr; 1405 ipif_t *ipif; 1406 uchar_t *haddr; 1407 uint_t haddrlen; 1408 ip_stack_t *ipst = ill->ill_ipst; 1409 in6_addr_t targ; 1410 ip_recv_attr_t iras; 1411 mblk_t *attrmp; 1412 1413 attrmp = mp; 1414 mp = mp->b_cont; 1415 attrmp->b_cont = NULL; 1416 if (!ip_recv_attr_from_mblk(attrmp, &iras)) { 1417 /* The ill or ip_stack_t disappeared on us */ 1418 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 1419 ip_drop_input("ip_recv_attr_from_mblk", mp, ill); 1420 freemsg(mp); 1421 ira_cleanup(&iras, B_TRUE); 1422 return; 1423 } 1424 1425 ASSERT(ill == iras.ira_rill); 1426 1427 ip_ndp_find_addresses(mp, &iras, ill, &targ, &haddr, &haddrlen); 1428 if (haddr != NULL && haddrlen == ill->ill_phys_addr_length) { 1429 /* 1430 * Ignore conflicts generated by misbehaving switches that 1431 * just reflect our own messages back to us. For IPMP, we may 1432 * see reflections across any ill in the illgrp. 1433 * 1434 * RFC2462 and revisions tried to detect both the case 1435 * when a statically configured IPv6 address is a duplicate, 1436 * and the case when the L2 address itself is a duplicate. The 1437 * later is important because, with stateles address autoconf, 1438 * if the L2 address is a duplicate, the resulting IPv6 1439 * address(es) would also be duplicates. We rely on DAD of the 1440 * IPv6 address itself to detect the latter case. 1441 */ 1442 /* For an under ill_grp can change under lock */ 1443 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 1444 if (bcmp(haddr, ill->ill_phys_addr, haddrlen) == 0 || 1445 IS_UNDER_IPMP(ill) && 1446 ipmp_illgrp_find_ill(ill->ill_grp, haddr, 1447 haddrlen) != NULL) { 1448 rw_exit(&ipst->ips_ill_g_lock); 1449 goto ignore_conflict; 1450 } 1451 rw_exit(&ipst->ips_ill_g_lock); 1452 } 1453 1454 /* 1455 * Look up the appropriate ipif. 1456 */ 1457 ipif = ipif_lookup_addr_v6(&targ, ill, ALL_ZONES, ipst); 1458 if (ipif == NULL) 1459 goto ignore_conflict; 1460 1461 /* Reload the ill to match the ipif */ 1462 ill = ipif->ipif_ill; 1463 1464 /* If it's already duplicate or ineligible, then don't do anything. */ 1465 if (ipif->ipif_flags & (IPIF_POINTOPOINT|IPIF_DUPLICATE)) { 1466 ipif_refrele(ipif); 1467 goto ignore_conflict; 1468 } 1469 1470 /* 1471 * If this is a failure during duplicate recovery, then don't 1472 * complain. It may take a long time to recover. 1473 */ 1474 if (!ipif->ipif_was_dup) { 1475 char ibuf[LIFNAMSIZ]; 1476 char hbuf[MAC_STR_LEN]; 1477 char sbuf[INET6_ADDRSTRLEN]; 1478 1479 ipif_get_name(ipif, ibuf, sizeof (ibuf)); 1480 cmn_err(CE_WARN, "%s has duplicate address %s (in use by %s);" 1481 " disabled", ibuf, 1482 inet_ntop(AF_INET6, &targ, sbuf, sizeof (sbuf)), 1483 mac_colon_addr(haddr, haddrlen, hbuf, sizeof (hbuf))); 1484 } 1485 mutex_enter(&ill->ill_lock); 1486 ASSERT(!(ipif->ipif_flags & IPIF_DUPLICATE)); 1487 ipif->ipif_flags |= IPIF_DUPLICATE; 1488 ill->ill_ipif_dup_count++; 1489 mutex_exit(&ill->ill_lock); 1490 (void) ipif_down(ipif, NULL, NULL); 1491 (void) ipif_down_tail(ipif); 1492 mutex_enter(&ill->ill_lock); 1493 if (!(ipif->ipif_flags & (IPIF_DHCPRUNNING|IPIF_TEMPORARY)) && 1494 ill->ill_net_type == IRE_IF_RESOLVER && 1495 !(ipif->ipif_state_flags & IPIF_CONDEMNED) && 1496 ipst->ips_ip_dup_recovery > 0) { 1497 ASSERT(ipif->ipif_recovery_id == 0); 1498 ipif->ipif_recovery_id = timeout(ipif_dup_recovery, 1499 ipif, MSEC_TO_TICK(ipst->ips_ip_dup_recovery)); 1500 } 1501 mutex_exit(&ill->ill_lock); 1502 ipif_refrele(ipif); 1503 1504 ignore_conflict: 1505 freemsg(mp); 1506 ira_cleanup(&iras, B_TRUE); 1507 } 1508 1509 /* 1510 * Handle failure by tearing down the ipifs with the specified address. Note 1511 * that tearing down the ipif also means deleting the ncec through ipif_down, so 1512 * it's not possible to do recovery by just restarting the ncec timer. Instead, 1513 * we start a timer on the ipif. 1514 * Caller has to free mp; 1515 */ 1516 static void 1517 ndp_failure(mblk_t *mp, ip_recv_attr_t *ira) 1518 { 1519 const uchar_t *haddr; 1520 ill_t *ill = ira->ira_rill; 1521 1522 /* 1523 * Ignore conflicts generated by misbehaving switches that just 1524 * reflect our own messages back to us. 1525 */ 1526 1527 /* icmp_inbound_v6 ensures this */ 1528 ASSERT(ira->ira_flags & IRAF_L2SRC_SET); 1529 haddr = ira->ira_l2src; 1530 if (haddr != NULL && 1531 bcmp(haddr, ill->ill_phys_addr, ill->ill_phys_addr_length) == 0) { 1532 return; 1533 } 1534 1535 if ((mp = copymsg(mp)) != NULL) { 1536 mblk_t *attrmp; 1537 1538 attrmp = ip_recv_attr_to_mblk(ira); 1539 if (attrmp == NULL) { 1540 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 1541 ip_drop_input("ipIfStatsInDiscards", mp, ill); 1542 freemsg(mp); 1543 } else { 1544 ASSERT(attrmp->b_cont == NULL); 1545 attrmp->b_cont = mp; 1546 mp = attrmp; 1547 ill_refhold(ill); 1548 qwriter_ip(ill, ill->ill_rq, mp, ip_ndp_excl, NEW_OP, 1549 B_FALSE); 1550 } 1551 } 1552 } 1553 1554 /* 1555 * Handle a discovered conflict: some other system is advertising that it owns 1556 * one of our IP addresses. We need to defend ourselves, or just shut down the 1557 * interface. 1558 * 1559 * Handles both IPv4 and IPv6 1560 */ 1561 boolean_t 1562 ip_nce_conflict(mblk_t *mp, ip_recv_attr_t *ira, ncec_t *ncec) 1563 { 1564 ipif_t *ipif; 1565 clock_t now; 1566 uint_t maxdefense; 1567 uint_t defs; 1568 ill_t *ill = ira->ira_ill; 1569 ip_stack_t *ipst = ill->ill_ipst; 1570 uint32_t elapsed; 1571 boolean_t isv6 = ill->ill_isv6; 1572 ipaddr_t ncec_addr; 1573 1574 if (isv6) { 1575 ipif = ipif_lookup_addr_v6(&ncec->ncec_addr, ill, ALL_ZONES, 1576 ipst); 1577 } else { 1578 if (arp_no_defense) { 1579 /* 1580 * Yes, there is a conflict, but no, we do not 1581 * defend ourself. 1582 */ 1583 return (B_TRUE); 1584 } 1585 IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, ncec_addr); 1586 ipif = ipif_lookup_addr(ncec_addr, ill, ALL_ZONES, 1587 ipst); 1588 } 1589 if (ipif == NULL) 1590 return (B_FALSE); 1591 1592 /* 1593 * First, figure out if this address is disposable. 1594 */ 1595 if (ipif->ipif_flags & (IPIF_DHCPRUNNING | IPIF_TEMPORARY)) 1596 maxdefense = ipst->ips_ip_max_temp_defend; 1597 else 1598 maxdefense = ipst->ips_ip_max_defend; 1599 1600 /* 1601 * Now figure out how many times we've defended ourselves. Ignore 1602 * defenses that happened long in the past. 1603 */ 1604 now = ddi_get_lbolt(); 1605 elapsed = (drv_hztousec(now - ncec->ncec_last_time_defended))/1000000; 1606 mutex_enter(&ncec->ncec_lock); 1607 if ((defs = ncec->ncec_defense_count) > 0 && 1608 elapsed > ipst->ips_ip_defend_interval) { 1609 /* 1610 * ip_defend_interval has elapsed. 1611 * reset the defense count. 1612 */ 1613 ncec->ncec_defense_count = defs = 0; 1614 } 1615 ncec->ncec_defense_count++; 1616 ncec->ncec_last_time_defended = now; 1617 mutex_exit(&ncec->ncec_lock); 1618 ipif_refrele(ipif); 1619 1620 /* 1621 * If we've defended ourselves too many times already, then give up and 1622 * tear down the interface(s) using this address. 1623 * Otherwise, caller has to defend by sending out an announce. 1624 */ 1625 if (defs >= maxdefense) { 1626 if (isv6) 1627 ndp_failure(mp, ira); 1628 else 1629 arp_failure(mp, ira); 1630 } else { 1631 return (B_TRUE); /* caller must defend this address */ 1632 } 1633 return (B_FALSE); 1634 } 1635 1636 /* 1637 * Handle reception of Neighbor Solicitation messages. 1638 */ 1639 static void 1640 ndp_input_solicit(mblk_t *mp, ip_recv_attr_t *ira) 1641 { 1642 ill_t *ill = ira->ira_ill, *under_ill; 1643 nd_neighbor_solicit_t *ns; 1644 uint32_t hlen = ill->ill_phys_addr_length; 1645 uchar_t *haddr = NULL; 1646 icmp6_t *icmp_nd; 1647 ip6_t *ip6h; 1648 ncec_t *our_ncec = NULL; 1649 in6_addr_t target; 1650 in6_addr_t src; 1651 int len; 1652 int flag = 0; 1653 nd_opt_hdr_t *opt = NULL; 1654 boolean_t bad_solicit = B_FALSE; 1655 mib2_ipv6IfIcmpEntry_t *mib = ill->ill_icmp6_mib; 1656 boolean_t need_ill_refrele = B_FALSE; 1657 1658 ip6h = (ip6_t *)mp->b_rptr; 1659 icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN); 1660 len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN; 1661 src = ip6h->ip6_src; 1662 ns = (nd_neighbor_solicit_t *)icmp_nd; 1663 target = ns->nd_ns_target; 1664 if (IN6_IS_ADDR_MULTICAST(&target) || IN6_IS_ADDR_V4MAPPED(&target) || 1665 IN6_IS_ADDR_LOOPBACK(&target)) { 1666 if (ip_debug > 2) { 1667 /* ip1dbg */ 1668 pr_addr_dbg("ndp_input_solicit: Martian Target %s\n", 1669 AF_INET6, &target); 1670 } 1671 bad_solicit = B_TRUE; 1672 goto done; 1673 } 1674 if (len > sizeof (nd_neighbor_solicit_t)) { 1675 /* Options present */ 1676 opt = (nd_opt_hdr_t *)&ns[1]; 1677 len -= sizeof (nd_neighbor_solicit_t); 1678 if (!ndp_verify_optlen(opt, len)) { 1679 ip1dbg(("ndp_input_solicit: Bad opt len\n")); 1680 bad_solicit = B_TRUE; 1681 goto done; 1682 } 1683 } 1684 if (IN6_IS_ADDR_UNSPECIFIED(&src)) { 1685 /* Check to see if this is a valid DAD solicitation */ 1686 if (!IN6_IS_ADDR_MC_SOLICITEDNODE(&ip6h->ip6_dst)) { 1687 if (ip_debug > 2) { 1688 /* ip1dbg */ 1689 pr_addr_dbg("ndp_input_solicit: IPv6 " 1690 "Destination is not solicited node " 1691 "multicast %s\n", AF_INET6, 1692 &ip6h->ip6_dst); 1693 } 1694 bad_solicit = B_TRUE; 1695 goto done; 1696 } 1697 } 1698 1699 /* 1700 * NOTE: with IPMP, it's possible the nominated multicast ill (which 1701 * received this packet if it's multicast) is not the ill tied to 1702 * e.g. the IPMP ill's data link-local. So we match across the illgrp 1703 * to ensure we find the associated NCE. 1704 */ 1705 our_ncec = ncec_lookup_illgrp_v6(ill, &target); 1706 /* 1707 * If this is a valid Solicitation for an address we are publishing, 1708 * then a PUBLISH entry should exist in the cache 1709 */ 1710 if (our_ncec == NULL || !NCE_PUBLISH(our_ncec)) { 1711 ip1dbg(("ndp_input_solicit: Wrong target in NS?!" 1712 "ifname=%s ", ill->ill_name)); 1713 if (ip_debug > 2) { 1714 /* ip1dbg */ 1715 pr_addr_dbg(" dst %s\n", AF_INET6, &target); 1716 } 1717 if (our_ncec == NULL) 1718 bad_solicit = B_TRUE; 1719 goto done; 1720 } 1721 1722 /* At this point we should have a verified NS per spec */ 1723 if (opt != NULL) { 1724 opt = ndp_get_option(opt, len, ND_OPT_SOURCE_LINKADDR); 1725 if (opt != NULL) { 1726 haddr = (uchar_t *)&opt[1]; 1727 if (hlen > opt->nd_opt_len * 8 - sizeof (*opt) || 1728 hlen == 0) { 1729 ip1dbg(("ndp_input_advert: bad SLLA\n")); 1730 bad_solicit = B_TRUE; 1731 goto done; 1732 } 1733 } 1734 } 1735 1736 /* If sending directly to peer, set the unicast flag */ 1737 if (!IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst)) 1738 flag |= NDP_UNICAST; 1739 1740 /* 1741 * Create/update the entry for the soliciting node on the ipmp_ill. 1742 * or respond to outstanding queries, don't if 1743 * the source is unspecified address. 1744 */ 1745 if (!IN6_IS_ADDR_UNSPECIFIED(&src)) { 1746 int err; 1747 nce_t *nnce; 1748 1749 ASSERT(ill->ill_isv6); 1750 /* 1751 * Regular solicitations *must* include the Source Link-Layer 1752 * Address option. Ignore messages that do not. 1753 */ 1754 if (haddr == NULL && IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst)) { 1755 ip1dbg(("ndp_input_solicit: source link-layer address " 1756 "option missing with a specified source.\n")); 1757 bad_solicit = B_TRUE; 1758 goto done; 1759 } 1760 1761 /* 1762 * This is a regular solicitation. If we're still in the 1763 * process of verifying the address, then don't respond at all 1764 * and don't keep track of the sender. 1765 */ 1766 if (our_ncec->ncec_state == ND_PROBE) 1767 goto done; 1768 1769 /* 1770 * If the solicitation doesn't have sender hardware address 1771 * (legal for unicast solicitation), then process without 1772 * installing the return NCE. Either we already know it, or 1773 * we'll be forced to look it up when (and if) we reply to the 1774 * packet. 1775 */ 1776 if (haddr == NULL) 1777 goto no_source; 1778 1779 under_ill = ill; 1780 if (IS_UNDER_IPMP(under_ill)) { 1781 ill = ipmp_ill_hold_ipmp_ill(under_ill); 1782 if (ill == NULL) 1783 ill = under_ill; 1784 else 1785 need_ill_refrele = B_TRUE; 1786 } 1787 err = nce_lookup_then_add_v6(ill, 1788 haddr, hlen, 1789 &src, /* Soliciting nodes address */ 1790 0, 1791 ND_STALE, 1792 &nnce); 1793 1794 if (need_ill_refrele) { 1795 ill_refrele(ill); 1796 ill = under_ill; 1797 need_ill_refrele = B_FALSE; 1798 } 1799 switch (err) { 1800 case 0: 1801 /* done with this entry */ 1802 nce_refrele(nnce); 1803 break; 1804 case EEXIST: 1805 /* 1806 * B_FALSE indicates this is not an an advertisement. 1807 */ 1808 nce_process(nnce->nce_common, haddr, 0, B_FALSE); 1809 nce_refrele(nnce); 1810 break; 1811 default: 1812 ip1dbg(("ndp_input_solicit: Can't create NCE %d\n", 1813 err)); 1814 goto done; 1815 } 1816 no_source: 1817 flag |= NDP_SOLICITED; 1818 } else { 1819 /* 1820 * No source link layer address option should be present in a 1821 * valid DAD request. 1822 */ 1823 if (haddr != NULL) { 1824 ip1dbg(("ndp_input_solicit: source link-layer address " 1825 "option present with an unspecified source.\n")); 1826 bad_solicit = B_TRUE; 1827 goto done; 1828 } 1829 if (our_ncec->ncec_state == ND_PROBE) { 1830 /* 1831 * Internally looped-back probes will have 1832 * IRAF_L2SRC_LOOPBACK set so we can ignore our own 1833 * transmissions. 1834 */ 1835 if (!(ira->ira_flags & IRAF_L2SRC_LOOPBACK)) { 1836 /* 1837 * If someone else is probing our address, then 1838 * we've crossed wires. Declare failure. 1839 */ 1840 ndp_failure(mp, ira); 1841 } 1842 goto done; 1843 } 1844 /* 1845 * This is a DAD probe. Multicast the advertisement to the 1846 * all-nodes address. 1847 */ 1848 src = ipv6_all_hosts_mcast; 1849 } 1850 flag |= nce_advert_flags(our_ncec); 1851 (void) ndp_xmit(ill, 1852 ND_NEIGHBOR_ADVERT, 1853 our_ncec->ncec_lladdr, 1854 our_ncec->ncec_lladdr_length, 1855 &target, /* Source and target of the advertisement pkt */ 1856 &src, /* IP Destination (source of original pkt) */ 1857 flag); 1858 done: 1859 if (bad_solicit) 1860 BUMP_MIB(mib, ipv6IfIcmpInBadNeighborSolicitations); 1861 if (our_ncec != NULL) 1862 ncec_refrele(our_ncec); 1863 } 1864 1865 /* 1866 * Handle reception of Neighbor Solicitation messages 1867 */ 1868 void 1869 ndp_input_advert(mblk_t *mp, ip_recv_attr_t *ira) 1870 { 1871 ill_t *ill = ira->ira_ill; 1872 nd_neighbor_advert_t *na; 1873 uint32_t hlen = ill->ill_phys_addr_length; 1874 uchar_t *haddr = NULL; 1875 icmp6_t *icmp_nd; 1876 ip6_t *ip6h; 1877 ncec_t *dst_ncec = NULL; 1878 in6_addr_t target; 1879 nd_opt_hdr_t *opt = NULL; 1880 int len; 1881 ip_stack_t *ipst = ill->ill_ipst; 1882 mib2_ipv6IfIcmpEntry_t *mib = ill->ill_icmp6_mib; 1883 1884 ip6h = (ip6_t *)mp->b_rptr; 1885 icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN); 1886 len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN; 1887 na = (nd_neighbor_advert_t *)icmp_nd; 1888 1889 if (IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst) && 1890 (na->nd_na_flags_reserved & ND_NA_FLAG_SOLICITED)) { 1891 ip1dbg(("ndp_input_advert: Target is multicast but the " 1892 "solicited flag is not zero\n")); 1893 BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements); 1894 return; 1895 } 1896 target = na->nd_na_target; 1897 if (IN6_IS_ADDR_MULTICAST(&target) || IN6_IS_ADDR_V4MAPPED(&target) || 1898 IN6_IS_ADDR_LOOPBACK(&target)) { 1899 if (ip_debug > 2) { 1900 /* ip1dbg */ 1901 pr_addr_dbg("ndp_input_solicit: Martian Target %s\n", 1902 AF_INET6, &target); 1903 } 1904 BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements); 1905 return; 1906 } 1907 if (len > sizeof (nd_neighbor_advert_t)) { 1908 opt = (nd_opt_hdr_t *)&na[1]; 1909 if (!ndp_verify_optlen(opt, 1910 len - sizeof (nd_neighbor_advert_t))) { 1911 ip1dbg(("ndp_input_advert: cannot verify SLLA\n")); 1912 BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements); 1913 return; 1914 } 1915 /* At this point we have a verified NA per spec */ 1916 len -= sizeof (nd_neighbor_advert_t); 1917 opt = ndp_get_option(opt, len, ND_OPT_TARGET_LINKADDR); 1918 if (opt != NULL) { 1919 haddr = (uchar_t *)&opt[1]; 1920 if (hlen > opt->nd_opt_len * 8 - sizeof (*opt) || 1921 hlen == 0) { 1922 ip1dbg(("ndp_input_advert: bad SLLA\n")); 1923 BUMP_MIB(mib, 1924 ipv6IfIcmpInBadNeighborAdvertisements); 1925 return; 1926 } 1927 } 1928 } 1929 1930 /* 1931 * NOTE: we match across the illgrp since we need to do DAD for all of 1932 * our local addresses, and those are spread across all the active 1933 * ills in the group. 1934 */ 1935 if ((dst_ncec = ncec_lookup_illgrp_v6(ill, &target)) == NULL) 1936 return; 1937 1938 if (NCE_PUBLISH(dst_ncec)) { 1939 /* 1940 * Someone just advertised an addresses that we publish. First, 1941 * check it it was us -- if so, we can safely ignore it. 1942 * We don't get the haddr from the ira_l2src because, in the 1943 * case that the packet originated from us, on an IPMP group, 1944 * the ira_l2src may would be the link-layer address of the 1945 * cast_ill used to send the packet, which may not be the same 1946 * as the dst_ncec->ncec_lladdr of the address. 1947 */ 1948 if (haddr != NULL) { 1949 if (ira->ira_flags & IRAF_L2SRC_LOOPBACK) 1950 goto out; 1951 1952 if (!nce_cmp_ll_addr(dst_ncec, haddr, hlen)) 1953 goto out; /* from us -- no conflict */ 1954 1955 /* 1956 * If we're in an IPMP group, check if this is an echo 1957 * from another ill in the group. Use the double- 1958 * checked locking pattern to avoid grabbing 1959 * ill_g_lock in the non-IPMP case. 1960 */ 1961 if (IS_UNDER_IPMP(ill)) { 1962 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 1963 if (IS_UNDER_IPMP(ill) && ipmp_illgrp_find_ill( 1964 ill->ill_grp, haddr, hlen) != NULL) { 1965 rw_exit(&ipst->ips_ill_g_lock); 1966 goto out; 1967 } 1968 rw_exit(&ipst->ips_ill_g_lock); 1969 } 1970 } 1971 1972 /* 1973 * This appears to be a real conflict. If we're trying to 1974 * configure this NCE (ND_PROBE), then shut it down. 1975 * Otherwise, handle the discovered conflict. 1976 */ 1977 if (dst_ncec->ncec_state == ND_PROBE) { 1978 ndp_failure(mp, ira); 1979 } else { 1980 if (ip_nce_conflict(mp, ira, dst_ncec)) { 1981 char hbuf[MAC_STR_LEN]; 1982 char sbuf[INET6_ADDRSTRLEN]; 1983 1984 cmn_err(CE_WARN, 1985 "node '%s' is using %s on %s", 1986 inet_ntop(AF_INET6, &target, sbuf, 1987 sizeof (sbuf)), 1988 haddr == NULL ? "<none>" : 1989 mac_colon_addr(haddr, hlen, hbuf, 1990 sizeof (hbuf)), ill->ill_name); 1991 /* 1992 * RFC 4862, Section 5.4.4 does not mandate 1993 * any specific behavior when an NA matches 1994 * a non-tentative address assigned to the 1995 * receiver. We make the choice of defending 1996 * our address, based on the assumption that 1997 * the sender has not detected the Duplicate. 1998 * 1999 * ncec_last_time_defended has been adjusted 2000 * in ip_nce_conflict() 2001 */ 2002 (void) ndp_announce(dst_ncec); 2003 } 2004 } 2005 } else { 2006 if (na->nd_na_flags_reserved & ND_NA_FLAG_ROUTER) 2007 dst_ncec->ncec_flags |= NCE_F_ISROUTER; 2008 2009 /* B_TRUE indicates this an advertisement */ 2010 nce_process(dst_ncec, haddr, na->nd_na_flags_reserved, B_TRUE); 2011 } 2012 out: 2013 ncec_refrele(dst_ncec); 2014 } 2015 2016 /* 2017 * Process NDP neighbor solicitation/advertisement messages. 2018 * The checksum has already checked o.k before reaching here. 2019 * Information about the datalink header is contained in ira_l2src, but 2020 * that should be ignored for loopback packets. 2021 */ 2022 void 2023 ndp_input(mblk_t *mp, ip_recv_attr_t *ira) 2024 { 2025 ill_t *ill = ira->ira_rill; 2026 icmp6_t *icmp_nd; 2027 ip6_t *ip6h; 2028 int len; 2029 mib2_ipv6IfIcmpEntry_t *mib = ill->ill_icmp6_mib; 2030 ill_t *orig_ill = NULL; 2031 2032 /* 2033 * Since ira_ill is where the IRE_LOCAL was hosted we use ira_rill 2034 * and make it be the IPMP upper so avoid being confused by a packet 2035 * addressed to a unicast address on a different ill. 2036 */ 2037 if (IS_UNDER_IPMP(ill)) { 2038 orig_ill = ill; 2039 ill = ipmp_ill_hold_ipmp_ill(orig_ill); 2040 if (ill == NULL) { 2041 ill = orig_ill; 2042 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 2043 ip_drop_input("ipIfStatsInDiscards - IPMP ill", 2044 mp, ill); 2045 freemsg(mp); 2046 return; 2047 } 2048 ASSERT(ill != orig_ill); 2049 orig_ill = ira->ira_ill; 2050 ira->ira_ill = ill; 2051 mib = ill->ill_icmp6_mib; 2052 } 2053 if (!pullupmsg(mp, -1)) { 2054 ip1dbg(("ndp_input: pullupmsg failed\n")); 2055 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 2056 ip_drop_input("ipIfStatsInDiscards - pullupmsg", mp, ill); 2057 goto done; 2058 } 2059 ip6h = (ip6_t *)mp->b_rptr; 2060 if (ip6h->ip6_hops != IPV6_MAX_HOPS) { 2061 ip1dbg(("ndp_input: hoplimit != IPV6_MAX_HOPS\n")); 2062 ip_drop_input("ipv6IfIcmpBadHoplimit", mp, ill); 2063 BUMP_MIB(mib, ipv6IfIcmpBadHoplimit); 2064 goto done; 2065 } 2066 /* 2067 * NDP does not accept any extension headers between the 2068 * IP header and the ICMP header since e.g. a routing 2069 * header could be dangerous. 2070 * This assumes that any AH or ESP headers are removed 2071 * by ip prior to passing the packet to ndp_input. 2072 */ 2073 if (ip6h->ip6_nxt != IPPROTO_ICMPV6) { 2074 ip1dbg(("ndp_input: Wrong next header 0x%x\n", 2075 ip6h->ip6_nxt)); 2076 ip_drop_input("Wrong next header", mp, ill); 2077 BUMP_MIB(mib, ipv6IfIcmpInErrors); 2078 goto done; 2079 } 2080 icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN); 2081 ASSERT(icmp_nd->icmp6_type == ND_NEIGHBOR_SOLICIT || 2082 icmp_nd->icmp6_type == ND_NEIGHBOR_ADVERT); 2083 if (icmp_nd->icmp6_code != 0) { 2084 ip1dbg(("ndp_input: icmp6 code != 0 \n")); 2085 ip_drop_input("code non-zero", mp, ill); 2086 BUMP_MIB(mib, ipv6IfIcmpInErrors); 2087 goto done; 2088 } 2089 len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN; 2090 /* 2091 * Make sure packet length is large enough for either 2092 * a NS or a NA icmp packet. 2093 */ 2094 if (len < sizeof (struct icmp6_hdr) + sizeof (struct in6_addr)) { 2095 ip1dbg(("ndp_input: packet too short\n")); 2096 ip_drop_input("packet too short", mp, ill); 2097 BUMP_MIB(mib, ipv6IfIcmpInErrors); 2098 goto done; 2099 } 2100 if (icmp_nd->icmp6_type == ND_NEIGHBOR_SOLICIT) { 2101 ndp_input_solicit(mp, ira); 2102 } else { 2103 ndp_input_advert(mp, ira); 2104 } 2105 done: 2106 freemsg(mp); 2107 if (orig_ill != NULL) { 2108 ill_refrele(ill); 2109 ira->ira_ill = orig_ill; 2110 } 2111 } 2112 2113 /* 2114 * ndp_xmit is called to form and transmit a ND solicitation or 2115 * advertisement ICMP packet. 2116 * 2117 * If the source address is unspecified and this isn't a probe (used for 2118 * duplicate address detection), an appropriate source address and link layer 2119 * address will be chosen here. The link layer address option is included if 2120 * the source is specified (i.e., all non-probe packets), and omitted (per the 2121 * specification) otherwise. 2122 * 2123 * It returns B_FALSE only if it does a successful put() to the 2124 * corresponding ill's ill_wq otherwise returns B_TRUE. 2125 */ 2126 static boolean_t 2127 ndp_xmit(ill_t *ill, uint32_t operation, uint8_t *hw_addr, uint_t hw_addr_len, 2128 const in6_addr_t *sender, const in6_addr_t *target, int flag) 2129 { 2130 uint32_t len; 2131 icmp6_t *icmp6; 2132 mblk_t *mp; 2133 ip6_t *ip6h; 2134 nd_opt_hdr_t *opt; 2135 uint_t plen; 2136 zoneid_t zoneid = GLOBAL_ZONEID; 2137 ill_t *hwaddr_ill = ill; 2138 ip_xmit_attr_t ixas; 2139 ip_stack_t *ipst = ill->ill_ipst; 2140 boolean_t need_refrele = B_FALSE; 2141 boolean_t probe = B_FALSE; 2142 2143 if (IS_UNDER_IPMP(ill)) { 2144 probe = ipif_lookup_testaddr_v6(ill, sender, NULL); 2145 /* 2146 * We send non-probe packets on the upper IPMP interface. 2147 * ip_output_simple() will use cast_ill for sending any 2148 * multicast packets. Note that we can't follow the same 2149 * logic for probe packets because all interfaces in the ipmp 2150 * group may have failed, so that we really want to only try 2151 * to send the ND packet on the ill corresponding to the src 2152 * address. 2153 */ 2154 if (!probe) { 2155 ill = ipmp_ill_hold_ipmp_ill(ill); 2156 if (ill != NULL) 2157 need_refrele = B_TRUE; 2158 else 2159 ill = hwaddr_ill; 2160 } 2161 } 2162 2163 /* 2164 * If we have a unspecified source(sender) address, select a 2165 * proper source address for the solicitation here itself so 2166 * that we can initialize the h/w address correctly. 2167 * 2168 * If the sender is specified then we use this address in order 2169 * to lookup the zoneid before calling ip_output_v6(). This is to 2170 * enable unicast ND_NEIGHBOR_ADVERT packets to be routed correctly 2171 * by IP (we cannot guarantee that the global zone has an interface 2172 * route to the destination). 2173 * 2174 * Note that the NA never comes here with the unspecified source 2175 * address. 2176 */ 2177 2178 /* 2179 * Probes will have unspec src at this point. 2180 */ 2181 if (!(IN6_IS_ADDR_UNSPECIFIED(sender))) { 2182 zoneid = ipif_lookup_addr_zoneid_v6(sender, ill, ipst); 2183 /* 2184 * It's possible for ipif_lookup_addr_zoneid_v6() to return 2185 * ALL_ZONES if it cannot find a matching ipif for the address 2186 * we are trying to use. In this case we err on the side of 2187 * trying to send the packet by defaulting to the GLOBAL_ZONEID. 2188 */ 2189 if (zoneid == ALL_ZONES) 2190 zoneid = GLOBAL_ZONEID; 2191 } 2192 2193 plen = (sizeof (nd_opt_hdr_t) + hw_addr_len + 7) / 8; 2194 len = IPV6_HDR_LEN + sizeof (nd_neighbor_advert_t) + plen * 8; 2195 mp = allocb(len, BPRI_LO); 2196 if (mp == NULL) { 2197 if (need_refrele) 2198 ill_refrele(ill); 2199 return (B_TRUE); 2200 } 2201 2202 bzero((char *)mp->b_rptr, len); 2203 mp->b_wptr = mp->b_rptr + len; 2204 2205 bzero(&ixas, sizeof (ixas)); 2206 ixas.ixa_flags = IXAF_SET_ULP_CKSUM | IXAF_NO_HW_CKSUM; 2207 2208 ixas.ixa_ifindex = ill->ill_phyint->phyint_ifindex; 2209 ixas.ixa_ipst = ipst; 2210 ixas.ixa_cred = kcred; 2211 ixas.ixa_cpid = NOPID; 2212 ixas.ixa_tsl = NULL; 2213 ixas.ixa_zoneid = zoneid; 2214 2215 ip6h = (ip6_t *)mp->b_rptr; 2216 ip6h->ip6_vcf = IPV6_DEFAULT_VERS_AND_FLOW; 2217 ip6h->ip6_plen = htons(len - IPV6_HDR_LEN); 2218 ip6h->ip6_nxt = IPPROTO_ICMPV6; 2219 ip6h->ip6_hops = IPV6_MAX_HOPS; 2220 ixas.ixa_multicast_ttl = ip6h->ip6_hops; 2221 ip6h->ip6_dst = *target; 2222 icmp6 = (icmp6_t *)&ip6h[1]; 2223 2224 if (hw_addr_len != 0) { 2225 opt = (nd_opt_hdr_t *)((uint8_t *)ip6h + IPV6_HDR_LEN + 2226 sizeof (nd_neighbor_advert_t)); 2227 } else { 2228 opt = NULL; 2229 } 2230 if (operation == ND_NEIGHBOR_SOLICIT) { 2231 nd_neighbor_solicit_t *ns = (nd_neighbor_solicit_t *)icmp6; 2232 2233 if (opt != NULL && !(flag & NDP_PROBE)) { 2234 /* 2235 * Note that we don't send out SLLA for ND probes 2236 * per RFC 4862, even though we do send out the src 2237 * haddr for IPv4 DAD probes, even though both IPv4 2238 * and IPv6 go out with the unspecified/INADDR_ANY 2239 * src IP addr. 2240 */ 2241 opt->nd_opt_type = ND_OPT_SOURCE_LINKADDR; 2242 } 2243 ip6h->ip6_src = *sender; 2244 ns->nd_ns_target = *target; 2245 if (!(flag & NDP_UNICAST)) { 2246 /* Form multicast address of the target */ 2247 ip6h->ip6_dst = ipv6_solicited_node_mcast; 2248 ip6h->ip6_dst.s6_addr32[3] |= 2249 ns->nd_ns_target.s6_addr32[3]; 2250 } 2251 } else { 2252 nd_neighbor_advert_t *na = (nd_neighbor_advert_t *)icmp6; 2253 2254 ASSERT(!(flag & NDP_PROBE)); 2255 if (opt != NULL) 2256 opt->nd_opt_type = ND_OPT_TARGET_LINKADDR; 2257 ip6h->ip6_src = *sender; 2258 na->nd_na_target = *sender; 2259 if (flag & NDP_ISROUTER) 2260 na->nd_na_flags_reserved |= ND_NA_FLAG_ROUTER; 2261 if (flag & NDP_SOLICITED) 2262 na->nd_na_flags_reserved |= ND_NA_FLAG_SOLICITED; 2263 if (flag & NDP_ORIDE) 2264 na->nd_na_flags_reserved |= ND_NA_FLAG_OVERRIDE; 2265 } 2266 2267 if (!(flag & NDP_PROBE)) { 2268 if (hw_addr != NULL && opt != NULL) { 2269 /* Fill in link layer address and option len */ 2270 opt->nd_opt_len = (uint8_t)plen; 2271 bcopy(hw_addr, &opt[1], hw_addr_len); 2272 } 2273 } 2274 if (opt != NULL && opt->nd_opt_type == 0) { 2275 /* If there's no link layer address option, then strip it. */ 2276 len -= plen * 8; 2277 mp->b_wptr = mp->b_rptr + len; 2278 ip6h->ip6_plen = htons(len - IPV6_HDR_LEN); 2279 } 2280 2281 icmp6->icmp6_type = (uint8_t)operation; 2282 icmp6->icmp6_code = 0; 2283 /* 2284 * Prepare for checksum by putting icmp length in the icmp 2285 * checksum field. The checksum is calculated in ip_output.c. 2286 */ 2287 icmp6->icmp6_cksum = ip6h->ip6_plen; 2288 2289 (void) ip_output_simple(mp, &ixas); 2290 ixa_cleanup(&ixas); 2291 if (need_refrele) 2292 ill_refrele(ill); 2293 return (B_FALSE); 2294 } 2295 2296 /* 2297 * Used to set ND_UNREACHBLE before ncec_delete sets it NCE_F_CONDEMNED. 2298 * The datapath uses this as an indication that there 2299 * is a problem (as opposed to a NCE that was just 2300 * reclaimed due to lack of memory. 2301 * Note that static ARP entries never become unreachable. 2302 */ 2303 void 2304 nce_make_unreachable(ncec_t *ncec) 2305 { 2306 mutex_enter(&ncec->ncec_lock); 2307 ncec->ncec_state = ND_UNREACHABLE; 2308 mutex_exit(&ncec->ncec_lock); 2309 } 2310 2311 /* 2312 * NCE retransmit timer. Common to IPv4 and IPv6. 2313 * This timer goes off when: 2314 * a. It is time to retransmit a resolution for resolver. 2315 * b. It is time to send reachability probes. 2316 */ 2317 void 2318 nce_timer(void *arg) 2319 { 2320 ncec_t *ncec = arg; 2321 ill_t *ill = ncec->ncec_ill, *src_ill; 2322 char addrbuf[INET6_ADDRSTRLEN]; 2323 boolean_t dropped = B_FALSE; 2324 ip_stack_t *ipst = ncec->ncec_ipst; 2325 boolean_t isv6 = (ncec->ncec_ipversion == IPV6_VERSION); 2326 in_addr_t sender4 = INADDR_ANY; 2327 in6_addr_t sender6 = ipv6_all_zeros; 2328 2329 /* 2330 * The timer has to be cancelled by ncec_delete before doing the final 2331 * refrele. So the NCE is guaranteed to exist when the timer runs 2332 * until it clears the timeout_id. Before clearing the timeout_id 2333 * bump up the refcnt so that we can continue to use the ncec 2334 */ 2335 ASSERT(ncec != NULL); 2336 mutex_enter(&ncec->ncec_lock); 2337 ncec_refhold_locked(ncec); 2338 ncec->ncec_timeout_id = 0; 2339 mutex_exit(&ncec->ncec_lock); 2340 2341 src_ill = nce_resolve_src(ncec, &sender6); 2342 /* if we could not find a sender address, return */ 2343 if (src_ill == NULL) { 2344 if (!isv6) { 2345 IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, sender4); 2346 ip1dbg(("no src ill for %s\n", inet_ntop(AF_INET, 2347 &sender4, addrbuf, sizeof (addrbuf)))); 2348 } else { 2349 ip1dbg(("no src ill for %s\n", inet_ntop(AF_INET6, 2350 &ncec->ncec_addr, addrbuf, sizeof (addrbuf)))); 2351 } 2352 nce_restart_timer(ncec, ill->ill_reachable_retrans_time); 2353 ncec_refrele(ncec); 2354 return; 2355 } 2356 if (!isv6) 2357 IN6_V4MAPPED_TO_IPADDR(&sender6, sender4); 2358 2359 mutex_enter(&ncec->ncec_lock); 2360 /* 2361 * Check the reachability state. 2362 */ 2363 switch (ncec->ncec_state) { 2364 case ND_DELAY: 2365 ASSERT(ncec->ncec_lladdr != NULL); 2366 ncec->ncec_state = ND_PROBE; 2367 ncec->ncec_pcnt = ND_MAX_UNICAST_SOLICIT; 2368 if (isv6) { 2369 mutex_exit(&ncec->ncec_lock); 2370 dropped = ndp_xmit(src_ill, ND_NEIGHBOR_SOLICIT, 2371 src_ill->ill_phys_addr, 2372 src_ill->ill_phys_addr_length, 2373 &sender6, &ncec->ncec_addr, 2374 NDP_UNICAST); 2375 } else { 2376 dropped = (arp_request(ncec, sender4, src_ill) == 0); 2377 mutex_exit(&ncec->ncec_lock); 2378 } 2379 if (!dropped) { 2380 mutex_enter(&ncec->ncec_lock); 2381 ncec->ncec_pcnt--; 2382 mutex_exit(&ncec->ncec_lock); 2383 } 2384 if (ip_debug > 3) { 2385 /* ip2dbg */ 2386 pr_addr_dbg("nce_timer: state for %s changed " 2387 "to PROBE\n", AF_INET6, &ncec->ncec_addr); 2388 } 2389 nce_restart_timer(ncec, ill->ill_reachable_retrans_time); 2390 break; 2391 case ND_PROBE: 2392 /* must be retransmit timer */ 2393 ASSERT(ncec->ncec_pcnt >= -1); 2394 if (ncec->ncec_pcnt > 0) { 2395 /* 2396 * As per RFC2461, the ncec gets deleted after 2397 * MAX_UNICAST_SOLICIT unsuccessful re-transmissions. 2398 * Note that the first unicast solicitation is sent 2399 * during the DELAY state. 2400 */ 2401 ip2dbg(("nce_timer: pcount=%x dst %s\n", 2402 ncec->ncec_pcnt, 2403 inet_ntop((isv6? AF_INET6 : AF_INET), 2404 &ncec->ncec_addr, addrbuf, sizeof (addrbuf)))); 2405 if (NCE_PUBLISH(ncec)) { 2406 mutex_exit(&ncec->ncec_lock); 2407 /* 2408 * send out a probe; note that src_ill 2409 * is ignored by nce_dad() for all 2410 * DAD message types other than IPv6 2411 * unicast probes 2412 */ 2413 nce_dad(ncec, src_ill, B_TRUE); 2414 } else { 2415 ASSERT(src_ill != NULL); 2416 if (isv6) { 2417 mutex_exit(&ncec->ncec_lock); 2418 dropped = ndp_xmit(src_ill, 2419 ND_NEIGHBOR_SOLICIT, 2420 src_ill->ill_phys_addr, 2421 src_ill->ill_phys_addr_length, 2422 &sender6, &ncec->ncec_addr, 2423 NDP_UNICAST); 2424 } else { 2425 /* 2426 * since the nce is REACHABLE, 2427 * the ARP request will be sent out 2428 * as a link-layer unicast. 2429 */ 2430 dropped = (arp_request(ncec, sender4, 2431 src_ill) == 0); 2432 mutex_exit(&ncec->ncec_lock); 2433 } 2434 if (!dropped) { 2435 mutex_enter(&ncec->ncec_lock); 2436 ncec->ncec_pcnt--; 2437 mutex_exit(&ncec->ncec_lock); 2438 } 2439 nce_restart_timer(ncec, 2440 ill->ill_reachable_retrans_time); 2441 } 2442 } else if (ncec->ncec_pcnt < 0) { 2443 /* No hope, delete the ncec */ 2444 /* Tell datapath it went bad */ 2445 ncec->ncec_state = ND_UNREACHABLE; 2446 mutex_exit(&ncec->ncec_lock); 2447 if (ip_debug > 2) { 2448 /* ip1dbg */ 2449 pr_addr_dbg("nce_timer: Delete NCE for" 2450 " dst %s\n", (isv6? AF_INET6: AF_INET), 2451 &ncec->ncec_addr); 2452 } 2453 /* if static ARP can't delete. */ 2454 if ((ncec->ncec_flags & NCE_F_STATIC) == 0) 2455 ncec_delete(ncec); 2456 2457 } else if (!NCE_PUBLISH(ncec)) { 2458 /* 2459 * Probe count is 0 for a dynamic entry (one that we 2460 * ourselves are not publishing). We should never get 2461 * here if NONUD was requested, hence the ASSERT below. 2462 */ 2463 ASSERT((ncec->ncec_flags & NCE_F_NONUD) == 0); 2464 ip2dbg(("nce_timer: pcount=%x dst %s\n", 2465 ncec->ncec_pcnt, inet_ntop(AF_INET6, 2466 &ncec->ncec_addr, addrbuf, sizeof (addrbuf)))); 2467 ncec->ncec_pcnt--; 2468 mutex_exit(&ncec->ncec_lock); 2469 /* Wait one interval before killing */ 2470 nce_restart_timer(ncec, 2471 ill->ill_reachable_retrans_time); 2472 } else if (ill->ill_phyint->phyint_flags & PHYI_RUNNING) { 2473 ipif_t *ipif; 2474 ipaddr_t ncec_addr; 2475 2476 /* 2477 * We're done probing, and we can now declare this 2478 * address to be usable. Let IP know that it's ok to 2479 * use. 2480 */ 2481 ncec->ncec_state = ND_REACHABLE; 2482 ncec->ncec_flags &= ~NCE_F_UNVERIFIED; 2483 mutex_exit(&ncec->ncec_lock); 2484 if (isv6) { 2485 ipif = ipif_lookup_addr_exact_v6( 2486 &ncec->ncec_addr, ill, ipst); 2487 } else { 2488 IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, 2489 ncec_addr); 2490 ipif = ipif_lookup_addr_exact(ncec_addr, ill, 2491 ipst); 2492 } 2493 if (ipif != NULL) { 2494 if (ipif->ipif_was_dup) { 2495 char ibuf[LIFNAMSIZ]; 2496 char sbuf[INET6_ADDRSTRLEN]; 2497 2498 ipif->ipif_was_dup = B_FALSE; 2499 (void) inet_ntop(AF_INET6, 2500 &ipif->ipif_v6lcl_addr, 2501 sbuf, sizeof (sbuf)); 2502 ipif_get_name(ipif, ibuf, 2503 sizeof (ibuf)); 2504 cmn_err(CE_NOTE, "recovered address " 2505 "%s on %s", sbuf, ibuf); 2506 } 2507 if ((ipif->ipif_flags & IPIF_UP) && 2508 !ipif->ipif_addr_ready) 2509 ipif_up_notify(ipif); 2510 ipif->ipif_addr_ready = 1; 2511 ipif_refrele(ipif); 2512 } 2513 if (!isv6 && arp_no_defense) 2514 break; 2515 /* Begin defending our new address */ 2516 if (ncec->ncec_unsolicit_count > 0) { 2517 ncec->ncec_unsolicit_count--; 2518 if (isv6) { 2519 dropped = ndp_announce(ncec); 2520 } else { 2521 dropped = arp_announce(ncec); 2522 } 2523 2524 if (dropped) 2525 ncec->ncec_unsolicit_count++; 2526 else 2527 ncec->ncec_last_time_defended = 2528 ddi_get_lbolt(); 2529 } 2530 if (ncec->ncec_unsolicit_count > 0) { 2531 nce_restart_timer(ncec, 2532 ANNOUNCE_INTERVAL(isv6)); 2533 } else if (DEFENSE_INTERVAL(isv6) != 0) { 2534 nce_restart_timer(ncec, DEFENSE_INTERVAL(isv6)); 2535 } 2536 } else { 2537 /* 2538 * This is an address we're probing to be our own, but 2539 * the ill is down. Wait until it comes back before 2540 * doing anything, but switch to reachable state so 2541 * that the restart will work. 2542 */ 2543 ncec->ncec_state = ND_REACHABLE; 2544 mutex_exit(&ncec->ncec_lock); 2545 } 2546 break; 2547 case ND_INCOMPLETE: { 2548 mblk_t *mp, *nextmp; 2549 mblk_t **prevmpp; 2550 2551 /* 2552 * Per case (2) in the nce_queue_mp() comments, scan ncec_qd_mp 2553 * for any IPMP probe packets, and toss them. IPMP probe 2554 * packets will always be at the head of ncec_qd_mp, so that 2555 * we can stop at the first queued ND packet that is 2556 * not a probe packet. 2557 */ 2558 prevmpp = &ncec->ncec_qd_mp; 2559 for (mp = ncec->ncec_qd_mp; mp != NULL; mp = nextmp) { 2560 nextmp = mp->b_next; 2561 2562 if (IS_UNDER_IPMP(ill) && ncec->ncec_nprobes > 0) { 2563 inet_freemsg(mp); 2564 ncec->ncec_nprobes--; 2565 *prevmpp = nextmp; 2566 } else { 2567 prevmpp = &mp->b_next; 2568 } 2569 } 2570 2571 /* 2572 * Must be resolver's retransmit timer. 2573 */ 2574 mutex_exit(&ncec->ncec_lock); 2575 ip_ndp_resolve(ncec); 2576 break; 2577 } 2578 case ND_REACHABLE: 2579 if (((ncec->ncec_flags & NCE_F_UNSOL_ADV) && 2580 ncec->ncec_unsolicit_count != 0) || 2581 (NCE_PUBLISH(ncec) && DEFENSE_INTERVAL(isv6) != 0)) { 2582 if (ncec->ncec_unsolicit_count > 0) { 2583 ncec->ncec_unsolicit_count--; 2584 mutex_exit(&ncec->ncec_lock); 2585 /* 2586 * When we get to zero announcements left, 2587 * switch to address defense 2588 */ 2589 } else { 2590 boolean_t rate_limit; 2591 2592 mutex_exit(&ncec->ncec_lock); 2593 rate_limit = ill_defend_rate_limit(ill, ncec); 2594 if (rate_limit) { 2595 nce_restart_timer(ncec, 2596 DEFENSE_INTERVAL(isv6)); 2597 break; 2598 } 2599 } 2600 if (isv6) { 2601 dropped = ndp_announce(ncec); 2602 } else { 2603 dropped = arp_announce(ncec); 2604 } 2605 mutex_enter(&ncec->ncec_lock); 2606 if (dropped) { 2607 ncec->ncec_unsolicit_count++; 2608 } else { 2609 ncec->ncec_last_time_defended = 2610 ddi_get_lbolt(); 2611 } 2612 mutex_exit(&ncec->ncec_lock); 2613 if (ncec->ncec_unsolicit_count != 0) { 2614 nce_restart_timer(ncec, 2615 ANNOUNCE_INTERVAL(isv6)); 2616 } else { 2617 nce_restart_timer(ncec, DEFENSE_INTERVAL(isv6)); 2618 } 2619 } else { 2620 mutex_exit(&ncec->ncec_lock); 2621 } 2622 break; 2623 default: 2624 mutex_exit(&ncec->ncec_lock); 2625 break; 2626 } 2627 done: 2628 ncec_refrele(ncec); 2629 ill_refrele(src_ill); 2630 } 2631 2632 /* 2633 * Set a link layer address from the ll_addr passed in. 2634 * Copy SAP from ill. 2635 */ 2636 static void 2637 nce_set_ll(ncec_t *ncec, uchar_t *ll_addr) 2638 { 2639 ill_t *ill = ncec->ncec_ill; 2640 2641 ASSERT(ll_addr != NULL); 2642 if (ill->ill_phys_addr_length > 0) { 2643 /* 2644 * The bcopy() below used to be called for the physical address 2645 * length rather than the link layer address length. For 2646 * ethernet and many other media, the phys_addr and lla are 2647 * identical. 2648 * 2649 * The phys_addr and lla may not be the same for devices that 2650 * support DL_IPV6_LINK_LAYER_ADDR, though there are currently 2651 * no known instances of these. 2652 * 2653 * For PPP or other interfaces with a zero length 2654 * physical address, don't do anything here. 2655 * The bcopy() with a zero phys_addr length was previously 2656 * a no-op for interfaces with a zero-length physical address. 2657 * Using the lla for them would change the way they operate. 2658 * Doing nothing in such cases preserves expected behavior. 2659 */ 2660 bcopy(ll_addr, ncec->ncec_lladdr, ill->ill_nd_lla_len); 2661 } 2662 } 2663 2664 boolean_t 2665 nce_cmp_ll_addr(const ncec_t *ncec, const uchar_t *ll_addr, 2666 uint32_t ll_addr_len) 2667 { 2668 ASSERT(ncec->ncec_lladdr != NULL); 2669 if (ll_addr == NULL) 2670 return (B_FALSE); 2671 if (bcmp(ll_addr, ncec->ncec_lladdr, ll_addr_len) != 0) 2672 return (B_TRUE); 2673 return (B_FALSE); 2674 } 2675 2676 /* 2677 * Updates the link layer address or the reachability state of 2678 * a cache entry. Reset probe counter if needed. 2679 */ 2680 void 2681 nce_update(ncec_t *ncec, uint16_t new_state, uchar_t *new_ll_addr) 2682 { 2683 ill_t *ill = ncec->ncec_ill; 2684 boolean_t need_stop_timer = B_FALSE; 2685 boolean_t need_fastpath_update = B_FALSE; 2686 nce_t *nce = NULL; 2687 timeout_id_t tid; 2688 2689 ASSERT(MUTEX_HELD(&ncec->ncec_lock)); 2690 /* 2691 * If this interface does not do NUD, there is no point 2692 * in allowing an update to the cache entry. Although 2693 * we will respond to NS. 2694 * The only time we accept an update for a resolver when 2695 * NUD is turned off is when it has just been created. 2696 * Non-Resolvers will always be created as REACHABLE. 2697 */ 2698 if (new_state != ND_UNCHANGED) { 2699 if ((ncec->ncec_flags & NCE_F_NONUD) && 2700 (ncec->ncec_state != ND_INCOMPLETE)) 2701 return; 2702 ASSERT((int16_t)new_state >= ND_STATE_VALID_MIN); 2703 ASSERT((int16_t)new_state <= ND_STATE_VALID_MAX); 2704 need_stop_timer = B_TRUE; 2705 if (new_state == ND_REACHABLE) 2706 ncec->ncec_last = TICK_TO_MSEC(ddi_get_lbolt64()); 2707 else { 2708 /* We force NUD in this case */ 2709 ncec->ncec_last = 0; 2710 } 2711 ncec->ncec_state = new_state; 2712 ncec->ncec_pcnt = ND_MAX_UNICAST_SOLICIT; 2713 ASSERT(ncec->ncec_lladdr != NULL || new_state == ND_INITIAL || 2714 new_state == ND_INCOMPLETE); 2715 } 2716 if (need_stop_timer || (ncec->ncec_flags & NCE_F_STATIC)) { 2717 tid = ncec->ncec_timeout_id; 2718 ncec->ncec_timeout_id = 0; 2719 } 2720 /* 2721 * Re-trigger fastpath probe and 2722 * overwrite the DL_UNITDATA_REQ data, noting we'll lose 2723 * whatever packets that happens to be transmitting at the time. 2724 */ 2725 if (new_ll_addr != NULL) { 2726 bcopy(new_ll_addr, ncec->ncec_lladdr, 2727 ill->ill_phys_addr_length); 2728 need_fastpath_update = B_TRUE; 2729 } 2730 mutex_exit(&ncec->ncec_lock); 2731 if (need_stop_timer || (ncec->ncec_flags & NCE_F_STATIC)) { 2732 if (tid != 0) 2733 (void) untimeout(tid); 2734 } 2735 if (need_fastpath_update) { 2736 /* 2737 * Delete any existing existing dlur_mp and fp_mp information. 2738 * For IPMP interfaces, all underlying ill's must be checked 2739 * and purged. 2740 */ 2741 nce_fastpath_list_delete(ncec->ncec_ill, ncec, NULL); 2742 /* 2743 * add the new dlur_mp and fp_mp 2744 */ 2745 nce = nce_fastpath(ncec, B_TRUE, NULL); 2746 if (nce != NULL) 2747 nce_refrele(nce); 2748 } 2749 mutex_enter(&ncec->ncec_lock); 2750 } 2751 2752 static void 2753 nce_queue_mp_common(ncec_t *ncec, mblk_t *mp, boolean_t head_insert) 2754 { 2755 uint_t count = 0; 2756 mblk_t **mpp, *tmp; 2757 2758 ASSERT(MUTEX_HELD(&ncec->ncec_lock)); 2759 2760 for (mpp = &ncec->ncec_qd_mp; *mpp != NULL; mpp = &(*mpp)->b_next) { 2761 if (++count > ncec->ncec_ill->ill_max_buf) { 2762 tmp = ncec->ncec_qd_mp->b_next; 2763 ncec->ncec_qd_mp->b_next = NULL; 2764 /* 2765 * if we never create data addrs on the under_ill 2766 * does this matter? 2767 */ 2768 BUMP_MIB(ncec->ncec_ill->ill_ip_mib, 2769 ipIfStatsOutDiscards); 2770 ip_drop_output("ipIfStatsOutDiscards", ncec->ncec_qd_mp, 2771 ncec->ncec_ill); 2772 freemsg(ncec->ncec_qd_mp); 2773 ncec->ncec_qd_mp = tmp; 2774 } 2775 } 2776 2777 if (head_insert) { 2778 ncec->ncec_nprobes++; 2779 mp->b_next = ncec->ncec_qd_mp; 2780 ncec->ncec_qd_mp = mp; 2781 } else { 2782 *mpp = mp; 2783 } 2784 } 2785 2786 /* 2787 * nce_queue_mp will queue the packet into the ncec_qd_mp. The packet will be 2788 * queued at the head or tail of the queue based on the input argument 2789 * 'head_insert'. The caller should specify this argument as B_TRUE if this 2790 * packet is an IPMP probe packet, in which case the following happens: 2791 * 2792 * 1. Insert it at the head of the ncec_qd_mp list. Consider the normal 2793 * (non-ipmp_probe) load-speading case where the source address of the ND 2794 * packet is not tied to ncec_ill. If the ill bound to the source address 2795 * cannot receive, the response to the ND packet will not be received. 2796 * However, if ND packets for ncec_ill's probes are queued behind that ND 2797 * packet, those probes will also fail to be sent, and thus in.mpathd will 2798 * erroneously conclude that ncec_ill has also failed. 2799 * 2800 * 2. Drop the ipmp_probe packet in ndp_timer() if the ND did not succeed on 2801 * the first attempt. This ensures that ND problems do not manifest as 2802 * probe RTT spikes. 2803 * 2804 * We achieve this by inserting ipmp_probe() packets at the head of the 2805 * nce_queue. 2806 * 2807 * The ncec for the probe target is created with ncec_ill set to the ipmp_ill, 2808 * but the caller needs to set head_insert to B_TRUE if this is a probe packet. 2809 */ 2810 void 2811 nce_queue_mp(ncec_t *ncec, mblk_t *mp, boolean_t head_insert) 2812 { 2813 ASSERT(MUTEX_HELD(&ncec->ncec_lock)); 2814 nce_queue_mp_common(ncec, mp, head_insert); 2815 } 2816 2817 /* 2818 * Called when address resolution failed due to a timeout. 2819 * Send an ICMP unreachable in response to all queued packets. 2820 */ 2821 void 2822 ndp_resolv_failed(ncec_t *ncec) 2823 { 2824 mblk_t *mp, *nxt_mp; 2825 char buf[INET6_ADDRSTRLEN]; 2826 ill_t *ill = ncec->ncec_ill; 2827 ip_recv_attr_t iras; 2828 2829 bzero(&iras, sizeof (iras)); 2830 iras.ira_flags = 0; 2831 /* 2832 * we are setting the ira_rill to the ipmp_ill (instead of 2833 * the actual ill on which the packet was received), but this 2834 * is ok because we don't actually need the real ira_rill. 2835 * to send the icmp unreachable to the sender. 2836 */ 2837 iras.ira_ill = iras.ira_rill = ill; 2838 iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex; 2839 iras.ira_rifindex = iras.ira_ruifindex; 2840 2841 ip1dbg(("ndp_resolv_failed: dst %s\n", 2842 inet_ntop(AF_INET6, (char *)&ncec->ncec_addr, buf, sizeof (buf)))); 2843 mutex_enter(&ncec->ncec_lock); 2844 mp = ncec->ncec_qd_mp; 2845 ncec->ncec_qd_mp = NULL; 2846 ncec->ncec_nprobes = 0; 2847 mutex_exit(&ncec->ncec_lock); 2848 while (mp != NULL) { 2849 nxt_mp = mp->b_next; 2850 mp->b_next = NULL; 2851 2852 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); 2853 ip_drop_output("ipIfStatsOutDiscards - address unreachable", 2854 mp, ill); 2855 icmp_unreachable_v6(mp, 2856 ICMP6_DST_UNREACH_ADDR, B_FALSE, &iras); 2857 ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE)); 2858 mp = nxt_mp; 2859 } 2860 ncec_cb_dispatch(ncec); /* finish off waiting callbacks */ 2861 } 2862 2863 /* 2864 * Handle the completion of NDP and ARP resolution. 2865 */ 2866 void 2867 nce_resolv_ok(ncec_t *ncec) 2868 { 2869 mblk_t *mp; 2870 uint_t pkt_len; 2871 iaflags_t ixaflags = IXAF_NO_TRACE; 2872 nce_t *nce; 2873 ill_t *ill = ncec->ncec_ill; 2874 boolean_t isv6 = (ncec->ncec_ipversion == IPV6_VERSION); 2875 ip_stack_t *ipst = ill->ill_ipst; 2876 2877 if (IS_IPMP(ncec->ncec_ill)) { 2878 nce_resolv_ipmp_ok(ncec); 2879 return; 2880 } 2881 /* non IPMP case */ 2882 2883 mutex_enter(&ncec->ncec_lock); 2884 ASSERT(ncec->ncec_nprobes == 0); 2885 mp = ncec->ncec_qd_mp; 2886 ncec->ncec_qd_mp = NULL; 2887 mutex_exit(&ncec->ncec_lock); 2888 2889 while (mp != NULL) { 2890 mblk_t *nxt_mp; 2891 2892 if (ill->ill_isv6) { 2893 ip6_t *ip6h = (ip6_t *)mp->b_rptr; 2894 2895 pkt_len = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN; 2896 } else { 2897 ipha_t *ipha = (ipha_t *)mp->b_rptr; 2898 2899 ixaflags |= IXAF_IS_IPV4; 2900 pkt_len = ntohs(ipha->ipha_length); 2901 } 2902 nxt_mp = mp->b_next; 2903 mp->b_next = NULL; 2904 /* 2905 * IXAF_NO_DEV_FLOW_CTL information for TCP packets is no 2906 * longer available, but it's ok to drop this flag because TCP 2907 * has its own flow-control in effect, so TCP packets 2908 * are not likely to get here when flow-control is in effect. 2909 */ 2910 mutex_enter(&ill->ill_lock); 2911 nce = nce_lookup(ill, &ncec->ncec_addr); 2912 mutex_exit(&ill->ill_lock); 2913 2914 if (nce == NULL) { 2915 if (isv6) { 2916 BUMP_MIB(&ipst->ips_ip6_mib, 2917 ipIfStatsOutDiscards); 2918 } else { 2919 BUMP_MIB(&ipst->ips_ip_mib, 2920 ipIfStatsOutDiscards); 2921 } 2922 ip_drop_output("ipIfStatsOutDiscards - no nce", 2923 mp, NULL); 2924 freemsg(mp); 2925 } else { 2926 /* 2927 * We don't know the zoneid, but 2928 * ip_xmit does not care since IXAF_NO_TRACE 2929 * is set. (We traced the packet the first 2930 * time through ip_xmit.) 2931 */ 2932 (void) ip_xmit(mp, nce, ixaflags, pkt_len, 0, 2933 ALL_ZONES, 0, NULL); 2934 nce_refrele(nce); 2935 } 2936 mp = nxt_mp; 2937 } 2938 2939 ncec_cb_dispatch(ncec); /* complete callbacks */ 2940 } 2941 2942 /* 2943 * Called by SIOCSNDP* ioctl to add/change an ncec entry 2944 * and the corresponding attributes. 2945 * Disallow states other than ND_REACHABLE or ND_STALE. 2946 */ 2947 int 2948 ndp_sioc_update(ill_t *ill, lif_nd_req_t *lnr) 2949 { 2950 sin6_t *sin6; 2951 in6_addr_t *addr; 2952 ncec_t *ncec; 2953 nce_t *nce; 2954 int err = 0; 2955 uint16_t new_flags = 0; 2956 uint16_t old_flags = 0; 2957 int inflags = lnr->lnr_flags; 2958 ip_stack_t *ipst = ill->ill_ipst; 2959 boolean_t do_postprocess = B_FALSE; 2960 2961 ASSERT(ill->ill_isv6); 2962 if ((lnr->lnr_state_create != ND_REACHABLE) && 2963 (lnr->lnr_state_create != ND_STALE)) 2964 return (EINVAL); 2965 2966 sin6 = (sin6_t *)&lnr->lnr_addr; 2967 addr = &sin6->sin6_addr; 2968 2969 mutex_enter(&ipst->ips_ndp6->ndp_g_lock); 2970 ASSERT(!IS_UNDER_IPMP(ill)); 2971 nce = nce_lookup_addr(ill, addr); 2972 if (nce != NULL) 2973 new_flags = nce->nce_common->ncec_flags; 2974 2975 switch (inflags & (NDF_ISROUTER_ON|NDF_ISROUTER_OFF)) { 2976 case NDF_ISROUTER_ON: 2977 new_flags |= NCE_F_ISROUTER; 2978 break; 2979 case NDF_ISROUTER_OFF: 2980 new_flags &= ~NCE_F_ISROUTER; 2981 break; 2982 case (NDF_ISROUTER_OFF|NDF_ISROUTER_ON): 2983 mutex_exit(&ipst->ips_ndp6->ndp_g_lock); 2984 if (nce != NULL) 2985 nce_refrele(nce); 2986 return (EINVAL); 2987 } 2988 if (inflags & NDF_STATIC) 2989 new_flags |= NCE_F_STATIC; 2990 2991 switch (inflags & (NDF_ANYCAST_ON|NDF_ANYCAST_OFF)) { 2992 case NDF_ANYCAST_ON: 2993 new_flags |= NCE_F_ANYCAST; 2994 break; 2995 case NDF_ANYCAST_OFF: 2996 new_flags &= ~NCE_F_ANYCAST; 2997 break; 2998 case (NDF_ANYCAST_OFF|NDF_ANYCAST_ON): 2999 mutex_exit(&ipst->ips_ndp6->ndp_g_lock); 3000 if (nce != NULL) 3001 nce_refrele(nce); 3002 return (EINVAL); 3003 } 3004 3005 if (nce == NULL) { 3006 err = nce_add_v6(ill, 3007 (uchar_t *)lnr->lnr_hdw_addr, 3008 ill->ill_phys_addr_length, 3009 addr, 3010 new_flags, 3011 lnr->lnr_state_create, 3012 &nce); 3013 if (err != 0) { 3014 mutex_exit(&ipst->ips_ndp6->ndp_g_lock); 3015 ip1dbg(("ndp_sioc_update: Can't create NCE %d\n", err)); 3016 return (err); 3017 } else { 3018 do_postprocess = B_TRUE; 3019 } 3020 } 3021 ncec = nce->nce_common; 3022 old_flags = ncec->ncec_flags; 3023 if (old_flags & NCE_F_ISROUTER && !(new_flags & NCE_F_ISROUTER)) { 3024 ncec_router_to_host(ncec); 3025 mutex_exit(&ipst->ips_ndp6->ndp_g_lock); 3026 if (do_postprocess) 3027 err = nce_add_v6_postprocess(nce); 3028 nce_refrele(nce); 3029 return (0); 3030 } 3031 mutex_exit(&ipst->ips_ndp6->ndp_g_lock); 3032 3033 if (do_postprocess) 3034 err = nce_add_v6_postprocess(nce); 3035 /* 3036 * err cannot be anything other than 0 because we don't support 3037 * proxy arp of static addresses. 3038 */ 3039 ASSERT(err == 0); 3040 3041 mutex_enter(&ncec->ncec_lock); 3042 ncec->ncec_flags = new_flags; 3043 mutex_exit(&ncec->ncec_lock); 3044 /* 3045 * Note that we ignore the state at this point, which 3046 * should be either STALE or REACHABLE. Instead we let 3047 * the link layer address passed in to determine the state 3048 * much like incoming packets. 3049 */ 3050 nce_process(ncec, (uchar_t *)lnr->lnr_hdw_addr, 0, B_FALSE); 3051 nce_refrele(nce); 3052 return (0); 3053 } 3054 3055 /* 3056 * Create an nce_t structure for ill using the ncec->ncec_lladdr to set up 3057 * the nce_dlur_mp. If ill != ncec->ncec_ill, then the ips_ill_g_lock must 3058 * be held to ensure that they are in the same group. 3059 */ 3060 static nce_t * 3061 nce_fastpath_create(ill_t *ill, ncec_t *ncec) 3062 { 3063 3064 nce_t *nce; 3065 3066 nce = nce_ill_lookup_then_add(ill, ncec); 3067 3068 if (nce == NULL || IS_LOOPBACK(nce->nce_ill) || IS_VNI(nce->nce_ill)) 3069 return (nce); 3070 3071 /* 3072 * hold the ncec_lock to synchronize with nce_update() so that, 3073 * at the end of this function, the contents of nce_dlur_mp are 3074 * consistent with ncec->ncec_lladdr, even though some intermediate 3075 * packet may have been sent out with a mangled address, which would 3076 * only be a transient condition. 3077 */ 3078 mutex_enter(&ncec->ncec_lock); 3079 if (ncec->ncec_lladdr != NULL) { 3080 bcopy(ncec->ncec_lladdr, nce->nce_dlur_mp->b_rptr + 3081 NCE_LL_ADDR_OFFSET(ill), ill->ill_phys_addr_length); 3082 } else { 3083 nce->nce_dlur_mp = ill_dlur_gen(NULL, 0, ill->ill_sap, 3084 ill->ill_sap_length); 3085 } 3086 mutex_exit(&ncec->ncec_lock); 3087 return (nce); 3088 } 3089 3090 /* 3091 * we make nce_fp_mp to have an M_DATA prepend. 3092 * The caller ensures there is hold on ncec for this function. 3093 * Note that since ill_fastpath_probe() copies the mblk there is 3094 * no need to hold the nce or ncec beyond this function. 3095 * 3096 * If the caller has passed in a non-null ncec_nce to nce_fastpath() that 3097 * ncec_nce must correspond to the nce for ncec with nce_ill == ncec->ncec_ill 3098 * and will be returned back by this function, so that no extra nce_refrele 3099 * is required for the caller. The calls from nce_add_common() use this 3100 * method. All other callers (that pass in NULL ncec_nce) will have to do a 3101 * nce_refrele of the returned nce (when it is non-null). 3102 */ 3103 nce_t * 3104 nce_fastpath(ncec_t *ncec, boolean_t trigger_fp_req, nce_t *ncec_nce) 3105 { 3106 nce_t *nce; 3107 ill_t *ill = ncec->ncec_ill; 3108 3109 ASSERT(ill != NULL); 3110 3111 if (IS_IPMP(ill) && trigger_fp_req) { 3112 trigger_fp_req = B_FALSE; 3113 ipmp_ncec_refresh_nce(ncec); 3114 } 3115 3116 /* 3117 * If the caller already has the nce corresponding to the ill, use 3118 * that one. Otherwise we have to lookup/add the nce. Calls from 3119 * nce_add_common() fall in the former category, and have just done 3120 * the nce lookup/add that can be reused. 3121 */ 3122 if (ncec_nce == NULL) 3123 nce = nce_fastpath_create(ill, ncec); 3124 else 3125 nce = ncec_nce; 3126 3127 if (nce == NULL || IS_LOOPBACK(nce->nce_ill) || IS_VNI(nce->nce_ill)) 3128 return (nce); 3129 3130 if (trigger_fp_req) 3131 nce_fastpath_trigger(nce); 3132 return (nce); 3133 } 3134 3135 /* 3136 * Trigger fastpath on nce. No locks may be held. 3137 */ 3138 static void 3139 nce_fastpath_trigger(nce_t *nce) 3140 { 3141 int res; 3142 ill_t *ill = nce->nce_ill; 3143 ncec_t *ncec = nce->nce_common; 3144 3145 res = ill_fastpath_probe(ill, nce->nce_dlur_mp); 3146 /* 3147 * EAGAIN is an indication of a transient error 3148 * i.e. allocation failure etc. leave the ncec in the list it 3149 * will be updated when another probe happens for another ire 3150 * if not it will be taken out of the list when the ire is 3151 * deleted. 3152 */ 3153 if (res != 0 && res != EAGAIN && res != ENOTSUP) 3154 nce_fastpath_list_delete(ill, ncec, NULL); 3155 } 3156 3157 /* 3158 * Add ncec to the nce fastpath list on ill. 3159 */ 3160 static nce_t * 3161 nce_ill_lookup_then_add_locked(ill_t *ill, ncec_t *ncec) 3162 { 3163 nce_t *nce = NULL; 3164 3165 ASSERT(MUTEX_HELD(&ill->ill_lock)); 3166 /* 3167 * Atomically ensure that the ill is not CONDEMNED and is not going 3168 * down, before adding the NCE. 3169 */ 3170 if (ill->ill_state_flags & ILL_CONDEMNED) 3171 return (NULL); 3172 mutex_enter(&ncec->ncec_lock); 3173 /* 3174 * if ncec has not been deleted and 3175 * is not already in the list add it. 3176 */ 3177 if (!NCE_ISCONDEMNED(ncec)) { 3178 nce = nce_lookup(ill, &ncec->ncec_addr); 3179 if (nce != NULL) 3180 goto done; 3181 nce = nce_add(ill, ncec); 3182 } 3183 done: 3184 mutex_exit(&ncec->ncec_lock); 3185 return (nce); 3186 } 3187 3188 nce_t * 3189 nce_ill_lookup_then_add(ill_t *ill, ncec_t *ncec) 3190 { 3191 nce_t *nce; 3192 3193 mutex_enter(&ill->ill_lock); 3194 nce = nce_ill_lookup_then_add_locked(ill, ncec); 3195 mutex_exit(&ill->ill_lock); 3196 return (nce); 3197 } 3198 3199 3200 /* 3201 * remove ncec from the ill_nce list. If 'dead' is non-null, the deleted 3202 * nce is added to the 'dead' list, and the caller must nce_refrele() the 3203 * entry after all locks have been dropped. 3204 */ 3205 void 3206 nce_fastpath_list_delete(ill_t *ill, ncec_t *ncec, list_t *dead) 3207 { 3208 nce_t *nce; 3209 3210 ASSERT(ill != NULL); 3211 3212 /* delete any nces referencing the ncec from underlying ills */ 3213 if (IS_IPMP(ill)) 3214 ipmp_ncec_delete_nce(ncec); 3215 3216 /* now the ill itself */ 3217 mutex_enter(&ill->ill_lock); 3218 for (nce = list_head(&ill->ill_nce); nce != NULL; 3219 nce = list_next(&ill->ill_nce, nce)) { 3220 if (nce->nce_common == ncec) { 3221 nce_refhold(nce); 3222 nce_delete(nce); 3223 break; 3224 } 3225 } 3226 mutex_exit(&ill->ill_lock); 3227 if (nce != NULL) { 3228 if (dead == NULL) 3229 nce_refrele(nce); 3230 else 3231 list_insert_tail(dead, nce); 3232 } 3233 } 3234 3235 /* 3236 * when the fastpath response does not fit in the datab 3237 * associated with the existing nce_fp_mp, we delete and 3238 * add the nce to retrigger fastpath based on the information 3239 * in the ncec_t. 3240 */ 3241 static nce_t * 3242 nce_delete_then_add(nce_t *nce) 3243 { 3244 ill_t *ill = nce->nce_ill; 3245 nce_t *newnce = NULL; 3246 3247 ip0dbg(("nce_delete_then_add nce %p ill %s\n", 3248 (void *)nce, ill->ill_name)); 3249 mutex_enter(&ill->ill_lock); 3250 mutex_enter(&nce->nce_common->ncec_lock); 3251 nce_delete(nce); 3252 /* 3253 * Make sure that ncec is not condemned before adding. We hold the 3254 * ill_lock and ncec_lock to synchronize with ncec_delete() and 3255 * ipmp_ncec_delete_nce() 3256 */ 3257 if (!NCE_ISCONDEMNED(nce->nce_common)) 3258 newnce = nce_add(ill, nce->nce_common); 3259 mutex_exit(&nce->nce_common->ncec_lock); 3260 mutex_exit(&ill->ill_lock); 3261 nce_refrele(nce); 3262 return (newnce); /* could be null if nomem */ 3263 } 3264 3265 typedef struct nce_fp_match_s { 3266 nce_t *nce_fp_match_res; 3267 mblk_t *nce_fp_match_ack_mp; 3268 } nce_fp_match_t; 3269 3270 /* ARGSUSED */ 3271 static int 3272 nce_fastpath_match_dlur(ill_t *ill, nce_t *nce, void *arg) 3273 { 3274 nce_fp_match_t *nce_fp_marg = arg; 3275 ncec_t *ncec = nce->nce_common; 3276 mblk_t *mp = nce_fp_marg->nce_fp_match_ack_mp; 3277 uchar_t *mp_rptr, *ud_mp_rptr; 3278 mblk_t *ud_mp = nce->nce_dlur_mp; 3279 ptrdiff_t cmplen; 3280 3281 /* 3282 * mp is the mp associated with the fastpath ack. 3283 * ud_mp is the outstanding DL_UNITDATA_REQ on the nce_t 3284 * under consideration. If the contents match, then the 3285 * fastpath ack is used to update the nce. 3286 */ 3287 if (ud_mp == NULL) 3288 return (0); 3289 mp_rptr = mp->b_rptr; 3290 cmplen = mp->b_wptr - mp_rptr; 3291 ASSERT(cmplen >= 0); 3292 3293 ud_mp_rptr = ud_mp->b_rptr; 3294 /* 3295 * The ncec is locked here to prevent any other threads from accessing 3296 * and changing nce_dlur_mp when the address becomes resolved to an 3297 * lla while we're in the middle of looking at and comparing the 3298 * hardware address (lla). It is also locked to prevent multiple 3299 * threads in nce_fastpath() from examining nce_dlur_mp at the same 3300 * time. 3301 */ 3302 mutex_enter(&ncec->ncec_lock); 3303 if (ud_mp->b_wptr - ud_mp_rptr != cmplen || 3304 bcmp((char *)mp_rptr, (char *)ud_mp_rptr, cmplen) == 0) { 3305 nce_fp_marg->nce_fp_match_res = nce; 3306 mutex_exit(&ncec->ncec_lock); 3307 nce_refhold(nce); 3308 return (1); 3309 } 3310 mutex_exit(&ncec->ncec_lock); 3311 return (0); 3312 } 3313 3314 /* 3315 * Update all NCE's that are not in fastpath mode and 3316 * have an nce_fp_mp that matches mp. mp->b_cont contains 3317 * the fastpath header. 3318 * 3319 * Returns TRUE if entry should be dequeued, or FALSE otherwise. 3320 */ 3321 void 3322 nce_fastpath_update(ill_t *ill, mblk_t *mp) 3323 { 3324 nce_fp_match_t nce_fp_marg; 3325 nce_t *nce; 3326 mblk_t *nce_fp_mp, *fp_mp; 3327 3328 nce_fp_marg.nce_fp_match_res = NULL; 3329 nce_fp_marg.nce_fp_match_ack_mp = mp; 3330 3331 nce_walk(ill, nce_fastpath_match_dlur, &nce_fp_marg); 3332 3333 if ((nce = nce_fp_marg.nce_fp_match_res) == NULL) 3334 return; 3335 3336 mutex_enter(&nce->nce_lock); 3337 nce_fp_mp = nce->nce_fp_mp; 3338 3339 if (nce_fp_mp != NULL) { 3340 fp_mp = mp->b_cont; 3341 if (nce_fp_mp->b_rptr + MBLKL(fp_mp) > 3342 nce_fp_mp->b_datap->db_lim) { 3343 mutex_exit(&nce->nce_lock); 3344 nce = nce_delete_then_add(nce); 3345 if (nce == NULL) { 3346 return; 3347 } 3348 mutex_enter(&nce->nce_lock); 3349 nce_fp_mp = nce->nce_fp_mp; 3350 } 3351 } 3352 3353 /* Matched - install mp as the fastpath mp */ 3354 if (nce_fp_mp == NULL) { 3355 fp_mp = dupb(mp->b_cont); 3356 nce->nce_fp_mp = fp_mp; 3357 } else { 3358 fp_mp = mp->b_cont; 3359 bcopy(fp_mp->b_rptr, nce_fp_mp->b_rptr, MBLKL(fp_mp)); 3360 nce->nce_fp_mp->b_wptr = nce->nce_fp_mp->b_rptr 3361 + MBLKL(fp_mp); 3362 } 3363 mutex_exit(&nce->nce_lock); 3364 nce_refrele(nce); 3365 } 3366 3367 /* 3368 * Return a pointer to a given option in the packet. 3369 * Assumes that option part of the packet have already been validated. 3370 */ 3371 nd_opt_hdr_t * 3372 ndp_get_option(nd_opt_hdr_t *opt, int optlen, int opt_type) 3373 { 3374 while (optlen > 0) { 3375 if (opt->nd_opt_type == opt_type) 3376 return (opt); 3377 optlen -= 8 * opt->nd_opt_len; 3378 opt = (struct nd_opt_hdr *)((char *)opt + 8 * opt->nd_opt_len); 3379 } 3380 return (NULL); 3381 } 3382 3383 /* 3384 * Verify all option lengths present are > 0, also check to see 3385 * if the option lengths and packet length are consistent. 3386 */ 3387 boolean_t 3388 ndp_verify_optlen(nd_opt_hdr_t *opt, int optlen) 3389 { 3390 ASSERT(opt != NULL); 3391 while (optlen > 0) { 3392 if (opt->nd_opt_len == 0) 3393 return (B_FALSE); 3394 optlen -= 8 * opt->nd_opt_len; 3395 if (optlen < 0) 3396 return (B_FALSE); 3397 opt = (struct nd_opt_hdr *)((char *)opt + 8 * opt->nd_opt_len); 3398 } 3399 return (B_TRUE); 3400 } 3401 3402 /* 3403 * ncec_walk function. 3404 * Free a fraction of the NCE cache entries. 3405 * 3406 * A possible optimization here would be to use ncec_last where possible, and 3407 * delete the least-frequently used entry, which would require more complex 3408 * computation as we walk through the ncec's (e.g., track ncec entries by 3409 * order of ncec_last and/or maintain state) 3410 */ 3411 static void 3412 ncec_cache_reclaim(ncec_t *ncec, void *arg) 3413 { 3414 ip_stack_t *ipst = ncec->ncec_ipst; 3415 uint_t fraction = *(uint_t *)arg; 3416 uint_t rand; 3417 3418 if ((ncec->ncec_flags & 3419 (NCE_F_MYADDR | NCE_F_STATIC | NCE_F_BCAST)) != 0) { 3420 return; 3421 } 3422 3423 rand = (uint_t)ddi_get_lbolt() + 3424 NCE_ADDR_HASH_V6(ncec->ncec_addr, NCE_TABLE_SIZE); 3425 if ((rand/fraction)*fraction == rand) { 3426 IP_STAT(ipst, ip_nce_reclaim_deleted); 3427 ncec_delete(ncec); 3428 } 3429 } 3430 3431 /* 3432 * kmem_cache callback to free up memory. 3433 * 3434 * For now we just delete a fixed fraction. 3435 */ 3436 static void 3437 ip_nce_reclaim_stack(ip_stack_t *ipst) 3438 { 3439 uint_t fraction = ipst->ips_ip_nce_reclaim_fraction; 3440 3441 IP_STAT(ipst, ip_nce_reclaim_calls); 3442 3443 ncec_walk(NULL, ncec_cache_reclaim, &fraction, ipst); 3444 3445 /* 3446 * Walk all CONNs that can have a reference on an ire, ncec or dce. 3447 * Get them to update any stale references to drop any refholds they 3448 * have. 3449 */ 3450 ipcl_walk(conn_ixa_cleanup, (void *)B_FALSE, ipst); 3451 } 3452 3453 /* 3454 * Called by the memory allocator subsystem directly, when the system 3455 * is running low on memory. 3456 */ 3457 /* ARGSUSED */ 3458 void 3459 ip_nce_reclaim(void *args) 3460 { 3461 netstack_handle_t nh; 3462 netstack_t *ns; 3463 ip_stack_t *ipst; 3464 3465 netstack_next_init(&nh); 3466 while ((ns = netstack_next(&nh)) != NULL) { 3467 /* 3468 * netstack_next() can return a netstack_t with a NULL 3469 * netstack_ip at boot time. 3470 */ 3471 if ((ipst = ns->netstack_ip) == NULL) { 3472 netstack_rele(ns); 3473 continue; 3474 } 3475 ip_nce_reclaim_stack(ipst); 3476 netstack_rele(ns); 3477 } 3478 netstack_next_fini(&nh); 3479 } 3480 3481 #ifdef DEBUG 3482 void 3483 ncec_trace_ref(ncec_t *ncec) 3484 { 3485 ASSERT(MUTEX_HELD(&ncec->ncec_lock)); 3486 3487 if (ncec->ncec_trace_disable) 3488 return; 3489 3490 if (!th_trace_ref(ncec, ncec->ncec_ipst)) { 3491 ncec->ncec_trace_disable = B_TRUE; 3492 ncec_trace_cleanup(ncec); 3493 } 3494 } 3495 3496 void 3497 ncec_untrace_ref(ncec_t *ncec) 3498 { 3499 ASSERT(MUTEX_HELD(&ncec->ncec_lock)); 3500 3501 if (!ncec->ncec_trace_disable) 3502 th_trace_unref(ncec); 3503 } 3504 3505 static void 3506 ncec_trace_cleanup(const ncec_t *ncec) 3507 { 3508 th_trace_cleanup(ncec, ncec->ncec_trace_disable); 3509 } 3510 #endif 3511 3512 /* 3513 * Called when address resolution fails due to a timeout. 3514 * Send an ICMP unreachable in response to all queued packets. 3515 */ 3516 void 3517 arp_resolv_failed(ncec_t *ncec) 3518 { 3519 mblk_t *mp, *nxt_mp; 3520 char buf[INET6_ADDRSTRLEN]; 3521 struct in_addr ipv4addr; 3522 ill_t *ill = ncec->ncec_ill; 3523 ip_stack_t *ipst = ncec->ncec_ipst; 3524 ip_recv_attr_t iras; 3525 3526 bzero(&iras, sizeof (iras)); 3527 iras.ira_flags = IRAF_IS_IPV4; 3528 /* 3529 * we are setting the ira_rill to the ipmp_ill (instead of 3530 * the actual ill on which the packet was received), but this 3531 * is ok because we don't actually need the real ira_rill. 3532 * to send the icmp unreachable to the sender. 3533 */ 3534 iras.ira_ill = iras.ira_rill = ill; 3535 iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex; 3536 iras.ira_rifindex = iras.ira_ruifindex; 3537 3538 IN6_V4MAPPED_TO_INADDR(&ncec->ncec_addr, &ipv4addr); 3539 ip3dbg(("arp_resolv_failed: dst %s\n", 3540 inet_ntop(AF_INET, &ipv4addr, buf, sizeof (buf)))); 3541 mutex_enter(&ncec->ncec_lock); 3542 mp = ncec->ncec_qd_mp; 3543 ncec->ncec_qd_mp = NULL; 3544 ncec->ncec_nprobes = 0; 3545 mutex_exit(&ncec->ncec_lock); 3546 while (mp != NULL) { 3547 nxt_mp = mp->b_next; 3548 mp->b_next = NULL; 3549 3550 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); 3551 ip_drop_output("ipIfStatsOutDiscards - address unreachable", 3552 mp, ill); 3553 if (ipst->ips_ip_arp_icmp_error) { 3554 ip3dbg(("arp_resolv_failed: " 3555 "Calling icmp_unreachable\n")); 3556 icmp_unreachable(mp, ICMP_HOST_UNREACHABLE, &iras); 3557 } else { 3558 freemsg(mp); 3559 } 3560 ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE)); 3561 mp = nxt_mp; 3562 } 3563 ncec_cb_dispatch(ncec); /* finish off waiting callbacks */ 3564 } 3565 3566 /* 3567 * if ill is an under_ill, translate it to the ipmp_ill and add the 3568 * nce on the ipmp_ill. Two nce_t entries (one on the ipmp_ill, and 3569 * one on the underlying in_ill) will be created for the 3570 * ncec_t in this case. The ncec_t itself will be created on the ipmp_ill. 3571 */ 3572 int 3573 nce_lookup_then_add_v4(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len, 3574 const in_addr_t *addr, uint16_t flags, uint16_t state, nce_t **newnce) 3575 { 3576 int err; 3577 in6_addr_t addr6; 3578 ip_stack_t *ipst = ill->ill_ipst; 3579 nce_t *nce, *upper_nce = NULL; 3580 ill_t *in_ill = ill, *under = NULL; 3581 boolean_t need_ill_refrele = B_FALSE; 3582 3583 if (flags & NCE_F_MCAST) { 3584 /* 3585 * hw_addr will be figured out in nce_set_multicast_v4; 3586 * caller needs to pass in the cast_ill for ipmp 3587 */ 3588 ASSERT(hw_addr == NULL); 3589 ASSERT(!IS_IPMP(ill)); 3590 err = nce_set_multicast_v4(ill, addr, flags, newnce); 3591 return (err); 3592 } 3593 3594 if (IS_UNDER_IPMP(ill) && !(flags & NCE_F_MYADDR)) { 3595 ill = ipmp_ill_hold_ipmp_ill(ill); 3596 if (ill == NULL) 3597 return (ENXIO); 3598 need_ill_refrele = B_TRUE; 3599 } 3600 if ((flags & NCE_F_BCAST) != 0) { 3601 /* 3602 * IPv4 broadcast ncec: compute the hwaddr. 3603 */ 3604 if (IS_IPMP(ill)) { 3605 under = ipmp_ill_hold_xmit_ill(ill, B_FALSE); 3606 if (under == NULL) { 3607 if (need_ill_refrele) 3608 ill_refrele(ill); 3609 return (ENETDOWN); 3610 } 3611 hw_addr = under->ill_bcast_mp->b_rptr + 3612 NCE_LL_ADDR_OFFSET(under); 3613 hw_addr_len = under->ill_phys_addr_length; 3614 } else { 3615 hw_addr = ill->ill_bcast_mp->b_rptr + 3616 NCE_LL_ADDR_OFFSET(ill), 3617 hw_addr_len = ill->ill_phys_addr_length; 3618 } 3619 } 3620 3621 mutex_enter(&ipst->ips_ndp4->ndp_g_lock); 3622 IN6_IPADDR_TO_V4MAPPED(*addr, &addr6); 3623 nce = nce_lookup_addr(ill, &addr6); 3624 if (nce == NULL) { 3625 err = nce_add_v4(ill, hw_addr, hw_addr_len, addr, flags, 3626 state, &nce); 3627 } else { 3628 err = EEXIST; 3629 } 3630 mutex_exit(&ipst->ips_ndp4->ndp_g_lock); 3631 if (err == 0) 3632 err = nce_add_v4_postprocess(nce); 3633 3634 if (in_ill != ill && nce != NULL) { 3635 nce_t *under_nce = NULL; 3636 3637 /* 3638 * in_ill was the under_ill. Try to create the under_nce. 3639 * Hold the ill_g_lock to prevent changes to group membership 3640 * until we are done. 3641 */ 3642 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 3643 if (!IS_IN_SAME_ILLGRP(in_ill, ill)) { 3644 DTRACE_PROBE2(ill__not__in__group, nce_t *, nce, 3645 ill_t *, ill); 3646 rw_exit(&ipst->ips_ill_g_lock); 3647 err = ENXIO; 3648 nce_refrele(nce); 3649 nce = NULL; 3650 goto bail; 3651 } 3652 under_nce = nce_fastpath_create(in_ill, nce->nce_common); 3653 if (under_nce == NULL) { 3654 rw_exit(&ipst->ips_ill_g_lock); 3655 err = EINVAL; 3656 nce_refrele(nce); 3657 nce = NULL; 3658 goto bail; 3659 } 3660 rw_exit(&ipst->ips_ill_g_lock); 3661 upper_nce = nce; 3662 nce = under_nce; /* will be returned to caller */ 3663 if (NCE_ISREACHABLE(nce->nce_common)) 3664 nce_fastpath_trigger(under_nce); 3665 } 3666 if (nce != NULL) { 3667 if (newnce != NULL) 3668 *newnce = nce; 3669 else 3670 nce_refrele(nce); 3671 } 3672 bail: 3673 if (under != NULL) 3674 ill_refrele(under); 3675 if (upper_nce != NULL) 3676 nce_refrele(upper_nce); 3677 if (need_ill_refrele) 3678 ill_refrele(ill); 3679 3680 return (err); 3681 } 3682 3683 /* 3684 * NDP Cache Entry creation routine for IPv4. 3685 * This routine must always be called with ndp4->ndp_g_lock held. 3686 * Prior to return, ncec_refcnt is incremented. 3687 * 3688 * IPMP notes: the ncec for non-local (i.e., !NCE_MYADDR(ncec) addresses 3689 * are always added pointing at the ipmp_ill. Thus, when the ill passed 3690 * to nce_add_v4 is an under_ill (i.e., IS_UNDER_IPMP(ill)) two nce_t 3691 * entries will be created, both pointing at the same ncec_t. The nce_t 3692 * entries will have their nce_ill set to the ipmp_ill and the under_ill 3693 * respectively, with the ncec_t having its ncec_ill pointing at the ipmp_ill. 3694 * Local addresses are always created on the ill passed to nce_add_v4. 3695 */ 3696 int 3697 nce_add_v4(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len, 3698 const in_addr_t *addr, uint16_t flags, uint16_t state, nce_t **newnce) 3699 { 3700 int err; 3701 boolean_t is_multicast = (flags & NCE_F_MCAST); 3702 struct in6_addr addr6; 3703 nce_t *nce; 3704 3705 ASSERT(MUTEX_HELD(&ill->ill_ipst->ips_ndp4->ndp_g_lock)); 3706 ASSERT(!ill->ill_isv6); 3707 ASSERT(!IN_MULTICAST(htonl(*addr)) || is_multicast); 3708 3709 IN6_IPADDR_TO_V4MAPPED(*addr, &addr6); 3710 err = nce_add_common(ill, hw_addr, hw_addr_len, &addr6, flags, state, 3711 &nce); 3712 ASSERT(newnce != NULL); 3713 *newnce = nce; 3714 return (err); 3715 } 3716 3717 /* 3718 * Post-processing routine to be executed after nce_add_v4(). This function 3719 * triggers fastpath (if appropriate) and DAD on the newly added nce entry 3720 * and must be called without any locks held. 3721 * 3722 * Always returns 0, but we return an int to keep this symmetric with the 3723 * IPv6 counter-part. 3724 */ 3725 int 3726 nce_add_v4_postprocess(nce_t *nce) 3727 { 3728 ncec_t *ncec = nce->nce_common; 3729 uint16_t flags = ncec->ncec_flags; 3730 boolean_t ndp_need_dad = B_FALSE; 3731 boolean_t dropped; 3732 clock_t delay; 3733 ip_stack_t *ipst = ncec->ncec_ill->ill_ipst; 3734 uchar_t *hw_addr = ncec->ncec_lladdr; 3735 boolean_t trigger_fastpath = B_TRUE; 3736 3737 /* 3738 * If the hw_addr is NULL, typically for ND_INCOMPLETE nces, then 3739 * we call nce_fastpath as soon as the ncec is resolved in nce_process. 3740 * We call nce_fastpath from nce_update if the link layer address of 3741 * the peer changes from nce_update 3742 */ 3743 if (NCE_PUBLISH(ncec) || !NCE_ISREACHABLE(ncec) || (hw_addr == NULL && 3744 ncec->ncec_ill->ill_net_type != IRE_IF_NORESOLVER)) 3745 trigger_fastpath = B_FALSE; 3746 3747 if (trigger_fastpath) 3748 nce_fastpath_trigger(nce); 3749 3750 if (NCE_PUBLISH(ncec) && ncec->ncec_state == ND_PROBE) { 3751 /* 3752 * Either the caller (by passing in ND_PROBE) 3753 * or nce_add_common() (by the internally computed state 3754 * based on ncec_addr and ill_net_type) has determined 3755 * that this unicast entry needs DAD. Trigger DAD. 3756 */ 3757 ndp_need_dad = B_TRUE; 3758 } else if (flags & NCE_F_UNSOL_ADV) { 3759 /* 3760 * We account for the transmit below by assigning one 3761 * less than the ndd variable. Subsequent decrements 3762 * are done in nce_timer. 3763 */ 3764 mutex_enter(&ncec->ncec_lock); 3765 ncec->ncec_unsolicit_count = 3766 ipst->ips_ip_arp_publish_count - 1; 3767 mutex_exit(&ncec->ncec_lock); 3768 dropped = arp_announce(ncec); 3769 mutex_enter(&ncec->ncec_lock); 3770 if (dropped) 3771 ncec->ncec_unsolicit_count++; 3772 else 3773 ncec->ncec_last_time_defended = ddi_get_lbolt(); 3774 if (ncec->ncec_unsolicit_count != 0) { 3775 nce_start_timer(ncec, 3776 ipst->ips_ip_arp_publish_interval); 3777 } 3778 mutex_exit(&ncec->ncec_lock); 3779 } 3780 3781 /* 3782 * If ncec_xmit_interval is 0, user has configured us to send the first 3783 * probe right away. Do so, and set up for the subsequent probes. 3784 */ 3785 if (ndp_need_dad) { 3786 mutex_enter(&ncec->ncec_lock); 3787 if (ncec->ncec_pcnt == 0) { 3788 /* 3789 * DAD probes and announce can be 3790 * administratively disabled by setting the 3791 * probe_count to zero. Restart the timer in 3792 * this case to mark the ipif as ready. 3793 */ 3794 ncec->ncec_unsolicit_count = 0; 3795 mutex_exit(&ncec->ncec_lock); 3796 nce_restart_timer(ncec, 0); 3797 } else { 3798 mutex_exit(&ncec->ncec_lock); 3799 delay = ((ncec->ncec_flags & NCE_F_FAST) ? 3800 ipst->ips_arp_probe_delay : 3801 ipst->ips_arp_fastprobe_delay); 3802 nce_dad(ncec, NULL, (delay == 0 ? B_TRUE : B_FALSE)); 3803 } 3804 } 3805 return (0); 3806 } 3807 3808 /* 3809 * ncec_walk routine to update all entries that have a given destination or 3810 * gateway address and cached link layer (MAC) address. This is used when ARP 3811 * informs us that a network-to-link-layer mapping may have changed. 3812 */ 3813 void 3814 nce_update_hw_changed(ncec_t *ncec, void *arg) 3815 { 3816 nce_hw_map_t *hwm = arg; 3817 ipaddr_t ncec_addr; 3818 3819 if (ncec->ncec_state != ND_REACHABLE) 3820 return; 3821 3822 IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, ncec_addr); 3823 if (ncec_addr != hwm->hwm_addr) 3824 return; 3825 3826 mutex_enter(&ncec->ncec_lock); 3827 if (hwm->hwm_flags != 0) 3828 ncec->ncec_flags = hwm->hwm_flags; 3829 nce_update(ncec, ND_STALE, hwm->hwm_hwaddr); 3830 mutex_exit(&ncec->ncec_lock); 3831 } 3832 3833 void 3834 ncec_refhold(ncec_t *ncec) 3835 { 3836 mutex_enter(&(ncec)->ncec_lock); 3837 (ncec)->ncec_refcnt++; 3838 ASSERT((ncec)->ncec_refcnt != 0); 3839 #ifdef DEBUG 3840 ncec_trace_ref(ncec); 3841 #endif 3842 mutex_exit(&(ncec)->ncec_lock); 3843 } 3844 3845 void 3846 ncec_refhold_notr(ncec_t *ncec) 3847 { 3848 mutex_enter(&(ncec)->ncec_lock); 3849 (ncec)->ncec_refcnt++; 3850 ASSERT((ncec)->ncec_refcnt != 0); 3851 mutex_exit(&(ncec)->ncec_lock); 3852 } 3853 3854 static void 3855 ncec_refhold_locked(ncec_t *ncec) 3856 { 3857 ASSERT(MUTEX_HELD(&(ncec)->ncec_lock)); 3858 (ncec)->ncec_refcnt++; 3859 #ifdef DEBUG 3860 ncec_trace_ref(ncec); 3861 #endif 3862 } 3863 3864 /* ncec_inactive destroys the mutex thus no mutex_exit is needed */ 3865 void 3866 ncec_refrele(ncec_t *ncec) 3867 { 3868 mutex_enter(&(ncec)->ncec_lock); 3869 #ifdef DEBUG 3870 ncec_untrace_ref(ncec); 3871 #endif 3872 ASSERT((ncec)->ncec_refcnt != 0); 3873 if (--(ncec)->ncec_refcnt == 0) { 3874 ncec_inactive(ncec); 3875 } else { 3876 mutex_exit(&(ncec)->ncec_lock); 3877 } 3878 } 3879 3880 void 3881 ncec_refrele_notr(ncec_t *ncec) 3882 { 3883 mutex_enter(&(ncec)->ncec_lock); 3884 ASSERT((ncec)->ncec_refcnt != 0); 3885 if (--(ncec)->ncec_refcnt == 0) { 3886 ncec_inactive(ncec); 3887 } else { 3888 mutex_exit(&(ncec)->ncec_lock); 3889 } 3890 } 3891 3892 /* 3893 * Common to IPv4 and IPv6. 3894 */ 3895 void 3896 nce_restart_timer(ncec_t *ncec, uint_t ms) 3897 { 3898 timeout_id_t tid; 3899 3900 ASSERT(!MUTEX_HELD(&(ncec)->ncec_lock)); 3901 3902 /* First cancel any running timer */ 3903 mutex_enter(&ncec->ncec_lock); 3904 tid = ncec->ncec_timeout_id; 3905 ncec->ncec_timeout_id = 0; 3906 if (tid != 0) { 3907 mutex_exit(&ncec->ncec_lock); 3908 (void) untimeout(tid); 3909 mutex_enter(&ncec->ncec_lock); 3910 } 3911 3912 /* Restart timer */ 3913 nce_start_timer(ncec, ms); 3914 mutex_exit(&ncec->ncec_lock); 3915 } 3916 3917 static void 3918 nce_start_timer(ncec_t *ncec, uint_t ms) 3919 { 3920 ASSERT(MUTEX_HELD(&ncec->ncec_lock)); 3921 /* 3922 * Don't start the timer if the ncec has been deleted, or if the timer 3923 * is already running 3924 */ 3925 if (!NCE_ISCONDEMNED(ncec) && ncec->ncec_timeout_id == 0) { 3926 ncec->ncec_timeout_id = timeout(nce_timer, ncec, 3927 MSEC_TO_TICK(ms) == 0 ? 1 : MSEC_TO_TICK(ms)); 3928 } 3929 } 3930 3931 int 3932 nce_set_multicast_v4(ill_t *ill, const in_addr_t *dst, 3933 uint16_t flags, nce_t **newnce) 3934 { 3935 uchar_t *hw_addr; 3936 int err = 0; 3937 ip_stack_t *ipst = ill->ill_ipst; 3938 in6_addr_t dst6; 3939 nce_t *nce; 3940 3941 ASSERT(!ill->ill_isv6); 3942 3943 IN6_IPADDR_TO_V4MAPPED(*dst, &dst6); 3944 mutex_enter(&ipst->ips_ndp4->ndp_g_lock); 3945 if ((nce = nce_lookup_addr(ill, &dst6)) != NULL) { 3946 mutex_exit(&ipst->ips_ndp4->ndp_g_lock); 3947 goto done; 3948 } 3949 if (ill->ill_net_type == IRE_IF_RESOLVER) { 3950 /* 3951 * For IRE_IF_RESOLVER a hardware mapping can be 3952 * generated, for IRE_IF_NORESOLVER, resolution cookie 3953 * in the ill is copied in nce_add_v4(). 3954 */ 3955 hw_addr = kmem_alloc(ill->ill_phys_addr_length, KM_NOSLEEP); 3956 if (hw_addr == NULL) { 3957 mutex_exit(&ipst->ips_ndp4->ndp_g_lock); 3958 return (ENOMEM); 3959 } 3960 ip_mcast_mapping(ill, (uchar_t *)dst, hw_addr); 3961 } else { 3962 /* 3963 * IRE_IF_NORESOLVER type simply copies the resolution 3964 * cookie passed in. So no hw_addr is needed. 3965 */ 3966 hw_addr = NULL; 3967 } 3968 ASSERT(flags & NCE_F_MCAST); 3969 ASSERT(flags & NCE_F_NONUD); 3970 /* nce_state will be computed by nce_add_common() */ 3971 err = nce_add_v4(ill, hw_addr, ill->ill_phys_addr_length, dst, flags, 3972 ND_UNCHANGED, &nce); 3973 mutex_exit(&ipst->ips_ndp4->ndp_g_lock); 3974 if (err == 0) 3975 err = nce_add_v4_postprocess(nce); 3976 if (hw_addr != NULL) 3977 kmem_free(hw_addr, ill->ill_phys_addr_length); 3978 if (err != 0) { 3979 ip1dbg(("nce_set_multicast_v4: create failed" "%d\n", err)); 3980 return (err); 3981 } 3982 done: 3983 if (newnce != NULL) 3984 *newnce = nce; 3985 else 3986 nce_refrele(nce); 3987 return (0); 3988 } 3989 3990 /* 3991 * This is used when scanning for "old" (least recently broadcast) NCEs. We 3992 * don't want to have to walk the list for every single one, so we gather up 3993 * batches at a time. 3994 */ 3995 #define NCE_RESCHED_LIST_LEN 8 3996 3997 typedef struct { 3998 ill_t *ncert_ill; 3999 uint_t ncert_num; 4000 ncec_t *ncert_nces[NCE_RESCHED_LIST_LEN]; 4001 } nce_resched_t; 4002 4003 /* 4004 * Pick the longest waiting NCEs for defense. 4005 */ 4006 /* ARGSUSED */ 4007 static int 4008 ncec_reschedule(ill_t *ill, nce_t *nce, void *arg) 4009 { 4010 nce_resched_t *ncert = arg; 4011 ncec_t **ncecs; 4012 ncec_t **ncec_max; 4013 ncec_t *ncec_temp; 4014 ncec_t *ncec = nce->nce_common; 4015 4016 ASSERT(ncec->ncec_ill == ncert->ncert_ill); 4017 /* 4018 * Only reachable entries that are ready for announcement are eligible. 4019 */ 4020 if (!NCE_MYADDR(ncec) || ncec->ncec_state != ND_REACHABLE) 4021 return (0); 4022 if (ncert->ncert_num < NCE_RESCHED_LIST_LEN) { 4023 ncec_refhold(ncec); 4024 ncert->ncert_nces[ncert->ncert_num++] = ncec; 4025 } else { 4026 ncecs = ncert->ncert_nces; 4027 ncec_max = ncecs + NCE_RESCHED_LIST_LEN; 4028 ncec_refhold(ncec); 4029 for (; ncecs < ncec_max; ncecs++) { 4030 ASSERT(ncec != NULL); 4031 if ((*ncecs)->ncec_last_time_defended > 4032 ncec->ncec_last_time_defended) { 4033 ncec_temp = *ncecs; 4034 *ncecs = ncec; 4035 ncec = ncec_temp; 4036 } 4037 } 4038 ncec_refrele(ncec); 4039 } 4040 return (0); 4041 } 4042 4043 /* 4044 * Reschedule the ARP defense of any long-waiting NCEs. It's assumed that this 4045 * doesn't happen very often (if at all), and thus it needn't be highly 4046 * optimized. (Note, though, that it's actually O(N) complexity, because the 4047 * outer loop is bounded by a constant rather than by the length of the list.) 4048 */ 4049 static void 4050 nce_ill_reschedule(ill_t *ill, nce_resched_t *ncert) 4051 { 4052 ncec_t *ncec; 4053 ip_stack_t *ipst = ill->ill_ipst; 4054 uint_t i, defend_rate; 4055 4056 i = ill->ill_defend_count; 4057 ill->ill_defend_count = 0; 4058 if (ill->ill_isv6) 4059 defend_rate = ipst->ips_ndp_defend_rate; 4060 else 4061 defend_rate = ipst->ips_arp_defend_rate; 4062 /* If none could be sitting around, then don't reschedule */ 4063 if (i < defend_rate) { 4064 DTRACE_PROBE1(reschedule_none, ill_t *, ill); 4065 return; 4066 } 4067 ncert->ncert_ill = ill; 4068 while (ill->ill_defend_count < defend_rate) { 4069 nce_walk_common(ill, ncec_reschedule, ncert); 4070 for (i = 0; i < ncert->ncert_num; i++) { 4071 4072 ncec = ncert->ncert_nces[i]; 4073 mutex_enter(&ncec->ncec_lock); 4074 ncec->ncec_flags |= NCE_F_DELAYED; 4075 mutex_exit(&ncec->ncec_lock); 4076 /* 4077 * we plan to schedule this ncec, so incr the 4078 * defend_count in anticipation. 4079 */ 4080 if (++ill->ill_defend_count >= defend_rate) 4081 break; 4082 } 4083 if (ncert->ncert_num < NCE_RESCHED_LIST_LEN) 4084 break; 4085 } 4086 } 4087 4088 /* 4089 * Check if the current rate-limiting parameters permit the sending 4090 * of another address defense announcement for both IPv4 and IPv6. 4091 * Returns B_TRUE if rate-limiting is in effect (i.e., send is not 4092 * permitted), and B_FALSE otherwise. The `defend_rate' parameter 4093 * determines how many address defense announcements are permitted 4094 * in any `defense_perio' interval. 4095 */ 4096 static boolean_t 4097 ill_defend_rate_limit(ill_t *ill, ncec_t *ncec) 4098 { 4099 clock_t now = ddi_get_lbolt(); 4100 ip_stack_t *ipst = ill->ill_ipst; 4101 clock_t start = ill->ill_defend_start; 4102 uint32_t elapsed, defend_period, defend_rate; 4103 nce_resched_t ncert; 4104 boolean_t ret; 4105 int i; 4106 4107 if (ill->ill_isv6) { 4108 defend_period = ipst->ips_ndp_defend_period; 4109 defend_rate = ipst->ips_ndp_defend_rate; 4110 } else { 4111 defend_period = ipst->ips_arp_defend_period; 4112 defend_rate = ipst->ips_arp_defend_rate; 4113 } 4114 if (defend_rate == 0) 4115 return (B_TRUE); 4116 bzero(&ncert, sizeof (ncert)); 4117 mutex_enter(&ill->ill_lock); 4118 if (start > 0) { 4119 elapsed = now - start; 4120 if (elapsed > SEC_TO_TICK(defend_period)) { 4121 ill->ill_defend_start = now; 4122 /* 4123 * nce_ill_reschedule will attempt to 4124 * prevent starvation by reschduling the 4125 * oldest entries, which are marked with 4126 * the NCE_F_DELAYED flag. 4127 */ 4128 nce_ill_reschedule(ill, &ncert); 4129 } 4130 } else { 4131 ill->ill_defend_start = now; 4132 } 4133 ASSERT(ill->ill_defend_count <= defend_rate); 4134 mutex_enter(&ncec->ncec_lock); 4135 if (ncec->ncec_flags & NCE_F_DELAYED) { 4136 /* 4137 * This ncec was rescheduled as one of the really old 4138 * entries needing on-going defense. The 4139 * ill_defend_count was already incremented in 4140 * nce_ill_reschedule. Go ahead and send the announce. 4141 */ 4142 ncec->ncec_flags &= ~NCE_F_DELAYED; 4143 mutex_exit(&ncec->ncec_lock); 4144 ret = B_FALSE; 4145 goto done; 4146 } 4147 mutex_exit(&ncec->ncec_lock); 4148 if (ill->ill_defend_count < defend_rate) 4149 ill->ill_defend_count++; 4150 if (ill->ill_defend_count == defend_rate) { 4151 /* 4152 * we are no longer allowed to send unbidden defense 4153 * messages. Wait for rescheduling. 4154 */ 4155 ret = B_TRUE; 4156 } else { 4157 ret = B_FALSE; 4158 } 4159 done: 4160 mutex_exit(&ill->ill_lock); 4161 /* 4162 * After all the locks have been dropped we can restart nce timer, 4163 * and refrele the delayed ncecs 4164 */ 4165 for (i = 0; i < ncert.ncert_num; i++) { 4166 clock_t xmit_interval; 4167 ncec_t *tmp; 4168 4169 tmp = ncert.ncert_nces[i]; 4170 xmit_interval = nce_fuzz_interval(tmp->ncec_xmit_interval, 4171 B_FALSE); 4172 nce_restart_timer(tmp, xmit_interval); 4173 ncec_refrele(tmp); 4174 } 4175 return (ret); 4176 } 4177 4178 boolean_t 4179 ndp_announce(ncec_t *ncec) 4180 { 4181 return (ndp_xmit(ncec->ncec_ill, ND_NEIGHBOR_ADVERT, ncec->ncec_lladdr, 4182 ncec->ncec_lladdr_length, &ncec->ncec_addr, &ipv6_all_hosts_mcast, 4183 nce_advert_flags(ncec))); 4184 } 4185 4186 ill_t * 4187 nce_resolve_src(ncec_t *ncec, in6_addr_t *src) 4188 { 4189 mblk_t *mp; 4190 in6_addr_t src6; 4191 ipaddr_t src4; 4192 ill_t *ill = ncec->ncec_ill; 4193 ill_t *src_ill = NULL; 4194 ipif_t *ipif = NULL; 4195 boolean_t is_myaddr = NCE_MYADDR(ncec); 4196 boolean_t isv6 = (ncec->ncec_ipversion == IPV6_VERSION); 4197 4198 ASSERT(src != NULL); 4199 ASSERT(IN6_IS_ADDR_UNSPECIFIED(src)); 4200 src6 = *src; 4201 if (is_myaddr) { 4202 src6 = ncec->ncec_addr; 4203 if (!isv6) 4204 IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, src4); 4205 } else { 4206 /* 4207 * try to find one from the outgoing packet. 4208 */ 4209 mutex_enter(&ncec->ncec_lock); 4210 mp = ncec->ncec_qd_mp; 4211 if (mp != NULL) { 4212 if (isv6) { 4213 ip6_t *ip6h = (ip6_t *)mp->b_rptr; 4214 4215 src6 = ip6h->ip6_src; 4216 } else { 4217 ipha_t *ipha = (ipha_t *)mp->b_rptr; 4218 4219 src4 = ipha->ipha_src; 4220 IN6_IPADDR_TO_V4MAPPED(src4, &src6); 4221 } 4222 } 4223 mutex_exit(&ncec->ncec_lock); 4224 } 4225 4226 /* 4227 * For outgoing packets, if the src of outgoing packet is one 4228 * of the assigned interface addresses use it, otherwise we 4229 * will pick the source address below. 4230 * For local addresses (is_myaddr) doing DAD, NDP announce 4231 * messages are mcast. So we use the (IPMP) cast_ill or the 4232 * (non-IPMP) ncec_ill for these message types. The only case 4233 * of unicast DAD messages are for IPv6 ND probes, for which 4234 * we find the ipif_bound_ill corresponding to the ncec_addr. 4235 */ 4236 if (!IN6_IS_ADDR_UNSPECIFIED(&src6) || is_myaddr) { 4237 if (isv6) { 4238 ipif = ipif_lookup_addr_nondup_v6(&src6, ill, ALL_ZONES, 4239 ill->ill_ipst); 4240 } else { 4241 ipif = ipif_lookup_addr_nondup(src4, ill, ALL_ZONES, 4242 ill->ill_ipst); 4243 } 4244 4245 /* 4246 * If no relevant ipif can be found, then it's not one of our 4247 * addresses. Reset to :: and try to find a src for the NS or 4248 * ARP request using ipif_select_source_v[4,6] below. 4249 * If an ipif can be found, but it's not yet done with 4250 * DAD verification, and we are not being invoked for 4251 * DAD (i.e., !is_myaddr), then just postpone this 4252 * transmission until later. 4253 */ 4254 if (ipif == NULL) { 4255 src6 = ipv6_all_zeros; 4256 src4 = INADDR_ANY; 4257 } else if (!ipif->ipif_addr_ready && !is_myaddr) { 4258 DTRACE_PROBE2(nce__resolve__ipif__not__ready, 4259 ncec_t *, ncec, ipif_t *, ipif); 4260 ipif_refrele(ipif); 4261 return (NULL); 4262 } 4263 } 4264 4265 if (IN6_IS_ADDR_UNSPECIFIED(&src6) && !is_myaddr) { 4266 /* 4267 * Pick a source address for this solicitation, but 4268 * restrict the selection to addresses assigned to the 4269 * output interface. We do this because the destination will 4270 * create a neighbor cache entry for the source address of 4271 * this packet, so the source address had better be a valid 4272 * neighbor. 4273 */ 4274 if (isv6) { 4275 ipif = ipif_select_source_v6(ill, &ncec->ncec_addr, 4276 B_TRUE, IPV6_PREFER_SRC_DEFAULT, ALL_ZONES, 4277 B_FALSE, NULL); 4278 } else { 4279 ipaddr_t nce_addr; 4280 4281 IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, nce_addr); 4282 ipif = ipif_select_source_v4(ill, nce_addr, ALL_ZONES, 4283 B_FALSE, NULL); 4284 } 4285 if (ipif == NULL && IS_IPMP(ill)) { 4286 ill_t *send_ill = ipmp_ill_hold_xmit_ill(ill, B_TRUE); 4287 4288 if (send_ill != NULL) { 4289 if (isv6) { 4290 ipif = ipif_select_source_v6(send_ill, 4291 &ncec->ncec_addr, B_TRUE, 4292 IPV6_PREFER_SRC_DEFAULT, ALL_ZONES, 4293 B_FALSE, NULL); 4294 } else { 4295 IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, 4296 src4); 4297 ipif = ipif_select_source_v4(send_ill, 4298 src4, ALL_ZONES, B_TRUE, NULL); 4299 } 4300 ill_refrele(send_ill); 4301 } 4302 } 4303 4304 if (ipif == NULL) { 4305 char buf[INET6_ADDRSTRLEN]; 4306 4307 ip1dbg(("nce_resolve_src: No source ipif for dst %s\n", 4308 inet_ntop((isv6 ? AF_INET6 : AF_INET), 4309 (char *)&ncec->ncec_addr, buf, sizeof (buf)))); 4310 DTRACE_PROBE1(nce__resolve__no__ipif, ncec_t *, ncec); 4311 return (NULL); 4312 } 4313 src6 = ipif->ipif_v6lcl_addr; 4314 } 4315 *src = src6; 4316 if (ipif != NULL) { 4317 src_ill = ipif->ipif_ill; 4318 if (IS_IPMP(src_ill)) 4319 src_ill = ipmp_ipif_hold_bound_ill(ipif); 4320 else 4321 ill_refhold(src_ill); 4322 ipif_refrele(ipif); 4323 DTRACE_PROBE2(nce__resolve__src__ill, ncec_t *, ncec, 4324 ill_t *, src_ill); 4325 } 4326 return (src_ill); 4327 } 4328 4329 void 4330 ip_nce_lookup_and_update(ipaddr_t *addr, ipif_t *ipif, ip_stack_t *ipst, 4331 uchar_t *hwaddr, int hwaddr_len, int flags) 4332 { 4333 ill_t *ill; 4334 ncec_t *ncec; 4335 nce_t *nce; 4336 uint16_t new_state; 4337 4338 ill = (ipif ? ipif->ipif_ill : NULL); 4339 if (ill != NULL) { 4340 /* 4341 * only one ncec is possible 4342 */ 4343 nce = nce_lookup_v4(ill, addr); 4344 if (nce != NULL) { 4345 ncec = nce->nce_common; 4346 mutex_enter(&ncec->ncec_lock); 4347 if (NCE_ISREACHABLE(ncec)) 4348 new_state = ND_UNCHANGED; 4349 else 4350 new_state = ND_STALE; 4351 ncec->ncec_flags = flags; 4352 nce_update(ncec, new_state, hwaddr); 4353 mutex_exit(&ncec->ncec_lock); 4354 nce_refrele(nce); 4355 return; 4356 } 4357 } else { 4358 /* 4359 * ill is wildcard; clean up all ncec's and ire's 4360 * that match on addr. 4361 */ 4362 nce_hw_map_t hwm; 4363 4364 hwm.hwm_addr = *addr; 4365 hwm.hwm_hwlen = hwaddr_len; 4366 hwm.hwm_hwaddr = hwaddr; 4367 hwm.hwm_flags = flags; 4368 4369 ncec_walk_common(ipst->ips_ndp4, NULL, 4370 nce_update_hw_changed, &hwm, B_TRUE); 4371 } 4372 } 4373 4374 /* 4375 * Common function to add ncec entries. 4376 * we always add the ncec with ncec_ill == ill, and always create 4377 * nce_t on ncec_ill. A dlpi fastpath message may be triggered if the 4378 * ncec is !reachable. 4379 * 4380 * When the caller passes in an nce_state of ND_UNCHANGED, 4381 * nce_add_common() will determine the state of the created nce based 4382 * on the ill_net_type and nce_flags used. Otherwise, the nce will 4383 * be created with state set to the passed in nce_state. 4384 */ 4385 static int 4386 nce_add_common(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len, 4387 const in6_addr_t *addr, uint16_t flags, uint16_t nce_state, nce_t **retnce) 4388 { 4389 static ncec_t nce_nil; 4390 uchar_t *template = NULL; 4391 int err; 4392 ncec_t *ncec; 4393 ncec_t **ncep; 4394 ip_stack_t *ipst = ill->ill_ipst; 4395 uint16_t state; 4396 boolean_t fastprobe = B_FALSE; 4397 struct ndp_g_s *ndp; 4398 nce_t *nce = NULL; 4399 mblk_t *dlur_mp = NULL; 4400 4401 if (ill->ill_isv6) 4402 ndp = ill->ill_ipst->ips_ndp6; 4403 else 4404 ndp = ill->ill_ipst->ips_ndp4; 4405 4406 *retnce = NULL; 4407 4408 ASSERT(MUTEX_HELD(&ndp->ndp_g_lock)); 4409 4410 if (IN6_IS_ADDR_UNSPECIFIED(addr)) { 4411 ip0dbg(("nce_add_common: no addr\n")); 4412 return (EINVAL); 4413 } 4414 if ((flags & ~NCE_EXTERNAL_FLAGS_MASK)) { 4415 ip0dbg(("nce_add_common: flags = %x\n", (int)flags)); 4416 return (EINVAL); 4417 } 4418 4419 if (ill->ill_isv6) { 4420 ncep = ((ncec_t **)NCE_HASH_PTR_V6(ipst, *addr)); 4421 } else { 4422 ipaddr_t v4addr; 4423 4424 IN6_V4MAPPED_TO_IPADDR(addr, v4addr); 4425 ncep = ((ncec_t **)NCE_HASH_PTR_V4(ipst, v4addr)); 4426 } 4427 4428 /* 4429 * The caller has ensured that there is no nce on ill, but there could 4430 * still be an nce_common_t for the address, so that we find exisiting 4431 * ncec_t strucutures first, and atomically add a new nce_t if 4432 * one is found. The ndp_g_lock ensures that we don't cross threads 4433 * with an ncec_delete(). Unlike ncec_lookup_illgrp() we do not 4434 * compare for matches across the illgrp because this function is 4435 * called via nce_lookup_then_add_v* -> nce_add_v* -> nce_add_common, 4436 * with the nce_lookup_then_add_v* passing in the ipmp_ill where 4437 * appropriate. 4438 */ 4439 ncec = *ncep; 4440 for (; ncec != NULL; ncec = ncec->ncec_next) { 4441 if (ncec->ncec_ill == ill) { 4442 if (IN6_ARE_ADDR_EQUAL(&ncec->ncec_addr, addr)) { 4443 /* 4444 * We should never find *retnce to be 4445 * MYADDR, since the caller may then 4446 * incorrectly restart a DAD timer that's 4447 * already running. However, if we are in 4448 * forwarding mode, and the interface is 4449 * moving in/out of groups, the data 4450 * path ire lookup (e.g., ire_revalidate_nce) 4451 * may have determined that some destination 4452 * is offlink while the control path is adding 4453 * that address as a local address. 4454 * Recover from this case by failing the 4455 * lookup 4456 */ 4457 if (NCE_MYADDR(ncec)) 4458 return (ENXIO); 4459 *retnce = nce_ill_lookup_then_add(ill, ncec); 4460 if (*retnce != NULL) 4461 break; 4462 } 4463 } 4464 } 4465 if (*retnce != NULL) /* caller must trigger fastpath on nce */ 4466 return (0); 4467 4468 ncec = kmem_cache_alloc(ncec_cache, KM_NOSLEEP); 4469 if (ncec == NULL) 4470 return (ENOMEM); 4471 *ncec = nce_nil; 4472 ncec->ncec_ill = ill; 4473 ncec->ncec_ipversion = (ill->ill_isv6 ? IPV6_VERSION : IPV4_VERSION); 4474 ncec->ncec_flags = flags; 4475 ncec->ncec_ipst = ipst; /* No netstack_hold */ 4476 4477 if (!ill->ill_isv6) { 4478 ipaddr_t addr4; 4479 4480 /* 4481 * DAD probe interval and probe count are set based on 4482 * fast/slow probe settings. If the underlying link doesn't 4483 * have reliably up/down notifications or if we're working 4484 * with IPv4 169.254.0.0/16 Link Local Address space, then 4485 * don't use the fast timers. Otherwise, use them. 4486 */ 4487 ASSERT(IN6_IS_ADDR_V4MAPPED(addr)); 4488 IN6_V4MAPPED_TO_IPADDR(addr, addr4); 4489 if (ill->ill_note_link && !IS_IPV4_LL_SPACE(&addr4)) { 4490 fastprobe = B_TRUE; 4491 } else if (IS_IPMP(ill) && NCE_PUBLISH(ncec) && 4492 !IS_IPV4_LL_SPACE(&addr4)) { 4493 ill_t *hwaddr_ill; 4494 4495 hwaddr_ill = ipmp_illgrp_find_ill(ill->ill_grp, hw_addr, 4496 hw_addr_len); 4497 if (hwaddr_ill != NULL && hwaddr_ill->ill_note_link) 4498 fastprobe = B_TRUE; 4499 } 4500 if (fastprobe) { 4501 ncec->ncec_xmit_interval = 4502 ipst->ips_arp_fastprobe_interval; 4503 ncec->ncec_pcnt = 4504 ipst->ips_arp_fastprobe_count; 4505 ncec->ncec_flags |= NCE_F_FAST; 4506 } else { 4507 ncec->ncec_xmit_interval = 4508 ipst->ips_arp_probe_interval; 4509 ncec->ncec_pcnt = 4510 ipst->ips_arp_probe_count; 4511 } 4512 if (NCE_PUBLISH(ncec)) { 4513 ncec->ncec_unsolicit_count = 4514 ipst->ips_ip_arp_publish_count; 4515 } 4516 } else { 4517 /* 4518 * probe interval is constant: ILL_PROBE_INTERVAL 4519 * probe count is constant: ND_MAX_UNICAST_SOLICIT 4520 */ 4521 ncec->ncec_pcnt = ND_MAX_UNICAST_SOLICIT; 4522 if (NCE_PUBLISH(ncec)) { 4523 ncec->ncec_unsolicit_count = 4524 ipst->ips_ip_ndp_unsolicit_count; 4525 } 4526 } 4527 ncec->ncec_rcnt = ill->ill_xmit_count; 4528 ncec->ncec_addr = *addr; 4529 ncec->ncec_qd_mp = NULL; 4530 ncec->ncec_refcnt = 1; /* for ncec getting created */ 4531 mutex_init(&ncec->ncec_lock, NULL, MUTEX_DEFAULT, NULL); 4532 ncec->ncec_trace_disable = B_FALSE; 4533 4534 /* 4535 * ncec_lladdr holds link layer address 4536 */ 4537 if (hw_addr_len > 0) { 4538 template = kmem_alloc(hw_addr_len, KM_NOSLEEP); 4539 if (template == NULL) { 4540 err = ENOMEM; 4541 goto err_ret; 4542 } 4543 ncec->ncec_lladdr = template; 4544 ncec->ncec_lladdr_length = hw_addr_len; 4545 bzero(ncec->ncec_lladdr, hw_addr_len); 4546 } 4547 if ((flags & NCE_F_BCAST) != 0) { 4548 state = ND_REACHABLE; 4549 ASSERT(hw_addr_len > 0); 4550 } else if (ill->ill_net_type == IRE_IF_RESOLVER) { 4551 state = ND_INITIAL; 4552 } else if (ill->ill_net_type == IRE_IF_NORESOLVER) { 4553 /* 4554 * NORESOLVER entries are always created in the REACHABLE 4555 * state. 4556 */ 4557 state = ND_REACHABLE; 4558 if (ill->ill_phys_addr_length == IP_ADDR_LEN && 4559 ill->ill_mactype != DL_IPV4 && 4560 ill->ill_mactype != DL_6TO4) { 4561 /* 4562 * We create a nce_res_mp with the IP nexthop address 4563 * as the destination address if the physical length 4564 * is exactly 4 bytes for point-to-multipoint links 4565 * that do their own resolution from IP to link-layer 4566 * address (e.g. IP over X.25). 4567 */ 4568 bcopy((uchar_t *)addr, 4569 ncec->ncec_lladdr, ill->ill_phys_addr_length); 4570 } 4571 if (ill->ill_phys_addr_length == IPV6_ADDR_LEN && 4572 ill->ill_mactype != DL_IPV6) { 4573 /* 4574 * We create a nce_res_mp with the IP nexthop address 4575 * as the destination address if the physical legnth 4576 * is exactly 16 bytes for point-to-multipoint links 4577 * that do their own resolution from IP to link-layer 4578 * address. 4579 */ 4580 bcopy((uchar_t *)addr, 4581 ncec->ncec_lladdr, ill->ill_phys_addr_length); 4582 } 4583 /* 4584 * Since NUD is not part of the base IPv4 protocol definition, 4585 * IPv4 neighbor entries on NORESOLVER interfaces will never 4586 * age, and are marked NCE_F_NONUD. 4587 */ 4588 if (!ill->ill_isv6) 4589 ncec->ncec_flags |= NCE_F_NONUD; 4590 } else if (ill->ill_net_type == IRE_LOOPBACK) { 4591 state = ND_REACHABLE; 4592 } 4593 4594 if (hw_addr != NULL || ill->ill_net_type == IRE_IF_NORESOLVER) { 4595 /* 4596 * We are adding an ncec with a deterministic hw_addr, 4597 * so the state can only be one of {REACHABLE, STALE, PROBE}. 4598 * 4599 * if we are adding a unicast ncec for the local address 4600 * it would be REACHABLE; we would be adding a ND_STALE entry 4601 * for the requestor of an ARP_REQUEST/ND_SOLICIT. Our own 4602 * addresses are added in PROBE to trigger DAD. 4603 */ 4604 if ((flags & (NCE_F_MCAST|NCE_F_BCAST)) || 4605 ill->ill_net_type == IRE_IF_NORESOLVER) 4606 state = ND_REACHABLE; 4607 else if (!NCE_PUBLISH(ncec)) 4608 state = ND_STALE; 4609 else 4610 state = ND_PROBE; 4611 if (hw_addr != NULL) 4612 nce_set_ll(ncec, hw_addr); 4613 } 4614 /* caller overrides internally computed state */ 4615 if (nce_state != ND_UNCHANGED) 4616 state = nce_state; 4617 4618 if (state == ND_PROBE) 4619 ncec->ncec_flags |= NCE_F_UNVERIFIED; 4620 4621 ncec->ncec_state = state; 4622 4623 if (state == ND_REACHABLE) { 4624 ncec->ncec_last = ncec->ncec_init_time = 4625 TICK_TO_MSEC(ddi_get_lbolt64()); 4626 } else { 4627 ncec->ncec_last = 0; 4628 if (state == ND_INITIAL) 4629 ncec->ncec_init_time = TICK_TO_MSEC(ddi_get_lbolt64()); 4630 } 4631 list_create(&ncec->ncec_cb, sizeof (ncec_cb_t), 4632 offsetof(ncec_cb_t, ncec_cb_node)); 4633 /* 4634 * have all the memory allocations out of the way before taking locks 4635 * and adding the nce. 4636 */ 4637 nce = kmem_cache_alloc(nce_cache, KM_NOSLEEP); 4638 if (nce == NULL) { 4639 err = ENOMEM; 4640 goto err_ret; 4641 } 4642 if (ncec->ncec_lladdr != NULL || 4643 ill->ill_net_type == IRE_IF_NORESOLVER) { 4644 dlur_mp = ill_dlur_gen(ncec->ncec_lladdr, 4645 ill->ill_phys_addr_length, ill->ill_sap, 4646 ill->ill_sap_length); 4647 if (dlur_mp == NULL) { 4648 err = ENOMEM; 4649 goto err_ret; 4650 } 4651 } 4652 4653 /* 4654 * Atomically ensure that the ill is not CONDEMNED, before 4655 * adding the NCE. 4656 */ 4657 mutex_enter(&ill->ill_lock); 4658 if (ill->ill_state_flags & ILL_CONDEMNED) { 4659 mutex_exit(&ill->ill_lock); 4660 err = EINVAL; 4661 goto err_ret; 4662 } 4663 if (!NCE_MYADDR(ncec) && 4664 (ill->ill_state_flags & ILL_DOWN_IN_PROGRESS)) { 4665 mutex_exit(&ill->ill_lock); 4666 DTRACE_PROBE1(nce__add__on__down__ill, ncec_t *, ncec); 4667 err = EINVAL; 4668 goto err_ret; 4669 } 4670 /* 4671 * Acquire the ncec_lock even before adding the ncec to the list 4672 * so that it cannot get deleted after the ncec is added, but 4673 * before we add the nce. 4674 */ 4675 mutex_enter(&ncec->ncec_lock); 4676 if ((ncec->ncec_next = *ncep) != NULL) 4677 ncec->ncec_next->ncec_ptpn = &ncec->ncec_next; 4678 *ncep = ncec; 4679 ncec->ncec_ptpn = ncep; 4680 4681 /* Bump up the number of ncec's referencing this ill */ 4682 DTRACE_PROBE3(ill__incr__cnt, (ill_t *), ill, 4683 (char *), "ncec", (void *), ncec); 4684 ill->ill_ncec_cnt++; 4685 /* 4686 * Since we hold the ncec_lock at this time, the ncec cannot be 4687 * condemned, and we can safely add the nce. 4688 */ 4689 *retnce = nce_add_impl(ill, ncec, nce, dlur_mp); 4690 mutex_exit(&ncec->ncec_lock); 4691 mutex_exit(&ill->ill_lock); 4692 4693 /* caller must trigger fastpath on *retnce */ 4694 return (0); 4695 4696 err_ret: 4697 if (ncec != NULL) 4698 kmem_cache_free(ncec_cache, ncec); 4699 if (nce != NULL) 4700 kmem_cache_free(nce_cache, nce); 4701 freemsg(dlur_mp); 4702 if (template != NULL) 4703 kmem_free(template, ill->ill_phys_addr_length); 4704 return (err); 4705 } 4706 4707 /* 4708 * take a ref on the nce 4709 */ 4710 void 4711 nce_refhold(nce_t *nce) 4712 { 4713 mutex_enter(&nce->nce_lock); 4714 nce->nce_refcnt++; 4715 ASSERT((nce)->nce_refcnt != 0); 4716 mutex_exit(&nce->nce_lock); 4717 } 4718 4719 /* 4720 * release a ref on the nce; In general, this 4721 * cannot be called with locks held because nce_inactive 4722 * may result in nce_inactive which will take the ill_lock, 4723 * do ipif_ill_refrele_tail etc. Thus the one exception 4724 * where this can be called with locks held is when the caller 4725 * is certain that the nce_refcnt is sufficient to prevent 4726 * the invocation of nce_inactive. 4727 */ 4728 void 4729 nce_refrele(nce_t *nce) 4730 { 4731 ASSERT((nce)->nce_refcnt != 0); 4732 mutex_enter(&nce->nce_lock); 4733 if (--nce->nce_refcnt == 0) 4734 nce_inactive(nce); /* destroys the mutex */ 4735 else 4736 mutex_exit(&nce->nce_lock); 4737 } 4738 4739 /* 4740 * free the nce after all refs have gone away. 4741 */ 4742 static void 4743 nce_inactive(nce_t *nce) 4744 { 4745 ill_t *ill = nce->nce_ill; 4746 4747 ASSERT(nce->nce_refcnt == 0); 4748 4749 ncec_refrele_notr(nce->nce_common); 4750 nce->nce_common = NULL; 4751 freemsg(nce->nce_fp_mp); 4752 freemsg(nce->nce_dlur_mp); 4753 4754 mutex_enter(&ill->ill_lock); 4755 DTRACE_PROBE3(ill__decr__cnt, (ill_t *), ill, 4756 (char *), "nce", (void *), nce); 4757 ill->ill_nce_cnt--; 4758 nce->nce_ill = NULL; 4759 /* 4760 * If the number of ncec's associated with this ill have dropped 4761 * to zero, check whether we need to restart any operation that 4762 * is waiting for this to happen. 4763 */ 4764 if (ILL_DOWN_OK(ill)) { 4765 /* ipif_ill_refrele_tail drops the ill_lock */ 4766 ipif_ill_refrele_tail(ill); 4767 } else { 4768 mutex_exit(&ill->ill_lock); 4769 } 4770 4771 mutex_destroy(&nce->nce_lock); 4772 kmem_cache_free(nce_cache, nce); 4773 } 4774 4775 /* 4776 * Add an nce to the ill_nce list. 4777 */ 4778 static nce_t * 4779 nce_add_impl(ill_t *ill, ncec_t *ncec, nce_t *nce, mblk_t *dlur_mp) 4780 { 4781 bzero(nce, sizeof (*nce)); 4782 mutex_init(&nce->nce_lock, NULL, MUTEX_DEFAULT, NULL); 4783 nce->nce_common = ncec; 4784 nce->nce_addr = ncec->ncec_addr; 4785 nce->nce_ill = ill; 4786 DTRACE_PROBE3(ill__incr__cnt, (ill_t *), ill, 4787 (char *), "nce", (void *), nce); 4788 ill->ill_nce_cnt++; 4789 4790 nce->nce_refcnt = 1; /* for the thread */ 4791 ncec->ncec_refcnt++; /* want ncec_refhold_locked_notr(ncec) */ 4792 nce->nce_dlur_mp = dlur_mp; 4793 4794 /* add nce to the ill's fastpath list. */ 4795 nce->nce_refcnt++; /* for the list */ 4796 list_insert_head(&ill->ill_nce, nce); 4797 return (nce); 4798 } 4799 4800 static nce_t * 4801 nce_add(ill_t *ill, ncec_t *ncec) 4802 { 4803 nce_t *nce; 4804 mblk_t *dlur_mp = NULL; 4805 4806 ASSERT(MUTEX_HELD(&ill->ill_lock)); 4807 ASSERT(MUTEX_HELD(&ncec->ncec_lock)); 4808 4809 nce = kmem_cache_alloc(nce_cache, KM_NOSLEEP); 4810 if (nce == NULL) 4811 return (NULL); 4812 if (ncec->ncec_lladdr != NULL || 4813 ill->ill_net_type == IRE_IF_NORESOLVER) { 4814 dlur_mp = ill_dlur_gen(ncec->ncec_lladdr, 4815 ill->ill_phys_addr_length, ill->ill_sap, 4816 ill->ill_sap_length); 4817 if (dlur_mp == NULL) { 4818 kmem_cache_free(nce_cache, nce); 4819 return (NULL); 4820 } 4821 } 4822 return (nce_add_impl(ill, ncec, nce, dlur_mp)); 4823 } 4824 4825 /* 4826 * remove the nce from the ill_faspath list 4827 */ 4828 void 4829 nce_delete(nce_t *nce) 4830 { 4831 ill_t *ill = nce->nce_ill; 4832 4833 ASSERT(MUTEX_HELD(&ill->ill_lock)); 4834 4835 mutex_enter(&nce->nce_lock); 4836 if (nce->nce_is_condemned) { 4837 /* 4838 * some other thread has removed this nce from the ill_nce list 4839 */ 4840 mutex_exit(&nce->nce_lock); 4841 return; 4842 } 4843 nce->nce_is_condemned = B_TRUE; 4844 mutex_exit(&nce->nce_lock); 4845 4846 list_remove(&ill->ill_nce, nce); 4847 /* 4848 * even though we are holding the ill_lock, it is ok to 4849 * call nce_refrele here because we know that we should have 4850 * at least 2 refs on the nce: one for the thread, and one 4851 * for the list. The refrele below will release the one for 4852 * the list. 4853 */ 4854 nce_refrele(nce); 4855 } 4856 4857 nce_t * 4858 nce_lookup(ill_t *ill, const in6_addr_t *addr) 4859 { 4860 nce_t *nce = NULL; 4861 4862 ASSERT(ill != NULL); 4863 ASSERT(MUTEX_HELD(&ill->ill_lock)); 4864 4865 for (nce = list_head(&ill->ill_nce); nce != NULL; 4866 nce = list_next(&ill->ill_nce, nce)) { 4867 if (IN6_ARE_ADDR_EQUAL(&nce->nce_addr, addr)) 4868 break; 4869 } 4870 4871 /* 4872 * if we found the nce on the ill_nce list while holding 4873 * the ill_lock, then it cannot be condemned yet. 4874 */ 4875 if (nce != NULL) { 4876 ASSERT(!nce->nce_is_condemned); 4877 nce_refhold(nce); 4878 } 4879 return (nce); 4880 } 4881 4882 /* 4883 * Walk the ill_nce list on ill. The callback function func() cannot perform 4884 * any destructive actions. 4885 */ 4886 static void 4887 nce_walk_common(ill_t *ill, pfi_t func, void *arg) 4888 { 4889 nce_t *nce = NULL, *nce_next; 4890 4891 ASSERT(MUTEX_HELD(&ill->ill_lock)); 4892 for (nce = list_head(&ill->ill_nce); nce != NULL; ) { 4893 nce_next = list_next(&ill->ill_nce, nce); 4894 if (func(ill, nce, arg) != 0) 4895 break; 4896 nce = nce_next; 4897 } 4898 } 4899 4900 void 4901 nce_walk(ill_t *ill, pfi_t func, void *arg) 4902 { 4903 mutex_enter(&ill->ill_lock); 4904 nce_walk_common(ill, func, arg); 4905 mutex_exit(&ill->ill_lock); 4906 } 4907 4908 void 4909 nce_flush(ill_t *ill, boolean_t flushall) 4910 { 4911 nce_t *nce, *nce_next; 4912 list_t dead; 4913 4914 list_create(&dead, sizeof (nce_t), offsetof(nce_t, nce_node)); 4915 mutex_enter(&ill->ill_lock); 4916 for (nce = list_head(&ill->ill_nce); nce != NULL; ) { 4917 nce_next = list_next(&ill->ill_nce, nce); 4918 if (!flushall && NCE_PUBLISH(nce->nce_common)) { 4919 nce = nce_next; 4920 continue; 4921 } 4922 /* 4923 * nce_delete requires that the caller should either not 4924 * be holding locks, or should hold a ref to ensure that 4925 * we wont hit ncec_inactive. So take a ref and clean up 4926 * after the list is flushed. 4927 */ 4928 nce_refhold(nce); 4929 nce_delete(nce); 4930 list_insert_tail(&dead, nce); 4931 nce = nce_next; 4932 } 4933 mutex_exit(&ill->ill_lock); 4934 while ((nce = list_head(&dead)) != NULL) { 4935 list_remove(&dead, nce); 4936 nce_refrele(nce); 4937 } 4938 ASSERT(list_is_empty(&dead)); 4939 list_destroy(&dead); 4940 } 4941 4942 /* Return an interval that is anywhere in the [1 .. intv] range */ 4943 static clock_t 4944 nce_fuzz_interval(clock_t intv, boolean_t initial_time) 4945 { 4946 clock_t rnd, frac; 4947 4948 (void) random_get_pseudo_bytes((uint8_t *)&rnd, sizeof (rnd)); 4949 /* Note that clock_t is signed; must chop off bits */ 4950 rnd &= (1ul << (NBBY * sizeof (rnd) - 1)) - 1; 4951 if (initial_time) { 4952 if (intv <= 0) 4953 intv = 1; 4954 else 4955 intv = (rnd % intv) + 1; 4956 } else { 4957 /* Compute 'frac' as 20% of the configured interval */ 4958 if ((frac = intv / 5) <= 1) 4959 frac = 2; 4960 /* Set intv randomly in the range [intv-frac .. intv+frac] */ 4961 if ((intv = intv - frac + rnd % (2 * frac + 1)) <= 0) 4962 intv = 1; 4963 } 4964 return (intv); 4965 } 4966 4967 void 4968 nce_resolv_ipmp_ok(ncec_t *ncec) 4969 { 4970 mblk_t *mp; 4971 uint_t pkt_len; 4972 iaflags_t ixaflags = IXAF_NO_TRACE; 4973 nce_t *under_nce; 4974 ill_t *ill = ncec->ncec_ill; 4975 boolean_t isv6 = (ncec->ncec_ipversion == IPV6_VERSION); 4976 ipif_t *src_ipif = NULL; 4977 ip_stack_t *ipst = ill->ill_ipst; 4978 ill_t *send_ill; 4979 uint_t nprobes; 4980 4981 ASSERT(IS_IPMP(ill)); 4982 4983 mutex_enter(&ncec->ncec_lock); 4984 nprobes = ncec->ncec_nprobes; 4985 mp = ncec->ncec_qd_mp; 4986 ncec->ncec_qd_mp = NULL; 4987 ncec->ncec_nprobes = 0; 4988 mutex_exit(&ncec->ncec_lock); 4989 4990 while (mp != NULL) { 4991 mblk_t *nxt_mp; 4992 4993 nxt_mp = mp->b_next; 4994 mp->b_next = NULL; 4995 if (isv6) { 4996 ip6_t *ip6h = (ip6_t *)mp->b_rptr; 4997 4998 pkt_len = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN; 4999 src_ipif = ipif_lookup_addr_nondup_v6(&ip6h->ip6_src, 5000 ill, ALL_ZONES, ipst); 5001 } else { 5002 ipha_t *ipha = (ipha_t *)mp->b_rptr; 5003 5004 ixaflags |= IXAF_IS_IPV4; 5005 pkt_len = ntohs(ipha->ipha_length); 5006 src_ipif = ipif_lookup_addr_nondup(ipha->ipha_src, 5007 ill, ALL_ZONES, ipst); 5008 } 5009 5010 /* 5011 * find a new nce based on an under_ill. The first IPMP probe 5012 * packet gets queued, so we could still find a src_ipif that 5013 * matches an IPMP test address. 5014 */ 5015 if (src_ipif == NULL || IS_IPMP(src_ipif->ipif_ill)) { 5016 /* 5017 * if src_ipif is null, this could be either a 5018 * forwarded packet or a probe whose src got deleted. 5019 * We identify the former case by looking for the 5020 * ncec_nprobes: the first ncec_nprobes packets are 5021 * probes; 5022 */ 5023 if (src_ipif == NULL && nprobes > 0) 5024 goto drop_pkt; 5025 5026 /* 5027 * For forwarded packets, we use the ipmp rotor 5028 * to find send_ill. 5029 */ 5030 send_ill = ipmp_ill_hold_xmit_ill(ncec->ncec_ill, 5031 B_TRUE); 5032 } else { 5033 send_ill = src_ipif->ipif_ill; 5034 ill_refhold(send_ill); 5035 } 5036 5037 DTRACE_PROBE4(nce__resolve__ipmp, (mblk_t *), mp, 5038 (ncec_t *), ncec, (ipif_t *), 5039 src_ipif, (ill_t *), send_ill); 5040 5041 if (send_ill == NULL) { 5042 if (src_ipif != NULL) 5043 ipif_refrele(src_ipif); 5044 goto drop_pkt; 5045 } 5046 /* create an under_nce on send_ill */ 5047 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 5048 if (IS_IN_SAME_ILLGRP(send_ill, ncec->ncec_ill)) 5049 under_nce = nce_fastpath_create(send_ill, ncec); 5050 else 5051 under_nce = NULL; 5052 rw_exit(&ipst->ips_ill_g_lock); 5053 if (under_nce != NULL && NCE_ISREACHABLE(ncec)) 5054 nce_fastpath_trigger(under_nce); 5055 5056 ill_refrele(send_ill); 5057 if (src_ipif != NULL) 5058 ipif_refrele(src_ipif); 5059 5060 if (under_nce != NULL) { 5061 (void) ip_xmit(mp, under_nce, ixaflags, pkt_len, 0, 5062 ALL_ZONES, 0, NULL); 5063 nce_refrele(under_nce); 5064 if (nprobes > 0) 5065 nprobes--; 5066 mp = nxt_mp; 5067 continue; 5068 } 5069 drop_pkt: 5070 if (isv6) { 5071 BUMP_MIB(&ipst->ips_ip6_mib, ipIfStatsOutDiscards); 5072 } else { 5073 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); 5074 } 5075 ip_drop_output("ipIfStatsOutDiscards - no under_ill", mp, NULL); 5076 freemsg(mp); 5077 if (nprobes > 0) 5078 nprobes--; 5079 mp = nxt_mp; 5080 } 5081 ncec_cb_dispatch(ncec); /* complete callbacks */ 5082 } 5083