1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2010 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #include <sys/types.h> 27 #include <sys/stream.h> 28 #include <sys/stropts.h> 29 #include <sys/strsun.h> 30 #include <sys/sysmacros.h> 31 #include <sys/errno.h> 32 #include <sys/dlpi.h> 33 #include <sys/socket.h> 34 #include <sys/ddi.h> 35 #include <sys/sunddi.h> 36 #include <sys/cmn_err.h> 37 #include <sys/debug.h> 38 #include <sys/vtrace.h> 39 #include <sys/kmem.h> 40 #include <sys/zone.h> 41 #include <sys/ethernet.h> 42 #include <sys/sdt.h> 43 #include <sys/mac.h> 44 45 #include <net/if.h> 46 #include <net/if_types.h> 47 #include <net/if_dl.h> 48 #include <net/route.h> 49 #include <netinet/in.h> 50 #include <netinet/ip6.h> 51 #include <netinet/icmp6.h> 52 53 #include <inet/common.h> 54 #include <inet/mi.h> 55 #include <inet/mib2.h> 56 #include <inet/nd.h> 57 #include <inet/ip.h> 58 #include <inet/ip_impl.h> 59 #include <inet/ipclassifier.h> 60 #include <inet/ip_if.h> 61 #include <inet/ip_ire.h> 62 #include <inet/ip_rts.h> 63 #include <inet/ip6.h> 64 #include <inet/ip_ndp.h> 65 #include <inet/sctp_ip.h> 66 #include <inet/ip_arp.h> 67 #include <inet/ip2mac_impl.h> 68 69 #define ANNOUNCE_INTERVAL(isv6) \ 70 (isv6 ? ipst->ips_ip_ndp_unsolicit_interval : \ 71 ipst->ips_ip_arp_publish_interval) 72 73 #define DEFENSE_INTERVAL(isv6) \ 74 (isv6 ? ipst->ips_ndp_defend_interval : \ 75 ipst->ips_arp_defend_interval) 76 77 /* Non-tunable probe interval, based on link capabilities */ 78 #define ILL_PROBE_INTERVAL(ill) ((ill)->ill_note_link ? 150 : 1500) 79 80 /* 81 * The IPv4 Link Local address space is special; we do extra duplicate checking 82 * there, as the entire assignment mechanism rests on random numbers. 83 */ 84 #define IS_IPV4_LL_SPACE(ptr) (((uchar_t *)ptr)[0] == 169 && \ 85 ((uchar_t *)ptr)[1] == 254) 86 87 /* 88 * NCE_EXTERNAL_FLAGS_MASK defines the set of ncec_flags that may be passed 89 * in to the ncec*add* functions. 90 * 91 * NCE_F_AUTHORITY means that we ignore any incoming adverts for that 92 * mapping (though DAD is performed for the mapping). NCE_F_PUBLISH means 93 * that we will respond to requests for the protocol address. 94 */ 95 #define NCE_EXTERNAL_FLAGS_MASK \ 96 (NCE_F_MYADDR | NCE_F_ISROUTER | NCE_F_NONUD | \ 97 NCE_F_ANYCAST | NCE_F_UNSOL_ADV | NCE_F_BCAST | NCE_F_MCAST | \ 98 NCE_F_AUTHORITY | NCE_F_PUBLISH | NCE_F_STATIC) 99 100 /* 101 * Lock ordering: 102 * 103 * ndp_g_lock -> ill_lock -> ncec_lock 104 * 105 * The ndp_g_lock protects the NCE hash (nce_hash_tbl, NCE_HASH_PTR) and 106 * ncec_next. ncec_lock protects the contents of the NCE (particularly 107 * ncec_refcnt). 108 */ 109 110 static void nce_cleanup_list(ncec_t *ncec); 111 static void nce_set_ll(ncec_t *ncec, uchar_t *ll_addr); 112 static ncec_t *ncec_lookup_illgrp(ill_t *, const in6_addr_t *, 113 ncec_t *); 114 static nce_t *nce_lookup_addr(ill_t *, const in6_addr_t *); 115 static int nce_set_multicast_v6(ill_t *ill, const in6_addr_t *addr, 116 uint16_t ncec_flags, nce_t **newnce); 117 static int nce_set_multicast_v4(ill_t *ill, const in_addr_t *dst, 118 uint16_t ncec_flags, nce_t **newnce); 119 static boolean_t ndp_xmit(ill_t *ill, uint32_t operation, 120 uint8_t *hwaddr, uint_t hwaddr_len, const in6_addr_t *sender, 121 const in6_addr_t *target, int flag); 122 static void ncec_refhold_locked(ncec_t *); 123 static boolean_t ill_defend_rate_limit(ill_t *, ncec_t *); 124 static void nce_queue_mp_common(ncec_t *, mblk_t *, boolean_t); 125 static int nce_add_common(ill_t *, uchar_t *, uint_t, const in6_addr_t *, 126 uint16_t, uint16_t, nce_t **); 127 static nce_t *nce_add_impl(ill_t *, ncec_t *, nce_t *, mblk_t *); 128 static nce_t *nce_add(ill_t *, ncec_t *); 129 static void nce_inactive(nce_t *); 130 extern nce_t *nce_lookup(ill_t *, const in6_addr_t *); 131 static nce_t *nce_ill_lookup_then_add(ill_t *, ncec_t *); 132 static int nce_add_v6(ill_t *, uchar_t *, uint_t, const in6_addr_t *, 133 uint16_t, uint16_t, nce_t **); 134 static int nce_add_v4(ill_t *, uchar_t *, uint_t, const in_addr_t *, 135 uint16_t, uint16_t, nce_t **); 136 static int nce_add_v6_postprocess(nce_t *); 137 static int nce_add_v4_postprocess(nce_t *); 138 static ill_t *nce_resolve_src(ncec_t *, in6_addr_t *); 139 static clock_t nce_fuzz_interval(clock_t, boolean_t); 140 static void nce_resolv_ipmp_ok(ncec_t *); 141 static void nce_walk_common(ill_t *, pfi_t, void *); 142 static void nce_start_timer(ncec_t *, uint_t); 143 static nce_t *nce_fastpath_create(ill_t *, ncec_t *); 144 static void nce_fastpath_trigger(nce_t *); 145 static nce_t *nce_fastpath(ncec_t *, boolean_t, nce_t *); 146 147 #ifdef DEBUG 148 static void ncec_trace_cleanup(const ncec_t *); 149 #endif 150 151 #define NCE_HASH_PTR_V4(ipst, addr) \ 152 (&((ipst)->ips_ndp4->nce_hash_tbl[IRE_ADDR_HASH(addr, NCE_TABLE_SIZE)])) 153 154 #define NCE_HASH_PTR_V6(ipst, addr) \ 155 (&((ipst)->ips_ndp6->nce_hash_tbl[NCE_ADDR_HASH_V6(addr, \ 156 NCE_TABLE_SIZE)])) 157 158 extern kmem_cache_t *ncec_cache; 159 extern kmem_cache_t *nce_cache; 160 161 /* 162 * Send out a IPv6 (unicast) or IPv4 (broadcast) DAD probe 163 * If src_ill is not null, the ncec_addr is bound to src_ill. The 164 * src_ill is ignored by nce_dad for IPv4 Neighbor Cache entries where 165 * the probe is sent on the ncec_ill (in the non-IPMP case) or the 166 * IPMP cast_ill (in the IPMP case). 167 * 168 * Note that the probe interval is based on the src_ill for IPv6, and 169 * the ncec_xmit_interval for IPv4. 170 */ 171 static void 172 nce_dad(ncec_t *ncec, ill_t *src_ill, boolean_t send_probe) 173 { 174 boolean_t dropped; 175 uint32_t probe_interval; 176 177 ASSERT(!(ncec->ncec_flags & NCE_F_MCAST)); 178 ASSERT(!(ncec->ncec_flags & NCE_F_BCAST)); 179 if (ncec->ncec_ipversion == IPV6_VERSION) { 180 dropped = ndp_xmit(src_ill, ND_NEIGHBOR_SOLICIT, 181 ncec->ncec_lladdr, ncec->ncec_lladdr_length, 182 &ipv6_all_zeros, &ncec->ncec_addr, NDP_PROBE); 183 probe_interval = ILL_PROBE_INTERVAL(src_ill); 184 } else { 185 /* IPv4 DAD delay the initial probe. */ 186 if (send_probe) 187 dropped = arp_probe(ncec); 188 else 189 dropped = B_TRUE; 190 probe_interval = nce_fuzz_interval(ncec->ncec_xmit_interval, 191 !send_probe); 192 } 193 if (!dropped) { 194 mutex_enter(&ncec->ncec_lock); 195 ncec->ncec_pcnt--; 196 mutex_exit(&ncec->ncec_lock); 197 } 198 nce_restart_timer(ncec, probe_interval); 199 } 200 201 /* 202 * Compute default flags to use for an advertisement of this ncec's address. 203 */ 204 static int 205 nce_advert_flags(const ncec_t *ncec) 206 { 207 int flag = 0; 208 209 if (ncec->ncec_flags & NCE_F_ISROUTER) 210 flag |= NDP_ISROUTER; 211 if (!(ncec->ncec_flags & NCE_F_ANYCAST)) 212 flag |= NDP_ORIDE; 213 214 return (flag); 215 } 216 217 /* 218 * NDP Cache Entry creation routine. 219 * This routine must always be called with ndp6->ndp_g_lock held. 220 */ 221 int 222 nce_add_v6(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len, 223 const in6_addr_t *addr, uint16_t flags, uint16_t state, nce_t **newnce) 224 { 225 int err; 226 nce_t *nce; 227 228 ASSERT(MUTEX_HELD(&ill->ill_ipst->ips_ndp6->ndp_g_lock)); 229 ASSERT(ill != NULL && ill->ill_isv6); 230 231 err = nce_add_common(ill, hw_addr, hw_addr_len, addr, flags, state, 232 &nce); 233 if (err != 0) 234 return (err); 235 ASSERT(newnce != NULL); 236 *newnce = nce; 237 return (err); 238 } 239 240 /* 241 * Post-processing routine to be executed after nce_add_v6(). This function 242 * triggers fastpath (if appropriate) and DAD on the newly added nce entry 243 * and must be called without any locks held. 244 */ 245 int 246 nce_add_v6_postprocess(nce_t *nce) 247 { 248 ncec_t *ncec = nce->nce_common; 249 boolean_t dropped = B_FALSE; 250 uchar_t *hw_addr = ncec->ncec_lladdr; 251 uint_t hw_addr_len = ncec->ncec_lladdr_length; 252 ill_t *ill = ncec->ncec_ill; 253 int err = 0; 254 uint16_t flags = ncec->ncec_flags; 255 ip_stack_t *ipst = ill->ill_ipst; 256 boolean_t trigger_fastpath = B_TRUE; 257 258 /* 259 * If the hw_addr is NULL, typically for ND_INCOMPLETE nces, then 260 * we call nce_fastpath as soon as the ncec is resolved in nce_process. 261 * We call nce_fastpath from nce_update if the link layer address of 262 * the peer changes from nce_update 263 */ 264 if (NCE_PUBLISH(ncec) || !NCE_ISREACHABLE(ncec) || 265 (hw_addr == NULL && ill->ill_net_type != IRE_IF_NORESOLVER)) 266 trigger_fastpath = B_FALSE; 267 268 if (trigger_fastpath) 269 nce_fastpath_trigger(nce); 270 if (NCE_PUBLISH(ncec) && ncec->ncec_state == ND_PROBE) { 271 ill_t *hwaddr_ill; 272 /* 273 * Unicast entry that needs DAD. 274 */ 275 if (IS_IPMP(ill)) { 276 hwaddr_ill = ipmp_illgrp_find_ill(ill->ill_grp, 277 hw_addr, hw_addr_len); 278 } else { 279 hwaddr_ill = ill; 280 } 281 nce_dad(ncec, hwaddr_ill, B_TRUE); 282 err = EINPROGRESS; 283 } else if (flags & NCE_F_UNSOL_ADV) { 284 /* 285 * We account for the transmit below by assigning one 286 * less than the ndd variable. Subsequent decrements 287 * are done in nce_timer. 288 */ 289 mutex_enter(&ncec->ncec_lock); 290 ncec->ncec_unsolicit_count = 291 ipst->ips_ip_ndp_unsolicit_count - 1; 292 mutex_exit(&ncec->ncec_lock); 293 dropped = ndp_xmit(ill, 294 ND_NEIGHBOR_ADVERT, 295 hw_addr, 296 hw_addr_len, 297 &ncec->ncec_addr, /* Source and target of the adv */ 298 &ipv6_all_hosts_mcast, /* Destination of the packet */ 299 nce_advert_flags(ncec)); 300 mutex_enter(&ncec->ncec_lock); 301 if (dropped) 302 ncec->ncec_unsolicit_count++; 303 else 304 ncec->ncec_last_time_defended = ddi_get_lbolt(); 305 if (ncec->ncec_unsolicit_count != 0) { 306 nce_start_timer(ncec, 307 ipst->ips_ip_ndp_unsolicit_interval); 308 } 309 mutex_exit(&ncec->ncec_lock); 310 } 311 return (err); 312 } 313 314 /* 315 * Atomically lookup and add (if needed) Neighbor Cache information for 316 * an address. 317 * 318 * IPMP notes: the ncec for non-local (i.e., !NCE_MYADDR(ncec) addresses 319 * are always added pointing at the ipmp_ill. Thus, when the ill passed 320 * to nce_add_v6 is an under_ill (i.e., IS_UNDER_IPMP(ill)) two nce_t 321 * entries will be created, both pointing at the same ncec_t. The nce_t 322 * entries will have their nce_ill set to the ipmp_ill and the under_ill 323 * respectively, with the ncec_t having its ncec_ill pointing at the ipmp_ill. 324 * Local addresses are always created on the ill passed to nce_add_v6. 325 */ 326 int 327 nce_lookup_then_add_v6(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len, 328 const in6_addr_t *addr, uint16_t flags, uint16_t state, nce_t **newnce) 329 { 330 int err = 0; 331 ip_stack_t *ipst = ill->ill_ipst; 332 nce_t *nce, *upper_nce = NULL; 333 ill_t *in_ill = ill; 334 boolean_t need_ill_refrele = B_FALSE; 335 336 if (flags & NCE_F_MCAST) { 337 /* 338 * hw_addr will be figured out in nce_set_multicast_v6; 339 * caller has to select the cast_ill 340 */ 341 ASSERT(hw_addr == NULL); 342 ASSERT(!IS_IPMP(ill)); 343 err = nce_set_multicast_v6(ill, addr, flags, newnce); 344 return (err); 345 } 346 ASSERT(ill->ill_isv6); 347 if (IS_UNDER_IPMP(ill) && !(flags & NCE_F_MYADDR)) { 348 ill = ipmp_ill_hold_ipmp_ill(ill); 349 if (ill == NULL) 350 return (ENXIO); 351 need_ill_refrele = B_TRUE; 352 } 353 354 mutex_enter(&ipst->ips_ndp6->ndp_g_lock); 355 nce = nce_lookup_addr(ill, addr); 356 if (nce == NULL) { 357 err = nce_add_v6(ill, hw_addr, hw_addr_len, addr, flags, state, 358 &nce); 359 } else { 360 err = EEXIST; 361 } 362 mutex_exit(&ipst->ips_ndp6->ndp_g_lock); 363 if (err == 0) 364 err = nce_add_v6_postprocess(nce); 365 if (in_ill != ill && nce != NULL) { 366 nce_t *under_nce = NULL; 367 368 /* 369 * in_ill was the under_ill. Try to create the under_nce. 370 * Hold the ill_g_lock to prevent changes to group membership 371 * until we are done. 372 */ 373 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 374 if (!IS_IN_SAME_ILLGRP(in_ill, ill)) { 375 DTRACE_PROBE2(ill__not__in__group, nce_t *, nce, 376 ill_t *, ill); 377 rw_exit(&ipst->ips_ill_g_lock); 378 err = ENXIO; 379 nce_refrele(nce); 380 nce = NULL; 381 goto bail; 382 } 383 under_nce = nce_fastpath_create(in_ill, nce->nce_common); 384 if (under_nce == NULL) { 385 rw_exit(&ipst->ips_ill_g_lock); 386 err = EINVAL; 387 nce_refrele(nce); 388 nce = NULL; 389 goto bail; 390 } 391 rw_exit(&ipst->ips_ill_g_lock); 392 upper_nce = nce; 393 nce = under_nce; /* will be returned to caller */ 394 if (NCE_ISREACHABLE(nce->nce_common)) 395 nce_fastpath_trigger(under_nce); 396 } 397 /* nce_refrele is deferred until the lock is dropped */ 398 if (nce != NULL) { 399 if (newnce != NULL) 400 *newnce = nce; 401 else 402 nce_refrele(nce); 403 } 404 bail: 405 if (upper_nce != NULL) 406 nce_refrele(upper_nce); 407 if (need_ill_refrele) 408 ill_refrele(ill); 409 return (err); 410 } 411 412 /* 413 * Remove all the CONDEMNED nces from the appropriate hash table. 414 * We create a private list of NCEs, these may have ires pointing 415 * to them, so the list will be passed through to clean up dependent 416 * ires and only then we can do ncec_refrele() which can make NCE inactive. 417 */ 418 static void 419 nce_remove(ndp_g_t *ndp, ncec_t *ncec, ncec_t **free_nce_list) 420 { 421 ncec_t *ncec1; 422 ncec_t **ptpn; 423 424 ASSERT(MUTEX_HELD(&ndp->ndp_g_lock)); 425 ASSERT(ndp->ndp_g_walker == 0); 426 for (; ncec; ncec = ncec1) { 427 ncec1 = ncec->ncec_next; 428 mutex_enter(&ncec->ncec_lock); 429 if (NCE_ISCONDEMNED(ncec)) { 430 ptpn = ncec->ncec_ptpn; 431 ncec1 = ncec->ncec_next; 432 if (ncec1 != NULL) 433 ncec1->ncec_ptpn = ptpn; 434 *ptpn = ncec1; 435 ncec->ncec_ptpn = NULL; 436 ncec->ncec_next = NULL; 437 ncec->ncec_next = *free_nce_list; 438 *free_nce_list = ncec; 439 } 440 mutex_exit(&ncec->ncec_lock); 441 } 442 } 443 444 /* 445 * 1. Mark the entry CONDEMNED. This ensures that no new nce_lookup() 446 * will return this NCE. Also no new timeouts will 447 * be started (See nce_restart_timer). 448 * 2. Cancel any currently running timeouts. 449 * 3. If there is an ndp walker, return. The walker will do the cleanup. 450 * This ensures that walkers see a consistent list of NCEs while walking. 451 * 4. Otherwise remove the NCE from the list of NCEs 452 */ 453 void 454 ncec_delete(ncec_t *ncec) 455 { 456 ncec_t **ptpn; 457 ncec_t *ncec1; 458 int ipversion = ncec->ncec_ipversion; 459 ndp_g_t *ndp; 460 ip_stack_t *ipst = ncec->ncec_ipst; 461 462 if (ipversion == IPV4_VERSION) 463 ndp = ipst->ips_ndp4; 464 else 465 ndp = ipst->ips_ndp6; 466 467 /* Serialize deletes */ 468 mutex_enter(&ncec->ncec_lock); 469 if (NCE_ISCONDEMNED(ncec)) { 470 /* Some other thread is doing the delete */ 471 mutex_exit(&ncec->ncec_lock); 472 return; 473 } 474 /* 475 * Caller has a refhold. Also 1 ref for being in the list. Thus 476 * refcnt has to be >= 2 477 */ 478 ASSERT(ncec->ncec_refcnt >= 2); 479 ncec->ncec_flags |= NCE_F_CONDEMNED; 480 mutex_exit(&ncec->ncec_lock); 481 482 /* Count how many condemned ires for kmem_cache callback */ 483 atomic_add_32(&ipst->ips_num_nce_condemned, 1); 484 nce_fastpath_list_delete(ncec->ncec_ill, ncec, NULL); 485 486 /* Complete any waiting callbacks */ 487 ncec_cb_dispatch(ncec); 488 489 /* 490 * Cancel any running timer. Timeout can't be restarted 491 * since CONDEMNED is set. Can't hold ncec_lock across untimeout. 492 * Passing invalid timeout id is fine. 493 */ 494 if (ncec->ncec_timeout_id != 0) { 495 (void) untimeout(ncec->ncec_timeout_id); 496 ncec->ncec_timeout_id = 0; 497 } 498 499 mutex_enter(&ndp->ndp_g_lock); 500 if (ncec->ncec_ptpn == NULL) { 501 /* 502 * The last ndp walker has already removed this ncec from 503 * the list after we marked the ncec CONDEMNED and before 504 * we grabbed the global lock. 505 */ 506 mutex_exit(&ndp->ndp_g_lock); 507 return; 508 } 509 if (ndp->ndp_g_walker > 0) { 510 /* 511 * Can't unlink. The walker will clean up 512 */ 513 ndp->ndp_g_walker_cleanup = B_TRUE; 514 mutex_exit(&ndp->ndp_g_lock); 515 return; 516 } 517 518 /* 519 * Now remove the ncec from the list. nce_restart_timer won't restart 520 * the timer since it is marked CONDEMNED. 521 */ 522 ptpn = ncec->ncec_ptpn; 523 ncec1 = ncec->ncec_next; 524 if (ncec1 != NULL) 525 ncec1->ncec_ptpn = ptpn; 526 *ptpn = ncec1; 527 ncec->ncec_ptpn = NULL; 528 ncec->ncec_next = NULL; 529 mutex_exit(&ndp->ndp_g_lock); 530 531 /* Removed from ncec_ptpn/ncec_next list */ 532 ncec_refrele_notr(ncec); 533 } 534 535 void 536 ncec_inactive(ncec_t *ncec) 537 { 538 mblk_t **mpp; 539 ill_t *ill = ncec->ncec_ill; 540 ip_stack_t *ipst = ncec->ncec_ipst; 541 542 ASSERT(ncec->ncec_refcnt == 0); 543 ASSERT(MUTEX_HELD(&ncec->ncec_lock)); 544 545 /* Count how many condemned nces for kmem_cache callback */ 546 if (NCE_ISCONDEMNED(ncec)) 547 atomic_add_32(&ipst->ips_num_nce_condemned, -1); 548 549 /* Free all allocated messages */ 550 mpp = &ncec->ncec_qd_mp; 551 while (*mpp != NULL) { 552 mblk_t *mp; 553 554 mp = *mpp; 555 *mpp = mp->b_next; 556 557 inet_freemsg(mp); 558 } 559 /* 560 * must have been cleaned up in ncec_delete 561 */ 562 ASSERT(list_is_empty(&ncec->ncec_cb)); 563 list_destroy(&ncec->ncec_cb); 564 /* 565 * free the ncec_lladdr if one was allocated in nce_add_common() 566 */ 567 if (ncec->ncec_lladdr_length > 0) 568 kmem_free(ncec->ncec_lladdr, ncec->ncec_lladdr_length); 569 570 #ifdef DEBUG 571 ncec_trace_cleanup(ncec); 572 #endif 573 574 mutex_enter(&ill->ill_lock); 575 DTRACE_PROBE3(ill__decr__cnt, (ill_t *), ill, 576 (char *), "ncec", (void *), ncec); 577 ill->ill_ncec_cnt--; 578 ncec->ncec_ill = NULL; 579 /* 580 * If the number of ncec's associated with this ill have dropped 581 * to zero, check whether we need to restart any operation that 582 * is waiting for this to happen. 583 */ 584 if (ILL_DOWN_OK(ill)) { 585 /* ipif_ill_refrele_tail drops the ill_lock */ 586 ipif_ill_refrele_tail(ill); 587 } else { 588 mutex_exit(&ill->ill_lock); 589 } 590 591 mutex_destroy(&ncec->ncec_lock); 592 kmem_cache_free(ncec_cache, ncec); 593 } 594 595 /* 596 * ncec_walk routine. Delete the ncec if it is associated with the ill 597 * that is going away. Always called as a writer. 598 */ 599 void 600 ncec_delete_per_ill(ncec_t *ncec, uchar_t *arg) 601 { 602 if ((ncec != NULL) && ncec->ncec_ill == (ill_t *)arg) { 603 ncec_delete(ncec); 604 } 605 } 606 607 /* 608 * Neighbor Cache cleanup logic for a list of ncec_t entries. 609 */ 610 static void 611 nce_cleanup_list(ncec_t *ncec) 612 { 613 ncec_t *ncec_next; 614 615 ASSERT(ncec != NULL); 616 while (ncec != NULL) { 617 ncec_next = ncec->ncec_next; 618 ncec->ncec_next = NULL; 619 620 /* 621 * It is possible for the last ndp walker (this thread) 622 * to come here after ncec_delete has marked the ncec CONDEMNED 623 * and before it has removed the ncec from the fastpath list 624 * or called untimeout. So we need to do it here. It is safe 625 * for both ncec_delete and this thread to do it twice or 626 * even simultaneously since each of the threads has a 627 * reference on the ncec. 628 */ 629 nce_fastpath_list_delete(ncec->ncec_ill, ncec, NULL); 630 /* 631 * Cancel any running timer. Timeout can't be restarted 632 * since CONDEMNED is set. The ncec_lock can't be 633 * held across untimeout though passing invalid timeout 634 * id is fine. 635 */ 636 if (ncec->ncec_timeout_id != 0) { 637 (void) untimeout(ncec->ncec_timeout_id); 638 ncec->ncec_timeout_id = 0; 639 } 640 /* Removed from ncec_ptpn/ncec_next list */ 641 ncec_refrele_notr(ncec); 642 ncec = ncec_next; 643 } 644 } 645 646 /* 647 * Restart DAD on given NCE. Returns B_TRUE if DAD has been restarted. 648 */ 649 boolean_t 650 nce_restart_dad(ncec_t *ncec) 651 { 652 boolean_t started; 653 ill_t *ill, *hwaddr_ill; 654 655 if (ncec == NULL) 656 return (B_FALSE); 657 ill = ncec->ncec_ill; 658 mutex_enter(&ncec->ncec_lock); 659 if (ncec->ncec_state == ND_PROBE) { 660 mutex_exit(&ncec->ncec_lock); 661 started = B_TRUE; 662 } else if (ncec->ncec_state == ND_REACHABLE) { 663 ASSERT(ncec->ncec_lladdr != NULL); 664 ncec->ncec_state = ND_PROBE; 665 ncec->ncec_pcnt = ND_MAX_UNICAST_SOLICIT; 666 /* 667 * Slight cheat here: we don't use the initial probe delay 668 * for IPv4 in this obscure case. 669 */ 670 mutex_exit(&ncec->ncec_lock); 671 if (IS_IPMP(ill)) { 672 hwaddr_ill = ipmp_illgrp_find_ill(ill->ill_grp, 673 ncec->ncec_lladdr, ncec->ncec_lladdr_length); 674 } else { 675 hwaddr_ill = ill; 676 } 677 nce_dad(ncec, hwaddr_ill, B_TRUE); 678 started = B_TRUE; 679 } else { 680 mutex_exit(&ncec->ncec_lock); 681 started = B_FALSE; 682 } 683 return (started); 684 } 685 686 /* 687 * IPv6 Cache entry lookup. Try to find an ncec matching the parameters passed. 688 * If one is found, the refcnt on the ncec will be incremented. 689 */ 690 ncec_t * 691 ncec_lookup_illgrp_v6(ill_t *ill, const in6_addr_t *addr) 692 { 693 ncec_t *ncec; 694 ip_stack_t *ipst = ill->ill_ipst; 695 696 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 697 mutex_enter(&ipst->ips_ndp6->ndp_g_lock); 698 699 /* Get head of v6 hash table */ 700 ncec = *((ncec_t **)NCE_HASH_PTR_V6(ipst, *addr)); 701 ncec = ncec_lookup_illgrp(ill, addr, ncec); 702 mutex_exit(&ipst->ips_ndp6->ndp_g_lock); 703 rw_exit(&ipst->ips_ill_g_lock); 704 return (ncec); 705 } 706 /* 707 * IPv4 Cache entry lookup. Try to find an ncec matching the parameters passed. 708 * If one is found, the refcnt on the ncec will be incremented. 709 */ 710 ncec_t * 711 ncec_lookup_illgrp_v4(ill_t *ill, const in_addr_t *addr) 712 { 713 ncec_t *ncec = NULL; 714 in6_addr_t addr6; 715 ip_stack_t *ipst = ill->ill_ipst; 716 717 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 718 mutex_enter(&ipst->ips_ndp4->ndp_g_lock); 719 720 /* Get head of v4 hash table */ 721 ncec = *((ncec_t **)NCE_HASH_PTR_V4(ipst, *addr)); 722 IN6_IPADDR_TO_V4MAPPED(*addr, &addr6); 723 ncec = ncec_lookup_illgrp(ill, &addr6, ncec); 724 mutex_exit(&ipst->ips_ndp4->ndp_g_lock); 725 rw_exit(&ipst->ips_ill_g_lock); 726 return (ncec); 727 } 728 729 /* 730 * Cache entry lookup. Try to find an ncec matching the parameters passed. 731 * If an ncec is found, increment the hold count on that ncec. 732 * The caller passes in the start of the appropriate hash table, and must 733 * be holding the appropriate global lock (ndp_g_lock). In addition, since 734 * this function matches ncec_t entries across the illgrp, the ips_ill_g_lock 735 * must be held as reader. 736 * 737 * This function always matches across the ipmp group. 738 */ 739 ncec_t * 740 ncec_lookup_illgrp(ill_t *ill, const in6_addr_t *addr, ncec_t *ncec) 741 { 742 ndp_g_t *ndp; 743 ip_stack_t *ipst = ill->ill_ipst; 744 745 if (ill->ill_isv6) 746 ndp = ipst->ips_ndp6; 747 else 748 ndp = ipst->ips_ndp4; 749 750 ASSERT(ill != NULL); 751 ASSERT(MUTEX_HELD(&ndp->ndp_g_lock)); 752 if (IN6_IS_ADDR_UNSPECIFIED(addr)) 753 return (NULL); 754 for (; ncec != NULL; ncec = ncec->ncec_next) { 755 if (ncec->ncec_ill == ill || 756 IS_IN_SAME_ILLGRP(ill, ncec->ncec_ill)) { 757 if (IN6_ARE_ADDR_EQUAL(&ncec->ncec_addr, addr)) { 758 mutex_enter(&ncec->ncec_lock); 759 if (!NCE_ISCONDEMNED(ncec)) { 760 ncec_refhold_locked(ncec); 761 mutex_exit(&ncec->ncec_lock); 762 break; 763 } 764 mutex_exit(&ncec->ncec_lock); 765 } 766 } 767 } 768 return (ncec); 769 } 770 771 /* 772 * Find an nce_t on ill with nce_addr == addr. Lookup the nce_t 773 * entries for ill only, i.e., when ill is part of an ipmp group, 774 * nce_lookup_v4 will never try to match across the group. 775 */ 776 nce_t * 777 nce_lookup_v4(ill_t *ill, const in_addr_t *addr) 778 { 779 nce_t *nce; 780 in6_addr_t addr6; 781 ip_stack_t *ipst = ill->ill_ipst; 782 783 mutex_enter(&ipst->ips_ndp4->ndp_g_lock); 784 IN6_IPADDR_TO_V4MAPPED(*addr, &addr6); 785 nce = nce_lookup_addr(ill, &addr6); 786 mutex_exit(&ipst->ips_ndp4->ndp_g_lock); 787 return (nce); 788 } 789 790 /* 791 * Find an nce_t on ill with nce_addr == addr. Lookup the nce_t 792 * entries for ill only, i.e., when ill is part of an ipmp group, 793 * nce_lookup_v6 will never try to match across the group. 794 */ 795 nce_t * 796 nce_lookup_v6(ill_t *ill, const in6_addr_t *addr6) 797 { 798 nce_t *nce; 799 ip_stack_t *ipst = ill->ill_ipst; 800 801 mutex_enter(&ipst->ips_ndp6->ndp_g_lock); 802 nce = nce_lookup_addr(ill, addr6); 803 mutex_exit(&ipst->ips_ndp6->ndp_g_lock); 804 return (nce); 805 } 806 807 static nce_t * 808 nce_lookup_addr(ill_t *ill, const in6_addr_t *addr) 809 { 810 nce_t *nce; 811 812 ASSERT(ill != NULL); 813 #ifdef DEBUG 814 if (ill->ill_isv6) 815 ASSERT(MUTEX_HELD(&ill->ill_ipst->ips_ndp6->ndp_g_lock)); 816 else 817 ASSERT(MUTEX_HELD(&ill->ill_ipst->ips_ndp4->ndp_g_lock)); 818 #endif 819 mutex_enter(&ill->ill_lock); 820 nce = nce_lookup(ill, addr); 821 mutex_exit(&ill->ill_lock); 822 return (nce); 823 } 824 825 826 /* 827 * Router turned to host. We need to make sure that cached copies of the ncec 828 * are not used for forwarding packets if they were derived from the default 829 * route, and that the default route itself is removed, as required by 830 * section 7.2.5 of RFC 2461. 831 * 832 * Note that the ncec itself probably has valid link-layer information for the 833 * nexthop, so that there is no reason to delete the ncec, as long as the 834 * ISROUTER flag is turned off. 835 */ 836 static void 837 ncec_router_to_host(ncec_t *ncec) 838 { 839 ire_t *ire; 840 ip_stack_t *ipst = ncec->ncec_ipst; 841 842 mutex_enter(&ncec->ncec_lock); 843 ncec->ncec_flags &= ~NCE_F_ISROUTER; 844 mutex_exit(&ncec->ncec_lock); 845 846 ire = ire_ftable_lookup_v6(&ipv6_all_zeros, &ipv6_all_zeros, 847 &ncec->ncec_addr, IRE_DEFAULT, ncec->ncec_ill, ALL_ZONES, NULL, 848 MATCH_IRE_ILL | MATCH_IRE_TYPE | MATCH_IRE_GW, 0, ipst, NULL); 849 if (ire != NULL) { 850 ip_rts_rtmsg(RTM_DELETE, ire, 0, ipst); 851 ire_delete(ire); 852 ire_refrele(ire); 853 } 854 } 855 856 /* 857 * Process passed in parameters either from an incoming packet or via 858 * user ioctl. 859 */ 860 void 861 nce_process(ncec_t *ncec, uchar_t *hw_addr, uint32_t flag, boolean_t is_adv) 862 { 863 ill_t *ill = ncec->ncec_ill; 864 uint32_t hw_addr_len = ill->ill_phys_addr_length; 865 boolean_t ll_updated = B_FALSE; 866 boolean_t ll_changed; 867 nce_t *nce; 868 869 ASSERT(ncec->ncec_ipversion == IPV6_VERSION); 870 /* 871 * No updates of link layer address or the neighbor state is 872 * allowed, when the cache is in NONUD state. This still 873 * allows for responding to reachability solicitation. 874 */ 875 mutex_enter(&ncec->ncec_lock); 876 if (ncec->ncec_state == ND_INCOMPLETE) { 877 if (hw_addr == NULL) { 878 mutex_exit(&ncec->ncec_lock); 879 return; 880 } 881 nce_set_ll(ncec, hw_addr); 882 /* 883 * Update ncec state and send the queued packets 884 * back to ip this time ire will be added. 885 */ 886 if (flag & ND_NA_FLAG_SOLICITED) { 887 nce_update(ncec, ND_REACHABLE, NULL); 888 } else { 889 nce_update(ncec, ND_STALE, NULL); 890 } 891 mutex_exit(&ncec->ncec_lock); 892 nce = nce_fastpath(ncec, B_TRUE, NULL); 893 nce_resolv_ok(ncec); 894 if (nce != NULL) 895 nce_refrele(nce); 896 return; 897 } 898 ll_changed = nce_cmp_ll_addr(ncec, hw_addr, hw_addr_len); 899 if (!is_adv) { 900 /* If this is a SOLICITATION request only */ 901 if (ll_changed) 902 nce_update(ncec, ND_STALE, hw_addr); 903 mutex_exit(&ncec->ncec_lock); 904 ncec_cb_dispatch(ncec); 905 return; 906 } 907 if (!(flag & ND_NA_FLAG_OVERRIDE) && ll_changed) { 908 /* If in any other state than REACHABLE, ignore */ 909 if (ncec->ncec_state == ND_REACHABLE) { 910 nce_update(ncec, ND_STALE, NULL); 911 } 912 mutex_exit(&ncec->ncec_lock); 913 ncec_cb_dispatch(ncec); 914 return; 915 } else { 916 if (ll_changed) { 917 nce_update(ncec, ND_UNCHANGED, hw_addr); 918 ll_updated = B_TRUE; 919 } 920 if (flag & ND_NA_FLAG_SOLICITED) { 921 nce_update(ncec, ND_REACHABLE, NULL); 922 } else { 923 if (ll_updated) { 924 nce_update(ncec, ND_STALE, NULL); 925 } 926 } 927 mutex_exit(&ncec->ncec_lock); 928 if (!(flag & ND_NA_FLAG_ROUTER) && (ncec->ncec_flags & 929 NCE_F_ISROUTER)) { 930 ncec_router_to_host(ncec); 931 } else { 932 ncec_cb_dispatch(ncec); 933 } 934 } 935 } 936 937 /* 938 * Pass arg1 to the pfi supplied, along with each ncec in existence. 939 * ncec_walk() places a REFHOLD on the ncec and drops the lock when 940 * walking the hash list. 941 */ 942 void 943 ncec_walk_common(ndp_g_t *ndp, ill_t *ill, pfi_t pfi, void *arg1, 944 boolean_t trace) 945 { 946 ncec_t *ncec; 947 ncec_t *ncec1; 948 ncec_t **ncep; 949 ncec_t *free_nce_list = NULL; 950 951 mutex_enter(&ndp->ndp_g_lock); 952 /* Prevent ncec_delete from unlink and free of NCE */ 953 ndp->ndp_g_walker++; 954 mutex_exit(&ndp->ndp_g_lock); 955 for (ncep = ndp->nce_hash_tbl; 956 ncep < A_END(ndp->nce_hash_tbl); ncep++) { 957 for (ncec = *ncep; ncec != NULL; ncec = ncec1) { 958 ncec1 = ncec->ncec_next; 959 if (ill == NULL || ncec->ncec_ill == ill) { 960 if (trace) { 961 ncec_refhold(ncec); 962 (*pfi)(ncec, arg1); 963 ncec_refrele(ncec); 964 } else { 965 ncec_refhold_notr(ncec); 966 (*pfi)(ncec, arg1); 967 ncec_refrele_notr(ncec); 968 } 969 } 970 } 971 } 972 mutex_enter(&ndp->ndp_g_lock); 973 ndp->ndp_g_walker--; 974 if (ndp->ndp_g_walker_cleanup && ndp->ndp_g_walker == 0) { 975 /* Time to delete condemned entries */ 976 for (ncep = ndp->nce_hash_tbl; 977 ncep < A_END(ndp->nce_hash_tbl); ncep++) { 978 ncec = *ncep; 979 if (ncec != NULL) { 980 nce_remove(ndp, ncec, &free_nce_list); 981 } 982 } 983 ndp->ndp_g_walker_cleanup = B_FALSE; 984 } 985 986 mutex_exit(&ndp->ndp_g_lock); 987 988 if (free_nce_list != NULL) { 989 nce_cleanup_list(free_nce_list); 990 } 991 } 992 993 /* 994 * Walk everything. 995 * Note that ill can be NULL hence can't derive the ipst from it. 996 */ 997 void 998 ncec_walk(ill_t *ill, pfi_t pfi, void *arg1, ip_stack_t *ipst) 999 { 1000 ncec_walk_common(ipst->ips_ndp4, ill, pfi, arg1, B_TRUE); 1001 ncec_walk_common(ipst->ips_ndp6, ill, pfi, arg1, B_TRUE); 1002 } 1003 1004 /* 1005 * For each interface an entry is added for the unspecified multicast group. 1006 * Here that mapping is used to form the multicast cache entry for a particular 1007 * multicast destination. 1008 */ 1009 static int 1010 nce_set_multicast_v6(ill_t *ill, const in6_addr_t *dst, 1011 uint16_t flags, nce_t **newnce) 1012 { 1013 uchar_t *hw_addr; 1014 int err = 0; 1015 ip_stack_t *ipst = ill->ill_ipst; 1016 nce_t *nce; 1017 1018 ASSERT(ill != NULL); 1019 ASSERT(ill->ill_isv6); 1020 ASSERT(!(IN6_IS_ADDR_UNSPECIFIED(dst))); 1021 1022 mutex_enter(&ipst->ips_ndp6->ndp_g_lock); 1023 nce = nce_lookup_addr(ill, dst); 1024 if (nce != NULL) { 1025 mutex_exit(&ipst->ips_ndp6->ndp_g_lock); 1026 goto done; 1027 } 1028 if (ill->ill_net_type == IRE_IF_RESOLVER) { 1029 /* 1030 * For IRE_IF_RESOLVER a hardware mapping can be 1031 * generated. 1032 */ 1033 hw_addr = kmem_alloc(ill->ill_nd_lla_len, KM_NOSLEEP); 1034 if (hw_addr == NULL) { 1035 mutex_exit(&ipst->ips_ndp6->ndp_g_lock); 1036 return (ENOMEM); 1037 } 1038 ip_mcast_mapping(ill, (uchar_t *)dst, hw_addr); 1039 } else { 1040 /* No hw_addr is needed for IRE_IF_NORESOLVER. */ 1041 hw_addr = NULL; 1042 } 1043 ASSERT((flags & NCE_F_MCAST) != 0); 1044 ASSERT((flags & NCE_F_NONUD) != 0); 1045 /* nce_state will be computed by nce_add_common() */ 1046 err = nce_add_v6(ill, hw_addr, ill->ill_phys_addr_length, dst, flags, 1047 ND_UNCHANGED, &nce); 1048 mutex_exit(&ipst->ips_ndp6->ndp_g_lock); 1049 if (err == 0) 1050 err = nce_add_v6_postprocess(nce); 1051 if (hw_addr != NULL) 1052 kmem_free(hw_addr, ill->ill_nd_lla_len); 1053 if (err != 0) { 1054 ip1dbg(("nce_set_multicast_v6: create failed" "%d\n", err)); 1055 return (err); 1056 } 1057 done: 1058 ASSERT(nce->nce_common->ncec_state == ND_REACHABLE); 1059 if (newnce != NULL) 1060 *newnce = nce; 1061 else 1062 nce_refrele(nce); 1063 return (0); 1064 } 1065 1066 /* 1067 * Return the link layer address, and any flags of a ncec. 1068 */ 1069 int 1070 ndp_query(ill_t *ill, struct lif_nd_req *lnr) 1071 { 1072 ncec_t *ncec; 1073 in6_addr_t *addr; 1074 sin6_t *sin6; 1075 1076 ASSERT(ill != NULL && ill->ill_isv6); 1077 sin6 = (sin6_t *)&lnr->lnr_addr; 1078 addr = &sin6->sin6_addr; 1079 1080 /* 1081 * NOTE: if the ill is an IPMP interface, then match against the whole 1082 * illgrp. This e.g. allows in.ndpd to retrieve the link layer 1083 * addresses for the data addresses on an IPMP interface even though 1084 * ipif_ndp_up() created them with an ncec_ill of ipif_bound_ill. 1085 */ 1086 ncec = ncec_lookup_illgrp_v6(ill, addr); 1087 if (ncec == NULL) 1088 return (ESRCH); 1089 /* If no link layer address is available yet, return ESRCH */ 1090 if (!NCE_ISREACHABLE(ncec)) { 1091 ncec_refrele(ncec); 1092 return (ESRCH); 1093 } 1094 lnr->lnr_hdw_len = ill->ill_phys_addr_length; 1095 bcopy(ncec->ncec_lladdr, (uchar_t *)&lnr->lnr_hdw_addr, 1096 lnr->lnr_hdw_len); 1097 if (ncec->ncec_flags & NCE_F_ISROUTER) 1098 lnr->lnr_flags = NDF_ISROUTER_ON; 1099 if (ncec->ncec_flags & NCE_F_ANYCAST) 1100 lnr->lnr_flags |= NDF_ANYCAST_ON; 1101 ncec_refrele(ncec); 1102 return (0); 1103 } 1104 1105 /* 1106 * Finish setting up the Enable/Disable multicast for the driver. 1107 */ 1108 mblk_t * 1109 ndp_mcastreq(ill_t *ill, const in6_addr_t *v6group, uint32_t hw_addr_len, 1110 uint32_t hw_addr_offset, mblk_t *mp) 1111 { 1112 uchar_t *hw_addr; 1113 ipaddr_t v4group; 1114 uchar_t *addr; 1115 1116 ASSERT(ill->ill_net_type == IRE_IF_RESOLVER); 1117 if (IN6_IS_ADDR_V4MAPPED(v6group)) { 1118 IN6_V4MAPPED_TO_IPADDR(v6group, v4group); 1119 1120 ASSERT(CLASSD(v4group)); 1121 ASSERT(!(ill->ill_isv6)); 1122 1123 addr = (uchar_t *)&v4group; 1124 } else { 1125 ASSERT(IN6_IS_ADDR_MULTICAST(v6group)); 1126 ASSERT(ill->ill_isv6); 1127 1128 addr = (uchar_t *)v6group; 1129 } 1130 hw_addr = mi_offset_paramc(mp, hw_addr_offset, hw_addr_len); 1131 if (hw_addr == NULL) { 1132 ip0dbg(("ndp_mcastreq NULL hw_addr\n")); 1133 freemsg(mp); 1134 return (NULL); 1135 } 1136 1137 ip_mcast_mapping(ill, addr, hw_addr); 1138 return (mp); 1139 } 1140 1141 void 1142 ip_ndp_resolve(ncec_t *ncec) 1143 { 1144 in_addr_t sender4 = INADDR_ANY; 1145 in6_addr_t sender6 = ipv6_all_zeros; 1146 ill_t *src_ill; 1147 uint32_t ms; 1148 1149 src_ill = nce_resolve_src(ncec, &sender6); 1150 if (src_ill == NULL) { 1151 /* Make sure we try again later */ 1152 ms = ncec->ncec_ill->ill_reachable_retrans_time; 1153 nce_restart_timer(ncec, (clock_t)ms); 1154 return; 1155 } 1156 if (ncec->ncec_ipversion == IPV4_VERSION) 1157 IN6_V4MAPPED_TO_IPADDR(&sender6, sender4); 1158 mutex_enter(&ncec->ncec_lock); 1159 if (ncec->ncec_ipversion == IPV6_VERSION) 1160 ms = ndp_solicit(ncec, sender6, src_ill); 1161 else 1162 ms = arp_request(ncec, sender4, src_ill); 1163 mutex_exit(&ncec->ncec_lock); 1164 if (ms == 0) { 1165 if (ncec->ncec_state != ND_REACHABLE) { 1166 if (ncec->ncec_ipversion == IPV6_VERSION) 1167 ndp_resolv_failed(ncec); 1168 else 1169 arp_resolv_failed(ncec); 1170 ASSERT((ncec->ncec_flags & NCE_F_STATIC) == 0); 1171 nce_make_unreachable(ncec); 1172 ncec_delete(ncec); 1173 } 1174 } else { 1175 nce_restart_timer(ncec, (clock_t)ms); 1176 } 1177 done: 1178 ill_refrele(src_ill); 1179 } 1180 1181 /* 1182 * Send an IPv6 neighbor solicitation. 1183 * Returns number of milliseconds after which we should either rexmit or abort. 1184 * Return of zero means we should abort. 1185 * The caller holds the ncec_lock to protect ncec_qd_mp and ncec_rcnt. 1186 * The optional source address is used as a hint to ndp_solicit for 1187 * which source to use in the packet. 1188 * 1189 * NOTE: This routine drops ncec_lock (and later reacquires it) when sending 1190 * the packet. 1191 */ 1192 uint32_t 1193 ndp_solicit(ncec_t *ncec, in6_addr_t src, ill_t *ill) 1194 { 1195 in6_addr_t dst; 1196 boolean_t dropped = B_FALSE; 1197 1198 ASSERT(ncec->ncec_ipversion == IPV6_VERSION); 1199 ASSERT(MUTEX_HELD(&ncec->ncec_lock)); 1200 1201 if (ncec->ncec_rcnt == 0) 1202 return (0); 1203 1204 dst = ncec->ncec_addr; 1205 ncec->ncec_rcnt--; 1206 mutex_exit(&ncec->ncec_lock); 1207 dropped = ndp_xmit(ill, ND_NEIGHBOR_SOLICIT, ill->ill_phys_addr, 1208 ill->ill_phys_addr_length, &src, &dst, 0); 1209 mutex_enter(&ncec->ncec_lock); 1210 if (dropped) 1211 ncec->ncec_rcnt++; 1212 return (ncec->ncec_ill->ill_reachable_retrans_time); 1213 } 1214 1215 /* 1216 * Attempt to recover an address on an interface that's been marked as a 1217 * duplicate. Because NCEs are destroyed when the interface goes down, there's 1218 * no easy way to just probe the address and have the right thing happen if 1219 * it's no longer in use. Instead, we just bring it up normally and allow the 1220 * regular interface start-up logic to probe for a remaining duplicate and take 1221 * us back down if necessary. 1222 * Neither DHCP nor temporary addresses arrive here; they're excluded by 1223 * ip_ndp_excl. 1224 */ 1225 /* ARGSUSED */ 1226 void 1227 ip_addr_recover(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg) 1228 { 1229 ill_t *ill = rq->q_ptr; 1230 ipif_t *ipif; 1231 in6_addr_t *addr6 = (in6_addr_t *)mp->b_rptr; 1232 in_addr_t *addr4 = (in_addr_t *)mp->b_rptr; 1233 boolean_t addr_equal; 1234 1235 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 1236 /* 1237 * We do not support recovery of proxy ARP'd interfaces, 1238 * because the system lacks a complete proxy ARP mechanism. 1239 */ 1240 if (ill->ill_isv6) { 1241 addr_equal = IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6lcl_addr, 1242 addr6); 1243 } else { 1244 addr_equal = (ipif->ipif_lcl_addr == *addr4); 1245 } 1246 1247 if ((ipif->ipif_flags & IPIF_POINTOPOINT) || !addr_equal) 1248 continue; 1249 1250 /* 1251 * If we have already recovered or if the interface is going 1252 * away, then ignore. 1253 */ 1254 mutex_enter(&ill->ill_lock); 1255 if (!(ipif->ipif_flags & IPIF_DUPLICATE) || 1256 (ipif->ipif_state_flags & IPIF_CONDEMNED)) { 1257 mutex_exit(&ill->ill_lock); 1258 continue; 1259 } 1260 1261 ipif->ipif_flags &= ~IPIF_DUPLICATE; 1262 ill->ill_ipif_dup_count--; 1263 mutex_exit(&ill->ill_lock); 1264 ipif->ipif_was_dup = B_TRUE; 1265 1266 if (ill->ill_isv6) { 1267 VERIFY(ipif_ndp_up(ipif, B_TRUE) != EINPROGRESS); 1268 (void) ipif_up_done_v6(ipif); 1269 } else { 1270 VERIFY(ipif_arp_up(ipif, Res_act_initial, B_TRUE) != 1271 EINPROGRESS); 1272 (void) ipif_up_done(ipif); 1273 } 1274 } 1275 freeb(mp); 1276 } 1277 1278 /* 1279 * Attempt to recover an IPv6 interface that's been shut down as a duplicate. 1280 * As long as someone else holds the address, the interface will stay down. 1281 * When that conflict goes away, the interface is brought back up. This is 1282 * done so that accidental shutdowns of addresses aren't made permanent. Your 1283 * server will recover from a failure. 1284 * 1285 * For DHCP and temporary addresses, recovery is not done in the kernel. 1286 * Instead, it's handled by user space processes (dhcpagent and in.ndpd). 1287 * 1288 * This function is entered on a timer expiry; the ID is in ipif_recovery_id. 1289 */ 1290 void 1291 ipif_dup_recovery(void *arg) 1292 { 1293 ipif_t *ipif = arg; 1294 1295 ipif->ipif_recovery_id = 0; 1296 if (!(ipif->ipif_flags & IPIF_DUPLICATE)) 1297 return; 1298 1299 /* 1300 * No lock, because this is just an optimization. 1301 */ 1302 if (ipif->ipif_state_flags & IPIF_CONDEMNED) 1303 return; 1304 1305 /* If the link is down, we'll retry this later */ 1306 if (!(ipif->ipif_ill->ill_phyint->phyint_flags & PHYI_RUNNING)) 1307 return; 1308 1309 ipif_do_recovery(ipif); 1310 } 1311 1312 /* 1313 * Perform interface recovery by forcing the duplicate interfaces up and 1314 * allowing the system to determine which ones should stay up. 1315 * 1316 * Called both by recovery timer expiry and link-up notification. 1317 */ 1318 void 1319 ipif_do_recovery(ipif_t *ipif) 1320 { 1321 ill_t *ill = ipif->ipif_ill; 1322 mblk_t *mp; 1323 ip_stack_t *ipst = ill->ill_ipst; 1324 size_t mp_size; 1325 1326 if (ipif->ipif_isv6) 1327 mp_size = sizeof (ipif->ipif_v6lcl_addr); 1328 else 1329 mp_size = sizeof (ipif->ipif_lcl_addr); 1330 mp = allocb(mp_size, BPRI_MED); 1331 if (mp == NULL) { 1332 mutex_enter(&ill->ill_lock); 1333 if (ipst->ips_ip_dup_recovery > 0 && 1334 ipif->ipif_recovery_id == 0 && 1335 !(ipif->ipif_state_flags & IPIF_CONDEMNED)) { 1336 ipif->ipif_recovery_id = timeout(ipif_dup_recovery, 1337 ipif, MSEC_TO_TICK(ipst->ips_ip_dup_recovery)); 1338 } 1339 mutex_exit(&ill->ill_lock); 1340 } else { 1341 /* 1342 * A recovery timer may still be running if we got here from 1343 * ill_restart_dad(); cancel that timer. 1344 */ 1345 if (ipif->ipif_recovery_id != 0) 1346 (void) untimeout(ipif->ipif_recovery_id); 1347 ipif->ipif_recovery_id = 0; 1348 1349 if (ipif->ipif_isv6) { 1350 bcopy(&ipif->ipif_v6lcl_addr, mp->b_rptr, 1351 sizeof (ipif->ipif_v6lcl_addr)); 1352 } else { 1353 bcopy(&ipif->ipif_lcl_addr, mp->b_rptr, 1354 sizeof (ipif->ipif_lcl_addr)); 1355 } 1356 ill_refhold(ill); 1357 qwriter_ip(ill, ill->ill_rq, mp, ip_addr_recover, NEW_OP, 1358 B_FALSE); 1359 } 1360 } 1361 1362 /* 1363 * Find the MAC and IP addresses in an NA/NS message. 1364 */ 1365 static void 1366 ip_ndp_find_addresses(mblk_t *mp, ip_recv_attr_t *ira, ill_t *ill, 1367 in6_addr_t *targp, uchar_t **haddr, uint_t *haddrlenp) 1368 { 1369 icmp6_t *icmp6 = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN); 1370 nd_neighbor_solicit_t *ns = (nd_neighbor_solicit_t *)icmp6; 1371 uchar_t *addr; 1372 int alen; 1373 1374 /* icmp_inbound_v6 ensures this */ 1375 ASSERT(ira->ira_flags & IRAF_L2SRC_SET); 1376 1377 addr = ira->ira_l2src; 1378 alen = ill->ill_phys_addr_length; 1379 if (alen > 0) { 1380 *haddr = addr; 1381 *haddrlenp = alen; 1382 } else { 1383 *haddr = NULL; 1384 *haddrlenp = 0; 1385 } 1386 1387 /* nd_ns_target and nd_na_target are at the same offset, so we cheat */ 1388 *targp = ns->nd_ns_target; 1389 } 1390 1391 /* 1392 * This is for exclusive changes due to NDP duplicate address detection 1393 * failure. 1394 */ 1395 /* ARGSUSED */ 1396 static void 1397 ip_ndp_excl(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg) 1398 { 1399 ill_t *ill = rq->q_ptr; 1400 ipif_t *ipif; 1401 uchar_t *haddr; 1402 uint_t haddrlen; 1403 ip_stack_t *ipst = ill->ill_ipst; 1404 in6_addr_t targ; 1405 ip_recv_attr_t iras; 1406 mblk_t *attrmp; 1407 1408 attrmp = mp; 1409 mp = mp->b_cont; 1410 attrmp->b_cont = NULL; 1411 if (!ip_recv_attr_from_mblk(attrmp, &iras)) { 1412 /* The ill or ip_stack_t disappeared on us */ 1413 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 1414 ip_drop_input("ip_recv_attr_from_mblk", mp, ill); 1415 freemsg(mp); 1416 ira_cleanup(&iras, B_TRUE); 1417 return; 1418 } 1419 1420 ASSERT(ill == iras.ira_rill); 1421 1422 ip_ndp_find_addresses(mp, &iras, ill, &targ, &haddr, &haddrlen); 1423 if (haddr != NULL && haddrlen == ill->ill_phys_addr_length) { 1424 /* 1425 * Ignore conflicts generated by misbehaving switches that 1426 * just reflect our own messages back to us. For IPMP, we may 1427 * see reflections across any ill in the illgrp. 1428 * 1429 * RFC2462 and revisions tried to detect both the case 1430 * when a statically configured IPv6 address is a duplicate, 1431 * and the case when the L2 address itself is a duplicate. The 1432 * later is important because, with stateles address autoconf, 1433 * if the L2 address is a duplicate, the resulting IPv6 1434 * address(es) would also be duplicates. We rely on DAD of the 1435 * IPv6 address itself to detect the latter case. 1436 */ 1437 /* For an under ill_grp can change under lock */ 1438 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 1439 if (bcmp(haddr, ill->ill_phys_addr, haddrlen) == 0 || 1440 IS_UNDER_IPMP(ill) && 1441 ipmp_illgrp_find_ill(ill->ill_grp, haddr, 1442 haddrlen) != NULL) { 1443 rw_exit(&ipst->ips_ill_g_lock); 1444 goto ignore_conflict; 1445 } 1446 rw_exit(&ipst->ips_ill_g_lock); 1447 } 1448 1449 /* 1450 * Look up the appropriate ipif. 1451 */ 1452 ipif = ipif_lookup_addr_v6(&targ, ill, ALL_ZONES, ipst); 1453 if (ipif == NULL) 1454 goto ignore_conflict; 1455 1456 /* Reload the ill to match the ipif */ 1457 ill = ipif->ipif_ill; 1458 1459 /* If it's already duplicate or ineligible, then don't do anything. */ 1460 if (ipif->ipif_flags & (IPIF_POINTOPOINT|IPIF_DUPLICATE)) { 1461 ipif_refrele(ipif); 1462 goto ignore_conflict; 1463 } 1464 1465 /* 1466 * If this is a failure during duplicate recovery, then don't 1467 * complain. It may take a long time to recover. 1468 */ 1469 if (!ipif->ipif_was_dup) { 1470 char ibuf[LIFNAMSIZ]; 1471 char hbuf[MAC_STR_LEN]; 1472 char sbuf[INET6_ADDRSTRLEN]; 1473 1474 ipif_get_name(ipif, ibuf, sizeof (ibuf)); 1475 cmn_err(CE_WARN, "%s has duplicate address %s (in use by %s);" 1476 " disabled", ibuf, 1477 inet_ntop(AF_INET6, &targ, sbuf, sizeof (sbuf)), 1478 mac_colon_addr(haddr, haddrlen, hbuf, sizeof (hbuf))); 1479 } 1480 mutex_enter(&ill->ill_lock); 1481 ASSERT(!(ipif->ipif_flags & IPIF_DUPLICATE)); 1482 ipif->ipif_flags |= IPIF_DUPLICATE; 1483 ill->ill_ipif_dup_count++; 1484 mutex_exit(&ill->ill_lock); 1485 (void) ipif_down(ipif, NULL, NULL); 1486 (void) ipif_down_tail(ipif); 1487 mutex_enter(&ill->ill_lock); 1488 if (!(ipif->ipif_flags & (IPIF_DHCPRUNNING|IPIF_TEMPORARY)) && 1489 ill->ill_net_type == IRE_IF_RESOLVER && 1490 !(ipif->ipif_state_flags & IPIF_CONDEMNED) && 1491 ipst->ips_ip_dup_recovery > 0) { 1492 ASSERT(ipif->ipif_recovery_id == 0); 1493 ipif->ipif_recovery_id = timeout(ipif_dup_recovery, 1494 ipif, MSEC_TO_TICK(ipst->ips_ip_dup_recovery)); 1495 } 1496 mutex_exit(&ill->ill_lock); 1497 ipif_refrele(ipif); 1498 1499 ignore_conflict: 1500 freemsg(mp); 1501 ira_cleanup(&iras, B_TRUE); 1502 } 1503 1504 /* 1505 * Handle failure by tearing down the ipifs with the specified address. Note 1506 * that tearing down the ipif also means deleting the ncec through ipif_down, so 1507 * it's not possible to do recovery by just restarting the ncec timer. Instead, 1508 * we start a timer on the ipif. 1509 * Caller has to free mp; 1510 */ 1511 static void 1512 ndp_failure(mblk_t *mp, ip_recv_attr_t *ira) 1513 { 1514 const uchar_t *haddr; 1515 ill_t *ill = ira->ira_rill; 1516 1517 /* 1518 * Ignore conflicts generated by misbehaving switches that just 1519 * reflect our own messages back to us. 1520 */ 1521 1522 /* icmp_inbound_v6 ensures this */ 1523 ASSERT(ira->ira_flags & IRAF_L2SRC_SET); 1524 haddr = ira->ira_l2src; 1525 if (haddr != NULL && 1526 bcmp(haddr, ill->ill_phys_addr, ill->ill_phys_addr_length) == 0) { 1527 return; 1528 } 1529 1530 if ((mp = copymsg(mp)) != NULL) { 1531 mblk_t *attrmp; 1532 1533 attrmp = ip_recv_attr_to_mblk(ira); 1534 if (attrmp == NULL) { 1535 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 1536 ip_drop_input("ipIfStatsInDiscards", mp, ill); 1537 freemsg(mp); 1538 } else { 1539 ASSERT(attrmp->b_cont == NULL); 1540 attrmp->b_cont = mp; 1541 mp = attrmp; 1542 ill_refhold(ill); 1543 qwriter_ip(ill, ill->ill_rq, mp, ip_ndp_excl, NEW_OP, 1544 B_FALSE); 1545 } 1546 } 1547 } 1548 1549 /* 1550 * Handle a discovered conflict: some other system is advertising that it owns 1551 * one of our IP addresses. We need to defend ourselves, or just shut down the 1552 * interface. 1553 * 1554 * Handles both IPv4 and IPv6 1555 */ 1556 boolean_t 1557 ip_nce_conflict(mblk_t *mp, ip_recv_attr_t *ira, ncec_t *ncec) 1558 { 1559 ipif_t *ipif; 1560 clock_t now; 1561 uint_t maxdefense; 1562 uint_t defs; 1563 ill_t *ill = ira->ira_ill; 1564 ip_stack_t *ipst = ill->ill_ipst; 1565 uint32_t elapsed; 1566 boolean_t isv6 = ill->ill_isv6; 1567 ipaddr_t ncec_addr; 1568 1569 if (isv6) { 1570 ipif = ipif_lookup_addr_v6(&ncec->ncec_addr, ill, ALL_ZONES, 1571 ipst); 1572 } else { 1573 if (arp_no_defense) { 1574 /* 1575 * Yes, there is a conflict, but no, we do not 1576 * defend ourself. 1577 */ 1578 return (B_TRUE); 1579 } 1580 IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, ncec_addr); 1581 ipif = ipif_lookup_addr(ncec_addr, ill, ALL_ZONES, 1582 ipst); 1583 } 1584 if (ipif == NULL) 1585 return (B_FALSE); 1586 1587 /* 1588 * First, figure out if this address is disposable. 1589 */ 1590 if (ipif->ipif_flags & (IPIF_DHCPRUNNING | IPIF_TEMPORARY)) 1591 maxdefense = ipst->ips_ip_max_temp_defend; 1592 else 1593 maxdefense = ipst->ips_ip_max_defend; 1594 1595 /* 1596 * Now figure out how many times we've defended ourselves. Ignore 1597 * defenses that happened long in the past. 1598 */ 1599 now = ddi_get_lbolt(); 1600 elapsed = (drv_hztousec(now - ncec->ncec_last_time_defended))/1000000; 1601 mutex_enter(&ncec->ncec_lock); 1602 if ((defs = ncec->ncec_defense_count) > 0 && 1603 elapsed > ipst->ips_ip_defend_interval) { 1604 /* 1605 * ip_defend_interval has elapsed. 1606 * reset the defense count. 1607 */ 1608 ncec->ncec_defense_count = defs = 0; 1609 } 1610 ncec->ncec_defense_count++; 1611 ncec->ncec_last_time_defended = now; 1612 mutex_exit(&ncec->ncec_lock); 1613 ipif_refrele(ipif); 1614 1615 /* 1616 * If we've defended ourselves too many times already, then give up and 1617 * tear down the interface(s) using this address. 1618 * Otherwise, caller has to defend by sending out an announce. 1619 */ 1620 if (defs >= maxdefense) { 1621 if (isv6) 1622 ndp_failure(mp, ira); 1623 else 1624 arp_failure(mp, ira); 1625 } else { 1626 return (B_TRUE); /* caller must defend this address */ 1627 } 1628 return (B_FALSE); 1629 } 1630 1631 /* 1632 * Handle reception of Neighbor Solicitation messages. 1633 */ 1634 static void 1635 ndp_input_solicit(mblk_t *mp, ip_recv_attr_t *ira) 1636 { 1637 ill_t *ill = ira->ira_ill, *under_ill; 1638 nd_neighbor_solicit_t *ns; 1639 uint32_t hlen = ill->ill_phys_addr_length; 1640 uchar_t *haddr = NULL; 1641 icmp6_t *icmp_nd; 1642 ip6_t *ip6h; 1643 ncec_t *our_ncec = NULL; 1644 in6_addr_t target; 1645 in6_addr_t src; 1646 int len; 1647 int flag = 0; 1648 nd_opt_hdr_t *opt = NULL; 1649 boolean_t bad_solicit = B_FALSE; 1650 mib2_ipv6IfIcmpEntry_t *mib = ill->ill_icmp6_mib; 1651 boolean_t need_ill_refrele = B_FALSE; 1652 1653 ip6h = (ip6_t *)mp->b_rptr; 1654 icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN); 1655 len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN; 1656 src = ip6h->ip6_src; 1657 ns = (nd_neighbor_solicit_t *)icmp_nd; 1658 target = ns->nd_ns_target; 1659 if (IN6_IS_ADDR_MULTICAST(&target) || IN6_IS_ADDR_V4MAPPED(&target) || 1660 IN6_IS_ADDR_LOOPBACK(&target)) { 1661 if (ip_debug > 2) { 1662 /* ip1dbg */ 1663 pr_addr_dbg("ndp_input_solicit: Martian Target %s\n", 1664 AF_INET6, &target); 1665 } 1666 bad_solicit = B_TRUE; 1667 goto done; 1668 } 1669 if (len > sizeof (nd_neighbor_solicit_t)) { 1670 /* Options present */ 1671 opt = (nd_opt_hdr_t *)&ns[1]; 1672 len -= sizeof (nd_neighbor_solicit_t); 1673 if (!ndp_verify_optlen(opt, len)) { 1674 ip1dbg(("ndp_input_solicit: Bad opt len\n")); 1675 bad_solicit = B_TRUE; 1676 goto done; 1677 } 1678 } 1679 if (IN6_IS_ADDR_UNSPECIFIED(&src)) { 1680 /* Check to see if this is a valid DAD solicitation */ 1681 if (!IN6_IS_ADDR_MC_SOLICITEDNODE(&ip6h->ip6_dst)) { 1682 if (ip_debug > 2) { 1683 /* ip1dbg */ 1684 pr_addr_dbg("ndp_input_solicit: IPv6 " 1685 "Destination is not solicited node " 1686 "multicast %s\n", AF_INET6, 1687 &ip6h->ip6_dst); 1688 } 1689 bad_solicit = B_TRUE; 1690 goto done; 1691 } 1692 } 1693 1694 /* 1695 * NOTE: with IPMP, it's possible the nominated multicast ill (which 1696 * received this packet if it's multicast) is not the ill tied to 1697 * e.g. the IPMP ill's data link-local. So we match across the illgrp 1698 * to ensure we find the associated NCE. 1699 */ 1700 our_ncec = ncec_lookup_illgrp_v6(ill, &target); 1701 /* 1702 * If this is a valid Solicitation for an address we are publishing, 1703 * then a PUBLISH entry should exist in the cache 1704 */ 1705 if (our_ncec == NULL || !NCE_PUBLISH(our_ncec)) { 1706 ip1dbg(("ndp_input_solicit: Wrong target in NS?!" 1707 "ifname=%s ", ill->ill_name)); 1708 if (ip_debug > 2) { 1709 /* ip1dbg */ 1710 pr_addr_dbg(" dst %s\n", AF_INET6, &target); 1711 } 1712 if (our_ncec == NULL) 1713 bad_solicit = B_TRUE; 1714 goto done; 1715 } 1716 1717 /* At this point we should have a verified NS per spec */ 1718 if (opt != NULL) { 1719 opt = ndp_get_option(opt, len, ND_OPT_SOURCE_LINKADDR); 1720 if (opt != NULL) { 1721 haddr = (uchar_t *)&opt[1]; 1722 if (hlen > opt->nd_opt_len * 8 - sizeof (*opt) || 1723 hlen == 0) { 1724 ip1dbg(("ndp_input_advert: bad SLLA\n")); 1725 bad_solicit = B_TRUE; 1726 goto done; 1727 } 1728 } 1729 } 1730 1731 /* If sending directly to peer, set the unicast flag */ 1732 if (!IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst)) 1733 flag |= NDP_UNICAST; 1734 1735 /* 1736 * Create/update the entry for the soliciting node on the ipmp_ill. 1737 * or respond to outstanding queries, don't if 1738 * the source is unspecified address. 1739 */ 1740 if (!IN6_IS_ADDR_UNSPECIFIED(&src)) { 1741 int err; 1742 nce_t *nnce; 1743 1744 ASSERT(ill->ill_isv6); 1745 /* 1746 * Regular solicitations *must* include the Source Link-Layer 1747 * Address option. Ignore messages that do not. 1748 */ 1749 if (haddr == NULL && IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst)) { 1750 ip1dbg(("ndp_input_solicit: source link-layer address " 1751 "option missing with a specified source.\n")); 1752 bad_solicit = B_TRUE; 1753 goto done; 1754 } 1755 1756 /* 1757 * This is a regular solicitation. If we're still in the 1758 * process of verifying the address, then don't respond at all 1759 * and don't keep track of the sender. 1760 */ 1761 if (our_ncec->ncec_state == ND_PROBE) 1762 goto done; 1763 1764 /* 1765 * If the solicitation doesn't have sender hardware address 1766 * (legal for unicast solicitation), then process without 1767 * installing the return NCE. Either we already know it, or 1768 * we'll be forced to look it up when (and if) we reply to the 1769 * packet. 1770 */ 1771 if (haddr == NULL) 1772 goto no_source; 1773 1774 under_ill = ill; 1775 if (IS_UNDER_IPMP(under_ill)) { 1776 ill = ipmp_ill_hold_ipmp_ill(under_ill); 1777 if (ill == NULL) 1778 ill = under_ill; 1779 else 1780 need_ill_refrele = B_TRUE; 1781 } 1782 err = nce_lookup_then_add_v6(ill, 1783 haddr, hlen, 1784 &src, /* Soliciting nodes address */ 1785 0, 1786 ND_STALE, 1787 &nnce); 1788 1789 if (need_ill_refrele) { 1790 ill_refrele(ill); 1791 ill = under_ill; 1792 need_ill_refrele = B_FALSE; 1793 } 1794 switch (err) { 1795 case 0: 1796 /* done with this entry */ 1797 nce_refrele(nnce); 1798 break; 1799 case EEXIST: 1800 /* 1801 * B_FALSE indicates this is not an an advertisement. 1802 */ 1803 nce_process(nnce->nce_common, haddr, 0, B_FALSE); 1804 nce_refrele(nnce); 1805 break; 1806 default: 1807 ip1dbg(("ndp_input_solicit: Can't create NCE %d\n", 1808 err)); 1809 goto done; 1810 } 1811 no_source: 1812 flag |= NDP_SOLICITED; 1813 } else { 1814 /* 1815 * No source link layer address option should be present in a 1816 * valid DAD request. 1817 */ 1818 if (haddr != NULL) { 1819 ip1dbg(("ndp_input_solicit: source link-layer address " 1820 "option present with an unspecified source.\n")); 1821 bad_solicit = B_TRUE; 1822 goto done; 1823 } 1824 if (our_ncec->ncec_state == ND_PROBE) { 1825 /* 1826 * Internally looped-back probes will have 1827 * IRAF_L2SRC_LOOPBACK set so we can ignore our own 1828 * transmissions. 1829 */ 1830 if (!(ira->ira_flags & IRAF_L2SRC_LOOPBACK)) { 1831 /* 1832 * If someone else is probing our address, then 1833 * we've crossed wires. Declare failure. 1834 */ 1835 ndp_failure(mp, ira); 1836 } 1837 goto done; 1838 } 1839 /* 1840 * This is a DAD probe. Multicast the advertisement to the 1841 * all-nodes address. 1842 */ 1843 src = ipv6_all_hosts_mcast; 1844 } 1845 flag |= nce_advert_flags(our_ncec); 1846 (void) ndp_xmit(ill, 1847 ND_NEIGHBOR_ADVERT, 1848 our_ncec->ncec_lladdr, 1849 our_ncec->ncec_lladdr_length, 1850 &target, /* Source and target of the advertisement pkt */ 1851 &src, /* IP Destination (source of original pkt) */ 1852 flag); 1853 done: 1854 if (bad_solicit) 1855 BUMP_MIB(mib, ipv6IfIcmpInBadNeighborSolicitations); 1856 if (our_ncec != NULL) 1857 ncec_refrele(our_ncec); 1858 } 1859 1860 /* 1861 * Handle reception of Neighbor Solicitation messages 1862 */ 1863 void 1864 ndp_input_advert(mblk_t *mp, ip_recv_attr_t *ira) 1865 { 1866 ill_t *ill = ira->ira_ill; 1867 nd_neighbor_advert_t *na; 1868 uint32_t hlen = ill->ill_phys_addr_length; 1869 uchar_t *haddr = NULL; 1870 icmp6_t *icmp_nd; 1871 ip6_t *ip6h; 1872 ncec_t *dst_ncec = NULL; 1873 in6_addr_t target; 1874 nd_opt_hdr_t *opt = NULL; 1875 int len; 1876 ip_stack_t *ipst = ill->ill_ipst; 1877 mib2_ipv6IfIcmpEntry_t *mib = ill->ill_icmp6_mib; 1878 1879 ip6h = (ip6_t *)mp->b_rptr; 1880 icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN); 1881 len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN; 1882 na = (nd_neighbor_advert_t *)icmp_nd; 1883 1884 if (IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst) && 1885 (na->nd_na_flags_reserved & ND_NA_FLAG_SOLICITED)) { 1886 ip1dbg(("ndp_input_advert: Target is multicast but the " 1887 "solicited flag is not zero\n")); 1888 BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements); 1889 return; 1890 } 1891 target = na->nd_na_target; 1892 if (IN6_IS_ADDR_MULTICAST(&target) || IN6_IS_ADDR_V4MAPPED(&target) || 1893 IN6_IS_ADDR_LOOPBACK(&target)) { 1894 if (ip_debug > 2) { 1895 /* ip1dbg */ 1896 pr_addr_dbg("ndp_input_solicit: Martian Target %s\n", 1897 AF_INET6, &target); 1898 } 1899 BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements); 1900 return; 1901 } 1902 if (len > sizeof (nd_neighbor_advert_t)) { 1903 opt = (nd_opt_hdr_t *)&na[1]; 1904 if (!ndp_verify_optlen(opt, 1905 len - sizeof (nd_neighbor_advert_t))) { 1906 ip1dbg(("ndp_input_advert: cannot verify SLLA\n")); 1907 BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements); 1908 return; 1909 } 1910 /* At this point we have a verified NA per spec */ 1911 len -= sizeof (nd_neighbor_advert_t); 1912 opt = ndp_get_option(opt, len, ND_OPT_TARGET_LINKADDR); 1913 if (opt != NULL) { 1914 haddr = (uchar_t *)&opt[1]; 1915 if (hlen > opt->nd_opt_len * 8 - sizeof (*opt) || 1916 hlen == 0) { 1917 ip1dbg(("ndp_input_advert: bad SLLA\n")); 1918 BUMP_MIB(mib, 1919 ipv6IfIcmpInBadNeighborAdvertisements); 1920 return; 1921 } 1922 } 1923 } 1924 1925 /* 1926 * NOTE: we match across the illgrp since we need to do DAD for all of 1927 * our local addresses, and those are spread across all the active 1928 * ills in the group. 1929 */ 1930 if ((dst_ncec = ncec_lookup_illgrp_v6(ill, &target)) == NULL) 1931 return; 1932 1933 if (NCE_PUBLISH(dst_ncec)) { 1934 /* 1935 * Someone just advertised an addresses that we publish. First, 1936 * check it it was us -- if so, we can safely ignore it. 1937 * We don't get the haddr from the ira_l2src because, in the 1938 * case that the packet originated from us, on an IPMP group, 1939 * the ira_l2src may would be the link-layer address of the 1940 * cast_ill used to send the packet, which may not be the same 1941 * as the dst_ncec->ncec_lladdr of the address. 1942 */ 1943 if (haddr != NULL) { 1944 if (ira->ira_flags & IRAF_L2SRC_LOOPBACK) 1945 goto out; 1946 1947 if (!nce_cmp_ll_addr(dst_ncec, haddr, hlen)) 1948 goto out; /* from us -- no conflict */ 1949 1950 /* 1951 * If we're in an IPMP group, check if this is an echo 1952 * from another ill in the group. Use the double- 1953 * checked locking pattern to avoid grabbing 1954 * ill_g_lock in the non-IPMP case. 1955 */ 1956 if (IS_UNDER_IPMP(ill)) { 1957 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 1958 if (IS_UNDER_IPMP(ill) && ipmp_illgrp_find_ill( 1959 ill->ill_grp, haddr, hlen) != NULL) { 1960 rw_exit(&ipst->ips_ill_g_lock); 1961 goto out; 1962 } 1963 rw_exit(&ipst->ips_ill_g_lock); 1964 } 1965 } 1966 1967 /* 1968 * This appears to be a real conflict. If we're trying to 1969 * configure this NCE (ND_PROBE), then shut it down. 1970 * Otherwise, handle the discovered conflict. 1971 */ 1972 if (dst_ncec->ncec_state == ND_PROBE) { 1973 ndp_failure(mp, ira); 1974 } else { 1975 if (ip_nce_conflict(mp, ira, dst_ncec)) { 1976 char hbuf[MAC_STR_LEN]; 1977 char sbuf[INET6_ADDRSTRLEN]; 1978 1979 cmn_err(CE_WARN, 1980 "node '%s' is using %s on %s", 1981 inet_ntop(AF_INET6, &target, sbuf, 1982 sizeof (sbuf)), 1983 haddr == NULL ? "<none>" : 1984 mac_colon_addr(haddr, hlen, hbuf, 1985 sizeof (hbuf)), ill->ill_name); 1986 /* 1987 * RFC 4862, Section 5.4.4 does not mandate 1988 * any specific behavior when an NA matches 1989 * a non-tentative address assigned to the 1990 * receiver. We make the choice of defending 1991 * our address, based on the assumption that 1992 * the sender has not detected the Duplicate. 1993 * 1994 * ncec_last_time_defended has been adjusted 1995 * in ip_nce_conflict() 1996 */ 1997 (void) ndp_announce(dst_ncec); 1998 } 1999 } 2000 } else { 2001 if (na->nd_na_flags_reserved & ND_NA_FLAG_ROUTER) 2002 dst_ncec->ncec_flags |= NCE_F_ISROUTER; 2003 2004 /* B_TRUE indicates this an advertisement */ 2005 nce_process(dst_ncec, haddr, na->nd_na_flags_reserved, B_TRUE); 2006 } 2007 out: 2008 ncec_refrele(dst_ncec); 2009 } 2010 2011 /* 2012 * Process NDP neighbor solicitation/advertisement messages. 2013 * The checksum has already checked o.k before reaching here. 2014 * Information about the datalink header is contained in ira_l2src, but 2015 * that should be ignored for loopback packets. 2016 */ 2017 void 2018 ndp_input(mblk_t *mp, ip_recv_attr_t *ira) 2019 { 2020 ill_t *ill = ira->ira_rill; 2021 icmp6_t *icmp_nd; 2022 ip6_t *ip6h; 2023 int len; 2024 mib2_ipv6IfIcmpEntry_t *mib = ill->ill_icmp6_mib; 2025 ill_t *orig_ill = NULL; 2026 2027 /* 2028 * Since ira_ill is where the IRE_LOCAL was hosted we use ira_rill 2029 * and make it be the IPMP upper so avoid being confused by a packet 2030 * addressed to a unicast address on a different ill. 2031 */ 2032 if (IS_UNDER_IPMP(ill)) { 2033 orig_ill = ill; 2034 ill = ipmp_ill_hold_ipmp_ill(orig_ill); 2035 if (ill == NULL) { 2036 ill = orig_ill; 2037 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 2038 ip_drop_input("ipIfStatsInDiscards - IPMP ill", 2039 mp, ill); 2040 freemsg(mp); 2041 return; 2042 } 2043 ASSERT(ill != orig_ill); 2044 orig_ill = ira->ira_ill; 2045 ira->ira_ill = ill; 2046 mib = ill->ill_icmp6_mib; 2047 } 2048 if (!pullupmsg(mp, -1)) { 2049 ip1dbg(("ndp_input: pullupmsg failed\n")); 2050 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 2051 ip_drop_input("ipIfStatsInDiscards - pullupmsg", mp, ill); 2052 goto done; 2053 } 2054 ip6h = (ip6_t *)mp->b_rptr; 2055 if (ip6h->ip6_hops != IPV6_MAX_HOPS) { 2056 ip1dbg(("ndp_input: hoplimit != IPV6_MAX_HOPS\n")); 2057 ip_drop_input("ipv6IfIcmpBadHoplimit", mp, ill); 2058 BUMP_MIB(mib, ipv6IfIcmpBadHoplimit); 2059 goto done; 2060 } 2061 /* 2062 * NDP does not accept any extension headers between the 2063 * IP header and the ICMP header since e.g. a routing 2064 * header could be dangerous. 2065 * This assumes that any AH or ESP headers are removed 2066 * by ip prior to passing the packet to ndp_input. 2067 */ 2068 if (ip6h->ip6_nxt != IPPROTO_ICMPV6) { 2069 ip1dbg(("ndp_input: Wrong next header 0x%x\n", 2070 ip6h->ip6_nxt)); 2071 ip_drop_input("Wrong next header", mp, ill); 2072 BUMP_MIB(mib, ipv6IfIcmpInErrors); 2073 goto done; 2074 } 2075 icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN); 2076 ASSERT(icmp_nd->icmp6_type == ND_NEIGHBOR_SOLICIT || 2077 icmp_nd->icmp6_type == ND_NEIGHBOR_ADVERT); 2078 if (icmp_nd->icmp6_code != 0) { 2079 ip1dbg(("ndp_input: icmp6 code != 0 \n")); 2080 ip_drop_input("code non-zero", mp, ill); 2081 BUMP_MIB(mib, ipv6IfIcmpInErrors); 2082 goto done; 2083 } 2084 len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN; 2085 /* 2086 * Make sure packet length is large enough for either 2087 * a NS or a NA icmp packet. 2088 */ 2089 if (len < sizeof (struct icmp6_hdr) + sizeof (struct in6_addr)) { 2090 ip1dbg(("ndp_input: packet too short\n")); 2091 ip_drop_input("packet too short", mp, ill); 2092 BUMP_MIB(mib, ipv6IfIcmpInErrors); 2093 goto done; 2094 } 2095 if (icmp_nd->icmp6_type == ND_NEIGHBOR_SOLICIT) { 2096 ndp_input_solicit(mp, ira); 2097 } else { 2098 ndp_input_advert(mp, ira); 2099 } 2100 done: 2101 freemsg(mp); 2102 if (orig_ill != NULL) { 2103 ill_refrele(ill); 2104 ira->ira_ill = orig_ill; 2105 } 2106 } 2107 2108 /* 2109 * ndp_xmit is called to form and transmit a ND solicitation or 2110 * advertisement ICMP packet. 2111 * 2112 * If the source address is unspecified and this isn't a probe (used for 2113 * duplicate address detection), an appropriate source address and link layer 2114 * address will be chosen here. The link layer address option is included if 2115 * the source is specified (i.e., all non-probe packets), and omitted (per the 2116 * specification) otherwise. 2117 * 2118 * It returns B_FALSE only if it does a successful put() to the 2119 * corresponding ill's ill_wq otherwise returns B_TRUE. 2120 */ 2121 static boolean_t 2122 ndp_xmit(ill_t *ill, uint32_t operation, uint8_t *hw_addr, uint_t hw_addr_len, 2123 const in6_addr_t *sender, const in6_addr_t *target, int flag) 2124 { 2125 uint32_t len; 2126 icmp6_t *icmp6; 2127 mblk_t *mp; 2128 ip6_t *ip6h; 2129 nd_opt_hdr_t *opt; 2130 uint_t plen; 2131 zoneid_t zoneid = GLOBAL_ZONEID; 2132 ill_t *hwaddr_ill = ill; 2133 ip_xmit_attr_t ixas; 2134 ip_stack_t *ipst = ill->ill_ipst; 2135 boolean_t need_refrele = B_FALSE; 2136 boolean_t probe = B_FALSE; 2137 2138 if (IS_UNDER_IPMP(ill)) { 2139 probe = ipif_lookup_testaddr_v6(ill, sender, NULL); 2140 /* 2141 * We send non-probe packets on the upper IPMP interface. 2142 * ip_output_simple() will use cast_ill for sending any 2143 * multicast packets. Note that we can't follow the same 2144 * logic for probe packets because all interfaces in the ipmp 2145 * group may have failed, so that we really want to only try 2146 * to send the ND packet on the ill corresponding to the src 2147 * address. 2148 */ 2149 if (!probe) { 2150 ill = ipmp_ill_hold_ipmp_ill(ill); 2151 if (ill != NULL) 2152 need_refrele = B_TRUE; 2153 else 2154 ill = hwaddr_ill; 2155 } 2156 } 2157 2158 /* 2159 * If we have a unspecified source(sender) address, select a 2160 * proper source address for the solicitation here itself so 2161 * that we can initialize the h/w address correctly. 2162 * 2163 * If the sender is specified then we use this address in order 2164 * to lookup the zoneid before calling ip_output_v6(). This is to 2165 * enable unicast ND_NEIGHBOR_ADVERT packets to be routed correctly 2166 * by IP (we cannot guarantee that the global zone has an interface 2167 * route to the destination). 2168 * 2169 * Note that the NA never comes here with the unspecified source 2170 * address. 2171 */ 2172 2173 /* 2174 * Probes will have unspec src at this point. 2175 */ 2176 if (!(IN6_IS_ADDR_UNSPECIFIED(sender))) { 2177 zoneid = ipif_lookup_addr_zoneid_v6(sender, ill, ipst); 2178 /* 2179 * It's possible for ipif_lookup_addr_zoneid_v6() to return 2180 * ALL_ZONES if it cannot find a matching ipif for the address 2181 * we are trying to use. In this case we err on the side of 2182 * trying to send the packet by defaulting to the GLOBAL_ZONEID. 2183 */ 2184 if (zoneid == ALL_ZONES) 2185 zoneid = GLOBAL_ZONEID; 2186 } 2187 2188 plen = (sizeof (nd_opt_hdr_t) + hw_addr_len + 7) / 8; 2189 len = IPV6_HDR_LEN + sizeof (nd_neighbor_advert_t) + plen * 8; 2190 mp = allocb(len, BPRI_LO); 2191 if (mp == NULL) { 2192 if (need_refrele) 2193 ill_refrele(ill); 2194 return (B_TRUE); 2195 } 2196 2197 bzero((char *)mp->b_rptr, len); 2198 mp->b_wptr = mp->b_rptr + len; 2199 2200 bzero(&ixas, sizeof (ixas)); 2201 ixas.ixa_flags = IXAF_SET_ULP_CKSUM | IXAF_NO_HW_CKSUM; 2202 2203 ixas.ixa_ifindex = ill->ill_phyint->phyint_ifindex; 2204 ixas.ixa_ipst = ipst; 2205 ixas.ixa_cred = kcred; 2206 ixas.ixa_cpid = NOPID; 2207 ixas.ixa_tsl = NULL; 2208 ixas.ixa_zoneid = zoneid; 2209 2210 ip6h = (ip6_t *)mp->b_rptr; 2211 ip6h->ip6_vcf = IPV6_DEFAULT_VERS_AND_FLOW; 2212 ip6h->ip6_plen = htons(len - IPV6_HDR_LEN); 2213 ip6h->ip6_nxt = IPPROTO_ICMPV6; 2214 ip6h->ip6_hops = IPV6_MAX_HOPS; 2215 ixas.ixa_multicast_ttl = ip6h->ip6_hops; 2216 ip6h->ip6_dst = *target; 2217 icmp6 = (icmp6_t *)&ip6h[1]; 2218 2219 if (hw_addr_len != 0) { 2220 opt = (nd_opt_hdr_t *)((uint8_t *)ip6h + IPV6_HDR_LEN + 2221 sizeof (nd_neighbor_advert_t)); 2222 } else { 2223 opt = NULL; 2224 } 2225 if (operation == ND_NEIGHBOR_SOLICIT) { 2226 nd_neighbor_solicit_t *ns = (nd_neighbor_solicit_t *)icmp6; 2227 2228 if (opt != NULL && !(flag & NDP_PROBE)) { 2229 /* 2230 * Note that we don't send out SLLA for ND probes 2231 * per RFC 4862, even though we do send out the src 2232 * haddr for IPv4 DAD probes, even though both IPv4 2233 * and IPv6 go out with the unspecified/INADDR_ANY 2234 * src IP addr. 2235 */ 2236 opt->nd_opt_type = ND_OPT_SOURCE_LINKADDR; 2237 } 2238 ip6h->ip6_src = *sender; 2239 ns->nd_ns_target = *target; 2240 if (!(flag & NDP_UNICAST)) { 2241 /* Form multicast address of the target */ 2242 ip6h->ip6_dst = ipv6_solicited_node_mcast; 2243 ip6h->ip6_dst.s6_addr32[3] |= 2244 ns->nd_ns_target.s6_addr32[3]; 2245 } 2246 } else { 2247 nd_neighbor_advert_t *na = (nd_neighbor_advert_t *)icmp6; 2248 2249 ASSERT(!(flag & NDP_PROBE)); 2250 if (opt != NULL) 2251 opt->nd_opt_type = ND_OPT_TARGET_LINKADDR; 2252 ip6h->ip6_src = *sender; 2253 na->nd_na_target = *sender; 2254 if (flag & NDP_ISROUTER) 2255 na->nd_na_flags_reserved |= ND_NA_FLAG_ROUTER; 2256 if (flag & NDP_SOLICITED) 2257 na->nd_na_flags_reserved |= ND_NA_FLAG_SOLICITED; 2258 if (flag & NDP_ORIDE) 2259 na->nd_na_flags_reserved |= ND_NA_FLAG_OVERRIDE; 2260 } 2261 2262 if (!(flag & NDP_PROBE)) { 2263 if (hw_addr != NULL && opt != NULL) { 2264 /* Fill in link layer address and option len */ 2265 opt->nd_opt_len = (uint8_t)plen; 2266 bcopy(hw_addr, &opt[1], hw_addr_len); 2267 } 2268 } 2269 if (opt != NULL && opt->nd_opt_type == 0) { 2270 /* If there's no link layer address option, then strip it. */ 2271 len -= plen * 8; 2272 mp->b_wptr = mp->b_rptr + len; 2273 ip6h->ip6_plen = htons(len - IPV6_HDR_LEN); 2274 } 2275 2276 icmp6->icmp6_type = (uint8_t)operation; 2277 icmp6->icmp6_code = 0; 2278 /* 2279 * Prepare for checksum by putting icmp length in the icmp 2280 * checksum field. The checksum is calculated in ip_output.c. 2281 */ 2282 icmp6->icmp6_cksum = ip6h->ip6_plen; 2283 2284 (void) ip_output_simple(mp, &ixas); 2285 ixa_cleanup(&ixas); 2286 if (need_refrele) 2287 ill_refrele(ill); 2288 return (B_FALSE); 2289 } 2290 2291 /* 2292 * Used to set ND_UNREACHBLE before ncec_delete sets it NCE_F_CONDEMNED. 2293 * The datapath uses this as an indication that there 2294 * is a problem (as opposed to a NCE that was just 2295 * reclaimed due to lack of memory. 2296 * Note that static ARP entries never become unreachable. 2297 */ 2298 void 2299 nce_make_unreachable(ncec_t *ncec) 2300 { 2301 mutex_enter(&ncec->ncec_lock); 2302 ncec->ncec_state = ND_UNREACHABLE; 2303 mutex_exit(&ncec->ncec_lock); 2304 } 2305 2306 /* 2307 * NCE retransmit timer. Common to IPv4 and IPv6. 2308 * This timer goes off when: 2309 * a. It is time to retransmit a resolution for resolver. 2310 * b. It is time to send reachability probes. 2311 */ 2312 void 2313 nce_timer(void *arg) 2314 { 2315 ncec_t *ncec = arg; 2316 ill_t *ill = ncec->ncec_ill, *src_ill; 2317 char addrbuf[INET6_ADDRSTRLEN]; 2318 boolean_t dropped = B_FALSE; 2319 ip_stack_t *ipst = ncec->ncec_ipst; 2320 boolean_t isv6 = (ncec->ncec_ipversion == IPV6_VERSION); 2321 in_addr_t sender4 = INADDR_ANY; 2322 in6_addr_t sender6 = ipv6_all_zeros; 2323 2324 /* 2325 * The timer has to be cancelled by ncec_delete before doing the final 2326 * refrele. So the NCE is guaranteed to exist when the timer runs 2327 * until it clears the timeout_id. Before clearing the timeout_id 2328 * bump up the refcnt so that we can continue to use the ncec 2329 */ 2330 ASSERT(ncec != NULL); 2331 mutex_enter(&ncec->ncec_lock); 2332 ncec_refhold_locked(ncec); 2333 ncec->ncec_timeout_id = 0; 2334 mutex_exit(&ncec->ncec_lock); 2335 2336 src_ill = nce_resolve_src(ncec, &sender6); 2337 /* if we could not find a sender address, return */ 2338 if (src_ill == NULL) { 2339 if (!isv6) { 2340 IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, sender4); 2341 ip1dbg(("no src ill for %s\n", inet_ntop(AF_INET, 2342 &sender4, addrbuf, sizeof (addrbuf)))); 2343 } else { 2344 ip1dbg(("no src ill for %s\n", inet_ntop(AF_INET6, 2345 &ncec->ncec_addr, addrbuf, sizeof (addrbuf)))); 2346 } 2347 nce_restart_timer(ncec, ill->ill_reachable_retrans_time); 2348 ncec_refrele(ncec); 2349 return; 2350 } 2351 if (!isv6) 2352 IN6_V4MAPPED_TO_IPADDR(&sender6, sender4); 2353 2354 mutex_enter(&ncec->ncec_lock); 2355 /* 2356 * Check the reachability state. 2357 */ 2358 switch (ncec->ncec_state) { 2359 case ND_DELAY: 2360 ASSERT(ncec->ncec_lladdr != NULL); 2361 ncec->ncec_state = ND_PROBE; 2362 ncec->ncec_pcnt = ND_MAX_UNICAST_SOLICIT; 2363 if (isv6) { 2364 mutex_exit(&ncec->ncec_lock); 2365 dropped = ndp_xmit(src_ill, ND_NEIGHBOR_SOLICIT, 2366 src_ill->ill_phys_addr, 2367 src_ill->ill_phys_addr_length, 2368 &sender6, &ncec->ncec_addr, 2369 NDP_UNICAST); 2370 } else { 2371 dropped = arp_request(ncec, sender4, src_ill); 2372 mutex_exit(&ncec->ncec_lock); 2373 } 2374 if (!dropped) { 2375 mutex_enter(&ncec->ncec_lock); 2376 ncec->ncec_pcnt--; 2377 mutex_exit(&ncec->ncec_lock); 2378 } 2379 if (ip_debug > 3) { 2380 /* ip2dbg */ 2381 pr_addr_dbg("nce_timer: state for %s changed " 2382 "to PROBE\n", AF_INET6, &ncec->ncec_addr); 2383 } 2384 nce_restart_timer(ncec, ill->ill_reachable_retrans_time); 2385 break; 2386 case ND_PROBE: 2387 /* must be retransmit timer */ 2388 ASSERT(ncec->ncec_pcnt >= -1); 2389 if (ncec->ncec_pcnt > 0) { 2390 /* 2391 * As per RFC2461, the ncec gets deleted after 2392 * MAX_UNICAST_SOLICIT unsuccessful re-transmissions. 2393 * Note that the first unicast solicitation is sent 2394 * during the DELAY state. 2395 */ 2396 ip2dbg(("nce_timer: pcount=%x dst %s\n", 2397 ncec->ncec_pcnt, 2398 inet_ntop((isv6? AF_INET6 : AF_INET), 2399 &ncec->ncec_addr, addrbuf, sizeof (addrbuf)))); 2400 if (NCE_PUBLISH(ncec)) { 2401 mutex_exit(&ncec->ncec_lock); 2402 /* 2403 * send out a probe; note that src_ill 2404 * is ignored by nce_dad() for all 2405 * DAD message types other than IPv6 2406 * unicast probes 2407 */ 2408 nce_dad(ncec, src_ill, B_TRUE); 2409 } else { 2410 ASSERT(src_ill != NULL); 2411 if (isv6) { 2412 mutex_exit(&ncec->ncec_lock); 2413 dropped = ndp_xmit(src_ill, 2414 ND_NEIGHBOR_SOLICIT, 2415 src_ill->ill_phys_addr, 2416 src_ill->ill_phys_addr_length, 2417 &sender6, &ncec->ncec_addr, 2418 NDP_UNICAST); 2419 } else { 2420 /* 2421 * since the nce is REACHABLE, 2422 * the ARP request will be sent out 2423 * as a link-layer unicast. 2424 */ 2425 dropped = arp_request(ncec, sender4, 2426 src_ill); 2427 mutex_exit(&ncec->ncec_lock); 2428 } 2429 if (!dropped) { 2430 mutex_enter(&ncec->ncec_lock); 2431 ncec->ncec_pcnt--; 2432 mutex_exit(&ncec->ncec_lock); 2433 } 2434 nce_restart_timer(ncec, 2435 ill->ill_reachable_retrans_time); 2436 } 2437 } else if (ncec->ncec_pcnt < 0) { 2438 /* No hope, delete the ncec */ 2439 /* Tell datapath it went bad */ 2440 ncec->ncec_state = ND_UNREACHABLE; 2441 mutex_exit(&ncec->ncec_lock); 2442 if (ip_debug > 2) { 2443 /* ip1dbg */ 2444 pr_addr_dbg("nce_timer: Delete NCE for" 2445 " dst %s\n", (isv6? AF_INET6: AF_INET), 2446 &ncec->ncec_addr); 2447 } 2448 /* if static ARP can't delete. */ 2449 if ((ncec->ncec_flags & NCE_F_STATIC) == 0) 2450 ncec_delete(ncec); 2451 2452 } else if (!NCE_PUBLISH(ncec)) { 2453 /* 2454 * Probe count is 0 for a dynamic entry (one that we 2455 * ourselves are not publishing). We should never get 2456 * here if NONUD was requested, hence the ASSERT below. 2457 */ 2458 ASSERT((ncec->ncec_flags & NCE_F_NONUD) == 0); 2459 ip2dbg(("nce_timer: pcount=%x dst %s\n", 2460 ncec->ncec_pcnt, inet_ntop(AF_INET6, 2461 &ncec->ncec_addr, addrbuf, sizeof (addrbuf)))); 2462 ncec->ncec_pcnt--; 2463 mutex_exit(&ncec->ncec_lock); 2464 /* Wait one interval before killing */ 2465 nce_restart_timer(ncec, 2466 ill->ill_reachable_retrans_time); 2467 } else if (ill->ill_phyint->phyint_flags & PHYI_RUNNING) { 2468 ipif_t *ipif; 2469 ipaddr_t ncec_addr; 2470 2471 /* 2472 * We're done probing, and we can now declare this 2473 * address to be usable. Let IP know that it's ok to 2474 * use. 2475 */ 2476 ncec->ncec_state = ND_REACHABLE; 2477 ncec->ncec_flags &= ~NCE_F_UNVERIFIED; 2478 mutex_exit(&ncec->ncec_lock); 2479 if (isv6) { 2480 ipif = ipif_lookup_addr_exact_v6( 2481 &ncec->ncec_addr, ill, ipst); 2482 } else { 2483 IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, 2484 ncec_addr); 2485 ipif = ipif_lookup_addr_exact(ncec_addr, ill, 2486 ipst); 2487 } 2488 if (ipif != NULL) { 2489 if (ipif->ipif_was_dup) { 2490 char ibuf[LIFNAMSIZ]; 2491 char sbuf[INET6_ADDRSTRLEN]; 2492 2493 ipif->ipif_was_dup = B_FALSE; 2494 (void) inet_ntop(AF_INET6, 2495 &ipif->ipif_v6lcl_addr, 2496 sbuf, sizeof (sbuf)); 2497 ipif_get_name(ipif, ibuf, 2498 sizeof (ibuf)); 2499 cmn_err(CE_NOTE, "recovered address " 2500 "%s on %s", sbuf, ibuf); 2501 } 2502 if ((ipif->ipif_flags & IPIF_UP) && 2503 !ipif->ipif_addr_ready) 2504 ipif_up_notify(ipif); 2505 ipif->ipif_addr_ready = 1; 2506 ipif_refrele(ipif); 2507 } 2508 if (!isv6 && arp_no_defense) 2509 break; 2510 /* Begin defending our new address */ 2511 if (ncec->ncec_unsolicit_count > 0) { 2512 ncec->ncec_unsolicit_count--; 2513 if (isv6) { 2514 dropped = ndp_announce(ncec); 2515 } else { 2516 dropped = arp_announce(ncec); 2517 } 2518 2519 if (dropped) 2520 ncec->ncec_unsolicit_count++; 2521 else 2522 ncec->ncec_last_time_defended = 2523 ddi_get_lbolt(); 2524 } 2525 if (ncec->ncec_unsolicit_count > 0) { 2526 nce_restart_timer(ncec, 2527 ANNOUNCE_INTERVAL(isv6)); 2528 } else if (DEFENSE_INTERVAL(isv6) != 0) { 2529 nce_restart_timer(ncec, DEFENSE_INTERVAL(isv6)); 2530 } 2531 } else { 2532 /* 2533 * This is an address we're probing to be our own, but 2534 * the ill is down. Wait until it comes back before 2535 * doing anything, but switch to reachable state so 2536 * that the restart will work. 2537 */ 2538 ncec->ncec_state = ND_REACHABLE; 2539 mutex_exit(&ncec->ncec_lock); 2540 } 2541 break; 2542 case ND_INCOMPLETE: { 2543 mblk_t *mp, *nextmp; 2544 mblk_t **prevmpp; 2545 2546 /* 2547 * Per case (2) in the nce_queue_mp() comments, scan ncec_qd_mp 2548 * for any IPMP probe packets, and toss them. IPMP probe 2549 * packets will always be at the head of ncec_qd_mp, so that 2550 * we can stop at the first queued ND packet that is 2551 * not a probe packet. 2552 */ 2553 prevmpp = &ncec->ncec_qd_mp; 2554 for (mp = ncec->ncec_qd_mp; mp != NULL; mp = nextmp) { 2555 nextmp = mp->b_next; 2556 2557 if (IS_UNDER_IPMP(ill) && ncec->ncec_nprobes > 0) { 2558 inet_freemsg(mp); 2559 ncec->ncec_nprobes--; 2560 *prevmpp = nextmp; 2561 } else { 2562 prevmpp = &mp->b_next; 2563 } 2564 } 2565 2566 /* 2567 * Must be resolver's retransmit timer. 2568 */ 2569 mutex_exit(&ncec->ncec_lock); 2570 ip_ndp_resolve(ncec); 2571 break; 2572 } 2573 case ND_REACHABLE: 2574 if (((ncec->ncec_flags & NCE_F_UNSOL_ADV) && 2575 ncec->ncec_unsolicit_count != 0) || 2576 (NCE_PUBLISH(ncec) && DEFENSE_INTERVAL(isv6) != 0)) { 2577 if (ncec->ncec_unsolicit_count > 0) { 2578 ncec->ncec_unsolicit_count--; 2579 mutex_exit(&ncec->ncec_lock); 2580 /* 2581 * When we get to zero announcements left, 2582 * switch to address defense 2583 */ 2584 } else { 2585 boolean_t rate_limit; 2586 2587 mutex_exit(&ncec->ncec_lock); 2588 rate_limit = ill_defend_rate_limit(ill, ncec); 2589 if (rate_limit) { 2590 nce_restart_timer(ncec, 2591 DEFENSE_INTERVAL(isv6)); 2592 break; 2593 } 2594 } 2595 if (isv6) { 2596 dropped = ndp_announce(ncec); 2597 } else { 2598 dropped = arp_announce(ncec); 2599 } 2600 mutex_enter(&ncec->ncec_lock); 2601 if (dropped) { 2602 ncec->ncec_unsolicit_count++; 2603 } else { 2604 ncec->ncec_last_time_defended = 2605 ddi_get_lbolt(); 2606 } 2607 mutex_exit(&ncec->ncec_lock); 2608 if (ncec->ncec_unsolicit_count != 0) { 2609 nce_restart_timer(ncec, 2610 ANNOUNCE_INTERVAL(isv6)); 2611 } else { 2612 nce_restart_timer(ncec, DEFENSE_INTERVAL(isv6)); 2613 } 2614 } else { 2615 mutex_exit(&ncec->ncec_lock); 2616 } 2617 break; 2618 default: 2619 mutex_exit(&ncec->ncec_lock); 2620 break; 2621 } 2622 done: 2623 ncec_refrele(ncec); 2624 ill_refrele(src_ill); 2625 } 2626 2627 /* 2628 * Set a link layer address from the ll_addr passed in. 2629 * Copy SAP from ill. 2630 */ 2631 static void 2632 nce_set_ll(ncec_t *ncec, uchar_t *ll_addr) 2633 { 2634 ill_t *ill = ncec->ncec_ill; 2635 2636 ASSERT(ll_addr != NULL); 2637 if (ill->ill_phys_addr_length > 0) { 2638 /* 2639 * The bcopy() below used to be called for the physical address 2640 * length rather than the link layer address length. For 2641 * ethernet and many other media, the phys_addr and lla are 2642 * identical. 2643 * 2644 * The phys_addr and lla may not be the same for devices that 2645 * support DL_IPV6_LINK_LAYER_ADDR, though there are currently 2646 * no known instances of these. 2647 * 2648 * For PPP or other interfaces with a zero length 2649 * physical address, don't do anything here. 2650 * The bcopy() with a zero phys_addr length was previously 2651 * a no-op for interfaces with a zero-length physical address. 2652 * Using the lla for them would change the way they operate. 2653 * Doing nothing in such cases preserves expected behavior. 2654 */ 2655 bcopy(ll_addr, ncec->ncec_lladdr, ill->ill_nd_lla_len); 2656 } 2657 } 2658 2659 boolean_t 2660 nce_cmp_ll_addr(const ncec_t *ncec, const uchar_t *ll_addr, 2661 uint32_t ll_addr_len) 2662 { 2663 ASSERT(ncec->ncec_lladdr != NULL); 2664 if (ll_addr == NULL) 2665 return (B_FALSE); 2666 if (bcmp(ll_addr, ncec->ncec_lladdr, ll_addr_len) != 0) 2667 return (B_TRUE); 2668 return (B_FALSE); 2669 } 2670 2671 /* 2672 * Updates the link layer address or the reachability state of 2673 * a cache entry. Reset probe counter if needed. 2674 */ 2675 void 2676 nce_update(ncec_t *ncec, uint16_t new_state, uchar_t *new_ll_addr) 2677 { 2678 ill_t *ill = ncec->ncec_ill; 2679 boolean_t need_stop_timer = B_FALSE; 2680 boolean_t need_fastpath_update = B_FALSE; 2681 nce_t *nce = NULL; 2682 timeout_id_t tid; 2683 2684 ASSERT(MUTEX_HELD(&ncec->ncec_lock)); 2685 /* 2686 * If this interface does not do NUD, there is no point 2687 * in allowing an update to the cache entry. Although 2688 * we will respond to NS. 2689 * The only time we accept an update for a resolver when 2690 * NUD is turned off is when it has just been created. 2691 * Non-Resolvers will always be created as REACHABLE. 2692 */ 2693 if (new_state != ND_UNCHANGED) { 2694 if ((ncec->ncec_flags & NCE_F_NONUD) && 2695 (ncec->ncec_state != ND_INCOMPLETE)) 2696 return; 2697 ASSERT((int16_t)new_state >= ND_STATE_VALID_MIN); 2698 ASSERT((int16_t)new_state <= ND_STATE_VALID_MAX); 2699 need_stop_timer = B_TRUE; 2700 if (new_state == ND_REACHABLE) 2701 ncec->ncec_last = TICK_TO_MSEC(ddi_get_lbolt64()); 2702 else { 2703 /* We force NUD in this case */ 2704 ncec->ncec_last = 0; 2705 } 2706 ncec->ncec_state = new_state; 2707 ncec->ncec_pcnt = ND_MAX_UNICAST_SOLICIT; 2708 ASSERT(ncec->ncec_lladdr != NULL || new_state == ND_INITIAL || 2709 new_state == ND_INCOMPLETE); 2710 } 2711 if (need_stop_timer || (ncec->ncec_flags & NCE_F_STATIC)) { 2712 tid = ncec->ncec_timeout_id; 2713 ncec->ncec_timeout_id = 0; 2714 } 2715 /* 2716 * Re-trigger fastpath probe and 2717 * overwrite the DL_UNITDATA_REQ data, noting we'll lose 2718 * whatever packets that happens to be transmitting at the time. 2719 */ 2720 if (new_ll_addr != NULL) { 2721 bcopy(new_ll_addr, ncec->ncec_lladdr, 2722 ill->ill_phys_addr_length); 2723 need_fastpath_update = B_TRUE; 2724 } 2725 mutex_exit(&ncec->ncec_lock); 2726 if (need_stop_timer || (ncec->ncec_flags & NCE_F_STATIC)) { 2727 if (tid != 0) 2728 (void) untimeout(tid); 2729 } 2730 if (need_fastpath_update) { 2731 /* 2732 * Delete any existing existing dlur_mp and fp_mp information. 2733 * For IPMP interfaces, all underlying ill's must be checked 2734 * and purged. 2735 */ 2736 nce_fastpath_list_delete(ncec->ncec_ill, ncec, NULL); 2737 /* 2738 * add the new dlur_mp and fp_mp 2739 */ 2740 nce = nce_fastpath(ncec, B_TRUE, NULL); 2741 if (nce != NULL) 2742 nce_refrele(nce); 2743 } 2744 mutex_enter(&ncec->ncec_lock); 2745 } 2746 2747 static void 2748 nce_queue_mp_common(ncec_t *ncec, mblk_t *mp, boolean_t head_insert) 2749 { 2750 uint_t count = 0; 2751 mblk_t **mpp, *tmp; 2752 2753 ASSERT(MUTEX_HELD(&ncec->ncec_lock)); 2754 2755 for (mpp = &ncec->ncec_qd_mp; *mpp != NULL; mpp = &(*mpp)->b_next) { 2756 if (++count > ncec->ncec_ill->ill_max_buf) { 2757 tmp = ncec->ncec_qd_mp->b_next; 2758 ncec->ncec_qd_mp->b_next = NULL; 2759 /* 2760 * if we never create data addrs on the under_ill 2761 * does this matter? 2762 */ 2763 BUMP_MIB(ncec->ncec_ill->ill_ip_mib, 2764 ipIfStatsOutDiscards); 2765 ip_drop_output("ipIfStatsOutDiscards", ncec->ncec_qd_mp, 2766 ncec->ncec_ill); 2767 freemsg(ncec->ncec_qd_mp); 2768 ncec->ncec_qd_mp = tmp; 2769 } 2770 } 2771 2772 if (head_insert) { 2773 ncec->ncec_nprobes++; 2774 mp->b_next = ncec->ncec_qd_mp; 2775 ncec->ncec_qd_mp = mp; 2776 } else { 2777 *mpp = mp; 2778 } 2779 } 2780 2781 /* 2782 * nce_queue_mp will queue the packet into the ncec_qd_mp. The packet will be 2783 * queued at the head or tail of the queue based on the input argument 2784 * 'head_insert'. The caller should specify this argument as B_TRUE if this 2785 * packet is an IPMP probe packet, in which case the following happens: 2786 * 2787 * 1. Insert it at the head of the ncec_qd_mp list. Consider the normal 2788 * (non-ipmp_probe) load-speading case where the source address of the ND 2789 * packet is not tied to ncec_ill. If the ill bound to the source address 2790 * cannot receive, the response to the ND packet will not be received. 2791 * However, if ND packets for ncec_ill's probes are queued behind that ND 2792 * packet, those probes will also fail to be sent, and thus in.mpathd will 2793 * erroneously conclude that ncec_ill has also failed. 2794 * 2795 * 2. Drop the ipmp_probe packet in ndp_timer() if the ND did not succeed on 2796 * the first attempt. This ensures that ND problems do not manifest as 2797 * probe RTT spikes. 2798 * 2799 * We achieve this by inserting ipmp_probe() packets at the head of the 2800 * nce_queue. 2801 * 2802 * The ncec for the probe target is created with ncec_ill set to the ipmp_ill, 2803 * but the caller needs to set head_insert to B_TRUE if this is a probe packet. 2804 */ 2805 void 2806 nce_queue_mp(ncec_t *ncec, mblk_t *mp, boolean_t head_insert) 2807 { 2808 ASSERT(MUTEX_HELD(&ncec->ncec_lock)); 2809 nce_queue_mp_common(ncec, mp, head_insert); 2810 } 2811 2812 /* 2813 * Called when address resolution failed due to a timeout. 2814 * Send an ICMP unreachable in response to all queued packets. 2815 */ 2816 void 2817 ndp_resolv_failed(ncec_t *ncec) 2818 { 2819 mblk_t *mp, *nxt_mp; 2820 char buf[INET6_ADDRSTRLEN]; 2821 ill_t *ill = ncec->ncec_ill; 2822 ip_recv_attr_t iras; 2823 2824 bzero(&iras, sizeof (iras)); 2825 iras.ira_flags = 0; 2826 /* 2827 * we are setting the ira_rill to the ipmp_ill (instead of 2828 * the actual ill on which the packet was received), but this 2829 * is ok because we don't actually need the real ira_rill. 2830 * to send the icmp unreachable to the sender. 2831 */ 2832 iras.ira_ill = iras.ira_rill = ill; 2833 iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex; 2834 iras.ira_rifindex = iras.ira_ruifindex; 2835 2836 ip1dbg(("ndp_resolv_failed: dst %s\n", 2837 inet_ntop(AF_INET6, (char *)&ncec->ncec_addr, buf, sizeof (buf)))); 2838 mutex_enter(&ncec->ncec_lock); 2839 mp = ncec->ncec_qd_mp; 2840 ncec->ncec_qd_mp = NULL; 2841 ncec->ncec_nprobes = 0; 2842 mutex_exit(&ncec->ncec_lock); 2843 while (mp != NULL) { 2844 nxt_mp = mp->b_next; 2845 mp->b_next = NULL; 2846 2847 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); 2848 ip_drop_output("ipIfStatsOutDiscards - address unreachable", 2849 mp, ill); 2850 icmp_unreachable_v6(mp, 2851 ICMP6_DST_UNREACH_ADDR, B_FALSE, &iras); 2852 ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE)); 2853 mp = nxt_mp; 2854 } 2855 ncec_cb_dispatch(ncec); /* finish off waiting callbacks */ 2856 } 2857 2858 /* 2859 * Handle the completion of NDP and ARP resolution. 2860 */ 2861 void 2862 nce_resolv_ok(ncec_t *ncec) 2863 { 2864 mblk_t *mp; 2865 uint_t pkt_len; 2866 iaflags_t ixaflags = IXAF_NO_TRACE; 2867 nce_t *nce; 2868 ill_t *ill = ncec->ncec_ill; 2869 boolean_t isv6 = (ncec->ncec_ipversion == IPV6_VERSION); 2870 ip_stack_t *ipst = ill->ill_ipst; 2871 2872 if (IS_IPMP(ncec->ncec_ill)) { 2873 nce_resolv_ipmp_ok(ncec); 2874 return; 2875 } 2876 /* non IPMP case */ 2877 2878 mutex_enter(&ncec->ncec_lock); 2879 ASSERT(ncec->ncec_nprobes == 0); 2880 mp = ncec->ncec_qd_mp; 2881 ncec->ncec_qd_mp = NULL; 2882 mutex_exit(&ncec->ncec_lock); 2883 2884 while (mp != NULL) { 2885 mblk_t *nxt_mp; 2886 2887 if (ill->ill_isv6) { 2888 ip6_t *ip6h = (ip6_t *)mp->b_rptr; 2889 2890 pkt_len = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN; 2891 } else { 2892 ipha_t *ipha = (ipha_t *)mp->b_rptr; 2893 2894 ixaflags |= IXAF_IS_IPV4; 2895 pkt_len = ntohs(ipha->ipha_length); 2896 } 2897 nxt_mp = mp->b_next; 2898 mp->b_next = NULL; 2899 /* 2900 * IXAF_NO_DEV_FLOW_CTL information for TCP packets is no 2901 * longer available, but it's ok to drop this flag because TCP 2902 * has its own flow-control in effect, so TCP packets 2903 * are not likely to get here when flow-control is in effect. 2904 */ 2905 mutex_enter(&ill->ill_lock); 2906 nce = nce_lookup(ill, &ncec->ncec_addr); 2907 mutex_exit(&ill->ill_lock); 2908 2909 if (nce == NULL) { 2910 if (isv6) { 2911 BUMP_MIB(&ipst->ips_ip6_mib, 2912 ipIfStatsOutDiscards); 2913 } else { 2914 BUMP_MIB(&ipst->ips_ip_mib, 2915 ipIfStatsOutDiscards); 2916 } 2917 ip_drop_output("ipIfStatsOutDiscards - no nce", 2918 mp, NULL); 2919 freemsg(mp); 2920 } else { 2921 /* 2922 * We don't know the zoneid, but 2923 * ip_xmit does not care since IXAF_NO_TRACE 2924 * is set. (We traced the packet the first 2925 * time through ip_xmit.) 2926 */ 2927 (void) ip_xmit(mp, nce, ixaflags, pkt_len, 0, 2928 ALL_ZONES, 0, NULL); 2929 nce_refrele(nce); 2930 } 2931 mp = nxt_mp; 2932 } 2933 2934 ncec_cb_dispatch(ncec); /* complete callbacks */ 2935 } 2936 2937 /* 2938 * Called by SIOCSNDP* ioctl to add/change an ncec entry 2939 * and the corresponding attributes. 2940 * Disallow states other than ND_REACHABLE or ND_STALE. 2941 */ 2942 int 2943 ndp_sioc_update(ill_t *ill, lif_nd_req_t *lnr) 2944 { 2945 sin6_t *sin6; 2946 in6_addr_t *addr; 2947 ncec_t *ncec; 2948 nce_t *nce; 2949 int err = 0; 2950 uint16_t new_flags = 0; 2951 uint16_t old_flags = 0; 2952 int inflags = lnr->lnr_flags; 2953 ip_stack_t *ipst = ill->ill_ipst; 2954 boolean_t do_postprocess = B_FALSE; 2955 2956 ASSERT(ill->ill_isv6); 2957 if ((lnr->lnr_state_create != ND_REACHABLE) && 2958 (lnr->lnr_state_create != ND_STALE)) 2959 return (EINVAL); 2960 2961 sin6 = (sin6_t *)&lnr->lnr_addr; 2962 addr = &sin6->sin6_addr; 2963 2964 mutex_enter(&ipst->ips_ndp6->ndp_g_lock); 2965 ASSERT(!IS_UNDER_IPMP(ill)); 2966 nce = nce_lookup_addr(ill, addr); 2967 if (nce != NULL) 2968 new_flags = nce->nce_common->ncec_flags; 2969 2970 switch (inflags & (NDF_ISROUTER_ON|NDF_ISROUTER_OFF)) { 2971 case NDF_ISROUTER_ON: 2972 new_flags |= NCE_F_ISROUTER; 2973 break; 2974 case NDF_ISROUTER_OFF: 2975 new_flags &= ~NCE_F_ISROUTER; 2976 break; 2977 case (NDF_ISROUTER_OFF|NDF_ISROUTER_ON): 2978 mutex_exit(&ipst->ips_ndp6->ndp_g_lock); 2979 if (nce != NULL) 2980 nce_refrele(nce); 2981 return (EINVAL); 2982 } 2983 if (inflags & NDF_STATIC) 2984 new_flags |= NCE_F_STATIC; 2985 2986 switch (inflags & (NDF_ANYCAST_ON|NDF_ANYCAST_OFF)) { 2987 case NDF_ANYCAST_ON: 2988 new_flags |= NCE_F_ANYCAST; 2989 break; 2990 case NDF_ANYCAST_OFF: 2991 new_flags &= ~NCE_F_ANYCAST; 2992 break; 2993 case (NDF_ANYCAST_OFF|NDF_ANYCAST_ON): 2994 mutex_exit(&ipst->ips_ndp6->ndp_g_lock); 2995 if (nce != NULL) 2996 nce_refrele(nce); 2997 return (EINVAL); 2998 } 2999 3000 if (nce == NULL) { 3001 err = nce_add_v6(ill, 3002 (uchar_t *)lnr->lnr_hdw_addr, 3003 ill->ill_phys_addr_length, 3004 addr, 3005 new_flags, 3006 lnr->lnr_state_create, 3007 &nce); 3008 if (err != 0) { 3009 mutex_exit(&ipst->ips_ndp6->ndp_g_lock); 3010 ip1dbg(("ndp_sioc_update: Can't create NCE %d\n", err)); 3011 return (err); 3012 } else { 3013 do_postprocess = B_TRUE; 3014 } 3015 } 3016 ncec = nce->nce_common; 3017 old_flags = ncec->ncec_flags; 3018 if (old_flags & NCE_F_ISROUTER && !(new_flags & NCE_F_ISROUTER)) { 3019 ncec_router_to_host(ncec); 3020 mutex_exit(&ipst->ips_ndp6->ndp_g_lock); 3021 if (do_postprocess) 3022 err = nce_add_v6_postprocess(nce); 3023 nce_refrele(nce); 3024 return (0); 3025 } 3026 mutex_exit(&ipst->ips_ndp6->ndp_g_lock); 3027 3028 if (do_postprocess) 3029 err = nce_add_v6_postprocess(nce); 3030 /* 3031 * err cannot be anything other than 0 because we don't support 3032 * proxy arp of static addresses. 3033 */ 3034 ASSERT(err == 0); 3035 3036 mutex_enter(&ncec->ncec_lock); 3037 ncec->ncec_flags = new_flags; 3038 mutex_exit(&ncec->ncec_lock); 3039 /* 3040 * Note that we ignore the state at this point, which 3041 * should be either STALE or REACHABLE. Instead we let 3042 * the link layer address passed in to determine the state 3043 * much like incoming packets. 3044 */ 3045 nce_process(ncec, (uchar_t *)lnr->lnr_hdw_addr, 0, B_FALSE); 3046 nce_refrele(nce); 3047 return (0); 3048 } 3049 3050 /* 3051 * Create an nce_t structure for ill using the ncec->ncec_lladdr to set up 3052 * the nce_dlur_mp. If ill != ncec->ncec_ill, then the ips_ill_g_lock must 3053 * be held to ensure that they are in the same group. 3054 */ 3055 static nce_t * 3056 nce_fastpath_create(ill_t *ill, ncec_t *ncec) 3057 { 3058 3059 nce_t *nce; 3060 3061 nce = nce_ill_lookup_then_add(ill, ncec); 3062 3063 if (nce == NULL || IS_LOOPBACK(nce->nce_ill) || IS_VNI(nce->nce_ill)) 3064 return (nce); 3065 3066 /* 3067 * hold the ncec_lock to synchronize with nce_update() so that, 3068 * at the end of this function, the contents of nce_dlur_mp are 3069 * consistent with ncec->ncec_lladdr, even though some intermediate 3070 * packet may have been sent out with a mangled address, which would 3071 * only be a transient condition. 3072 */ 3073 mutex_enter(&ncec->ncec_lock); 3074 if (ncec->ncec_lladdr != NULL) { 3075 bcopy(ncec->ncec_lladdr, nce->nce_dlur_mp->b_rptr + 3076 NCE_LL_ADDR_OFFSET(ill), ill->ill_phys_addr_length); 3077 } else { 3078 nce->nce_dlur_mp = ill_dlur_gen(NULL, 0, ill->ill_sap, 3079 ill->ill_sap_length); 3080 } 3081 mutex_exit(&ncec->ncec_lock); 3082 return (nce); 3083 } 3084 3085 /* 3086 * we make nce_fp_mp to have an M_DATA prepend. 3087 * The caller ensures there is hold on ncec for this function. 3088 * Note that since ill_fastpath_probe() copies the mblk there is 3089 * no need to hold the nce or ncec beyond this function. 3090 * 3091 * If the caller has passed in a non-null ncec_nce to nce_faspath() that 3092 * ncec_nce must correspond to the nce for ncec with nce_ill == ncec->ncec_ill 3093 * and will be returned back by this function, so that no extra nce_refrele 3094 * is required for the caller. The calls from nce_add_common() use this 3095 * method. All other callers (that pass in NULL ncec_nce) will have to do a 3096 * nce_refrele of the returned nce (when it is non-null). 3097 */ 3098 nce_t * 3099 nce_fastpath(ncec_t *ncec, boolean_t trigger_fp_req, nce_t *ncec_nce) 3100 { 3101 nce_t *nce; 3102 ill_t *ill = ncec->ncec_ill; 3103 3104 ASSERT(ill != NULL); 3105 3106 if (IS_IPMP(ill) && trigger_fp_req) { 3107 trigger_fp_req = B_FALSE; 3108 ipmp_ncec_fastpath(ncec, ill); 3109 3110 } 3111 /* 3112 * If the caller already has the nce corresponding to the ill, use 3113 * that one. Otherwise we have to lookup/add the nce. Calls from 3114 * nce_add_common() fall in the former category, and have just done 3115 * the nce lookup/add that can be reused. 3116 */ 3117 if (ncec_nce == NULL) 3118 nce = nce_fastpath_create(ill, ncec); 3119 else 3120 nce = ncec_nce; 3121 3122 if (nce == NULL || IS_LOOPBACK(nce->nce_ill) || IS_VNI(nce->nce_ill)) 3123 return (nce); 3124 3125 if (trigger_fp_req) 3126 nce_fastpath_trigger(nce); 3127 return (nce); 3128 } 3129 3130 /* 3131 * Trigger fastpath on nce. No locks may be held. 3132 */ 3133 static void 3134 nce_fastpath_trigger(nce_t *nce) 3135 { 3136 int res; 3137 ill_t *ill = nce->nce_ill; 3138 ncec_t *ncec = nce->nce_common; 3139 3140 res = ill_fastpath_probe(ill, nce->nce_dlur_mp); 3141 /* 3142 * EAGAIN is an indication of a transient error 3143 * i.e. allocation failure etc. leave the ncec in the list it 3144 * will be updated when another probe happens for another ire 3145 * if not it will be taken out of the list when the ire is 3146 * deleted. 3147 */ 3148 if (res != 0 && res != EAGAIN && res != ENOTSUP) 3149 nce_fastpath_list_delete(ill, ncec, NULL); 3150 } 3151 3152 /* 3153 * Add ncec to the nce fastpath list on ill. 3154 */ 3155 static nce_t * 3156 nce_ill_lookup_then_add_locked(ill_t *ill, ncec_t *ncec) 3157 { 3158 nce_t *nce = NULL; 3159 3160 ASSERT(MUTEX_HELD(&ill->ill_lock)); 3161 /* 3162 * Atomically ensure that the ill is not CONDEMNED and is not going 3163 * down, before adding the NCE. 3164 */ 3165 if (ill->ill_state_flags & ILL_CONDEMNED) 3166 return (NULL); 3167 mutex_enter(&ncec->ncec_lock); 3168 /* 3169 * if ncec has not been deleted and 3170 * is not already in the list add it. 3171 */ 3172 if (!NCE_ISCONDEMNED(ncec)) { 3173 nce = nce_lookup(ill, &ncec->ncec_addr); 3174 if (nce != NULL) 3175 goto done; 3176 nce = nce_add(ill, ncec); 3177 } 3178 done: 3179 mutex_exit(&ncec->ncec_lock); 3180 return (nce); 3181 } 3182 3183 nce_t * 3184 nce_ill_lookup_then_add(ill_t *ill, ncec_t *ncec) 3185 { 3186 nce_t *nce; 3187 3188 mutex_enter(&ill->ill_lock); 3189 nce = nce_ill_lookup_then_add_locked(ill, ncec); 3190 mutex_exit(&ill->ill_lock); 3191 return (nce); 3192 } 3193 3194 3195 /* 3196 * remove ncec from the ill_nce list. If 'dead' is non-null, the deleted 3197 * nce is added to the 'dead' list, and the caller must nce_refrele() the 3198 * entry after all locks have been dropped. 3199 */ 3200 void 3201 nce_fastpath_list_delete(ill_t *ill, ncec_t *ncec, list_t *dead) 3202 { 3203 nce_t *nce; 3204 3205 ASSERT(ill != NULL); 3206 3207 /* first clean out any nce pointers in the under_ills */ 3208 if (IS_IPMP(ill)) 3209 ipmp_ncec_flush_nce(ncec); 3210 3211 /* now the ill itself */ 3212 mutex_enter(&ill->ill_lock); 3213 for (nce = list_head(&ill->ill_nce); nce != NULL; 3214 nce = list_next(&ill->ill_nce, nce)) { 3215 if (nce->nce_common == ncec) { 3216 nce_refhold(nce); 3217 nce_delete(nce); 3218 break; 3219 } 3220 } 3221 mutex_exit(&ill->ill_lock); 3222 if (nce != NULL) { 3223 if (dead == NULL) 3224 nce_refrele(nce); 3225 else 3226 list_insert_tail(dead, nce); 3227 } 3228 } 3229 3230 /* 3231 * when the fastpath response does not fit in the datab 3232 * associated with the existing nce_fp_mp, we delete and 3233 * add the nce to retrigger fastpath based on the information 3234 * in the ncec_t. 3235 */ 3236 static nce_t * 3237 nce_delete_then_add(nce_t *nce) 3238 { 3239 ill_t *ill = nce->nce_ill; 3240 nce_t *newnce = NULL; 3241 3242 ip0dbg(("nce_delete_then_add nce %p ill %s\n", 3243 (void *)nce, ill->ill_name)); 3244 mutex_enter(&ill->ill_lock); 3245 mutex_enter(&nce->nce_common->ncec_lock); 3246 nce_delete(nce); 3247 /* 3248 * Make sure that ncec is not condemned before adding. We hold the 3249 * ill_lock and ncec_lock to synchronize with ncec_delete() and 3250 * ipmp_ncec_flush_nce() 3251 */ 3252 if (!NCE_ISCONDEMNED(nce->nce_common)) 3253 newnce = nce_add(ill, nce->nce_common); 3254 mutex_exit(&nce->nce_common->ncec_lock); 3255 mutex_exit(&ill->ill_lock); 3256 nce_refrele(nce); 3257 return (newnce); /* could be null if nomem */ 3258 } 3259 3260 typedef struct nce_fp_match_s { 3261 nce_t *nce_fp_match_res; 3262 mblk_t *nce_fp_match_ack_mp; 3263 } nce_fp_match_t; 3264 3265 /* ARGSUSED */ 3266 static int 3267 nce_fastpath_match_dlur(ill_t *ill, nce_t *nce, void *arg) 3268 { 3269 nce_fp_match_t *nce_fp_marg = arg; 3270 ncec_t *ncec = nce->nce_common; 3271 mblk_t *mp = nce_fp_marg->nce_fp_match_ack_mp; 3272 uchar_t *mp_rptr, *ud_mp_rptr; 3273 mblk_t *ud_mp = nce->nce_dlur_mp; 3274 ptrdiff_t cmplen; 3275 3276 /* 3277 * mp is the mp associated with the fastpath ack. 3278 * ud_mp is the outstanding DL_UNITDATA_REQ on the nce_t 3279 * under consideration. If the contents match, then the 3280 * fastpath ack is used to update the nce. 3281 */ 3282 if (ud_mp == NULL) 3283 return (0); 3284 mp_rptr = mp->b_rptr; 3285 cmplen = mp->b_wptr - mp_rptr; 3286 ASSERT(cmplen >= 0); 3287 3288 ud_mp_rptr = ud_mp->b_rptr; 3289 /* 3290 * The ncec is locked here to prevent any other threads from accessing 3291 * and changing nce_dlur_mp when the address becomes resolved to an 3292 * lla while we're in the middle of looking at and comparing the 3293 * hardware address (lla). It is also locked to prevent multiple 3294 * threads in nce_fastpath() from examining nce_dlur_mp at the same 3295 * time. 3296 */ 3297 mutex_enter(&ncec->ncec_lock); 3298 if (ud_mp->b_wptr - ud_mp_rptr != cmplen || 3299 bcmp((char *)mp_rptr, (char *)ud_mp_rptr, cmplen) == 0) { 3300 nce_fp_marg->nce_fp_match_res = nce; 3301 mutex_exit(&ncec->ncec_lock); 3302 nce_refhold(nce); 3303 return (1); 3304 } 3305 mutex_exit(&ncec->ncec_lock); 3306 return (0); 3307 } 3308 3309 /* 3310 * Update all NCE's that are not in fastpath mode and 3311 * have an nce_fp_mp that matches mp. mp->b_cont contains 3312 * the fastpath header. 3313 * 3314 * Returns TRUE if entry should be dequeued, or FALSE otherwise. 3315 */ 3316 void 3317 nce_fastpath_update(ill_t *ill, mblk_t *mp) 3318 { 3319 nce_fp_match_t nce_fp_marg; 3320 nce_t *nce; 3321 mblk_t *nce_fp_mp, *fp_mp; 3322 3323 nce_fp_marg.nce_fp_match_res = NULL; 3324 nce_fp_marg.nce_fp_match_ack_mp = mp; 3325 3326 nce_walk(ill, nce_fastpath_match_dlur, &nce_fp_marg); 3327 3328 if ((nce = nce_fp_marg.nce_fp_match_res) == NULL) 3329 return; 3330 3331 mutex_enter(&nce->nce_lock); 3332 nce_fp_mp = nce->nce_fp_mp; 3333 3334 if (nce_fp_mp != NULL) { 3335 fp_mp = mp->b_cont; 3336 if (nce_fp_mp->b_rptr + MBLKL(fp_mp) > 3337 nce_fp_mp->b_datap->db_lim) { 3338 mutex_exit(&nce->nce_lock); 3339 nce = nce_delete_then_add(nce); 3340 if (nce == NULL) { 3341 return; 3342 } 3343 mutex_enter(&nce->nce_lock); 3344 nce_fp_mp = nce->nce_fp_mp; 3345 } 3346 } 3347 3348 /* Matched - install mp as the fastpath mp */ 3349 if (nce_fp_mp == NULL) { 3350 fp_mp = dupb(mp->b_cont); 3351 nce->nce_fp_mp = fp_mp; 3352 } else { 3353 fp_mp = mp->b_cont; 3354 bcopy(fp_mp->b_rptr, nce_fp_mp->b_rptr, MBLKL(fp_mp)); 3355 nce->nce_fp_mp->b_wptr = nce->nce_fp_mp->b_rptr 3356 + MBLKL(fp_mp); 3357 } 3358 mutex_exit(&nce->nce_lock); 3359 nce_refrele(nce); 3360 } 3361 3362 /* 3363 * Return a pointer to a given option in the packet. 3364 * Assumes that option part of the packet have already been validated. 3365 */ 3366 nd_opt_hdr_t * 3367 ndp_get_option(nd_opt_hdr_t *opt, int optlen, int opt_type) 3368 { 3369 while (optlen > 0) { 3370 if (opt->nd_opt_type == opt_type) 3371 return (opt); 3372 optlen -= 8 * opt->nd_opt_len; 3373 opt = (struct nd_opt_hdr *)((char *)opt + 8 * opt->nd_opt_len); 3374 } 3375 return (NULL); 3376 } 3377 3378 /* 3379 * Verify all option lengths present are > 0, also check to see 3380 * if the option lengths and packet length are consistent. 3381 */ 3382 boolean_t 3383 ndp_verify_optlen(nd_opt_hdr_t *opt, int optlen) 3384 { 3385 ASSERT(opt != NULL); 3386 while (optlen > 0) { 3387 if (opt->nd_opt_len == 0) 3388 return (B_FALSE); 3389 optlen -= 8 * opt->nd_opt_len; 3390 if (optlen < 0) 3391 return (B_FALSE); 3392 opt = (struct nd_opt_hdr *)((char *)opt + 8 * opt->nd_opt_len); 3393 } 3394 return (B_TRUE); 3395 } 3396 3397 /* 3398 * ncec_walk function. 3399 * Free a fraction of the NCE cache entries. 3400 * 3401 * A possible optimization here would be to use ncec_last where possible, and 3402 * delete the least-frequently used entry, which would require more complex 3403 * computation as we walk through the ncec's (e.g., track ncec entries by 3404 * order of ncec_last and/or maintain state) 3405 */ 3406 static void 3407 ncec_cache_reclaim(ncec_t *ncec, char *arg) 3408 { 3409 ip_stack_t *ipst = ncec->ncec_ipst; 3410 uint_t fraction = *(uint_t *)arg; 3411 uint_t rand; 3412 3413 if ((ncec->ncec_flags & 3414 (NCE_F_MYADDR | NCE_F_STATIC | NCE_F_BCAST)) != 0) { 3415 return; 3416 } 3417 3418 rand = (uint_t)ddi_get_lbolt() + 3419 NCE_ADDR_HASH_V6(ncec->ncec_addr, NCE_TABLE_SIZE); 3420 if ((rand/fraction)*fraction == rand) { 3421 IP_STAT(ipst, ip_nce_reclaim_deleted); 3422 ncec_delete(ncec); 3423 } 3424 } 3425 3426 /* 3427 * kmem_cache callback to free up memory. 3428 * 3429 * For now we just delete a fixed fraction. 3430 */ 3431 static void 3432 ip_nce_reclaim_stack(ip_stack_t *ipst) 3433 { 3434 uint_t fraction = ipst->ips_ip_nce_reclaim_fraction; 3435 3436 IP_STAT(ipst, ip_nce_reclaim_calls); 3437 3438 ncec_walk(NULL, (pfi_t)ncec_cache_reclaim, (uchar_t *)&fraction, ipst); 3439 3440 /* 3441 * Walk all CONNs that can have a reference on an ire, ncec or dce. 3442 * Get them to update any stale references to drop any refholds they 3443 * have. 3444 */ 3445 ipcl_walk(conn_ixa_cleanup, (void *)B_FALSE, ipst); 3446 } 3447 3448 /* 3449 * Called by the memory allocator subsystem directly, when the system 3450 * is running low on memory. 3451 */ 3452 /* ARGSUSED */ 3453 void 3454 ip_nce_reclaim(void *args) 3455 { 3456 netstack_handle_t nh; 3457 netstack_t *ns; 3458 ip_stack_t *ipst; 3459 3460 netstack_next_init(&nh); 3461 while ((ns = netstack_next(&nh)) != NULL) { 3462 /* 3463 * netstack_next() can return a netstack_t with a NULL 3464 * netstack_ip at boot time. 3465 */ 3466 if ((ipst = ns->netstack_ip) == NULL) { 3467 netstack_rele(ns); 3468 continue; 3469 } 3470 ip_nce_reclaim_stack(ipst); 3471 netstack_rele(ns); 3472 } 3473 netstack_next_fini(&nh); 3474 } 3475 3476 #ifdef DEBUG 3477 void 3478 ncec_trace_ref(ncec_t *ncec) 3479 { 3480 ASSERT(MUTEX_HELD(&ncec->ncec_lock)); 3481 3482 if (ncec->ncec_trace_disable) 3483 return; 3484 3485 if (!th_trace_ref(ncec, ncec->ncec_ipst)) { 3486 ncec->ncec_trace_disable = B_TRUE; 3487 ncec_trace_cleanup(ncec); 3488 } 3489 } 3490 3491 void 3492 ncec_untrace_ref(ncec_t *ncec) 3493 { 3494 ASSERT(MUTEX_HELD(&ncec->ncec_lock)); 3495 3496 if (!ncec->ncec_trace_disable) 3497 th_trace_unref(ncec); 3498 } 3499 3500 static void 3501 ncec_trace_cleanup(const ncec_t *ncec) 3502 { 3503 th_trace_cleanup(ncec, ncec->ncec_trace_disable); 3504 } 3505 #endif 3506 3507 /* 3508 * Called when address resolution fails due to a timeout. 3509 * Send an ICMP unreachable in response to all queued packets. 3510 */ 3511 void 3512 arp_resolv_failed(ncec_t *ncec) 3513 { 3514 mblk_t *mp, *nxt_mp; 3515 char buf[INET6_ADDRSTRLEN]; 3516 struct in_addr ipv4addr; 3517 ill_t *ill = ncec->ncec_ill; 3518 ip_stack_t *ipst = ncec->ncec_ipst; 3519 ip_recv_attr_t iras; 3520 3521 bzero(&iras, sizeof (iras)); 3522 iras.ira_flags = IRAF_IS_IPV4; 3523 /* 3524 * we are setting the ira_rill to the ipmp_ill (instead of 3525 * the actual ill on which the packet was received), but this 3526 * is ok because we don't actually need the real ira_rill. 3527 * to send the icmp unreachable to the sender. 3528 */ 3529 iras.ira_ill = iras.ira_rill = ill; 3530 iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex; 3531 iras.ira_rifindex = iras.ira_ruifindex; 3532 3533 IN6_V4MAPPED_TO_INADDR(&ncec->ncec_addr, &ipv4addr); 3534 ip3dbg(("arp_resolv_failed: dst %s\n", 3535 inet_ntop(AF_INET, &ipv4addr, buf, sizeof (buf)))); 3536 mutex_enter(&ncec->ncec_lock); 3537 mp = ncec->ncec_qd_mp; 3538 ncec->ncec_qd_mp = NULL; 3539 ncec->ncec_nprobes = 0; 3540 mutex_exit(&ncec->ncec_lock); 3541 while (mp != NULL) { 3542 nxt_mp = mp->b_next; 3543 mp->b_next = NULL; 3544 3545 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); 3546 ip_drop_output("ipIfStatsOutDiscards - address unreachable", 3547 mp, ill); 3548 if (ipst->ips_ip_arp_icmp_error) { 3549 ip3dbg(("arp_resolv_failed: " 3550 "Calling icmp_unreachable\n")); 3551 icmp_unreachable(mp, ICMP_HOST_UNREACHABLE, &iras); 3552 } else { 3553 freemsg(mp); 3554 } 3555 ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE)); 3556 mp = nxt_mp; 3557 } 3558 ncec_cb_dispatch(ncec); /* finish off waiting callbacks */ 3559 } 3560 3561 /* 3562 * if ill is an under_ill, translate it to the ipmp_ill and add the 3563 * nce on the ipmp_ill. Two nce_t entries (one on the ipmp_ill, and 3564 * one on the underlying in_ill) will be created for the 3565 * ncec_t in this case. The ncec_t itself will be created on the ipmp_ill. 3566 */ 3567 int 3568 nce_lookup_then_add_v4(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len, 3569 const in_addr_t *addr, uint16_t flags, uint16_t state, nce_t **newnce) 3570 { 3571 int err; 3572 in6_addr_t addr6; 3573 ip_stack_t *ipst = ill->ill_ipst; 3574 nce_t *nce, *upper_nce = NULL; 3575 ill_t *in_ill = ill, *under = NULL; 3576 boolean_t need_ill_refrele = B_FALSE; 3577 3578 if (flags & NCE_F_MCAST) { 3579 /* 3580 * hw_addr will be figured out in nce_set_multicast_v4; 3581 * caller needs to pass in the cast_ill for ipmp 3582 */ 3583 ASSERT(hw_addr == NULL); 3584 ASSERT(!IS_IPMP(ill)); 3585 err = nce_set_multicast_v4(ill, addr, flags, newnce); 3586 return (err); 3587 } 3588 3589 if (IS_UNDER_IPMP(ill) && !(flags & NCE_F_MYADDR)) { 3590 ill = ipmp_ill_hold_ipmp_ill(ill); 3591 if (ill == NULL) 3592 return (ENXIO); 3593 need_ill_refrele = B_TRUE; 3594 } 3595 if ((flags & NCE_F_BCAST) != 0) { 3596 /* 3597 * IPv4 broadcast ncec: compute the hwaddr. 3598 */ 3599 if (IS_IPMP(ill)) { 3600 under = ipmp_ill_get_xmit_ill(ill, B_FALSE); 3601 if (under == NULL) { 3602 if (need_ill_refrele) 3603 ill_refrele(ill); 3604 return (ENETDOWN); 3605 } 3606 hw_addr = under->ill_bcast_mp->b_rptr + 3607 NCE_LL_ADDR_OFFSET(under); 3608 hw_addr_len = under->ill_phys_addr_length; 3609 } else { 3610 hw_addr = ill->ill_bcast_mp->b_rptr + 3611 NCE_LL_ADDR_OFFSET(ill), 3612 hw_addr_len = ill->ill_phys_addr_length; 3613 } 3614 } 3615 3616 mutex_enter(&ipst->ips_ndp4->ndp_g_lock); 3617 IN6_IPADDR_TO_V4MAPPED(*addr, &addr6); 3618 nce = nce_lookup_addr(ill, &addr6); 3619 if (nce == NULL) { 3620 err = nce_add_v4(ill, hw_addr, hw_addr_len, addr, flags, 3621 state, &nce); 3622 } else { 3623 err = EEXIST; 3624 } 3625 mutex_exit(&ipst->ips_ndp4->ndp_g_lock); 3626 if (err == 0) 3627 err = nce_add_v4_postprocess(nce); 3628 3629 if (in_ill != ill && nce != NULL) { 3630 nce_t *under_nce = NULL; 3631 3632 /* 3633 * in_ill was the under_ill. Try to create the under_nce. 3634 * Hold the ill_g_lock to prevent changes to group membership 3635 * until we are done. 3636 */ 3637 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 3638 if (!IS_IN_SAME_ILLGRP(in_ill, ill)) { 3639 DTRACE_PROBE2(ill__not__in__group, nce_t *, nce, 3640 ill_t *, ill); 3641 rw_exit(&ipst->ips_ill_g_lock); 3642 err = ENXIO; 3643 nce_refrele(nce); 3644 nce = NULL; 3645 goto bail; 3646 } 3647 under_nce = nce_fastpath_create(in_ill, nce->nce_common); 3648 if (under_nce == NULL) { 3649 rw_exit(&ipst->ips_ill_g_lock); 3650 err = EINVAL; 3651 nce_refrele(nce); 3652 nce = NULL; 3653 goto bail; 3654 } 3655 rw_exit(&ipst->ips_ill_g_lock); 3656 upper_nce = nce; 3657 nce = under_nce; /* will be returned to caller */ 3658 if (NCE_ISREACHABLE(nce->nce_common)) 3659 nce_fastpath_trigger(under_nce); 3660 } 3661 if (nce != NULL) { 3662 if (newnce != NULL) 3663 *newnce = nce; 3664 else 3665 nce_refrele(nce); 3666 } 3667 bail: 3668 if (under != NULL) 3669 ill_refrele(under); 3670 if (upper_nce != NULL) 3671 nce_refrele(upper_nce); 3672 if (need_ill_refrele) 3673 ill_refrele(ill); 3674 3675 return (err); 3676 } 3677 3678 /* 3679 * NDP Cache Entry creation routine for IPv4. 3680 * This routine must always be called with ndp4->ndp_g_lock held. 3681 * Prior to return, ncec_refcnt is incremented. 3682 * 3683 * IPMP notes: the ncec for non-local (i.e., !NCE_MYADDR(ncec) addresses 3684 * are always added pointing at the ipmp_ill. Thus, when the ill passed 3685 * to nce_add_v4 is an under_ill (i.e., IS_UNDER_IPMP(ill)) two nce_t 3686 * entries will be created, both pointing at the same ncec_t. The nce_t 3687 * entries will have their nce_ill set to the ipmp_ill and the under_ill 3688 * respectively, with the ncec_t having its ncec_ill pointing at the ipmp_ill. 3689 * Local addresses are always created on the ill passed to nce_add_v4. 3690 */ 3691 int 3692 nce_add_v4(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len, 3693 const in_addr_t *addr, uint16_t flags, uint16_t state, nce_t **newnce) 3694 { 3695 int err; 3696 boolean_t is_multicast = (flags & NCE_F_MCAST); 3697 struct in6_addr addr6; 3698 nce_t *nce; 3699 3700 ASSERT(MUTEX_HELD(&ill->ill_ipst->ips_ndp4->ndp_g_lock)); 3701 ASSERT(!ill->ill_isv6); 3702 ASSERT(!IN_MULTICAST(htonl(*addr)) || is_multicast); 3703 3704 IN6_IPADDR_TO_V4MAPPED(*addr, &addr6); 3705 err = nce_add_common(ill, hw_addr, hw_addr_len, &addr6, flags, state, 3706 &nce); 3707 ASSERT(newnce != NULL); 3708 *newnce = nce; 3709 return (err); 3710 } 3711 3712 /* 3713 * Post-processing routine to be executed after nce_add_v4(). This function 3714 * triggers fastpath (if appropriate) and DAD on the newly added nce entry 3715 * and must be called without any locks held. 3716 * 3717 * Always returns 0, but we return an int to keep this symmetric with the 3718 * IPv6 counter-part. 3719 */ 3720 int 3721 nce_add_v4_postprocess(nce_t *nce) 3722 { 3723 ncec_t *ncec = nce->nce_common; 3724 uint16_t flags = ncec->ncec_flags; 3725 boolean_t ndp_need_dad = B_FALSE; 3726 boolean_t dropped; 3727 clock_t delay; 3728 ip_stack_t *ipst = ncec->ncec_ill->ill_ipst; 3729 uchar_t *hw_addr = ncec->ncec_lladdr; 3730 boolean_t trigger_fastpath = B_TRUE; 3731 3732 /* 3733 * If the hw_addr is NULL, typically for ND_INCOMPLETE nces, then 3734 * we call nce_fastpath as soon as the ncec is resolved in nce_process. 3735 * We call nce_fastpath from nce_update if the link layer address of 3736 * the peer changes from nce_update 3737 */ 3738 if (NCE_PUBLISH(ncec) || !NCE_ISREACHABLE(ncec) || (hw_addr == NULL && 3739 ncec->ncec_ill->ill_net_type != IRE_IF_NORESOLVER)) 3740 trigger_fastpath = B_FALSE; 3741 3742 if (trigger_fastpath) 3743 nce_fastpath_trigger(nce); 3744 3745 if (NCE_PUBLISH(ncec) && ncec->ncec_state == ND_PROBE) { 3746 /* 3747 * Either the caller (by passing in ND_PROBE) 3748 * or nce_add_common() (by the internally computed state 3749 * based on ncec_addr and ill_net_type) has determined 3750 * that this unicast entry needs DAD. Trigger DAD. 3751 */ 3752 ndp_need_dad = B_TRUE; 3753 } else if (flags & NCE_F_UNSOL_ADV) { 3754 /* 3755 * We account for the transmit below by assigning one 3756 * less than the ndd variable. Subsequent decrements 3757 * are done in nce_timer. 3758 */ 3759 mutex_enter(&ncec->ncec_lock); 3760 ncec->ncec_unsolicit_count = 3761 ipst->ips_ip_arp_publish_count - 1; 3762 mutex_exit(&ncec->ncec_lock); 3763 dropped = arp_announce(ncec); 3764 mutex_enter(&ncec->ncec_lock); 3765 if (dropped) 3766 ncec->ncec_unsolicit_count++; 3767 else 3768 ncec->ncec_last_time_defended = ddi_get_lbolt(); 3769 if (ncec->ncec_unsolicit_count != 0) { 3770 nce_start_timer(ncec, 3771 ipst->ips_ip_arp_publish_interval); 3772 } 3773 mutex_exit(&ncec->ncec_lock); 3774 } 3775 3776 /* 3777 * If ncec_xmit_interval is 0, user has configured us to send the first 3778 * probe right away. Do so, and set up for the subsequent probes. 3779 */ 3780 if (ndp_need_dad) { 3781 mutex_enter(&ncec->ncec_lock); 3782 if (ncec->ncec_pcnt == 0) { 3783 /* 3784 * DAD probes and announce can be 3785 * administratively disabled by setting the 3786 * probe_count to zero. Restart the timer in 3787 * this case to mark the ipif as ready. 3788 */ 3789 ncec->ncec_unsolicit_count = 0; 3790 mutex_exit(&ncec->ncec_lock); 3791 nce_restart_timer(ncec, 0); 3792 } else { 3793 mutex_exit(&ncec->ncec_lock); 3794 delay = ((ncec->ncec_flags & NCE_F_FAST) ? 3795 ipst->ips_arp_probe_delay : 3796 ipst->ips_arp_fastprobe_delay); 3797 nce_dad(ncec, NULL, (delay == 0 ? B_TRUE : B_FALSE)); 3798 } 3799 } 3800 return (0); 3801 } 3802 3803 /* 3804 * ncec_walk routine to update all entries that have a given destination or 3805 * gateway address and cached link layer (MAC) address. This is used when ARP 3806 * informs us that a network-to-link-layer mapping may have changed. 3807 */ 3808 void 3809 nce_update_hw_changed(ncec_t *ncec, void *arg) 3810 { 3811 nce_hw_map_t *hwm = arg; 3812 ipaddr_t ncec_addr; 3813 3814 if (ncec->ncec_state != ND_REACHABLE) 3815 return; 3816 3817 IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, ncec_addr); 3818 if (ncec_addr != hwm->hwm_addr) 3819 return; 3820 3821 mutex_enter(&ncec->ncec_lock); 3822 if (hwm->hwm_flags != 0) 3823 ncec->ncec_flags = hwm->hwm_flags; 3824 nce_update(ncec, ND_STALE, hwm->hwm_hwaddr); 3825 mutex_exit(&ncec->ncec_lock); 3826 } 3827 3828 void 3829 ncec_refhold(ncec_t *ncec) 3830 { 3831 mutex_enter(&(ncec)->ncec_lock); 3832 (ncec)->ncec_refcnt++; 3833 ASSERT((ncec)->ncec_refcnt != 0); 3834 #ifdef DEBUG 3835 ncec_trace_ref(ncec); 3836 #endif 3837 mutex_exit(&(ncec)->ncec_lock); 3838 } 3839 3840 void 3841 ncec_refhold_notr(ncec_t *ncec) 3842 { 3843 mutex_enter(&(ncec)->ncec_lock); 3844 (ncec)->ncec_refcnt++; 3845 ASSERT((ncec)->ncec_refcnt != 0); 3846 mutex_exit(&(ncec)->ncec_lock); 3847 } 3848 3849 static void 3850 ncec_refhold_locked(ncec_t *ncec) 3851 { 3852 ASSERT(MUTEX_HELD(&(ncec)->ncec_lock)); 3853 (ncec)->ncec_refcnt++; 3854 #ifdef DEBUG 3855 ncec_trace_ref(ncec); 3856 #endif 3857 } 3858 3859 /* ncec_inactive destroys the mutex thus no mutex_exit is needed */ 3860 void 3861 ncec_refrele(ncec_t *ncec) 3862 { 3863 mutex_enter(&(ncec)->ncec_lock); 3864 #ifdef DEBUG 3865 ncec_untrace_ref(ncec); 3866 #endif 3867 ASSERT((ncec)->ncec_refcnt != 0); 3868 if (--(ncec)->ncec_refcnt == 0) { 3869 ncec_inactive(ncec); 3870 } else { 3871 mutex_exit(&(ncec)->ncec_lock); 3872 } 3873 } 3874 3875 void 3876 ncec_refrele_notr(ncec_t *ncec) 3877 { 3878 mutex_enter(&(ncec)->ncec_lock); 3879 ASSERT((ncec)->ncec_refcnt != 0); 3880 if (--(ncec)->ncec_refcnt == 0) { 3881 ncec_inactive(ncec); 3882 } else { 3883 mutex_exit(&(ncec)->ncec_lock); 3884 } 3885 } 3886 3887 /* 3888 * Common to IPv4 and IPv6. 3889 */ 3890 void 3891 nce_restart_timer(ncec_t *ncec, uint_t ms) 3892 { 3893 timeout_id_t tid; 3894 3895 ASSERT(!MUTEX_HELD(&(ncec)->ncec_lock)); 3896 3897 /* First cancel any running timer */ 3898 mutex_enter(&ncec->ncec_lock); 3899 tid = ncec->ncec_timeout_id; 3900 ncec->ncec_timeout_id = 0; 3901 if (tid != 0) { 3902 mutex_exit(&ncec->ncec_lock); 3903 (void) untimeout(tid); 3904 mutex_enter(&ncec->ncec_lock); 3905 } 3906 3907 /* Restart timer */ 3908 nce_start_timer(ncec, ms); 3909 mutex_exit(&ncec->ncec_lock); 3910 } 3911 3912 static void 3913 nce_start_timer(ncec_t *ncec, uint_t ms) 3914 { 3915 ASSERT(MUTEX_HELD(&ncec->ncec_lock)); 3916 /* 3917 * Don't start the timer if the ncec has been deleted, or if the timer 3918 * is already running 3919 */ 3920 if (!NCE_ISCONDEMNED(ncec) && ncec->ncec_timeout_id == 0) { 3921 ncec->ncec_timeout_id = timeout(nce_timer, ncec, 3922 MSEC_TO_TICK(ms) == 0 ? 1 : MSEC_TO_TICK(ms)); 3923 } 3924 } 3925 3926 int 3927 nce_set_multicast_v4(ill_t *ill, const in_addr_t *dst, 3928 uint16_t flags, nce_t **newnce) 3929 { 3930 uchar_t *hw_addr; 3931 int err = 0; 3932 ip_stack_t *ipst = ill->ill_ipst; 3933 in6_addr_t dst6; 3934 nce_t *nce; 3935 3936 ASSERT(!ill->ill_isv6); 3937 3938 IN6_IPADDR_TO_V4MAPPED(*dst, &dst6); 3939 mutex_enter(&ipst->ips_ndp4->ndp_g_lock); 3940 if ((nce = nce_lookup_addr(ill, &dst6)) != NULL) { 3941 mutex_exit(&ipst->ips_ndp4->ndp_g_lock); 3942 goto done; 3943 } 3944 if (ill->ill_net_type == IRE_IF_RESOLVER) { 3945 /* 3946 * For IRE_IF_RESOLVER a hardware mapping can be 3947 * generated, for IRE_IF_NORESOLVER, resolution cookie 3948 * in the ill is copied in nce_add_v4(). 3949 */ 3950 hw_addr = kmem_alloc(ill->ill_phys_addr_length, KM_NOSLEEP); 3951 if (hw_addr == NULL) { 3952 mutex_exit(&ipst->ips_ndp4->ndp_g_lock); 3953 return (ENOMEM); 3954 } 3955 ip_mcast_mapping(ill, (uchar_t *)dst, hw_addr); 3956 } else { 3957 /* 3958 * IRE_IF_NORESOLVER type simply copies the resolution 3959 * cookie passed in. So no hw_addr is needed. 3960 */ 3961 hw_addr = NULL; 3962 } 3963 ASSERT(flags & NCE_F_MCAST); 3964 ASSERT(flags & NCE_F_NONUD); 3965 /* nce_state will be computed by nce_add_common() */ 3966 err = nce_add_v4(ill, hw_addr, ill->ill_phys_addr_length, dst, flags, 3967 ND_UNCHANGED, &nce); 3968 mutex_exit(&ipst->ips_ndp4->ndp_g_lock); 3969 if (err == 0) 3970 err = nce_add_v4_postprocess(nce); 3971 if (hw_addr != NULL) 3972 kmem_free(hw_addr, ill->ill_phys_addr_length); 3973 if (err != 0) { 3974 ip1dbg(("nce_set_multicast_v4: create failed" "%d\n", err)); 3975 return (err); 3976 } 3977 done: 3978 if (newnce != NULL) 3979 *newnce = nce; 3980 else 3981 nce_refrele(nce); 3982 return (0); 3983 } 3984 3985 /* 3986 * This is used when scanning for "old" (least recently broadcast) NCEs. We 3987 * don't want to have to walk the list for every single one, so we gather up 3988 * batches at a time. 3989 */ 3990 #define NCE_RESCHED_LIST_LEN 8 3991 3992 typedef struct { 3993 ill_t *ncert_ill; 3994 uint_t ncert_num; 3995 ncec_t *ncert_nces[NCE_RESCHED_LIST_LEN]; 3996 } nce_resched_t; 3997 3998 /* 3999 * Pick the longest waiting NCEs for defense. 4000 */ 4001 /* ARGSUSED */ 4002 static int 4003 ncec_reschedule(ill_t *ill, nce_t *nce, void *arg) 4004 { 4005 nce_resched_t *ncert = arg; 4006 ncec_t **ncecs; 4007 ncec_t **ncec_max; 4008 ncec_t *ncec_temp; 4009 ncec_t *ncec = nce->nce_common; 4010 4011 ASSERT(ncec->ncec_ill == ncert->ncert_ill); 4012 /* 4013 * Only reachable entries that are ready for announcement are eligible. 4014 */ 4015 if (!NCE_MYADDR(ncec) || ncec->ncec_state != ND_REACHABLE) 4016 return (0); 4017 if (ncert->ncert_num < NCE_RESCHED_LIST_LEN) { 4018 ncec_refhold(ncec); 4019 ncert->ncert_nces[ncert->ncert_num++] = ncec; 4020 } else { 4021 ncecs = ncert->ncert_nces; 4022 ncec_max = ncecs + NCE_RESCHED_LIST_LEN; 4023 ncec_refhold(ncec); 4024 for (; ncecs < ncec_max; ncecs++) { 4025 ASSERT(ncec != NULL); 4026 if ((*ncecs)->ncec_last_time_defended > 4027 ncec->ncec_last_time_defended) { 4028 ncec_temp = *ncecs; 4029 *ncecs = ncec; 4030 ncec = ncec_temp; 4031 } 4032 } 4033 ncec_refrele(ncec); 4034 } 4035 return (0); 4036 } 4037 4038 /* 4039 * Reschedule the ARP defense of any long-waiting NCEs. It's assumed that this 4040 * doesn't happen very often (if at all), and thus it needn't be highly 4041 * optimized. (Note, though, that it's actually O(N) complexity, because the 4042 * outer loop is bounded by a constant rather than by the length of the list.) 4043 */ 4044 static void 4045 nce_ill_reschedule(ill_t *ill, nce_resched_t *ncert) 4046 { 4047 ncec_t *ncec; 4048 ip_stack_t *ipst = ill->ill_ipst; 4049 uint_t i, defend_rate; 4050 4051 i = ill->ill_defend_count; 4052 ill->ill_defend_count = 0; 4053 if (ill->ill_isv6) 4054 defend_rate = ipst->ips_ndp_defend_rate; 4055 else 4056 defend_rate = ipst->ips_arp_defend_rate; 4057 /* If none could be sitting around, then don't reschedule */ 4058 if (i < defend_rate) { 4059 DTRACE_PROBE1(reschedule_none, ill_t *, ill); 4060 return; 4061 } 4062 ncert->ncert_ill = ill; 4063 while (ill->ill_defend_count < defend_rate) { 4064 nce_walk_common(ill, ncec_reschedule, ncert); 4065 for (i = 0; i < ncert->ncert_num; i++) { 4066 4067 ncec = ncert->ncert_nces[i]; 4068 mutex_enter(&ncec->ncec_lock); 4069 ncec->ncec_flags |= NCE_F_DELAYED; 4070 mutex_exit(&ncec->ncec_lock); 4071 /* 4072 * we plan to schedule this ncec, so incr the 4073 * defend_count in anticipation. 4074 */ 4075 if (++ill->ill_defend_count >= defend_rate) 4076 break; 4077 } 4078 if (ncert->ncert_num < NCE_RESCHED_LIST_LEN) 4079 break; 4080 } 4081 } 4082 4083 /* 4084 * Check if the current rate-limiting parameters permit the sending 4085 * of another address defense announcement for both IPv4 and IPv6. 4086 * Returns B_TRUE if rate-limiting is in effect (i.e., send is not 4087 * permitted), and B_FALSE otherwise. The `defend_rate' parameter 4088 * determines how many address defense announcements are permitted 4089 * in any `defense_perio' interval. 4090 */ 4091 static boolean_t 4092 ill_defend_rate_limit(ill_t *ill, ncec_t *ncec) 4093 { 4094 clock_t now = ddi_get_lbolt(); 4095 ip_stack_t *ipst = ill->ill_ipst; 4096 clock_t start = ill->ill_defend_start; 4097 uint32_t elapsed, defend_period, defend_rate; 4098 nce_resched_t ncert; 4099 boolean_t ret; 4100 int i; 4101 4102 if (ill->ill_isv6) { 4103 defend_period = ipst->ips_ndp_defend_period; 4104 defend_rate = ipst->ips_ndp_defend_rate; 4105 } else { 4106 defend_period = ipst->ips_arp_defend_period; 4107 defend_rate = ipst->ips_arp_defend_rate; 4108 } 4109 if (defend_rate == 0) 4110 return (B_TRUE); 4111 bzero(&ncert, sizeof (ncert)); 4112 mutex_enter(&ill->ill_lock); 4113 if (start > 0) { 4114 elapsed = now - start; 4115 if (elapsed > SEC_TO_TICK(defend_period)) { 4116 ill->ill_defend_start = now; 4117 /* 4118 * nce_ill_reschedule will attempt to 4119 * prevent starvation by reschduling the 4120 * oldest entries, which are marked with 4121 * the NCE_F_DELAYED flag. 4122 */ 4123 nce_ill_reschedule(ill, &ncert); 4124 } 4125 } else { 4126 ill->ill_defend_start = now; 4127 } 4128 ASSERT(ill->ill_defend_count <= defend_rate); 4129 mutex_enter(&ncec->ncec_lock); 4130 if (ncec->ncec_flags & NCE_F_DELAYED) { 4131 /* 4132 * This ncec was rescheduled as one of the really old 4133 * entries needing on-going defense. The 4134 * ill_defend_count was already incremented in 4135 * nce_ill_reschedule. Go ahead and send the announce. 4136 */ 4137 ncec->ncec_flags &= ~NCE_F_DELAYED; 4138 mutex_exit(&ncec->ncec_lock); 4139 ret = B_FALSE; 4140 goto done; 4141 } 4142 mutex_exit(&ncec->ncec_lock); 4143 if (ill->ill_defend_count < defend_rate) 4144 ill->ill_defend_count++; 4145 if (ill->ill_defend_count == defend_rate) { 4146 /* 4147 * we are no longer allowed to send unbidden defense 4148 * messages. Wait for rescheduling. 4149 */ 4150 ret = B_TRUE; 4151 } else { 4152 ret = B_FALSE; 4153 } 4154 done: 4155 mutex_exit(&ill->ill_lock); 4156 /* 4157 * After all the locks have been dropped we can restart nce timer, 4158 * and refrele the delayed ncecs 4159 */ 4160 for (i = 0; i < ncert.ncert_num; i++) { 4161 clock_t xmit_interval; 4162 ncec_t *tmp; 4163 4164 tmp = ncert.ncert_nces[i]; 4165 xmit_interval = nce_fuzz_interval(tmp->ncec_xmit_interval, 4166 B_FALSE); 4167 nce_restart_timer(tmp, xmit_interval); 4168 ncec_refrele(tmp); 4169 } 4170 return (ret); 4171 } 4172 4173 boolean_t 4174 ndp_announce(ncec_t *ncec) 4175 { 4176 return (ndp_xmit(ncec->ncec_ill, ND_NEIGHBOR_ADVERT, ncec->ncec_lladdr, 4177 ncec->ncec_lladdr_length, &ncec->ncec_addr, &ipv6_all_hosts_mcast, 4178 nce_advert_flags(ncec))); 4179 } 4180 4181 ill_t * 4182 nce_resolve_src(ncec_t *ncec, in6_addr_t *src) 4183 { 4184 mblk_t *mp; 4185 in6_addr_t src6; 4186 ipaddr_t src4; 4187 ill_t *ill = ncec->ncec_ill; 4188 ill_t *src_ill = NULL; 4189 ipif_t *ipif = NULL; 4190 boolean_t is_myaddr = NCE_MYADDR(ncec); 4191 boolean_t isv6 = (ncec->ncec_ipversion == IPV6_VERSION); 4192 4193 ASSERT(src != NULL); 4194 ASSERT(IN6_IS_ADDR_UNSPECIFIED(src)); 4195 src6 = *src; 4196 if (is_myaddr) { 4197 src6 = ncec->ncec_addr; 4198 if (!isv6) 4199 IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, src4); 4200 } else { 4201 /* 4202 * try to find one from the outgoing packet. 4203 */ 4204 mutex_enter(&ncec->ncec_lock); 4205 mp = ncec->ncec_qd_mp; 4206 if (mp != NULL) { 4207 if (isv6) { 4208 ip6_t *ip6h = (ip6_t *)mp->b_rptr; 4209 4210 src6 = ip6h->ip6_src; 4211 } else { 4212 ipha_t *ipha = (ipha_t *)mp->b_rptr; 4213 4214 src4 = ipha->ipha_src; 4215 IN6_IPADDR_TO_V4MAPPED(src4, &src6); 4216 } 4217 } 4218 mutex_exit(&ncec->ncec_lock); 4219 } 4220 4221 /* 4222 * For outgoing packets, if the src of outgoing packet is one 4223 * of the assigned interface addresses use it, otherwise we 4224 * will pick the source address below. 4225 * For local addresses (is_myaddr) doing DAD, NDP announce 4226 * messages are mcast. So we use the (IPMP) cast_ill or the 4227 * (non-IPMP) ncec_ill for these message types. The only case 4228 * of unicast DAD messages are for IPv6 ND probes, for which 4229 * we find the ipif_bound_ill corresponding to the ncec_addr. 4230 */ 4231 if (!IN6_IS_ADDR_UNSPECIFIED(&src6) || is_myaddr) { 4232 if (isv6) { 4233 ipif = ipif_lookup_addr_nondup_v6(&src6, ill, ALL_ZONES, 4234 ill->ill_ipst); 4235 } else { 4236 ipif = ipif_lookup_addr_nondup(src4, ill, ALL_ZONES, 4237 ill->ill_ipst); 4238 } 4239 4240 /* 4241 * If no relevant ipif can be found, then it's not one of our 4242 * addresses. Reset to :: and try to find a src for the NS or 4243 * ARP request using ipif_select_source_v[4,6] below. 4244 * If an ipif can be found, but it's not yet done with 4245 * DAD verification, and we are not being invoked for 4246 * DAD (i.e., !is_myaddr), then just postpone this 4247 * transmission until later. 4248 */ 4249 if (ipif == NULL) { 4250 src6 = ipv6_all_zeros; 4251 src4 = INADDR_ANY; 4252 } else if (!ipif->ipif_addr_ready && !is_myaddr) { 4253 DTRACE_PROBE2(nce__resolve__ipif__not__ready, 4254 ncec_t *, ncec, ipif_t *, ipif); 4255 ipif_refrele(ipif); 4256 return (NULL); 4257 } 4258 } 4259 4260 if (IN6_IS_ADDR_UNSPECIFIED(&src6) && !is_myaddr) { 4261 /* 4262 * Pick a source address for this solicitation, but 4263 * restrict the selection to addresses assigned to the 4264 * output interface. We do this because the destination will 4265 * create a neighbor cache entry for the source address of 4266 * this packet, so the source address had better be a valid 4267 * neighbor. 4268 */ 4269 if (isv6) { 4270 ipif = ipif_select_source_v6(ill, &ncec->ncec_addr, 4271 B_TRUE, IPV6_PREFER_SRC_DEFAULT, ALL_ZONES, 4272 B_FALSE, NULL); 4273 } else { 4274 ipaddr_t nce_addr; 4275 4276 IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, nce_addr); 4277 ipif = ipif_select_source_v4(ill, nce_addr, ALL_ZONES, 4278 B_FALSE, NULL); 4279 } 4280 if (ipif == NULL && IS_IPMP(ill)) { 4281 ill_t *send_ill = ipmp_ill_get_xmit_ill(ill, B_TRUE); 4282 4283 if (send_ill != NULL) { 4284 if (isv6) { 4285 ipif = ipif_select_source_v6(send_ill, 4286 &ncec->ncec_addr, B_TRUE, 4287 IPV6_PREFER_SRC_DEFAULT, ALL_ZONES, 4288 B_FALSE, NULL); 4289 } else { 4290 IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, 4291 src4); 4292 ipif = ipif_select_source_v4(send_ill, 4293 src4, ALL_ZONES, B_TRUE, NULL); 4294 } 4295 ill_refrele(send_ill); 4296 } 4297 } 4298 4299 if (ipif == NULL) { 4300 char buf[INET6_ADDRSTRLEN]; 4301 4302 ip1dbg(("nce_resolve_src: No source ipif for dst %s\n", 4303 inet_ntop((isv6 ? AF_INET6 : AF_INET), 4304 (char *)&ncec->ncec_addr, buf, sizeof (buf)))); 4305 DTRACE_PROBE1(nce__resolve__no__ipif, ncec_t *, ncec); 4306 return (NULL); 4307 } 4308 src6 = ipif->ipif_v6lcl_addr; 4309 } 4310 *src = src6; 4311 if (ipif != NULL) { 4312 src_ill = ipif->ipif_ill; 4313 if (IS_IPMP(src_ill)) 4314 src_ill = ipmp_ipif_hold_bound_ill(ipif); 4315 else 4316 ill_refhold(src_ill); 4317 ipif_refrele(ipif); 4318 DTRACE_PROBE2(nce__resolve__src__ill, ncec_t *, ncec, 4319 ill_t *, src_ill); 4320 } 4321 return (src_ill); 4322 } 4323 4324 void 4325 ip_nce_lookup_and_update(ipaddr_t *addr, ipif_t *ipif, ip_stack_t *ipst, 4326 uchar_t *hwaddr, int hwaddr_len, int flags) 4327 { 4328 ill_t *ill; 4329 ncec_t *ncec; 4330 nce_t *nce; 4331 uint16_t new_state; 4332 4333 ill = (ipif ? ipif->ipif_ill : NULL); 4334 if (ill != NULL) { 4335 /* 4336 * only one ncec is possible 4337 */ 4338 nce = nce_lookup_v4(ill, addr); 4339 if (nce != NULL) { 4340 ncec = nce->nce_common; 4341 mutex_enter(&ncec->ncec_lock); 4342 if (NCE_ISREACHABLE(ncec)) 4343 new_state = ND_UNCHANGED; 4344 else 4345 new_state = ND_STALE; 4346 ncec->ncec_flags = flags; 4347 nce_update(ncec, new_state, hwaddr); 4348 mutex_exit(&ncec->ncec_lock); 4349 nce_refrele(nce); 4350 return; 4351 } 4352 } else { 4353 /* 4354 * ill is wildcard; clean up all ncec's and ire's 4355 * that match on addr. 4356 */ 4357 nce_hw_map_t hwm; 4358 4359 hwm.hwm_addr = *addr; 4360 hwm.hwm_hwlen = hwaddr_len; 4361 hwm.hwm_hwaddr = hwaddr; 4362 hwm.hwm_flags = flags; 4363 4364 ncec_walk_common(ipst->ips_ndp4, NULL, 4365 (pfi_t)nce_update_hw_changed, (uchar_t *)&hwm, B_TRUE); 4366 } 4367 } 4368 4369 /* 4370 * Common function to add ncec entries. 4371 * we always add the ncec with ncec_ill == ill, and always create 4372 * nce_t on ncec_ill. A dlpi fastpath message may be triggered if the 4373 * ncec is !reachable. 4374 * 4375 * When the caller passes in an nce_state of ND_UNCHANGED, 4376 * nce_add_common() will determine the state of the created nce based 4377 * on the ill_net_type and nce_flags used. Otherwise, the nce will 4378 * be created with state set to the passed in nce_state. 4379 */ 4380 static int 4381 nce_add_common(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len, 4382 const in6_addr_t *addr, uint16_t flags, uint16_t nce_state, nce_t **retnce) 4383 { 4384 static ncec_t nce_nil; 4385 uchar_t *template = NULL; 4386 int err; 4387 ncec_t *ncec; 4388 ncec_t **ncep; 4389 ip_stack_t *ipst = ill->ill_ipst; 4390 uint16_t state; 4391 boolean_t fastprobe = B_FALSE; 4392 struct ndp_g_s *ndp; 4393 nce_t *nce = NULL; 4394 mblk_t *dlur_mp = NULL; 4395 4396 if (ill->ill_isv6) 4397 ndp = ill->ill_ipst->ips_ndp6; 4398 else 4399 ndp = ill->ill_ipst->ips_ndp4; 4400 4401 *retnce = NULL; 4402 4403 ASSERT(MUTEX_HELD(&ndp->ndp_g_lock)); 4404 4405 if (IN6_IS_ADDR_UNSPECIFIED(addr)) { 4406 ip0dbg(("nce_add_common: no addr\n")); 4407 return (EINVAL); 4408 } 4409 if ((flags & ~NCE_EXTERNAL_FLAGS_MASK)) { 4410 ip0dbg(("nce_add_common: flags = %x\n", (int)flags)); 4411 return (EINVAL); 4412 } 4413 4414 if (ill->ill_isv6) { 4415 ncep = ((ncec_t **)NCE_HASH_PTR_V6(ipst, *addr)); 4416 } else { 4417 ipaddr_t v4addr; 4418 4419 IN6_V4MAPPED_TO_IPADDR(addr, v4addr); 4420 ncep = ((ncec_t **)NCE_HASH_PTR_V4(ipst, v4addr)); 4421 } 4422 4423 /* 4424 * The caller has ensured that there is no nce on ill, but there could 4425 * still be an nce_common_t for the address, so that we find exisiting 4426 * ncec_t strucutures first, and atomically add a new nce_t if 4427 * one is found. The ndp_g_lock ensures that we don't cross threads 4428 * with an ncec_delete(). Unlike ncec_lookup_illgrp() we do not 4429 * compare for matches across the illgrp because this function is 4430 * called via nce_lookup_then_add_v* -> nce_add_v* -> nce_add_common, 4431 * with the nce_lookup_then_add_v* passing in the ipmp_ill where 4432 * appropriate. 4433 */ 4434 ncec = *ncep; 4435 for (; ncec != NULL; ncec = ncec->ncec_next) { 4436 if (ncec->ncec_ill == ill) { 4437 if (IN6_ARE_ADDR_EQUAL(&ncec->ncec_addr, addr)) { 4438 /* 4439 * We should never find *retnce to be 4440 * MYADDR, since the caller may then 4441 * incorrectly restart a DAD timer that's 4442 * already running. However, if we are in 4443 * forwarding mode, and the interface is 4444 * moving in/out of groups, the data 4445 * path ire lookup (e.g., ire_revalidate_nce) 4446 * may have determined that some destination 4447 * is offlink while the control path is adding 4448 * that address as a local address. 4449 * Recover from this case by failing the 4450 * lookup 4451 */ 4452 if (NCE_MYADDR(ncec)) 4453 return (ENXIO); 4454 *retnce = nce_ill_lookup_then_add(ill, ncec); 4455 if (*retnce != NULL) 4456 break; 4457 } 4458 } 4459 } 4460 if (*retnce != NULL) /* caller must trigger fastpath on nce */ 4461 return (0); 4462 4463 ncec = kmem_cache_alloc(ncec_cache, KM_NOSLEEP); 4464 if (ncec == NULL) 4465 return (ENOMEM); 4466 *ncec = nce_nil; 4467 ncec->ncec_ill = ill; 4468 ncec->ncec_ipversion = (ill->ill_isv6 ? IPV6_VERSION : IPV4_VERSION); 4469 ncec->ncec_flags = flags; 4470 ncec->ncec_ipst = ipst; /* No netstack_hold */ 4471 4472 if (!ill->ill_isv6) { 4473 ipaddr_t addr4; 4474 4475 /* 4476 * DAD probe interval and probe count are set based on 4477 * fast/slow probe settings. If the underlying link doesn't 4478 * have reliably up/down notifications or if we're working 4479 * with IPv4 169.254.0.0/16 Link Local Address space, then 4480 * don't use the fast timers. Otherwise, use them. 4481 */ 4482 ASSERT(IN6_IS_ADDR_V4MAPPED(addr)); 4483 IN6_V4MAPPED_TO_IPADDR(addr, addr4); 4484 if (ill->ill_note_link && !IS_IPV4_LL_SPACE(&addr4)) { 4485 fastprobe = B_TRUE; 4486 } else if (IS_IPMP(ill) && NCE_PUBLISH(ncec) && 4487 !IS_IPV4_LL_SPACE(&addr4)) { 4488 ill_t *hwaddr_ill; 4489 4490 hwaddr_ill = ipmp_illgrp_find_ill(ill->ill_grp, hw_addr, 4491 hw_addr_len); 4492 if (hwaddr_ill != NULL && hwaddr_ill->ill_note_link) 4493 fastprobe = B_TRUE; 4494 } 4495 if (fastprobe) { 4496 ncec->ncec_xmit_interval = 4497 ipst->ips_arp_fastprobe_interval; 4498 ncec->ncec_pcnt = 4499 ipst->ips_arp_fastprobe_count; 4500 ncec->ncec_flags |= NCE_F_FAST; 4501 } else { 4502 ncec->ncec_xmit_interval = 4503 ipst->ips_arp_probe_interval; 4504 ncec->ncec_pcnt = 4505 ipst->ips_arp_probe_count; 4506 } 4507 if (NCE_PUBLISH(ncec)) { 4508 ncec->ncec_unsolicit_count = 4509 ipst->ips_ip_arp_publish_count; 4510 } 4511 } else { 4512 /* 4513 * probe interval is constant: ILL_PROBE_INTERVAL 4514 * probe count is constant: ND_MAX_UNICAST_SOLICIT 4515 */ 4516 ncec->ncec_pcnt = ND_MAX_UNICAST_SOLICIT; 4517 if (NCE_PUBLISH(ncec)) { 4518 ncec->ncec_unsolicit_count = 4519 ipst->ips_ip_ndp_unsolicit_count; 4520 } 4521 } 4522 ncec->ncec_rcnt = ill->ill_xmit_count; 4523 ncec->ncec_addr = *addr; 4524 ncec->ncec_qd_mp = NULL; 4525 ncec->ncec_refcnt = 1; /* for ncec getting created */ 4526 mutex_init(&ncec->ncec_lock, NULL, MUTEX_DEFAULT, NULL); 4527 ncec->ncec_trace_disable = B_FALSE; 4528 4529 /* 4530 * ncec_lladdr holds link layer address 4531 */ 4532 if (hw_addr_len > 0) { 4533 template = kmem_alloc(hw_addr_len, KM_NOSLEEP); 4534 if (template == NULL) { 4535 err = ENOMEM; 4536 goto err_ret; 4537 } 4538 ncec->ncec_lladdr = template; 4539 ncec->ncec_lladdr_length = hw_addr_len; 4540 bzero(ncec->ncec_lladdr, hw_addr_len); 4541 } 4542 if ((flags & NCE_F_BCAST) != 0) { 4543 state = ND_REACHABLE; 4544 ASSERT(hw_addr_len > 0); 4545 } else if (ill->ill_net_type == IRE_IF_RESOLVER) { 4546 state = ND_INITIAL; 4547 } else if (ill->ill_net_type == IRE_IF_NORESOLVER) { 4548 /* 4549 * NORESOLVER entries are always created in the REACHABLE 4550 * state. 4551 */ 4552 state = ND_REACHABLE; 4553 if (ill->ill_phys_addr_length == IP_ADDR_LEN && 4554 ill->ill_mactype != DL_IPV4 && 4555 ill->ill_mactype != DL_6TO4) { 4556 /* 4557 * We create a nce_res_mp with the IP nexthop address 4558 * as the destination address if the physical length 4559 * is exactly 4 bytes for point-to-multipoint links 4560 * that do their own resolution from IP to link-layer 4561 * address (e.g. IP over X.25). 4562 */ 4563 bcopy((uchar_t *)addr, 4564 ncec->ncec_lladdr, ill->ill_phys_addr_length); 4565 } 4566 if (ill->ill_phys_addr_length == IPV6_ADDR_LEN && 4567 ill->ill_mactype != DL_IPV6) { 4568 /* 4569 * We create a nce_res_mp with the IP nexthop address 4570 * as the destination address if the physical legnth 4571 * is exactly 16 bytes for point-to-multipoint links 4572 * that do their own resolution from IP to link-layer 4573 * address. 4574 */ 4575 bcopy((uchar_t *)addr, 4576 ncec->ncec_lladdr, ill->ill_phys_addr_length); 4577 } 4578 /* 4579 * Since NUD is not part of the base IPv4 protocol definition, 4580 * IPv4 neighbor entries on NORESOLVER interfaces will never 4581 * age, and are marked NCE_F_NONUD. 4582 */ 4583 if (!ill->ill_isv6) 4584 ncec->ncec_flags |= NCE_F_NONUD; 4585 } else if (ill->ill_net_type == IRE_LOOPBACK) { 4586 state = ND_REACHABLE; 4587 } 4588 4589 if (hw_addr != NULL || ill->ill_net_type == IRE_IF_NORESOLVER) { 4590 /* 4591 * We are adding an ncec with a deterministic hw_addr, 4592 * so the state can only be one of {REACHABLE, STALE, PROBE}. 4593 * 4594 * if we are adding a unicast ncec for the local address 4595 * it would be REACHABLE; we would be adding a ND_STALE entry 4596 * for the requestor of an ARP_REQUEST/ND_SOLICIT. Our own 4597 * addresses are added in PROBE to trigger DAD. 4598 */ 4599 if ((flags & (NCE_F_MCAST|NCE_F_BCAST)) || 4600 ill->ill_net_type == IRE_IF_NORESOLVER) 4601 state = ND_REACHABLE; 4602 else if (!NCE_PUBLISH(ncec)) 4603 state = ND_STALE; 4604 else 4605 state = ND_PROBE; 4606 if (hw_addr != NULL) 4607 nce_set_ll(ncec, hw_addr); 4608 } 4609 /* caller overrides internally computed state */ 4610 if (nce_state != ND_UNCHANGED) 4611 state = nce_state; 4612 4613 if (state == ND_PROBE) 4614 ncec->ncec_flags |= NCE_F_UNVERIFIED; 4615 4616 ncec->ncec_state = state; 4617 4618 if (state == ND_REACHABLE) { 4619 ncec->ncec_last = ncec->ncec_init_time = 4620 TICK_TO_MSEC(ddi_get_lbolt64()); 4621 } else { 4622 ncec->ncec_last = 0; 4623 if (state == ND_INITIAL) 4624 ncec->ncec_init_time = TICK_TO_MSEC(ddi_get_lbolt64()); 4625 } 4626 list_create(&ncec->ncec_cb, sizeof (ncec_cb_t), 4627 offsetof(ncec_cb_t, ncec_cb_node)); 4628 /* 4629 * have all the memory allocations out of the way before taking locks 4630 * and adding the nce. 4631 */ 4632 nce = kmem_cache_alloc(nce_cache, KM_NOSLEEP); 4633 if (nce == NULL) { 4634 err = ENOMEM; 4635 goto err_ret; 4636 } 4637 if (ncec->ncec_lladdr != NULL || 4638 ill->ill_net_type == IRE_IF_NORESOLVER) { 4639 dlur_mp = ill_dlur_gen(ncec->ncec_lladdr, 4640 ill->ill_phys_addr_length, ill->ill_sap, 4641 ill->ill_sap_length); 4642 if (dlur_mp == NULL) { 4643 err = ENOMEM; 4644 goto err_ret; 4645 } 4646 } 4647 4648 /* 4649 * Atomically ensure that the ill is not CONDEMNED, before 4650 * adding the NCE. 4651 */ 4652 mutex_enter(&ill->ill_lock); 4653 if (ill->ill_state_flags & ILL_CONDEMNED) { 4654 mutex_exit(&ill->ill_lock); 4655 err = EINVAL; 4656 goto err_ret; 4657 } 4658 if (!NCE_MYADDR(ncec) && 4659 (ill->ill_state_flags & ILL_DOWN_IN_PROGRESS)) { 4660 mutex_exit(&ill->ill_lock); 4661 DTRACE_PROBE1(nce__add__on__down__ill, ncec_t *, ncec); 4662 err = EINVAL; 4663 goto err_ret; 4664 } 4665 /* 4666 * Acquire the ncec_lock even before adding the ncec to the list 4667 * so that it cannot get deleted after the ncec is added, but 4668 * before we add the nce. 4669 */ 4670 mutex_enter(&ncec->ncec_lock); 4671 if ((ncec->ncec_next = *ncep) != NULL) 4672 ncec->ncec_next->ncec_ptpn = &ncec->ncec_next; 4673 *ncep = ncec; 4674 ncec->ncec_ptpn = ncep; 4675 4676 /* Bump up the number of ncec's referencing this ill */ 4677 DTRACE_PROBE3(ill__incr__cnt, (ill_t *), ill, 4678 (char *), "ncec", (void *), ncec); 4679 ill->ill_ncec_cnt++; 4680 /* 4681 * Since we hold the ncec_lock at this time, the ncec cannot be 4682 * condemned, and we can safely add the nce. 4683 */ 4684 *retnce = nce_add_impl(ill, ncec, nce, dlur_mp); 4685 mutex_exit(&ncec->ncec_lock); 4686 mutex_exit(&ill->ill_lock); 4687 4688 /* caller must trigger fastpath on *retnce */ 4689 return (0); 4690 4691 err_ret: 4692 if (ncec != NULL) 4693 kmem_cache_free(ncec_cache, ncec); 4694 if (nce != NULL) 4695 kmem_cache_free(nce_cache, nce); 4696 freemsg(dlur_mp); 4697 if (template != NULL) 4698 kmem_free(template, ill->ill_phys_addr_length); 4699 return (err); 4700 } 4701 4702 /* 4703 * take a ref on the nce 4704 */ 4705 void 4706 nce_refhold(nce_t *nce) 4707 { 4708 mutex_enter(&nce->nce_lock); 4709 nce->nce_refcnt++; 4710 ASSERT((nce)->nce_refcnt != 0); 4711 mutex_exit(&nce->nce_lock); 4712 } 4713 4714 /* 4715 * release a ref on the nce; In general, this 4716 * cannot be called with locks held because nce_inactive 4717 * may result in nce_inactive which will take the ill_lock, 4718 * do ipif_ill_refrele_tail etc. Thus the one exception 4719 * where this can be called with locks held is when the caller 4720 * is certain that the nce_refcnt is sufficient to prevent 4721 * the invocation of nce_inactive. 4722 */ 4723 void 4724 nce_refrele(nce_t *nce) 4725 { 4726 ASSERT((nce)->nce_refcnt != 0); 4727 mutex_enter(&nce->nce_lock); 4728 if (--nce->nce_refcnt == 0) 4729 nce_inactive(nce); /* destroys the mutex */ 4730 else 4731 mutex_exit(&nce->nce_lock); 4732 } 4733 4734 /* 4735 * free the nce after all refs have gone away. 4736 */ 4737 static void 4738 nce_inactive(nce_t *nce) 4739 { 4740 ill_t *ill = nce->nce_ill; 4741 4742 ASSERT(nce->nce_refcnt == 0); 4743 4744 ncec_refrele_notr(nce->nce_common); 4745 nce->nce_common = NULL; 4746 freemsg(nce->nce_fp_mp); 4747 freemsg(nce->nce_dlur_mp); 4748 4749 mutex_enter(&ill->ill_lock); 4750 DTRACE_PROBE3(ill__decr__cnt, (ill_t *), ill, 4751 (char *), "nce", (void *), nce); 4752 ill->ill_nce_cnt--; 4753 nce->nce_ill = NULL; 4754 /* 4755 * If the number of ncec's associated with this ill have dropped 4756 * to zero, check whether we need to restart any operation that 4757 * is waiting for this to happen. 4758 */ 4759 if (ILL_DOWN_OK(ill)) { 4760 /* ipif_ill_refrele_tail drops the ill_lock */ 4761 ipif_ill_refrele_tail(ill); 4762 } else { 4763 mutex_exit(&ill->ill_lock); 4764 } 4765 4766 mutex_destroy(&nce->nce_lock); 4767 kmem_cache_free(nce_cache, nce); 4768 } 4769 4770 /* 4771 * Add an nce to the ill_nce list. 4772 */ 4773 static nce_t * 4774 nce_add_impl(ill_t *ill, ncec_t *ncec, nce_t *nce, mblk_t *dlur_mp) 4775 { 4776 bzero(nce, sizeof (*nce)); 4777 mutex_init(&nce->nce_lock, NULL, MUTEX_DEFAULT, NULL); 4778 nce->nce_common = ncec; 4779 nce->nce_addr = ncec->ncec_addr; 4780 nce->nce_ill = ill; 4781 DTRACE_PROBE3(ill__incr__cnt, (ill_t *), ill, 4782 (char *), "nce", (void *), nce); 4783 ill->ill_nce_cnt++; 4784 4785 nce->nce_refcnt = 1; /* for the thread */ 4786 ncec->ncec_refcnt++; /* want ncec_refhold_locked_notr(ncec) */ 4787 nce->nce_dlur_mp = dlur_mp; 4788 4789 /* add nce to the ill's fastpath list. */ 4790 nce->nce_refcnt++; /* for the list */ 4791 list_insert_head(&ill->ill_nce, nce); 4792 return (nce); 4793 } 4794 4795 static nce_t * 4796 nce_add(ill_t *ill, ncec_t *ncec) 4797 { 4798 nce_t *nce; 4799 mblk_t *dlur_mp = NULL; 4800 4801 ASSERT(MUTEX_HELD(&ill->ill_lock)); 4802 ASSERT(MUTEX_HELD(&ncec->ncec_lock)); 4803 4804 nce = kmem_cache_alloc(nce_cache, KM_NOSLEEP); 4805 if (nce == NULL) 4806 return (NULL); 4807 if (ncec->ncec_lladdr != NULL || 4808 ill->ill_net_type == IRE_IF_NORESOLVER) { 4809 dlur_mp = ill_dlur_gen(ncec->ncec_lladdr, 4810 ill->ill_phys_addr_length, ill->ill_sap, 4811 ill->ill_sap_length); 4812 if (dlur_mp == NULL) { 4813 kmem_cache_free(nce_cache, nce); 4814 return (NULL); 4815 } 4816 } 4817 return (nce_add_impl(ill, ncec, nce, dlur_mp)); 4818 } 4819 4820 /* 4821 * remove the nce from the ill_faspath list 4822 */ 4823 void 4824 nce_delete(nce_t *nce) 4825 { 4826 ill_t *ill = nce->nce_ill; 4827 4828 ASSERT(MUTEX_HELD(&ill->ill_lock)); 4829 4830 mutex_enter(&nce->nce_lock); 4831 if (nce->nce_is_condemned) { 4832 /* 4833 * some other thread has removed this nce from the ill_nce list 4834 */ 4835 mutex_exit(&nce->nce_lock); 4836 return; 4837 } 4838 nce->nce_is_condemned = B_TRUE; 4839 mutex_exit(&nce->nce_lock); 4840 4841 list_remove(&ill->ill_nce, nce); 4842 /* 4843 * even though we are holding the ill_lock, it is ok to 4844 * call nce_refrele here because we know that we should have 4845 * at least 2 refs on the nce: one for the thread, and one 4846 * for the list. The refrele below will release the one for 4847 * the list. 4848 */ 4849 nce_refrele(nce); 4850 } 4851 4852 nce_t * 4853 nce_lookup(ill_t *ill, const in6_addr_t *addr) 4854 { 4855 nce_t *nce = NULL; 4856 4857 ASSERT(ill != NULL); 4858 ASSERT(MUTEX_HELD(&ill->ill_lock)); 4859 4860 for (nce = list_head(&ill->ill_nce); nce != NULL; 4861 nce = list_next(&ill->ill_nce, nce)) { 4862 if (IN6_ARE_ADDR_EQUAL(&nce->nce_addr, addr)) 4863 break; 4864 } 4865 4866 /* 4867 * if we found the nce on the ill_nce list while holding 4868 * the ill_lock, then it cannot be condemned yet. 4869 */ 4870 if (nce != NULL) { 4871 ASSERT(!nce->nce_is_condemned); 4872 nce_refhold(nce); 4873 } 4874 return (nce); 4875 } 4876 4877 /* 4878 * Walk the ill_nce list on ill. The callback function func() cannot perform 4879 * any destructive actions. 4880 */ 4881 static void 4882 nce_walk_common(ill_t *ill, pfi_t func, void *arg) 4883 { 4884 nce_t *nce = NULL, *nce_next; 4885 4886 ASSERT(MUTEX_HELD(&ill->ill_lock)); 4887 for (nce = list_head(&ill->ill_nce); nce != NULL; ) { 4888 nce_next = list_next(&ill->ill_nce, nce); 4889 if (func(ill, nce, arg) != 0) 4890 break; 4891 nce = nce_next; 4892 } 4893 } 4894 4895 void 4896 nce_walk(ill_t *ill, pfi_t func, void *arg) 4897 { 4898 mutex_enter(&ill->ill_lock); 4899 nce_walk_common(ill, func, arg); 4900 mutex_exit(&ill->ill_lock); 4901 } 4902 4903 void 4904 nce_flush(ill_t *ill, boolean_t flushall) 4905 { 4906 nce_t *nce, *nce_next; 4907 list_t dead; 4908 4909 list_create(&dead, sizeof (nce_t), offsetof(nce_t, nce_node)); 4910 mutex_enter(&ill->ill_lock); 4911 for (nce = list_head(&ill->ill_nce); nce != NULL; ) { 4912 nce_next = list_next(&ill->ill_nce, nce); 4913 if (!flushall && NCE_PUBLISH(nce->nce_common)) { 4914 nce = nce_next; 4915 continue; 4916 } 4917 /* 4918 * nce_delete requires that the caller should either not 4919 * be holding locks, or should hold a ref to ensure that 4920 * we wont hit ncec_inactive. So take a ref and clean up 4921 * after the list is flushed. 4922 */ 4923 nce_refhold(nce); 4924 nce_delete(nce); 4925 list_insert_tail(&dead, nce); 4926 nce = nce_next; 4927 } 4928 mutex_exit(&ill->ill_lock); 4929 while ((nce = list_head(&dead)) != NULL) { 4930 list_remove(&dead, nce); 4931 nce_refrele(nce); 4932 } 4933 ASSERT(list_is_empty(&dead)); 4934 list_destroy(&dead); 4935 } 4936 4937 /* Return an interval that is anywhere in the [1 .. intv] range */ 4938 static clock_t 4939 nce_fuzz_interval(clock_t intv, boolean_t initial_time) 4940 { 4941 clock_t rnd, frac; 4942 4943 (void) random_get_pseudo_bytes((uint8_t *)&rnd, sizeof (rnd)); 4944 /* Note that clock_t is signed; must chop off bits */ 4945 rnd &= (1ul << (NBBY * sizeof (rnd) - 1)) - 1; 4946 if (initial_time) { 4947 if (intv <= 0) 4948 intv = 1; 4949 else 4950 intv = (rnd % intv) + 1; 4951 } else { 4952 /* Compute 'frac' as 20% of the configured interval */ 4953 if ((frac = intv / 5) <= 1) 4954 frac = 2; 4955 /* Set intv randomly in the range [intv-frac .. intv+frac] */ 4956 if ((intv = intv - frac + rnd % (2 * frac + 1)) <= 0) 4957 intv = 1; 4958 } 4959 return (intv); 4960 } 4961 4962 void 4963 nce_resolv_ipmp_ok(ncec_t *ncec) 4964 { 4965 mblk_t *mp; 4966 uint_t pkt_len; 4967 iaflags_t ixaflags = IXAF_NO_TRACE; 4968 nce_t *under_nce; 4969 ill_t *ill = ncec->ncec_ill; 4970 boolean_t isv6 = (ncec->ncec_ipversion == IPV6_VERSION); 4971 ipif_t *src_ipif = NULL; 4972 ip_stack_t *ipst = ill->ill_ipst; 4973 ill_t *send_ill; 4974 uint_t nprobes; 4975 4976 ASSERT(IS_IPMP(ill)); 4977 4978 mutex_enter(&ncec->ncec_lock); 4979 nprobes = ncec->ncec_nprobes; 4980 mp = ncec->ncec_qd_mp; 4981 ncec->ncec_qd_mp = NULL; 4982 ncec->ncec_nprobes = 0; 4983 mutex_exit(&ncec->ncec_lock); 4984 4985 while (mp != NULL) { 4986 mblk_t *nxt_mp; 4987 4988 nxt_mp = mp->b_next; 4989 mp->b_next = NULL; 4990 if (isv6) { 4991 ip6_t *ip6h = (ip6_t *)mp->b_rptr; 4992 4993 pkt_len = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN; 4994 src_ipif = ipif_lookup_addr_nondup_v6(&ip6h->ip6_src, 4995 ill, ALL_ZONES, ipst); 4996 } else { 4997 ipha_t *ipha = (ipha_t *)mp->b_rptr; 4998 4999 ixaflags |= IXAF_IS_IPV4; 5000 pkt_len = ntohs(ipha->ipha_length); 5001 src_ipif = ipif_lookup_addr_nondup(ipha->ipha_src, 5002 ill, ALL_ZONES, ipst); 5003 } 5004 5005 /* 5006 * find a new nce based on an under_ill. The first IPMP probe 5007 * packet gets queued, so we could still find a src_ipif that 5008 * matches an IPMP test address. 5009 */ 5010 if (src_ipif == NULL || IS_IPMP(src_ipif->ipif_ill)) { 5011 /* 5012 * if src_ipif is null, this could be either a 5013 * forwarded packet or a probe whose src got deleted. 5014 * We identify the former case by looking for the 5015 * ncec_nprobes: the first ncec_nprobes packets are 5016 * probes; 5017 */ 5018 if (src_ipif == NULL && nprobes > 0) 5019 goto drop_pkt; 5020 5021 /* 5022 * For forwarded packets, we use the ipmp rotor 5023 * to find send_ill. 5024 */ 5025 send_ill = ipmp_ill_get_xmit_ill(ncec->ncec_ill, 5026 B_TRUE); 5027 } else { 5028 send_ill = src_ipif->ipif_ill; 5029 ill_refhold(send_ill); 5030 } 5031 5032 DTRACE_PROBE4(nce__resolve__ipmp, (mblk_t *), mp, 5033 (ncec_t *), ncec, (ipif_t *), 5034 src_ipif, (ill_t *), send_ill); 5035 5036 if (send_ill == NULL) { 5037 if (src_ipif != NULL) 5038 ipif_refrele(src_ipif); 5039 goto drop_pkt; 5040 } 5041 /* create an under_nce on send_ill */ 5042 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 5043 if (IS_IN_SAME_ILLGRP(send_ill, ncec->ncec_ill)) 5044 under_nce = nce_fastpath_create(send_ill, ncec); 5045 else 5046 under_nce = NULL; 5047 rw_exit(&ipst->ips_ill_g_lock); 5048 if (under_nce != NULL && NCE_ISREACHABLE(ncec)) 5049 nce_fastpath_trigger(under_nce); 5050 5051 ill_refrele(send_ill); 5052 if (src_ipif != NULL) 5053 ipif_refrele(src_ipif); 5054 5055 if (under_nce != NULL) { 5056 (void) ip_xmit(mp, under_nce, ixaflags, pkt_len, 0, 5057 ALL_ZONES, 0, NULL); 5058 nce_refrele(under_nce); 5059 if (nprobes > 0) 5060 nprobes--; 5061 mp = nxt_mp; 5062 continue; 5063 } 5064 drop_pkt: 5065 if (isv6) { 5066 BUMP_MIB(&ipst->ips_ip6_mib, ipIfStatsOutDiscards); 5067 } else { 5068 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); 5069 } 5070 ip_drop_output("ipIfStatsOutDiscards - no under_ill", mp, NULL); 5071 freemsg(mp); 5072 if (nprobes > 0) 5073 nprobes--; 5074 mp = nxt_mp; 5075 } 5076 ncec_cb_dispatch(ncec); /* complete callbacks */ 5077 } 5078