1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved. 23 */ 24 25 #include <sys/types.h> 26 #include <sys/stream.h> 27 #include <sys/stropts.h> 28 #include <sys/strsun.h> 29 #include <sys/sysmacros.h> 30 #include <sys/errno.h> 31 #include <sys/dlpi.h> 32 #include <sys/socket.h> 33 #include <sys/ddi.h> 34 #include <sys/sunddi.h> 35 #include <sys/cmn_err.h> 36 #include <sys/debug.h> 37 #include <sys/vtrace.h> 38 #include <sys/kmem.h> 39 #include <sys/zone.h> 40 #include <sys/ethernet.h> 41 #include <sys/sdt.h> 42 #include <sys/mac.h> 43 44 #include <net/if.h> 45 #include <net/if_types.h> 46 #include <net/if_dl.h> 47 #include <net/route.h> 48 #include <netinet/in.h> 49 #include <netinet/ip6.h> 50 #include <netinet/icmp6.h> 51 52 #include <inet/common.h> 53 #include <inet/mi.h> 54 #include <inet/mib2.h> 55 #include <inet/nd.h> 56 #include <inet/ip.h> 57 #include <inet/ip_impl.h> 58 #include <inet/ipclassifier.h> 59 #include <inet/ip_if.h> 60 #include <inet/ip_ire.h> 61 #include <inet/ip_rts.h> 62 #include <inet/ip6.h> 63 #include <inet/ip_ndp.h> 64 #include <inet/sctp_ip.h> 65 #include <inet/ip_arp.h> 66 #include <inet/ip2mac_impl.h> 67 68 #define ANNOUNCE_INTERVAL(isv6) \ 69 (isv6 ? ipst->ips_ip_ndp_unsolicit_interval : \ 70 ipst->ips_ip_arp_publish_interval) 71 72 #define DEFENSE_INTERVAL(isv6) \ 73 (isv6 ? ipst->ips_ndp_defend_interval : \ 74 ipst->ips_arp_defend_interval) 75 76 /* Non-tunable probe interval, based on link capabilities */ 77 #define ILL_PROBE_INTERVAL(ill) ((ill)->ill_note_link ? 150 : 1500) 78 79 /* 80 * The IPv4 Link Local address space is special; we do extra duplicate checking 81 * there, as the entire assignment mechanism rests on random numbers. 82 */ 83 #define IS_IPV4_LL_SPACE(ptr) (((uchar_t *)ptr)[0] == 169 && \ 84 ((uchar_t *)ptr)[1] == 254) 85 86 /* 87 * NCE_EXTERNAL_FLAGS_MASK defines the set of ncec_flags that may be passed 88 * in to the ncec*add* functions. 89 * 90 * NCE_F_AUTHORITY means that we ignore any incoming adverts for that 91 * mapping (though DAD is performed for the mapping). NCE_F_PUBLISH means 92 * that we will respond to requests for the protocol address. 93 */ 94 #define NCE_EXTERNAL_FLAGS_MASK \ 95 (NCE_F_MYADDR | NCE_F_ISROUTER | NCE_F_NONUD | \ 96 NCE_F_ANYCAST | NCE_F_UNSOL_ADV | NCE_F_BCAST | NCE_F_MCAST | \ 97 NCE_F_AUTHORITY | NCE_F_PUBLISH | NCE_F_STATIC) 98 99 /* 100 * Lock ordering: 101 * 102 * ndp_g_lock -> ill_lock -> ncec_lock 103 * 104 * The ndp_g_lock protects the NCE hash (nce_hash_tbl, NCE_HASH_PTR) and 105 * ncec_next. ncec_lock protects the contents of the NCE (particularly 106 * ncec_refcnt). 107 */ 108 109 static void nce_cleanup_list(ncec_t *ncec); 110 static void nce_set_ll(ncec_t *ncec, uchar_t *ll_addr); 111 static ncec_t *ncec_lookup_illgrp(ill_t *, const in6_addr_t *, 112 ncec_t *); 113 static nce_t *nce_lookup_addr(ill_t *, const in6_addr_t *); 114 static int nce_set_multicast_v6(ill_t *ill, const in6_addr_t *addr, 115 uint16_t ncec_flags, nce_t **newnce); 116 static int nce_set_multicast_v4(ill_t *ill, const in_addr_t *dst, 117 uint16_t ncec_flags, nce_t **newnce); 118 static boolean_t ndp_xmit(ill_t *ill, uint32_t operation, 119 uint8_t *hwaddr, uint_t hwaddr_len, const in6_addr_t *sender, 120 const in6_addr_t *target, int flag); 121 static void ncec_refhold_locked(ncec_t *); 122 static boolean_t ill_defend_rate_limit(ill_t *, ncec_t *); 123 static void nce_queue_mp_common(ncec_t *, mblk_t *, boolean_t); 124 static int nce_add_common(ill_t *, uchar_t *, uint_t, const in6_addr_t *, 125 uint16_t, uint16_t, nce_t **); 126 static nce_t *nce_add_impl(ill_t *, ncec_t *, nce_t *, mblk_t *); 127 static nce_t *nce_add(ill_t *, ncec_t *); 128 static void nce_inactive(nce_t *); 129 extern nce_t *nce_lookup(ill_t *, const in6_addr_t *); 130 static nce_t *nce_ill_lookup_then_add(ill_t *, ncec_t *); 131 static int nce_add_v6(ill_t *, uchar_t *, uint_t, const in6_addr_t *, 132 uint16_t, uint16_t, nce_t **); 133 static int nce_add_v4(ill_t *, uchar_t *, uint_t, const in_addr_t *, 134 uint16_t, uint16_t, nce_t **); 135 static int nce_add_v6_postprocess(nce_t *); 136 static int nce_add_v4_postprocess(nce_t *); 137 static ill_t *nce_resolve_src(ncec_t *, in6_addr_t *); 138 static clock_t nce_fuzz_interval(clock_t, boolean_t); 139 static void nce_resolv_ipmp_ok(ncec_t *); 140 static void nce_walk_common(ill_t *, pfi_t, void *); 141 static void nce_start_timer(ncec_t *, uint_t); 142 static nce_t *nce_fastpath_create(ill_t *, ncec_t *); 143 static void nce_fastpath_trigger(nce_t *); 144 static nce_t *nce_fastpath(ncec_t *, boolean_t, nce_t *); 145 146 #ifdef DEBUG 147 static void ncec_trace_cleanup(const ncec_t *); 148 #endif 149 150 #define NCE_HASH_PTR_V4(ipst, addr) \ 151 (&((ipst)->ips_ndp4->nce_hash_tbl[IRE_ADDR_HASH(addr, NCE_TABLE_SIZE)])) 152 153 #define NCE_HASH_PTR_V6(ipst, addr) \ 154 (&((ipst)->ips_ndp6->nce_hash_tbl[NCE_ADDR_HASH_V6(addr, \ 155 NCE_TABLE_SIZE)])) 156 157 extern kmem_cache_t *ncec_cache; 158 extern kmem_cache_t *nce_cache; 159 160 /* 161 * Send out a IPv6 (unicast) or IPv4 (broadcast) DAD probe 162 * If src_ill is not null, the ncec_addr is bound to src_ill. The 163 * src_ill is ignored by nce_dad for IPv4 Neighbor Cache entries where 164 * the probe is sent on the ncec_ill (in the non-IPMP case) or the 165 * IPMP cast_ill (in the IPMP case). 166 * 167 * Note that the probe interval is based on the src_ill for IPv6, and 168 * the ncec_xmit_interval for IPv4. 169 */ 170 static void 171 nce_dad(ncec_t *ncec, ill_t *src_ill, boolean_t send_probe) 172 { 173 boolean_t dropped; 174 uint32_t probe_interval; 175 176 ASSERT(!(ncec->ncec_flags & NCE_F_MCAST)); 177 ASSERT(!(ncec->ncec_flags & NCE_F_BCAST)); 178 if (ncec->ncec_ipversion == IPV6_VERSION) { 179 dropped = ndp_xmit(src_ill, ND_NEIGHBOR_SOLICIT, 180 ncec->ncec_lladdr, ncec->ncec_lladdr_length, 181 &ipv6_all_zeros, &ncec->ncec_addr, NDP_PROBE); 182 probe_interval = ILL_PROBE_INTERVAL(src_ill); 183 } else { 184 /* IPv4 DAD delay the initial probe. */ 185 if (send_probe) 186 dropped = arp_probe(ncec); 187 else 188 dropped = B_TRUE; 189 probe_interval = nce_fuzz_interval(ncec->ncec_xmit_interval, 190 !send_probe); 191 } 192 if (!dropped) { 193 mutex_enter(&ncec->ncec_lock); 194 ncec->ncec_pcnt--; 195 mutex_exit(&ncec->ncec_lock); 196 } 197 nce_restart_timer(ncec, probe_interval); 198 } 199 200 /* 201 * Compute default flags to use for an advertisement of this ncec's address. 202 */ 203 static int 204 nce_advert_flags(const ncec_t *ncec) 205 { 206 int flag = 0; 207 208 if (ncec->ncec_flags & NCE_F_ISROUTER) 209 flag |= NDP_ISROUTER; 210 if (!(ncec->ncec_flags & NCE_F_ANYCAST)) 211 flag |= NDP_ORIDE; 212 213 return (flag); 214 } 215 216 /* 217 * NDP Cache Entry creation routine. 218 * This routine must always be called with ndp6->ndp_g_lock held. 219 */ 220 int 221 nce_add_v6(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len, 222 const in6_addr_t *addr, uint16_t flags, uint16_t state, nce_t **newnce) 223 { 224 int err; 225 nce_t *nce; 226 227 ASSERT(MUTEX_HELD(&ill->ill_ipst->ips_ndp6->ndp_g_lock)); 228 ASSERT(ill != NULL && ill->ill_isv6); 229 230 err = nce_add_common(ill, hw_addr, hw_addr_len, addr, flags, state, 231 &nce); 232 if (err != 0) 233 return (err); 234 ASSERT(newnce != NULL); 235 *newnce = nce; 236 return (err); 237 } 238 239 /* 240 * Post-processing routine to be executed after nce_add_v6(). This function 241 * triggers fastpath (if appropriate) and DAD on the newly added nce entry 242 * and must be called without any locks held. 243 */ 244 int 245 nce_add_v6_postprocess(nce_t *nce) 246 { 247 ncec_t *ncec = nce->nce_common; 248 boolean_t dropped = B_FALSE; 249 uchar_t *hw_addr = ncec->ncec_lladdr; 250 uint_t hw_addr_len = ncec->ncec_lladdr_length; 251 ill_t *ill = ncec->ncec_ill; 252 int err = 0; 253 uint16_t flags = ncec->ncec_flags; 254 ip_stack_t *ipst = ill->ill_ipst; 255 boolean_t trigger_fastpath = B_TRUE; 256 257 /* 258 * If the hw_addr is NULL, typically for ND_INCOMPLETE nces, then 259 * we call nce_fastpath as soon as the ncec is resolved in nce_process. 260 * We call nce_fastpath from nce_update if the link layer address of 261 * the peer changes from nce_update 262 */ 263 if (NCE_PUBLISH(ncec) || !NCE_ISREACHABLE(ncec) || 264 (hw_addr == NULL && ill->ill_net_type != IRE_IF_NORESOLVER)) 265 trigger_fastpath = B_FALSE; 266 267 if (trigger_fastpath) 268 nce_fastpath_trigger(nce); 269 if (NCE_PUBLISH(ncec) && ncec->ncec_state == ND_PROBE) { 270 ill_t *hwaddr_ill; 271 /* 272 * Unicast entry that needs DAD. 273 */ 274 if (IS_IPMP(ill)) { 275 hwaddr_ill = ipmp_illgrp_find_ill(ill->ill_grp, 276 hw_addr, hw_addr_len); 277 } else { 278 hwaddr_ill = ill; 279 } 280 nce_dad(ncec, hwaddr_ill, B_TRUE); 281 err = EINPROGRESS; 282 } else if (flags & NCE_F_UNSOL_ADV) { 283 /* 284 * We account for the transmit below by assigning one 285 * less than the ndd variable. Subsequent decrements 286 * are done in nce_timer. 287 */ 288 mutex_enter(&ncec->ncec_lock); 289 ncec->ncec_unsolicit_count = 290 ipst->ips_ip_ndp_unsolicit_count - 1; 291 mutex_exit(&ncec->ncec_lock); 292 dropped = ndp_xmit(ill, 293 ND_NEIGHBOR_ADVERT, 294 hw_addr, 295 hw_addr_len, 296 &ncec->ncec_addr, /* Source and target of the adv */ 297 &ipv6_all_hosts_mcast, /* Destination of the packet */ 298 nce_advert_flags(ncec)); 299 mutex_enter(&ncec->ncec_lock); 300 if (dropped) 301 ncec->ncec_unsolicit_count++; 302 else 303 ncec->ncec_last_time_defended = ddi_get_lbolt(); 304 if (ncec->ncec_unsolicit_count != 0) { 305 nce_start_timer(ncec, 306 ipst->ips_ip_ndp_unsolicit_interval); 307 } 308 mutex_exit(&ncec->ncec_lock); 309 } 310 return (err); 311 } 312 313 /* 314 * Atomically lookup and add (if needed) Neighbor Cache information for 315 * an address. 316 * 317 * IPMP notes: the ncec for non-local (i.e., !NCE_MYADDR(ncec) addresses 318 * are always added pointing at the ipmp_ill. Thus, when the ill passed 319 * to nce_add_v6 is an under_ill (i.e., IS_UNDER_IPMP(ill)) two nce_t 320 * entries will be created, both pointing at the same ncec_t. The nce_t 321 * entries will have their nce_ill set to the ipmp_ill and the under_ill 322 * respectively, with the ncec_t having its ncec_ill pointing at the ipmp_ill. 323 * Local addresses are always created on the ill passed to nce_add_v6. 324 */ 325 int 326 nce_lookup_then_add_v6(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len, 327 const in6_addr_t *addr, uint16_t flags, uint16_t state, nce_t **newnce) 328 { 329 int err = 0; 330 ip_stack_t *ipst = ill->ill_ipst; 331 nce_t *nce, *upper_nce = NULL; 332 ill_t *in_ill = ill; 333 boolean_t need_ill_refrele = B_FALSE; 334 335 if (flags & NCE_F_MCAST) { 336 /* 337 * hw_addr will be figured out in nce_set_multicast_v6; 338 * caller has to select the cast_ill 339 */ 340 ASSERT(hw_addr == NULL); 341 ASSERT(!IS_IPMP(ill)); 342 err = nce_set_multicast_v6(ill, addr, flags, newnce); 343 return (err); 344 } 345 ASSERT(ill->ill_isv6); 346 if (IS_UNDER_IPMP(ill) && !(flags & NCE_F_MYADDR)) { 347 ill = ipmp_ill_hold_ipmp_ill(ill); 348 if (ill == NULL) 349 return (ENXIO); 350 need_ill_refrele = B_TRUE; 351 } 352 353 mutex_enter(&ipst->ips_ndp6->ndp_g_lock); 354 nce = nce_lookup_addr(ill, addr); 355 if (nce == NULL) { 356 err = nce_add_v6(ill, hw_addr, hw_addr_len, addr, flags, state, 357 &nce); 358 } else { 359 err = EEXIST; 360 } 361 mutex_exit(&ipst->ips_ndp6->ndp_g_lock); 362 if (err == 0) 363 err = nce_add_v6_postprocess(nce); 364 if (in_ill != ill && nce != NULL) { 365 nce_t *under_nce = NULL; 366 367 /* 368 * in_ill was the under_ill. Try to create the under_nce. 369 * Hold the ill_g_lock to prevent changes to group membership 370 * until we are done. 371 */ 372 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 373 if (!IS_IN_SAME_ILLGRP(in_ill, ill)) { 374 DTRACE_PROBE2(ill__not__in__group, nce_t *, nce, 375 ill_t *, ill); 376 rw_exit(&ipst->ips_ill_g_lock); 377 err = ENXIO; 378 nce_refrele(nce); 379 nce = NULL; 380 goto bail; 381 } 382 under_nce = nce_fastpath_create(in_ill, nce->nce_common); 383 if (under_nce == NULL) { 384 rw_exit(&ipst->ips_ill_g_lock); 385 err = EINVAL; 386 nce_refrele(nce); 387 nce = NULL; 388 goto bail; 389 } 390 rw_exit(&ipst->ips_ill_g_lock); 391 upper_nce = nce; 392 nce = under_nce; /* will be returned to caller */ 393 if (NCE_ISREACHABLE(nce->nce_common)) 394 nce_fastpath_trigger(under_nce); 395 } 396 /* nce_refrele is deferred until the lock is dropped */ 397 if (nce != NULL) { 398 if (newnce != NULL) 399 *newnce = nce; 400 else 401 nce_refrele(nce); 402 } 403 bail: 404 if (upper_nce != NULL) 405 nce_refrele(upper_nce); 406 if (need_ill_refrele) 407 ill_refrele(ill); 408 return (err); 409 } 410 411 /* 412 * Remove all the CONDEMNED nces from the appropriate hash table. 413 * We create a private list of NCEs, these may have ires pointing 414 * to them, so the list will be passed through to clean up dependent 415 * ires and only then we can do ncec_refrele() which can make NCE inactive. 416 */ 417 static void 418 nce_remove(ndp_g_t *ndp, ncec_t *ncec, ncec_t **free_nce_list) 419 { 420 ncec_t *ncec1; 421 ncec_t **ptpn; 422 423 ASSERT(MUTEX_HELD(&ndp->ndp_g_lock)); 424 ASSERT(ndp->ndp_g_walker == 0); 425 for (; ncec; ncec = ncec1) { 426 ncec1 = ncec->ncec_next; 427 mutex_enter(&ncec->ncec_lock); 428 if (NCE_ISCONDEMNED(ncec)) { 429 ptpn = ncec->ncec_ptpn; 430 ncec1 = ncec->ncec_next; 431 if (ncec1 != NULL) 432 ncec1->ncec_ptpn = ptpn; 433 *ptpn = ncec1; 434 ncec->ncec_ptpn = NULL; 435 ncec->ncec_next = NULL; 436 ncec->ncec_next = *free_nce_list; 437 *free_nce_list = ncec; 438 } 439 mutex_exit(&ncec->ncec_lock); 440 } 441 } 442 443 /* 444 * 1. Mark the entry CONDEMNED. This ensures that no new nce_lookup() 445 * will return this NCE. Also no new timeouts will 446 * be started (See nce_restart_timer). 447 * 2. Cancel any currently running timeouts. 448 * 3. If there is an ndp walker, return. The walker will do the cleanup. 449 * This ensures that walkers see a consistent list of NCEs while walking. 450 * 4. Otherwise remove the NCE from the list of NCEs 451 */ 452 void 453 ncec_delete(ncec_t *ncec) 454 { 455 ncec_t **ptpn; 456 ncec_t *ncec1; 457 int ipversion = ncec->ncec_ipversion; 458 ndp_g_t *ndp; 459 ip_stack_t *ipst = ncec->ncec_ipst; 460 461 if (ipversion == IPV4_VERSION) 462 ndp = ipst->ips_ndp4; 463 else 464 ndp = ipst->ips_ndp6; 465 466 /* Serialize deletes */ 467 mutex_enter(&ncec->ncec_lock); 468 if (NCE_ISCONDEMNED(ncec)) { 469 /* Some other thread is doing the delete */ 470 mutex_exit(&ncec->ncec_lock); 471 return; 472 } 473 /* 474 * Caller has a refhold. Also 1 ref for being in the list. Thus 475 * refcnt has to be >= 2 476 */ 477 ASSERT(ncec->ncec_refcnt >= 2); 478 ncec->ncec_flags |= NCE_F_CONDEMNED; 479 mutex_exit(&ncec->ncec_lock); 480 481 /* Count how many condemned ires for kmem_cache callback */ 482 atomic_inc_32(&ipst->ips_num_nce_condemned); 483 nce_fastpath_list_delete(ncec->ncec_ill, ncec, NULL); 484 485 /* Complete any waiting callbacks */ 486 ncec_cb_dispatch(ncec); 487 488 /* 489 * Cancel any running timer. Timeout can't be restarted 490 * since CONDEMNED is set. Can't hold ncec_lock across untimeout. 491 * Passing invalid timeout id is fine. 492 */ 493 if (ncec->ncec_timeout_id != 0) { 494 (void) untimeout(ncec->ncec_timeout_id); 495 ncec->ncec_timeout_id = 0; 496 } 497 498 mutex_enter(&ndp->ndp_g_lock); 499 if (ncec->ncec_ptpn == NULL) { 500 /* 501 * The last ndp walker has already removed this ncec from 502 * the list after we marked the ncec CONDEMNED and before 503 * we grabbed the global lock. 504 */ 505 mutex_exit(&ndp->ndp_g_lock); 506 return; 507 } 508 if (ndp->ndp_g_walker > 0) { 509 /* 510 * Can't unlink. The walker will clean up 511 */ 512 ndp->ndp_g_walker_cleanup = B_TRUE; 513 mutex_exit(&ndp->ndp_g_lock); 514 return; 515 } 516 517 /* 518 * Now remove the ncec from the list. nce_restart_timer won't restart 519 * the timer since it is marked CONDEMNED. 520 */ 521 ptpn = ncec->ncec_ptpn; 522 ncec1 = ncec->ncec_next; 523 if (ncec1 != NULL) 524 ncec1->ncec_ptpn = ptpn; 525 *ptpn = ncec1; 526 ncec->ncec_ptpn = NULL; 527 ncec->ncec_next = NULL; 528 mutex_exit(&ndp->ndp_g_lock); 529 530 /* Removed from ncec_ptpn/ncec_next list */ 531 ncec_refrele_notr(ncec); 532 } 533 534 void 535 ncec_inactive(ncec_t *ncec) 536 { 537 mblk_t **mpp; 538 ill_t *ill = ncec->ncec_ill; 539 ip_stack_t *ipst = ncec->ncec_ipst; 540 541 ASSERT(ncec->ncec_refcnt == 0); 542 ASSERT(MUTEX_HELD(&ncec->ncec_lock)); 543 544 /* Count how many condemned nces for kmem_cache callback */ 545 if (NCE_ISCONDEMNED(ncec)) 546 atomic_add_32(&ipst->ips_num_nce_condemned, -1); 547 548 /* Free all allocated messages */ 549 mpp = &ncec->ncec_qd_mp; 550 while (*mpp != NULL) { 551 mblk_t *mp; 552 553 mp = *mpp; 554 *mpp = mp->b_next; 555 556 inet_freemsg(mp); 557 } 558 /* 559 * must have been cleaned up in ncec_delete 560 */ 561 ASSERT(list_is_empty(&ncec->ncec_cb)); 562 list_destroy(&ncec->ncec_cb); 563 /* 564 * free the ncec_lladdr if one was allocated in nce_add_common() 565 */ 566 if (ncec->ncec_lladdr_length > 0) 567 kmem_free(ncec->ncec_lladdr, ncec->ncec_lladdr_length); 568 569 #ifdef DEBUG 570 ncec_trace_cleanup(ncec); 571 #endif 572 573 mutex_enter(&ill->ill_lock); 574 DTRACE_PROBE3(ill__decr__cnt, (ill_t *), ill, 575 (char *), "ncec", (void *), ncec); 576 ill->ill_ncec_cnt--; 577 ncec->ncec_ill = NULL; 578 /* 579 * If the number of ncec's associated with this ill have dropped 580 * to zero, check whether we need to restart any operation that 581 * is waiting for this to happen. 582 */ 583 if (ILL_DOWN_OK(ill)) { 584 /* ipif_ill_refrele_tail drops the ill_lock */ 585 ipif_ill_refrele_tail(ill); 586 } else { 587 mutex_exit(&ill->ill_lock); 588 } 589 590 mutex_destroy(&ncec->ncec_lock); 591 kmem_cache_free(ncec_cache, ncec); 592 } 593 594 /* 595 * ncec_walk routine. Delete the ncec if it is associated with the ill 596 * that is going away. Always called as a writer. 597 */ 598 void 599 ncec_delete_per_ill(ncec_t *ncec, uchar_t *arg) 600 { 601 if ((ncec != NULL) && ncec->ncec_ill == (ill_t *)arg) { 602 ncec_delete(ncec); 603 } 604 } 605 606 /* 607 * Neighbor Cache cleanup logic for a list of ncec_t entries. 608 */ 609 static void 610 nce_cleanup_list(ncec_t *ncec) 611 { 612 ncec_t *ncec_next; 613 614 ASSERT(ncec != NULL); 615 while (ncec != NULL) { 616 ncec_next = ncec->ncec_next; 617 ncec->ncec_next = NULL; 618 619 /* 620 * It is possible for the last ndp walker (this thread) 621 * to come here after ncec_delete has marked the ncec CONDEMNED 622 * and before it has removed the ncec from the fastpath list 623 * or called untimeout. So we need to do it here. It is safe 624 * for both ncec_delete and this thread to do it twice or 625 * even simultaneously since each of the threads has a 626 * reference on the ncec. 627 */ 628 nce_fastpath_list_delete(ncec->ncec_ill, ncec, NULL); 629 /* 630 * Cancel any running timer. Timeout can't be restarted 631 * since CONDEMNED is set. The ncec_lock can't be 632 * held across untimeout though passing invalid timeout 633 * id is fine. 634 */ 635 if (ncec->ncec_timeout_id != 0) { 636 (void) untimeout(ncec->ncec_timeout_id); 637 ncec->ncec_timeout_id = 0; 638 } 639 /* Removed from ncec_ptpn/ncec_next list */ 640 ncec_refrele_notr(ncec); 641 ncec = ncec_next; 642 } 643 } 644 645 /* 646 * Restart DAD on given NCE. Returns B_TRUE if DAD has been restarted. 647 */ 648 boolean_t 649 nce_restart_dad(ncec_t *ncec) 650 { 651 boolean_t started; 652 ill_t *ill, *hwaddr_ill; 653 654 if (ncec == NULL) 655 return (B_FALSE); 656 ill = ncec->ncec_ill; 657 mutex_enter(&ncec->ncec_lock); 658 if (ncec->ncec_state == ND_PROBE) { 659 mutex_exit(&ncec->ncec_lock); 660 started = B_TRUE; 661 } else if (ncec->ncec_state == ND_REACHABLE) { 662 ASSERT(ncec->ncec_lladdr != NULL); 663 ncec->ncec_state = ND_PROBE; 664 ncec->ncec_pcnt = ND_MAX_UNICAST_SOLICIT; 665 /* 666 * Slight cheat here: we don't use the initial probe delay 667 * for IPv4 in this obscure case. 668 */ 669 mutex_exit(&ncec->ncec_lock); 670 if (IS_IPMP(ill)) { 671 hwaddr_ill = ipmp_illgrp_find_ill(ill->ill_grp, 672 ncec->ncec_lladdr, ncec->ncec_lladdr_length); 673 } else { 674 hwaddr_ill = ill; 675 } 676 nce_dad(ncec, hwaddr_ill, B_TRUE); 677 started = B_TRUE; 678 } else { 679 mutex_exit(&ncec->ncec_lock); 680 started = B_FALSE; 681 } 682 return (started); 683 } 684 685 /* 686 * IPv6 Cache entry lookup. Try to find an ncec matching the parameters passed. 687 * If one is found, the refcnt on the ncec will be incremented. 688 */ 689 ncec_t * 690 ncec_lookup_illgrp_v6(ill_t *ill, const in6_addr_t *addr) 691 { 692 ncec_t *ncec; 693 ip_stack_t *ipst = ill->ill_ipst; 694 695 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 696 mutex_enter(&ipst->ips_ndp6->ndp_g_lock); 697 698 /* Get head of v6 hash table */ 699 ncec = *((ncec_t **)NCE_HASH_PTR_V6(ipst, *addr)); 700 ncec = ncec_lookup_illgrp(ill, addr, ncec); 701 mutex_exit(&ipst->ips_ndp6->ndp_g_lock); 702 rw_exit(&ipst->ips_ill_g_lock); 703 return (ncec); 704 } 705 /* 706 * IPv4 Cache entry lookup. Try to find an ncec matching the parameters passed. 707 * If one is found, the refcnt on the ncec will be incremented. 708 */ 709 ncec_t * 710 ncec_lookup_illgrp_v4(ill_t *ill, const in_addr_t *addr) 711 { 712 ncec_t *ncec = NULL; 713 in6_addr_t addr6; 714 ip_stack_t *ipst = ill->ill_ipst; 715 716 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 717 mutex_enter(&ipst->ips_ndp4->ndp_g_lock); 718 719 /* Get head of v4 hash table */ 720 ncec = *((ncec_t **)NCE_HASH_PTR_V4(ipst, *addr)); 721 IN6_IPADDR_TO_V4MAPPED(*addr, &addr6); 722 ncec = ncec_lookup_illgrp(ill, &addr6, ncec); 723 mutex_exit(&ipst->ips_ndp4->ndp_g_lock); 724 rw_exit(&ipst->ips_ill_g_lock); 725 return (ncec); 726 } 727 728 /* 729 * Cache entry lookup. Try to find an ncec matching the parameters passed. 730 * If an ncec is found, increment the hold count on that ncec. 731 * The caller passes in the start of the appropriate hash table, and must 732 * be holding the appropriate global lock (ndp_g_lock). In addition, since 733 * this function matches ncec_t entries across the illgrp, the ips_ill_g_lock 734 * must be held as reader. 735 * 736 * This function always matches across the ipmp group. 737 */ 738 ncec_t * 739 ncec_lookup_illgrp(ill_t *ill, const in6_addr_t *addr, ncec_t *ncec) 740 { 741 ndp_g_t *ndp; 742 ip_stack_t *ipst = ill->ill_ipst; 743 744 if (ill->ill_isv6) 745 ndp = ipst->ips_ndp6; 746 else 747 ndp = ipst->ips_ndp4; 748 749 ASSERT(ill != NULL); 750 ASSERT(MUTEX_HELD(&ndp->ndp_g_lock)); 751 if (IN6_IS_ADDR_UNSPECIFIED(addr)) 752 return (NULL); 753 for (; ncec != NULL; ncec = ncec->ncec_next) { 754 if (ncec->ncec_ill == ill || 755 IS_IN_SAME_ILLGRP(ill, ncec->ncec_ill)) { 756 if (IN6_ARE_ADDR_EQUAL(&ncec->ncec_addr, addr)) { 757 mutex_enter(&ncec->ncec_lock); 758 if (!NCE_ISCONDEMNED(ncec)) { 759 ncec_refhold_locked(ncec); 760 mutex_exit(&ncec->ncec_lock); 761 break; 762 } 763 mutex_exit(&ncec->ncec_lock); 764 } 765 } 766 } 767 return (ncec); 768 } 769 770 /* 771 * Find an nce_t on ill with nce_addr == addr. Lookup the nce_t 772 * entries for ill only, i.e., when ill is part of an ipmp group, 773 * nce_lookup_v4 will never try to match across the group. 774 */ 775 nce_t * 776 nce_lookup_v4(ill_t *ill, const in_addr_t *addr) 777 { 778 nce_t *nce; 779 in6_addr_t addr6; 780 ip_stack_t *ipst = ill->ill_ipst; 781 782 mutex_enter(&ipst->ips_ndp4->ndp_g_lock); 783 IN6_IPADDR_TO_V4MAPPED(*addr, &addr6); 784 nce = nce_lookup_addr(ill, &addr6); 785 mutex_exit(&ipst->ips_ndp4->ndp_g_lock); 786 return (nce); 787 } 788 789 /* 790 * Find an nce_t on ill with nce_addr == addr. Lookup the nce_t 791 * entries for ill only, i.e., when ill is part of an ipmp group, 792 * nce_lookup_v6 will never try to match across the group. 793 */ 794 nce_t * 795 nce_lookup_v6(ill_t *ill, const in6_addr_t *addr6) 796 { 797 nce_t *nce; 798 ip_stack_t *ipst = ill->ill_ipst; 799 800 mutex_enter(&ipst->ips_ndp6->ndp_g_lock); 801 nce = nce_lookup_addr(ill, addr6); 802 mutex_exit(&ipst->ips_ndp6->ndp_g_lock); 803 return (nce); 804 } 805 806 static nce_t * 807 nce_lookup_addr(ill_t *ill, const in6_addr_t *addr) 808 { 809 nce_t *nce; 810 811 ASSERT(ill != NULL); 812 #ifdef DEBUG 813 if (ill->ill_isv6) 814 ASSERT(MUTEX_HELD(&ill->ill_ipst->ips_ndp6->ndp_g_lock)); 815 else 816 ASSERT(MUTEX_HELD(&ill->ill_ipst->ips_ndp4->ndp_g_lock)); 817 #endif 818 mutex_enter(&ill->ill_lock); 819 nce = nce_lookup(ill, addr); 820 mutex_exit(&ill->ill_lock); 821 return (nce); 822 } 823 824 825 /* 826 * Router turned to host. We need to make sure that cached copies of the ncec 827 * are not used for forwarding packets if they were derived from the default 828 * route, and that the default route itself is removed, as required by 829 * section 7.2.5 of RFC 2461. 830 * 831 * Note that the ncec itself probably has valid link-layer information for the 832 * nexthop, so that there is no reason to delete the ncec, as long as the 833 * ISROUTER flag is turned off. 834 */ 835 static void 836 ncec_router_to_host(ncec_t *ncec) 837 { 838 ire_t *ire; 839 ip_stack_t *ipst = ncec->ncec_ipst; 840 841 mutex_enter(&ncec->ncec_lock); 842 ncec->ncec_flags &= ~NCE_F_ISROUTER; 843 mutex_exit(&ncec->ncec_lock); 844 845 ire = ire_ftable_lookup_v6(&ipv6_all_zeros, &ipv6_all_zeros, 846 &ncec->ncec_addr, IRE_DEFAULT, ncec->ncec_ill, ALL_ZONES, NULL, 847 MATCH_IRE_ILL | MATCH_IRE_TYPE | MATCH_IRE_GW, 0, ipst, NULL); 848 if (ire != NULL) { 849 ip_rts_rtmsg(RTM_DELETE, ire, 0, ipst); 850 ire_delete(ire); 851 ire_refrele(ire); 852 } 853 } 854 855 /* 856 * Process passed in parameters either from an incoming packet or via 857 * user ioctl. 858 */ 859 void 860 nce_process(ncec_t *ncec, uchar_t *hw_addr, uint32_t flag, boolean_t is_adv) 861 { 862 ill_t *ill = ncec->ncec_ill; 863 uint32_t hw_addr_len = ill->ill_phys_addr_length; 864 boolean_t ll_updated = B_FALSE; 865 boolean_t ll_changed; 866 nce_t *nce; 867 868 ASSERT(ncec->ncec_ipversion == IPV6_VERSION); 869 /* 870 * No updates of link layer address or the neighbor state is 871 * allowed, when the cache is in NONUD state. This still 872 * allows for responding to reachability solicitation. 873 */ 874 mutex_enter(&ncec->ncec_lock); 875 if (ncec->ncec_state == ND_INCOMPLETE) { 876 if (hw_addr == NULL) { 877 mutex_exit(&ncec->ncec_lock); 878 return; 879 } 880 nce_set_ll(ncec, hw_addr); 881 /* 882 * Update ncec state and send the queued packets 883 * back to ip this time ire will be added. 884 */ 885 if (flag & ND_NA_FLAG_SOLICITED) { 886 nce_update(ncec, ND_REACHABLE, NULL); 887 } else { 888 nce_update(ncec, ND_STALE, NULL); 889 } 890 mutex_exit(&ncec->ncec_lock); 891 nce = nce_fastpath(ncec, B_TRUE, NULL); 892 nce_resolv_ok(ncec); 893 if (nce != NULL) 894 nce_refrele(nce); 895 return; 896 } 897 ll_changed = nce_cmp_ll_addr(ncec, hw_addr, hw_addr_len); 898 if (!is_adv) { 899 /* If this is a SOLICITATION request only */ 900 if (ll_changed) 901 nce_update(ncec, ND_STALE, hw_addr); 902 mutex_exit(&ncec->ncec_lock); 903 ncec_cb_dispatch(ncec); 904 return; 905 } 906 if (!(flag & ND_NA_FLAG_OVERRIDE) && ll_changed) { 907 /* If in any other state than REACHABLE, ignore */ 908 if (ncec->ncec_state == ND_REACHABLE) { 909 nce_update(ncec, ND_STALE, NULL); 910 } 911 mutex_exit(&ncec->ncec_lock); 912 ncec_cb_dispatch(ncec); 913 return; 914 } else { 915 if (ll_changed) { 916 nce_update(ncec, ND_UNCHANGED, hw_addr); 917 ll_updated = B_TRUE; 918 } 919 if (flag & ND_NA_FLAG_SOLICITED) { 920 nce_update(ncec, ND_REACHABLE, NULL); 921 } else { 922 if (ll_updated) { 923 nce_update(ncec, ND_STALE, NULL); 924 } 925 } 926 mutex_exit(&ncec->ncec_lock); 927 if (!(flag & ND_NA_FLAG_ROUTER) && (ncec->ncec_flags & 928 NCE_F_ISROUTER)) { 929 ncec_router_to_host(ncec); 930 } else { 931 ncec_cb_dispatch(ncec); 932 } 933 } 934 } 935 936 /* 937 * Pass arg1 to the pfi supplied, along with each ncec in existence. 938 * ncec_walk() places a REFHOLD on the ncec and drops the lock when 939 * walking the hash list. 940 */ 941 void 942 ncec_walk_common(ndp_g_t *ndp, ill_t *ill, pfi_t pfi, void *arg1, 943 boolean_t trace) 944 { 945 ncec_t *ncec; 946 ncec_t *ncec1; 947 ncec_t **ncep; 948 ncec_t *free_nce_list = NULL; 949 950 mutex_enter(&ndp->ndp_g_lock); 951 /* Prevent ncec_delete from unlink and free of NCE */ 952 ndp->ndp_g_walker++; 953 mutex_exit(&ndp->ndp_g_lock); 954 for (ncep = ndp->nce_hash_tbl; 955 ncep < A_END(ndp->nce_hash_tbl); ncep++) { 956 for (ncec = *ncep; ncec != NULL; ncec = ncec1) { 957 ncec1 = ncec->ncec_next; 958 if (ill == NULL || ncec->ncec_ill == ill) { 959 if (trace) { 960 ncec_refhold(ncec); 961 (*pfi)(ncec, arg1); 962 ncec_refrele(ncec); 963 } else { 964 ncec_refhold_notr(ncec); 965 (*pfi)(ncec, arg1); 966 ncec_refrele_notr(ncec); 967 } 968 } 969 } 970 } 971 mutex_enter(&ndp->ndp_g_lock); 972 ndp->ndp_g_walker--; 973 if (ndp->ndp_g_walker_cleanup && ndp->ndp_g_walker == 0) { 974 /* Time to delete condemned entries */ 975 for (ncep = ndp->nce_hash_tbl; 976 ncep < A_END(ndp->nce_hash_tbl); ncep++) { 977 ncec = *ncep; 978 if (ncec != NULL) { 979 nce_remove(ndp, ncec, &free_nce_list); 980 } 981 } 982 ndp->ndp_g_walker_cleanup = B_FALSE; 983 } 984 985 mutex_exit(&ndp->ndp_g_lock); 986 987 if (free_nce_list != NULL) { 988 nce_cleanup_list(free_nce_list); 989 } 990 } 991 992 /* 993 * Walk everything. 994 * Note that ill can be NULL hence can't derive the ipst from it. 995 */ 996 void 997 ncec_walk(ill_t *ill, pfi_t pfi, void *arg1, ip_stack_t *ipst) 998 { 999 ncec_walk_common(ipst->ips_ndp4, ill, pfi, arg1, B_TRUE); 1000 ncec_walk_common(ipst->ips_ndp6, ill, pfi, arg1, B_TRUE); 1001 } 1002 1003 /* 1004 * For each interface an entry is added for the unspecified multicast group. 1005 * Here that mapping is used to form the multicast cache entry for a particular 1006 * multicast destination. 1007 */ 1008 static int 1009 nce_set_multicast_v6(ill_t *ill, const in6_addr_t *dst, 1010 uint16_t flags, nce_t **newnce) 1011 { 1012 uchar_t *hw_addr; 1013 int err = 0; 1014 ip_stack_t *ipst = ill->ill_ipst; 1015 nce_t *nce; 1016 1017 ASSERT(ill != NULL); 1018 ASSERT(ill->ill_isv6); 1019 ASSERT(!(IN6_IS_ADDR_UNSPECIFIED(dst))); 1020 1021 mutex_enter(&ipst->ips_ndp6->ndp_g_lock); 1022 nce = nce_lookup_addr(ill, dst); 1023 if (nce != NULL) { 1024 mutex_exit(&ipst->ips_ndp6->ndp_g_lock); 1025 goto done; 1026 } 1027 if (ill->ill_net_type == IRE_IF_RESOLVER) { 1028 /* 1029 * For IRE_IF_RESOLVER a hardware mapping can be 1030 * generated. 1031 */ 1032 hw_addr = kmem_alloc(ill->ill_nd_lla_len, KM_NOSLEEP); 1033 if (hw_addr == NULL) { 1034 mutex_exit(&ipst->ips_ndp6->ndp_g_lock); 1035 return (ENOMEM); 1036 } 1037 ip_mcast_mapping(ill, (uchar_t *)dst, hw_addr); 1038 } else { 1039 /* No hw_addr is needed for IRE_IF_NORESOLVER. */ 1040 hw_addr = NULL; 1041 } 1042 ASSERT((flags & NCE_F_MCAST) != 0); 1043 ASSERT((flags & NCE_F_NONUD) != 0); 1044 /* nce_state will be computed by nce_add_common() */ 1045 err = nce_add_v6(ill, hw_addr, ill->ill_phys_addr_length, dst, flags, 1046 ND_UNCHANGED, &nce); 1047 mutex_exit(&ipst->ips_ndp6->ndp_g_lock); 1048 if (err == 0) 1049 err = nce_add_v6_postprocess(nce); 1050 if (hw_addr != NULL) 1051 kmem_free(hw_addr, ill->ill_nd_lla_len); 1052 if (err != 0) { 1053 ip1dbg(("nce_set_multicast_v6: create failed" "%d\n", err)); 1054 return (err); 1055 } 1056 done: 1057 ASSERT(nce->nce_common->ncec_state == ND_REACHABLE); 1058 if (newnce != NULL) 1059 *newnce = nce; 1060 else 1061 nce_refrele(nce); 1062 return (0); 1063 } 1064 1065 /* 1066 * Return the link layer address, and any flags of a ncec. 1067 */ 1068 int 1069 ndp_query(ill_t *ill, struct lif_nd_req *lnr) 1070 { 1071 ncec_t *ncec; 1072 in6_addr_t *addr; 1073 sin6_t *sin6; 1074 1075 ASSERT(ill != NULL && ill->ill_isv6); 1076 sin6 = (sin6_t *)&lnr->lnr_addr; 1077 addr = &sin6->sin6_addr; 1078 1079 /* 1080 * NOTE: if the ill is an IPMP interface, then match against the whole 1081 * illgrp. This e.g. allows in.ndpd to retrieve the link layer 1082 * addresses for the data addresses on an IPMP interface even though 1083 * ipif_ndp_up() created them with an ncec_ill of ipif_bound_ill. 1084 */ 1085 ncec = ncec_lookup_illgrp_v6(ill, addr); 1086 if (ncec == NULL) 1087 return (ESRCH); 1088 /* If no link layer address is available yet, return ESRCH */ 1089 if (!NCE_ISREACHABLE(ncec)) { 1090 ncec_refrele(ncec); 1091 return (ESRCH); 1092 } 1093 lnr->lnr_hdw_len = ill->ill_phys_addr_length; 1094 bcopy(ncec->ncec_lladdr, (uchar_t *)&lnr->lnr_hdw_addr, 1095 lnr->lnr_hdw_len); 1096 if (ncec->ncec_flags & NCE_F_ISROUTER) 1097 lnr->lnr_flags = NDF_ISROUTER_ON; 1098 if (ncec->ncec_flags & NCE_F_ANYCAST) 1099 lnr->lnr_flags |= NDF_ANYCAST_ON; 1100 if (ncec->ncec_flags & NCE_F_STATIC) 1101 lnr->lnr_flags |= NDF_STATIC; 1102 ncec_refrele(ncec); 1103 return (0); 1104 } 1105 1106 /* 1107 * Finish setting up the Enable/Disable multicast for the driver. 1108 */ 1109 mblk_t * 1110 ndp_mcastreq(ill_t *ill, const in6_addr_t *v6group, uint32_t hw_addr_len, 1111 uint32_t hw_addr_offset, mblk_t *mp) 1112 { 1113 uchar_t *hw_addr; 1114 ipaddr_t v4group; 1115 uchar_t *addr; 1116 1117 ASSERT(ill->ill_net_type == IRE_IF_RESOLVER); 1118 if (IN6_IS_ADDR_V4MAPPED(v6group)) { 1119 IN6_V4MAPPED_TO_IPADDR(v6group, v4group); 1120 1121 ASSERT(CLASSD(v4group)); 1122 ASSERT(!(ill->ill_isv6)); 1123 1124 addr = (uchar_t *)&v4group; 1125 } else { 1126 ASSERT(IN6_IS_ADDR_MULTICAST(v6group)); 1127 ASSERT(ill->ill_isv6); 1128 1129 addr = (uchar_t *)v6group; 1130 } 1131 hw_addr = mi_offset_paramc(mp, hw_addr_offset, hw_addr_len); 1132 if (hw_addr == NULL) { 1133 ip0dbg(("ndp_mcastreq NULL hw_addr\n")); 1134 freemsg(mp); 1135 return (NULL); 1136 } 1137 1138 ip_mcast_mapping(ill, addr, hw_addr); 1139 return (mp); 1140 } 1141 1142 void 1143 ip_ndp_resolve(ncec_t *ncec) 1144 { 1145 in_addr_t sender4 = INADDR_ANY; 1146 in6_addr_t sender6 = ipv6_all_zeros; 1147 ill_t *src_ill; 1148 uint32_t ms; 1149 1150 src_ill = nce_resolve_src(ncec, &sender6); 1151 if (src_ill == NULL) { 1152 /* Make sure we try again later */ 1153 ms = ncec->ncec_ill->ill_reachable_retrans_time; 1154 nce_restart_timer(ncec, (clock_t)ms); 1155 return; 1156 } 1157 if (ncec->ncec_ipversion == IPV4_VERSION) 1158 IN6_V4MAPPED_TO_IPADDR(&sender6, sender4); 1159 mutex_enter(&ncec->ncec_lock); 1160 if (ncec->ncec_ipversion == IPV6_VERSION) 1161 ms = ndp_solicit(ncec, sender6, src_ill); 1162 else 1163 ms = arp_request(ncec, sender4, src_ill); 1164 mutex_exit(&ncec->ncec_lock); 1165 if (ms == 0) { 1166 if (ncec->ncec_state != ND_REACHABLE) { 1167 if (ncec->ncec_ipversion == IPV6_VERSION) 1168 ndp_resolv_failed(ncec); 1169 else 1170 arp_resolv_failed(ncec); 1171 ASSERT((ncec->ncec_flags & NCE_F_STATIC) == 0); 1172 nce_make_unreachable(ncec); 1173 ncec_delete(ncec); 1174 } 1175 } else { 1176 nce_restart_timer(ncec, (clock_t)ms); 1177 } 1178 done: 1179 ill_refrele(src_ill); 1180 } 1181 1182 /* 1183 * Send an IPv6 neighbor solicitation. 1184 * Returns number of milliseconds after which we should either rexmit or abort. 1185 * Return of zero means we should abort. 1186 * The caller holds the ncec_lock to protect ncec_qd_mp and ncec_rcnt. 1187 * The optional source address is used as a hint to ndp_solicit for 1188 * which source to use in the packet. 1189 * 1190 * NOTE: This routine drops ncec_lock (and later reacquires it) when sending 1191 * the packet. 1192 */ 1193 uint32_t 1194 ndp_solicit(ncec_t *ncec, in6_addr_t src, ill_t *ill) 1195 { 1196 in6_addr_t dst; 1197 boolean_t dropped = B_FALSE; 1198 1199 ASSERT(ncec->ncec_ipversion == IPV6_VERSION); 1200 ASSERT(MUTEX_HELD(&ncec->ncec_lock)); 1201 1202 if (ncec->ncec_rcnt == 0) 1203 return (0); 1204 1205 dst = ncec->ncec_addr; 1206 ncec->ncec_rcnt--; 1207 mutex_exit(&ncec->ncec_lock); 1208 dropped = ndp_xmit(ill, ND_NEIGHBOR_SOLICIT, ill->ill_phys_addr, 1209 ill->ill_phys_addr_length, &src, &dst, 0); 1210 mutex_enter(&ncec->ncec_lock); 1211 if (dropped) 1212 ncec->ncec_rcnt++; 1213 return (ncec->ncec_ill->ill_reachable_retrans_time); 1214 } 1215 1216 /* 1217 * Attempt to recover an address on an interface that's been marked as a 1218 * duplicate. Because NCEs are destroyed when the interface goes down, there's 1219 * no easy way to just probe the address and have the right thing happen if 1220 * it's no longer in use. Instead, we just bring it up normally and allow the 1221 * regular interface start-up logic to probe for a remaining duplicate and take 1222 * us back down if necessary. 1223 * Neither DHCP nor temporary addresses arrive here; they're excluded by 1224 * ip_ndp_excl. 1225 */ 1226 /* ARGSUSED */ 1227 void 1228 ip_addr_recover(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg) 1229 { 1230 ill_t *ill = rq->q_ptr; 1231 ipif_t *ipif; 1232 in6_addr_t *addr6 = (in6_addr_t *)mp->b_rptr; 1233 in_addr_t *addr4 = (in_addr_t *)mp->b_rptr; 1234 boolean_t addr_equal; 1235 1236 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 1237 /* 1238 * We do not support recovery of proxy ARP'd interfaces, 1239 * because the system lacks a complete proxy ARP mechanism. 1240 */ 1241 if (ill->ill_isv6) { 1242 addr_equal = IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6lcl_addr, 1243 addr6); 1244 } else { 1245 addr_equal = (ipif->ipif_lcl_addr == *addr4); 1246 } 1247 1248 if ((ipif->ipif_flags & IPIF_POINTOPOINT) || !addr_equal) 1249 continue; 1250 1251 /* 1252 * If we have already recovered or if the interface is going 1253 * away, then ignore. 1254 */ 1255 mutex_enter(&ill->ill_lock); 1256 if (!(ipif->ipif_flags & IPIF_DUPLICATE) || 1257 (ipif->ipif_state_flags & IPIF_CONDEMNED)) { 1258 mutex_exit(&ill->ill_lock); 1259 continue; 1260 } 1261 1262 ipif->ipif_flags &= ~IPIF_DUPLICATE; 1263 ill->ill_ipif_dup_count--; 1264 mutex_exit(&ill->ill_lock); 1265 ipif->ipif_was_dup = B_TRUE; 1266 1267 if (ill->ill_isv6) { 1268 VERIFY(ipif_ndp_up(ipif, B_TRUE) != EINPROGRESS); 1269 (void) ipif_up_done_v6(ipif); 1270 } else { 1271 VERIFY(ipif_arp_up(ipif, Res_act_initial, B_TRUE) != 1272 EINPROGRESS); 1273 (void) ipif_up_done(ipif); 1274 } 1275 } 1276 freeb(mp); 1277 } 1278 1279 /* 1280 * Attempt to recover an IPv6 interface that's been shut down as a duplicate. 1281 * As long as someone else holds the address, the interface will stay down. 1282 * When that conflict goes away, the interface is brought back up. This is 1283 * done so that accidental shutdowns of addresses aren't made permanent. Your 1284 * server will recover from a failure. 1285 * 1286 * For DHCP and temporary addresses, recovery is not done in the kernel. 1287 * Instead, it's handled by user space processes (dhcpagent and in.ndpd). 1288 * 1289 * This function is entered on a timer expiry; the ID is in ipif_recovery_id. 1290 */ 1291 void 1292 ipif_dup_recovery(void *arg) 1293 { 1294 ipif_t *ipif = arg; 1295 1296 ipif->ipif_recovery_id = 0; 1297 if (!(ipif->ipif_flags & IPIF_DUPLICATE)) 1298 return; 1299 1300 /* 1301 * No lock, because this is just an optimization. 1302 */ 1303 if (ipif->ipif_state_flags & IPIF_CONDEMNED) 1304 return; 1305 1306 /* If the link is down, we'll retry this later */ 1307 if (!(ipif->ipif_ill->ill_phyint->phyint_flags & PHYI_RUNNING)) 1308 return; 1309 1310 ipif_do_recovery(ipif); 1311 } 1312 1313 /* 1314 * Perform interface recovery by forcing the duplicate interfaces up and 1315 * allowing the system to determine which ones should stay up. 1316 * 1317 * Called both by recovery timer expiry and link-up notification. 1318 */ 1319 void 1320 ipif_do_recovery(ipif_t *ipif) 1321 { 1322 ill_t *ill = ipif->ipif_ill; 1323 mblk_t *mp; 1324 ip_stack_t *ipst = ill->ill_ipst; 1325 size_t mp_size; 1326 1327 if (ipif->ipif_isv6) 1328 mp_size = sizeof (ipif->ipif_v6lcl_addr); 1329 else 1330 mp_size = sizeof (ipif->ipif_lcl_addr); 1331 mp = allocb(mp_size, BPRI_MED); 1332 if (mp == NULL) { 1333 mutex_enter(&ill->ill_lock); 1334 if (ipst->ips_ip_dup_recovery > 0 && 1335 ipif->ipif_recovery_id == 0 && 1336 !(ipif->ipif_state_flags & IPIF_CONDEMNED)) { 1337 ipif->ipif_recovery_id = timeout(ipif_dup_recovery, 1338 ipif, MSEC_TO_TICK(ipst->ips_ip_dup_recovery)); 1339 } 1340 mutex_exit(&ill->ill_lock); 1341 } else { 1342 /* 1343 * A recovery timer may still be running if we got here from 1344 * ill_restart_dad(); cancel that timer. 1345 */ 1346 if (ipif->ipif_recovery_id != 0) 1347 (void) untimeout(ipif->ipif_recovery_id); 1348 ipif->ipif_recovery_id = 0; 1349 1350 if (ipif->ipif_isv6) { 1351 bcopy(&ipif->ipif_v6lcl_addr, mp->b_rptr, 1352 sizeof (ipif->ipif_v6lcl_addr)); 1353 } else { 1354 bcopy(&ipif->ipif_lcl_addr, mp->b_rptr, 1355 sizeof (ipif->ipif_lcl_addr)); 1356 } 1357 ill_refhold(ill); 1358 qwriter_ip(ill, ill->ill_rq, mp, ip_addr_recover, NEW_OP, 1359 B_FALSE); 1360 } 1361 } 1362 1363 /* 1364 * Find the MAC and IP addresses in an NA/NS message. 1365 */ 1366 static void 1367 ip_ndp_find_addresses(mblk_t *mp, ip_recv_attr_t *ira, ill_t *ill, 1368 in6_addr_t *targp, uchar_t **haddr, uint_t *haddrlenp) 1369 { 1370 icmp6_t *icmp6 = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN); 1371 nd_neighbor_solicit_t *ns = (nd_neighbor_solicit_t *)icmp6; 1372 uchar_t *addr; 1373 int alen; 1374 1375 /* icmp_inbound_v6 ensures this */ 1376 ASSERT(ira->ira_flags & IRAF_L2SRC_SET); 1377 1378 addr = ira->ira_l2src; 1379 alen = ill->ill_phys_addr_length; 1380 if (alen > 0) { 1381 *haddr = addr; 1382 *haddrlenp = alen; 1383 } else { 1384 *haddr = NULL; 1385 *haddrlenp = 0; 1386 } 1387 1388 /* nd_ns_target and nd_na_target are at the same offset, so we cheat */ 1389 *targp = ns->nd_ns_target; 1390 } 1391 1392 /* 1393 * This is for exclusive changes due to NDP duplicate address detection 1394 * failure. 1395 */ 1396 /* ARGSUSED */ 1397 static void 1398 ip_ndp_excl(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg) 1399 { 1400 ill_t *ill = rq->q_ptr; 1401 ipif_t *ipif; 1402 uchar_t *haddr; 1403 uint_t haddrlen; 1404 ip_stack_t *ipst = ill->ill_ipst; 1405 in6_addr_t targ; 1406 ip_recv_attr_t iras; 1407 mblk_t *attrmp; 1408 1409 attrmp = mp; 1410 mp = mp->b_cont; 1411 attrmp->b_cont = NULL; 1412 if (!ip_recv_attr_from_mblk(attrmp, &iras)) { 1413 /* The ill or ip_stack_t disappeared on us */ 1414 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 1415 ip_drop_input("ip_recv_attr_from_mblk", mp, ill); 1416 freemsg(mp); 1417 ira_cleanup(&iras, B_TRUE); 1418 return; 1419 } 1420 1421 ASSERT(ill == iras.ira_rill); 1422 1423 ip_ndp_find_addresses(mp, &iras, ill, &targ, &haddr, &haddrlen); 1424 if (haddr != NULL && haddrlen == ill->ill_phys_addr_length) { 1425 /* 1426 * Ignore conflicts generated by misbehaving switches that 1427 * just reflect our own messages back to us. For IPMP, we may 1428 * see reflections across any ill in the illgrp. 1429 * 1430 * RFC2462 and revisions tried to detect both the case 1431 * when a statically configured IPv6 address is a duplicate, 1432 * and the case when the L2 address itself is a duplicate. The 1433 * later is important because, with stateles address autoconf, 1434 * if the L2 address is a duplicate, the resulting IPv6 1435 * address(es) would also be duplicates. We rely on DAD of the 1436 * IPv6 address itself to detect the latter case. 1437 */ 1438 /* For an under ill_grp can change under lock */ 1439 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 1440 if (bcmp(haddr, ill->ill_phys_addr, haddrlen) == 0 || 1441 IS_UNDER_IPMP(ill) && 1442 ipmp_illgrp_find_ill(ill->ill_grp, haddr, 1443 haddrlen) != NULL) { 1444 rw_exit(&ipst->ips_ill_g_lock); 1445 goto ignore_conflict; 1446 } 1447 rw_exit(&ipst->ips_ill_g_lock); 1448 } 1449 1450 /* 1451 * Look up the appropriate ipif. 1452 */ 1453 ipif = ipif_lookup_addr_v6(&targ, ill, ALL_ZONES, ipst); 1454 if (ipif == NULL) 1455 goto ignore_conflict; 1456 1457 /* Reload the ill to match the ipif */ 1458 ill = ipif->ipif_ill; 1459 1460 /* If it's already duplicate or ineligible, then don't do anything. */ 1461 if (ipif->ipif_flags & (IPIF_POINTOPOINT|IPIF_DUPLICATE)) { 1462 ipif_refrele(ipif); 1463 goto ignore_conflict; 1464 } 1465 1466 /* 1467 * If this is a failure during duplicate recovery, then don't 1468 * complain. It may take a long time to recover. 1469 */ 1470 if (!ipif->ipif_was_dup) { 1471 char ibuf[LIFNAMSIZ]; 1472 char hbuf[MAC_STR_LEN]; 1473 char sbuf[INET6_ADDRSTRLEN]; 1474 1475 ipif_get_name(ipif, ibuf, sizeof (ibuf)); 1476 cmn_err(CE_WARN, "%s has duplicate address %s (in use by %s);" 1477 " disabled", ibuf, 1478 inet_ntop(AF_INET6, &targ, sbuf, sizeof (sbuf)), 1479 mac_colon_addr(haddr, haddrlen, hbuf, sizeof (hbuf))); 1480 } 1481 mutex_enter(&ill->ill_lock); 1482 ASSERT(!(ipif->ipif_flags & IPIF_DUPLICATE)); 1483 ipif->ipif_flags |= IPIF_DUPLICATE; 1484 ill->ill_ipif_dup_count++; 1485 mutex_exit(&ill->ill_lock); 1486 (void) ipif_down(ipif, NULL, NULL); 1487 (void) ipif_down_tail(ipif); 1488 mutex_enter(&ill->ill_lock); 1489 if (!(ipif->ipif_flags & (IPIF_DHCPRUNNING|IPIF_TEMPORARY)) && 1490 ill->ill_net_type == IRE_IF_RESOLVER && 1491 !(ipif->ipif_state_flags & IPIF_CONDEMNED) && 1492 ipst->ips_ip_dup_recovery > 0) { 1493 ASSERT(ipif->ipif_recovery_id == 0); 1494 ipif->ipif_recovery_id = timeout(ipif_dup_recovery, 1495 ipif, MSEC_TO_TICK(ipst->ips_ip_dup_recovery)); 1496 } 1497 mutex_exit(&ill->ill_lock); 1498 ipif_refrele(ipif); 1499 1500 ignore_conflict: 1501 freemsg(mp); 1502 ira_cleanup(&iras, B_TRUE); 1503 } 1504 1505 /* 1506 * Handle failure by tearing down the ipifs with the specified address. Note 1507 * that tearing down the ipif also means deleting the ncec through ipif_down, so 1508 * it's not possible to do recovery by just restarting the ncec timer. Instead, 1509 * we start a timer on the ipif. 1510 * Caller has to free mp; 1511 */ 1512 static void 1513 ndp_failure(mblk_t *mp, ip_recv_attr_t *ira) 1514 { 1515 const uchar_t *haddr; 1516 ill_t *ill = ira->ira_rill; 1517 1518 /* 1519 * Ignore conflicts generated by misbehaving switches that just 1520 * reflect our own messages back to us. 1521 */ 1522 1523 /* icmp_inbound_v6 ensures this */ 1524 ASSERT(ira->ira_flags & IRAF_L2SRC_SET); 1525 haddr = ira->ira_l2src; 1526 if (haddr != NULL && 1527 bcmp(haddr, ill->ill_phys_addr, ill->ill_phys_addr_length) == 0) { 1528 return; 1529 } 1530 1531 if ((mp = copymsg(mp)) != NULL) { 1532 mblk_t *attrmp; 1533 1534 attrmp = ip_recv_attr_to_mblk(ira); 1535 if (attrmp == NULL) { 1536 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 1537 ip_drop_input("ipIfStatsInDiscards", mp, ill); 1538 freemsg(mp); 1539 } else { 1540 ASSERT(attrmp->b_cont == NULL); 1541 attrmp->b_cont = mp; 1542 mp = attrmp; 1543 ill_refhold(ill); 1544 qwriter_ip(ill, ill->ill_rq, mp, ip_ndp_excl, NEW_OP, 1545 B_FALSE); 1546 } 1547 } 1548 } 1549 1550 /* 1551 * Handle a discovered conflict: some other system is advertising that it owns 1552 * one of our IP addresses. We need to defend ourselves, or just shut down the 1553 * interface. 1554 * 1555 * Handles both IPv4 and IPv6 1556 */ 1557 boolean_t 1558 ip_nce_conflict(mblk_t *mp, ip_recv_attr_t *ira, ncec_t *ncec) 1559 { 1560 ipif_t *ipif; 1561 clock_t now; 1562 uint_t maxdefense; 1563 uint_t defs; 1564 ill_t *ill = ira->ira_ill; 1565 ip_stack_t *ipst = ill->ill_ipst; 1566 uint32_t elapsed; 1567 boolean_t isv6 = ill->ill_isv6; 1568 ipaddr_t ncec_addr; 1569 1570 if (isv6) { 1571 ipif = ipif_lookup_addr_v6(&ncec->ncec_addr, ill, ALL_ZONES, 1572 ipst); 1573 } else { 1574 if (arp_no_defense) { 1575 /* 1576 * Yes, there is a conflict, but no, we do not 1577 * defend ourself. 1578 */ 1579 return (B_TRUE); 1580 } 1581 IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, ncec_addr); 1582 ipif = ipif_lookup_addr(ncec_addr, ill, ALL_ZONES, 1583 ipst); 1584 } 1585 if (ipif == NULL) 1586 return (B_FALSE); 1587 1588 /* 1589 * First, figure out if this address is disposable. 1590 */ 1591 if (ipif->ipif_flags & (IPIF_DHCPRUNNING | IPIF_TEMPORARY)) 1592 maxdefense = ipst->ips_ip_max_temp_defend; 1593 else 1594 maxdefense = ipst->ips_ip_max_defend; 1595 1596 /* 1597 * Now figure out how many times we've defended ourselves. Ignore 1598 * defenses that happened long in the past. 1599 */ 1600 now = ddi_get_lbolt(); 1601 elapsed = (drv_hztousec(now - ncec->ncec_last_time_defended))/1000000; 1602 mutex_enter(&ncec->ncec_lock); 1603 if ((defs = ncec->ncec_defense_count) > 0 && 1604 elapsed > ipst->ips_ip_defend_interval) { 1605 /* 1606 * ip_defend_interval has elapsed. 1607 * reset the defense count. 1608 */ 1609 ncec->ncec_defense_count = defs = 0; 1610 } 1611 ncec->ncec_defense_count++; 1612 ncec->ncec_last_time_defended = now; 1613 mutex_exit(&ncec->ncec_lock); 1614 ipif_refrele(ipif); 1615 1616 /* 1617 * If we've defended ourselves too many times already, then give up and 1618 * tear down the interface(s) using this address. 1619 * Otherwise, caller has to defend by sending out an announce. 1620 */ 1621 if (defs >= maxdefense) { 1622 if (isv6) 1623 ndp_failure(mp, ira); 1624 else 1625 arp_failure(mp, ira); 1626 } else { 1627 return (B_TRUE); /* caller must defend this address */ 1628 } 1629 return (B_FALSE); 1630 } 1631 1632 /* 1633 * Handle reception of Neighbor Solicitation messages. 1634 */ 1635 static void 1636 ndp_input_solicit(mblk_t *mp, ip_recv_attr_t *ira) 1637 { 1638 ill_t *ill = ira->ira_ill, *under_ill; 1639 nd_neighbor_solicit_t *ns; 1640 uint32_t hlen = ill->ill_phys_addr_length; 1641 uchar_t *haddr = NULL; 1642 icmp6_t *icmp_nd; 1643 ip6_t *ip6h; 1644 ncec_t *our_ncec = NULL; 1645 in6_addr_t target; 1646 in6_addr_t src; 1647 int len; 1648 int flag = 0; 1649 nd_opt_hdr_t *opt = NULL; 1650 boolean_t bad_solicit = B_FALSE; 1651 mib2_ipv6IfIcmpEntry_t *mib = ill->ill_icmp6_mib; 1652 boolean_t need_ill_refrele = B_FALSE; 1653 1654 ip6h = (ip6_t *)mp->b_rptr; 1655 icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN); 1656 len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN; 1657 src = ip6h->ip6_src; 1658 ns = (nd_neighbor_solicit_t *)icmp_nd; 1659 target = ns->nd_ns_target; 1660 if (IN6_IS_ADDR_MULTICAST(&target) || IN6_IS_ADDR_V4MAPPED(&target) || 1661 IN6_IS_ADDR_LOOPBACK(&target)) { 1662 if (ip_debug > 2) { 1663 /* ip1dbg */ 1664 pr_addr_dbg("ndp_input_solicit: Martian Target %s\n", 1665 AF_INET6, &target); 1666 } 1667 bad_solicit = B_TRUE; 1668 goto done; 1669 } 1670 if (len > sizeof (nd_neighbor_solicit_t)) { 1671 /* Options present */ 1672 opt = (nd_opt_hdr_t *)&ns[1]; 1673 len -= sizeof (nd_neighbor_solicit_t); 1674 if (!ndp_verify_optlen(opt, len)) { 1675 ip1dbg(("ndp_input_solicit: Bad opt len\n")); 1676 bad_solicit = B_TRUE; 1677 goto done; 1678 } 1679 } 1680 if (IN6_IS_ADDR_UNSPECIFIED(&src)) { 1681 /* Check to see if this is a valid DAD solicitation */ 1682 if (!IN6_IS_ADDR_MC_SOLICITEDNODE(&ip6h->ip6_dst)) { 1683 if (ip_debug > 2) { 1684 /* ip1dbg */ 1685 pr_addr_dbg("ndp_input_solicit: IPv6 " 1686 "Destination is not solicited node " 1687 "multicast %s\n", AF_INET6, 1688 &ip6h->ip6_dst); 1689 } 1690 bad_solicit = B_TRUE; 1691 goto done; 1692 } 1693 } 1694 1695 /* 1696 * NOTE: with IPMP, it's possible the nominated multicast ill (which 1697 * received this packet if it's multicast) is not the ill tied to 1698 * e.g. the IPMP ill's data link-local. So we match across the illgrp 1699 * to ensure we find the associated NCE. 1700 */ 1701 our_ncec = ncec_lookup_illgrp_v6(ill, &target); 1702 /* 1703 * If this is a valid Solicitation for an address we are publishing, 1704 * then a PUBLISH entry should exist in the cache 1705 */ 1706 if (our_ncec == NULL || !NCE_PUBLISH(our_ncec)) { 1707 ip1dbg(("ndp_input_solicit: Wrong target in NS?!" 1708 "ifname=%s ", ill->ill_name)); 1709 if (ip_debug > 2) { 1710 /* ip1dbg */ 1711 pr_addr_dbg(" dst %s\n", AF_INET6, &target); 1712 } 1713 if (our_ncec == NULL) 1714 bad_solicit = B_TRUE; 1715 goto done; 1716 } 1717 1718 /* At this point we should have a verified NS per spec */ 1719 if (opt != NULL) { 1720 opt = ndp_get_option(opt, len, ND_OPT_SOURCE_LINKADDR); 1721 if (opt != NULL) { 1722 haddr = (uchar_t *)&opt[1]; 1723 if (hlen > opt->nd_opt_len * 8 - sizeof (*opt) || 1724 hlen == 0) { 1725 ip1dbg(("ndp_input_advert: bad SLLA\n")); 1726 bad_solicit = B_TRUE; 1727 goto done; 1728 } 1729 } 1730 } 1731 1732 /* If sending directly to peer, set the unicast flag */ 1733 if (!IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst)) 1734 flag |= NDP_UNICAST; 1735 1736 /* 1737 * Create/update the entry for the soliciting node on the ipmp_ill. 1738 * or respond to outstanding queries, don't if 1739 * the source is unspecified address. 1740 */ 1741 if (!IN6_IS_ADDR_UNSPECIFIED(&src)) { 1742 int err; 1743 nce_t *nnce; 1744 1745 ASSERT(ill->ill_isv6); 1746 /* 1747 * Regular solicitations *must* include the Source Link-Layer 1748 * Address option. Ignore messages that do not. 1749 */ 1750 if (haddr == NULL && IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst)) { 1751 ip1dbg(("ndp_input_solicit: source link-layer address " 1752 "option missing with a specified source.\n")); 1753 bad_solicit = B_TRUE; 1754 goto done; 1755 } 1756 1757 /* 1758 * This is a regular solicitation. If we're still in the 1759 * process of verifying the address, then don't respond at all 1760 * and don't keep track of the sender. 1761 */ 1762 if (our_ncec->ncec_state == ND_PROBE) 1763 goto done; 1764 1765 /* 1766 * If the solicitation doesn't have sender hardware address 1767 * (legal for unicast solicitation), then process without 1768 * installing the return NCE. Either we already know it, or 1769 * we'll be forced to look it up when (and if) we reply to the 1770 * packet. 1771 */ 1772 if (haddr == NULL) 1773 goto no_source; 1774 1775 under_ill = ill; 1776 if (IS_UNDER_IPMP(under_ill)) { 1777 ill = ipmp_ill_hold_ipmp_ill(under_ill); 1778 if (ill == NULL) 1779 ill = under_ill; 1780 else 1781 need_ill_refrele = B_TRUE; 1782 } 1783 err = nce_lookup_then_add_v6(ill, 1784 haddr, hlen, 1785 &src, /* Soliciting nodes address */ 1786 0, 1787 ND_STALE, 1788 &nnce); 1789 1790 if (need_ill_refrele) { 1791 ill_refrele(ill); 1792 ill = under_ill; 1793 need_ill_refrele = B_FALSE; 1794 } 1795 switch (err) { 1796 case 0: 1797 /* done with this entry */ 1798 nce_refrele(nnce); 1799 break; 1800 case EEXIST: 1801 /* 1802 * B_FALSE indicates this is not an an advertisement. 1803 */ 1804 nce_process(nnce->nce_common, haddr, 0, B_FALSE); 1805 nce_refrele(nnce); 1806 break; 1807 default: 1808 ip1dbg(("ndp_input_solicit: Can't create NCE %d\n", 1809 err)); 1810 goto done; 1811 } 1812 no_source: 1813 flag |= NDP_SOLICITED; 1814 } else { 1815 /* 1816 * No source link layer address option should be present in a 1817 * valid DAD request. 1818 */ 1819 if (haddr != NULL) { 1820 ip1dbg(("ndp_input_solicit: source link-layer address " 1821 "option present with an unspecified source.\n")); 1822 bad_solicit = B_TRUE; 1823 goto done; 1824 } 1825 if (our_ncec->ncec_state == ND_PROBE) { 1826 /* 1827 * Internally looped-back probes will have 1828 * IRAF_L2SRC_LOOPBACK set so we can ignore our own 1829 * transmissions. 1830 */ 1831 if (!(ira->ira_flags & IRAF_L2SRC_LOOPBACK)) { 1832 /* 1833 * If someone else is probing our address, then 1834 * we've crossed wires. Declare failure. 1835 */ 1836 ndp_failure(mp, ira); 1837 } 1838 goto done; 1839 } 1840 /* 1841 * This is a DAD probe. Multicast the advertisement to the 1842 * all-nodes address. 1843 */ 1844 src = ipv6_all_hosts_mcast; 1845 } 1846 flag |= nce_advert_flags(our_ncec); 1847 (void) ndp_xmit(ill, 1848 ND_NEIGHBOR_ADVERT, 1849 our_ncec->ncec_lladdr, 1850 our_ncec->ncec_lladdr_length, 1851 &target, /* Source and target of the advertisement pkt */ 1852 &src, /* IP Destination (source of original pkt) */ 1853 flag); 1854 done: 1855 if (bad_solicit) 1856 BUMP_MIB(mib, ipv6IfIcmpInBadNeighborSolicitations); 1857 if (our_ncec != NULL) 1858 ncec_refrele(our_ncec); 1859 } 1860 1861 /* 1862 * Handle reception of Neighbor Solicitation messages 1863 */ 1864 void 1865 ndp_input_advert(mblk_t *mp, ip_recv_attr_t *ira) 1866 { 1867 ill_t *ill = ira->ira_ill; 1868 nd_neighbor_advert_t *na; 1869 uint32_t hlen = ill->ill_phys_addr_length; 1870 uchar_t *haddr = NULL; 1871 icmp6_t *icmp_nd; 1872 ip6_t *ip6h; 1873 ncec_t *dst_ncec = NULL; 1874 in6_addr_t target; 1875 nd_opt_hdr_t *opt = NULL; 1876 int len; 1877 ip_stack_t *ipst = ill->ill_ipst; 1878 mib2_ipv6IfIcmpEntry_t *mib = ill->ill_icmp6_mib; 1879 1880 ip6h = (ip6_t *)mp->b_rptr; 1881 icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN); 1882 len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN; 1883 na = (nd_neighbor_advert_t *)icmp_nd; 1884 1885 if (IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst) && 1886 (na->nd_na_flags_reserved & ND_NA_FLAG_SOLICITED)) { 1887 ip1dbg(("ndp_input_advert: Target is multicast but the " 1888 "solicited flag is not zero\n")); 1889 BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements); 1890 return; 1891 } 1892 target = na->nd_na_target; 1893 if (IN6_IS_ADDR_MULTICAST(&target) || IN6_IS_ADDR_V4MAPPED(&target) || 1894 IN6_IS_ADDR_LOOPBACK(&target)) { 1895 if (ip_debug > 2) { 1896 /* ip1dbg */ 1897 pr_addr_dbg("ndp_input_solicit: Martian Target %s\n", 1898 AF_INET6, &target); 1899 } 1900 BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements); 1901 return; 1902 } 1903 if (len > sizeof (nd_neighbor_advert_t)) { 1904 opt = (nd_opt_hdr_t *)&na[1]; 1905 if (!ndp_verify_optlen(opt, 1906 len - sizeof (nd_neighbor_advert_t))) { 1907 ip1dbg(("ndp_input_advert: cannot verify SLLA\n")); 1908 BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements); 1909 return; 1910 } 1911 /* At this point we have a verified NA per spec */ 1912 len -= sizeof (nd_neighbor_advert_t); 1913 opt = ndp_get_option(opt, len, ND_OPT_TARGET_LINKADDR); 1914 if (opt != NULL) { 1915 haddr = (uchar_t *)&opt[1]; 1916 if (hlen > opt->nd_opt_len * 8 - sizeof (*opt) || 1917 hlen == 0) { 1918 ip1dbg(("ndp_input_advert: bad SLLA\n")); 1919 BUMP_MIB(mib, 1920 ipv6IfIcmpInBadNeighborAdvertisements); 1921 return; 1922 } 1923 } 1924 } 1925 1926 /* 1927 * NOTE: we match across the illgrp since we need to do DAD for all of 1928 * our local addresses, and those are spread across all the active 1929 * ills in the group. 1930 */ 1931 if ((dst_ncec = ncec_lookup_illgrp_v6(ill, &target)) == NULL) 1932 return; 1933 1934 if (NCE_PUBLISH(dst_ncec)) { 1935 /* 1936 * Someone just advertised an addresses that we publish. First, 1937 * check it it was us -- if so, we can safely ignore it. 1938 * We don't get the haddr from the ira_l2src because, in the 1939 * case that the packet originated from us, on an IPMP group, 1940 * the ira_l2src may would be the link-layer address of the 1941 * cast_ill used to send the packet, which may not be the same 1942 * as the dst_ncec->ncec_lladdr of the address. 1943 */ 1944 if (haddr != NULL) { 1945 if (ira->ira_flags & IRAF_L2SRC_LOOPBACK) 1946 goto out; 1947 1948 if (!nce_cmp_ll_addr(dst_ncec, haddr, hlen)) 1949 goto out; /* from us -- no conflict */ 1950 1951 /* 1952 * If we're in an IPMP group, check if this is an echo 1953 * from another ill in the group. Use the double- 1954 * checked locking pattern to avoid grabbing 1955 * ill_g_lock in the non-IPMP case. 1956 */ 1957 if (IS_UNDER_IPMP(ill)) { 1958 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 1959 if (IS_UNDER_IPMP(ill) && ipmp_illgrp_find_ill( 1960 ill->ill_grp, haddr, hlen) != NULL) { 1961 rw_exit(&ipst->ips_ill_g_lock); 1962 goto out; 1963 } 1964 rw_exit(&ipst->ips_ill_g_lock); 1965 } 1966 } 1967 1968 /* 1969 * This appears to be a real conflict. If we're trying to 1970 * configure this NCE (ND_PROBE), then shut it down. 1971 * Otherwise, handle the discovered conflict. 1972 */ 1973 if (dst_ncec->ncec_state == ND_PROBE) { 1974 ndp_failure(mp, ira); 1975 } else { 1976 if (ip_nce_conflict(mp, ira, dst_ncec)) { 1977 char hbuf[MAC_STR_LEN]; 1978 char sbuf[INET6_ADDRSTRLEN]; 1979 1980 cmn_err(CE_WARN, 1981 "node '%s' is using %s on %s", 1982 inet_ntop(AF_INET6, &target, sbuf, 1983 sizeof (sbuf)), 1984 haddr == NULL ? "<none>" : 1985 mac_colon_addr(haddr, hlen, hbuf, 1986 sizeof (hbuf)), ill->ill_name); 1987 /* 1988 * RFC 4862, Section 5.4.4 does not mandate 1989 * any specific behavior when an NA matches 1990 * a non-tentative address assigned to the 1991 * receiver. We make the choice of defending 1992 * our address, based on the assumption that 1993 * the sender has not detected the Duplicate. 1994 * 1995 * ncec_last_time_defended has been adjusted 1996 * in ip_nce_conflict() 1997 */ 1998 (void) ndp_announce(dst_ncec); 1999 } 2000 } 2001 } else { 2002 if (na->nd_na_flags_reserved & ND_NA_FLAG_ROUTER) 2003 dst_ncec->ncec_flags |= NCE_F_ISROUTER; 2004 2005 /* B_TRUE indicates this an advertisement */ 2006 nce_process(dst_ncec, haddr, na->nd_na_flags_reserved, B_TRUE); 2007 } 2008 out: 2009 ncec_refrele(dst_ncec); 2010 } 2011 2012 /* 2013 * Process NDP neighbor solicitation/advertisement messages. 2014 * The checksum has already checked o.k before reaching here. 2015 * Information about the datalink header is contained in ira_l2src, but 2016 * that should be ignored for loopback packets. 2017 */ 2018 void 2019 ndp_input(mblk_t *mp, ip_recv_attr_t *ira) 2020 { 2021 ill_t *ill = ira->ira_rill; 2022 icmp6_t *icmp_nd; 2023 ip6_t *ip6h; 2024 int len; 2025 mib2_ipv6IfIcmpEntry_t *mib = ill->ill_icmp6_mib; 2026 ill_t *orig_ill = NULL; 2027 2028 /* 2029 * Since ira_ill is where the IRE_LOCAL was hosted we use ira_rill 2030 * and make it be the IPMP upper so avoid being confused by a packet 2031 * addressed to a unicast address on a different ill. 2032 */ 2033 if (IS_UNDER_IPMP(ill)) { 2034 orig_ill = ill; 2035 ill = ipmp_ill_hold_ipmp_ill(orig_ill); 2036 if (ill == NULL) { 2037 ill = orig_ill; 2038 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 2039 ip_drop_input("ipIfStatsInDiscards - IPMP ill", 2040 mp, ill); 2041 freemsg(mp); 2042 return; 2043 } 2044 ASSERT(ill != orig_ill); 2045 orig_ill = ira->ira_ill; 2046 ira->ira_ill = ill; 2047 mib = ill->ill_icmp6_mib; 2048 } 2049 if (!pullupmsg(mp, -1)) { 2050 ip1dbg(("ndp_input: pullupmsg failed\n")); 2051 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 2052 ip_drop_input("ipIfStatsInDiscards - pullupmsg", mp, ill); 2053 goto done; 2054 } 2055 ip6h = (ip6_t *)mp->b_rptr; 2056 if (ip6h->ip6_hops != IPV6_MAX_HOPS) { 2057 ip1dbg(("ndp_input: hoplimit != IPV6_MAX_HOPS\n")); 2058 ip_drop_input("ipv6IfIcmpBadHoplimit", mp, ill); 2059 BUMP_MIB(mib, ipv6IfIcmpBadHoplimit); 2060 goto done; 2061 } 2062 /* 2063 * NDP does not accept any extension headers between the 2064 * IP header and the ICMP header since e.g. a routing 2065 * header could be dangerous. 2066 * This assumes that any AH or ESP headers are removed 2067 * by ip prior to passing the packet to ndp_input. 2068 */ 2069 if (ip6h->ip6_nxt != IPPROTO_ICMPV6) { 2070 ip1dbg(("ndp_input: Wrong next header 0x%x\n", 2071 ip6h->ip6_nxt)); 2072 ip_drop_input("Wrong next header", mp, ill); 2073 BUMP_MIB(mib, ipv6IfIcmpInErrors); 2074 goto done; 2075 } 2076 icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN); 2077 ASSERT(icmp_nd->icmp6_type == ND_NEIGHBOR_SOLICIT || 2078 icmp_nd->icmp6_type == ND_NEIGHBOR_ADVERT); 2079 if (icmp_nd->icmp6_code != 0) { 2080 ip1dbg(("ndp_input: icmp6 code != 0 \n")); 2081 ip_drop_input("code non-zero", mp, ill); 2082 BUMP_MIB(mib, ipv6IfIcmpInErrors); 2083 goto done; 2084 } 2085 len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN; 2086 /* 2087 * Make sure packet length is large enough for either 2088 * a NS or a NA icmp packet. 2089 */ 2090 if (len < sizeof (struct icmp6_hdr) + sizeof (struct in6_addr)) { 2091 ip1dbg(("ndp_input: packet too short\n")); 2092 ip_drop_input("packet too short", mp, ill); 2093 BUMP_MIB(mib, ipv6IfIcmpInErrors); 2094 goto done; 2095 } 2096 if (icmp_nd->icmp6_type == ND_NEIGHBOR_SOLICIT) { 2097 ndp_input_solicit(mp, ira); 2098 } else { 2099 ndp_input_advert(mp, ira); 2100 } 2101 done: 2102 freemsg(mp); 2103 if (orig_ill != NULL) { 2104 ill_refrele(ill); 2105 ira->ira_ill = orig_ill; 2106 } 2107 } 2108 2109 /* 2110 * ndp_xmit is called to form and transmit a ND solicitation or 2111 * advertisement ICMP packet. 2112 * 2113 * If the source address is unspecified and this isn't a probe (used for 2114 * duplicate address detection), an appropriate source address and link layer 2115 * address will be chosen here. The link layer address option is included if 2116 * the source is specified (i.e., all non-probe packets), and omitted (per the 2117 * specification) otherwise. 2118 * 2119 * It returns B_FALSE only if it does a successful put() to the 2120 * corresponding ill's ill_wq otherwise returns B_TRUE. 2121 */ 2122 static boolean_t 2123 ndp_xmit(ill_t *ill, uint32_t operation, uint8_t *hw_addr, uint_t hw_addr_len, 2124 const in6_addr_t *sender, const in6_addr_t *target, int flag) 2125 { 2126 uint32_t len; 2127 icmp6_t *icmp6; 2128 mblk_t *mp; 2129 ip6_t *ip6h; 2130 nd_opt_hdr_t *opt; 2131 uint_t plen; 2132 zoneid_t zoneid = GLOBAL_ZONEID; 2133 ill_t *hwaddr_ill = ill; 2134 ip_xmit_attr_t ixas; 2135 ip_stack_t *ipst = ill->ill_ipst; 2136 boolean_t need_refrele = B_FALSE; 2137 boolean_t probe = B_FALSE; 2138 2139 if (IS_UNDER_IPMP(ill)) { 2140 probe = ipif_lookup_testaddr_v6(ill, sender, NULL); 2141 /* 2142 * We send non-probe packets on the upper IPMP interface. 2143 * ip_output_simple() will use cast_ill for sending any 2144 * multicast packets. Note that we can't follow the same 2145 * logic for probe packets because all interfaces in the ipmp 2146 * group may have failed, so that we really want to only try 2147 * to send the ND packet on the ill corresponding to the src 2148 * address. 2149 */ 2150 if (!probe) { 2151 ill = ipmp_ill_hold_ipmp_ill(ill); 2152 if (ill != NULL) 2153 need_refrele = B_TRUE; 2154 else 2155 ill = hwaddr_ill; 2156 } 2157 } 2158 2159 /* 2160 * If we have a unspecified source(sender) address, select a 2161 * proper source address for the solicitation here itself so 2162 * that we can initialize the h/w address correctly. 2163 * 2164 * If the sender is specified then we use this address in order 2165 * to lookup the zoneid before calling ip_output_v6(). This is to 2166 * enable unicast ND_NEIGHBOR_ADVERT packets to be routed correctly 2167 * by IP (we cannot guarantee that the global zone has an interface 2168 * route to the destination). 2169 * 2170 * Note that the NA never comes here with the unspecified source 2171 * address. 2172 */ 2173 2174 /* 2175 * Probes will have unspec src at this point. 2176 */ 2177 if (!(IN6_IS_ADDR_UNSPECIFIED(sender))) { 2178 zoneid = ipif_lookup_addr_zoneid_v6(sender, ill, ipst); 2179 /* 2180 * It's possible for ipif_lookup_addr_zoneid_v6() to return 2181 * ALL_ZONES if it cannot find a matching ipif for the address 2182 * we are trying to use. In this case we err on the side of 2183 * trying to send the packet by defaulting to the GLOBAL_ZONEID. 2184 */ 2185 if (zoneid == ALL_ZONES) 2186 zoneid = GLOBAL_ZONEID; 2187 } 2188 2189 plen = (sizeof (nd_opt_hdr_t) + hw_addr_len + 7) / 8; 2190 len = IPV6_HDR_LEN + sizeof (nd_neighbor_advert_t) + plen * 8; 2191 mp = allocb(len, BPRI_LO); 2192 if (mp == NULL) { 2193 if (need_refrele) 2194 ill_refrele(ill); 2195 return (B_TRUE); 2196 } 2197 2198 bzero((char *)mp->b_rptr, len); 2199 mp->b_wptr = mp->b_rptr + len; 2200 2201 bzero(&ixas, sizeof (ixas)); 2202 ixas.ixa_flags = IXAF_SET_ULP_CKSUM | IXAF_NO_HW_CKSUM; 2203 2204 ixas.ixa_ifindex = ill->ill_phyint->phyint_ifindex; 2205 ixas.ixa_ipst = ipst; 2206 ixas.ixa_cred = kcred; 2207 ixas.ixa_cpid = NOPID; 2208 ixas.ixa_tsl = NULL; 2209 ixas.ixa_zoneid = zoneid; 2210 2211 ip6h = (ip6_t *)mp->b_rptr; 2212 ip6h->ip6_vcf = IPV6_DEFAULT_VERS_AND_FLOW; 2213 ip6h->ip6_plen = htons(len - IPV6_HDR_LEN); 2214 ip6h->ip6_nxt = IPPROTO_ICMPV6; 2215 ip6h->ip6_hops = IPV6_MAX_HOPS; 2216 ixas.ixa_multicast_ttl = ip6h->ip6_hops; 2217 ip6h->ip6_dst = *target; 2218 icmp6 = (icmp6_t *)&ip6h[1]; 2219 2220 if (hw_addr_len != 0) { 2221 opt = (nd_opt_hdr_t *)((uint8_t *)ip6h + IPV6_HDR_LEN + 2222 sizeof (nd_neighbor_advert_t)); 2223 } else { 2224 opt = NULL; 2225 } 2226 if (operation == ND_NEIGHBOR_SOLICIT) { 2227 nd_neighbor_solicit_t *ns = (nd_neighbor_solicit_t *)icmp6; 2228 2229 if (opt != NULL && !(flag & NDP_PROBE)) { 2230 /* 2231 * Note that we don't send out SLLA for ND probes 2232 * per RFC 4862, even though we do send out the src 2233 * haddr for IPv4 DAD probes, even though both IPv4 2234 * and IPv6 go out with the unspecified/INADDR_ANY 2235 * src IP addr. 2236 */ 2237 opt->nd_opt_type = ND_OPT_SOURCE_LINKADDR; 2238 } 2239 ip6h->ip6_src = *sender; 2240 ns->nd_ns_target = *target; 2241 if (!(flag & NDP_UNICAST)) { 2242 /* Form multicast address of the target */ 2243 ip6h->ip6_dst = ipv6_solicited_node_mcast; 2244 ip6h->ip6_dst.s6_addr32[3] |= 2245 ns->nd_ns_target.s6_addr32[3]; 2246 } 2247 } else { 2248 nd_neighbor_advert_t *na = (nd_neighbor_advert_t *)icmp6; 2249 2250 ASSERT(!(flag & NDP_PROBE)); 2251 if (opt != NULL) 2252 opt->nd_opt_type = ND_OPT_TARGET_LINKADDR; 2253 ip6h->ip6_src = *sender; 2254 na->nd_na_target = *sender; 2255 if (flag & NDP_ISROUTER) 2256 na->nd_na_flags_reserved |= ND_NA_FLAG_ROUTER; 2257 if (flag & NDP_SOLICITED) 2258 na->nd_na_flags_reserved |= ND_NA_FLAG_SOLICITED; 2259 if (flag & NDP_ORIDE) 2260 na->nd_na_flags_reserved |= ND_NA_FLAG_OVERRIDE; 2261 } 2262 2263 if (!(flag & NDP_PROBE)) { 2264 if (hw_addr != NULL && opt != NULL) { 2265 /* Fill in link layer address and option len */ 2266 opt->nd_opt_len = (uint8_t)plen; 2267 bcopy(hw_addr, &opt[1], hw_addr_len); 2268 } 2269 } 2270 if (opt != NULL && opt->nd_opt_type == 0) { 2271 /* If there's no link layer address option, then strip it. */ 2272 len -= plen * 8; 2273 mp->b_wptr = mp->b_rptr + len; 2274 ip6h->ip6_plen = htons(len - IPV6_HDR_LEN); 2275 } 2276 2277 icmp6->icmp6_type = (uint8_t)operation; 2278 icmp6->icmp6_code = 0; 2279 /* 2280 * Prepare for checksum by putting icmp length in the icmp 2281 * checksum field. The checksum is calculated in ip_output.c. 2282 */ 2283 icmp6->icmp6_cksum = ip6h->ip6_plen; 2284 2285 (void) ip_output_simple(mp, &ixas); 2286 ixa_cleanup(&ixas); 2287 if (need_refrele) 2288 ill_refrele(ill); 2289 return (B_FALSE); 2290 } 2291 2292 /* 2293 * Used to set ND_UNREACHBLE before ncec_delete sets it NCE_F_CONDEMNED. 2294 * The datapath uses this as an indication that there 2295 * is a problem (as opposed to a NCE that was just 2296 * reclaimed due to lack of memory. 2297 * Note that static ARP entries never become unreachable. 2298 */ 2299 void 2300 nce_make_unreachable(ncec_t *ncec) 2301 { 2302 mutex_enter(&ncec->ncec_lock); 2303 ncec->ncec_state = ND_UNREACHABLE; 2304 mutex_exit(&ncec->ncec_lock); 2305 } 2306 2307 /* 2308 * NCE retransmit timer. Common to IPv4 and IPv6. 2309 * This timer goes off when: 2310 * a. It is time to retransmit a resolution for resolver. 2311 * b. It is time to send reachability probes. 2312 */ 2313 void 2314 nce_timer(void *arg) 2315 { 2316 ncec_t *ncec = arg; 2317 ill_t *ill = ncec->ncec_ill, *src_ill; 2318 char addrbuf[INET6_ADDRSTRLEN]; 2319 boolean_t dropped = B_FALSE; 2320 ip_stack_t *ipst = ncec->ncec_ipst; 2321 boolean_t isv6 = (ncec->ncec_ipversion == IPV6_VERSION); 2322 in_addr_t sender4 = INADDR_ANY; 2323 in6_addr_t sender6 = ipv6_all_zeros; 2324 2325 /* 2326 * The timer has to be cancelled by ncec_delete before doing the final 2327 * refrele. So the NCE is guaranteed to exist when the timer runs 2328 * until it clears the timeout_id. Before clearing the timeout_id 2329 * bump up the refcnt so that we can continue to use the ncec 2330 */ 2331 ASSERT(ncec != NULL); 2332 mutex_enter(&ncec->ncec_lock); 2333 ncec_refhold_locked(ncec); 2334 ncec->ncec_timeout_id = 0; 2335 mutex_exit(&ncec->ncec_lock); 2336 2337 src_ill = nce_resolve_src(ncec, &sender6); 2338 /* if we could not find a sender address, return */ 2339 if (src_ill == NULL) { 2340 if (!isv6) { 2341 IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, sender4); 2342 ip1dbg(("no src ill for %s\n", inet_ntop(AF_INET, 2343 &sender4, addrbuf, sizeof (addrbuf)))); 2344 } else { 2345 ip1dbg(("no src ill for %s\n", inet_ntop(AF_INET6, 2346 &ncec->ncec_addr, addrbuf, sizeof (addrbuf)))); 2347 } 2348 nce_restart_timer(ncec, ill->ill_reachable_retrans_time); 2349 ncec_refrele(ncec); 2350 return; 2351 } 2352 if (!isv6) 2353 IN6_V4MAPPED_TO_IPADDR(&sender6, sender4); 2354 2355 mutex_enter(&ncec->ncec_lock); 2356 /* 2357 * Check the reachability state. 2358 */ 2359 switch (ncec->ncec_state) { 2360 case ND_DELAY: 2361 ASSERT(ncec->ncec_lladdr != NULL); 2362 ncec->ncec_state = ND_PROBE; 2363 ncec->ncec_pcnt = ND_MAX_UNICAST_SOLICIT; 2364 if (isv6) { 2365 mutex_exit(&ncec->ncec_lock); 2366 dropped = ndp_xmit(src_ill, ND_NEIGHBOR_SOLICIT, 2367 src_ill->ill_phys_addr, 2368 src_ill->ill_phys_addr_length, 2369 &sender6, &ncec->ncec_addr, 2370 NDP_UNICAST); 2371 } else { 2372 dropped = (arp_request(ncec, sender4, src_ill) == 0); 2373 mutex_exit(&ncec->ncec_lock); 2374 } 2375 if (!dropped) { 2376 mutex_enter(&ncec->ncec_lock); 2377 ncec->ncec_pcnt--; 2378 mutex_exit(&ncec->ncec_lock); 2379 } 2380 if (ip_debug > 3) { 2381 /* ip2dbg */ 2382 pr_addr_dbg("nce_timer: state for %s changed " 2383 "to PROBE\n", AF_INET6, &ncec->ncec_addr); 2384 } 2385 nce_restart_timer(ncec, ill->ill_reachable_retrans_time); 2386 break; 2387 case ND_PROBE: 2388 /* must be retransmit timer */ 2389 ASSERT(ncec->ncec_pcnt >= -1); 2390 if (ncec->ncec_pcnt > 0) { 2391 /* 2392 * As per RFC2461, the ncec gets deleted after 2393 * MAX_UNICAST_SOLICIT unsuccessful re-transmissions. 2394 * Note that the first unicast solicitation is sent 2395 * during the DELAY state. 2396 */ 2397 ip2dbg(("nce_timer: pcount=%x dst %s\n", 2398 ncec->ncec_pcnt, 2399 inet_ntop((isv6? AF_INET6 : AF_INET), 2400 &ncec->ncec_addr, addrbuf, sizeof (addrbuf)))); 2401 if (NCE_PUBLISH(ncec)) { 2402 mutex_exit(&ncec->ncec_lock); 2403 /* 2404 * send out a probe; note that src_ill 2405 * is ignored by nce_dad() for all 2406 * DAD message types other than IPv6 2407 * unicast probes 2408 */ 2409 nce_dad(ncec, src_ill, B_TRUE); 2410 } else { 2411 ASSERT(src_ill != NULL); 2412 if (isv6) { 2413 mutex_exit(&ncec->ncec_lock); 2414 dropped = ndp_xmit(src_ill, 2415 ND_NEIGHBOR_SOLICIT, 2416 src_ill->ill_phys_addr, 2417 src_ill->ill_phys_addr_length, 2418 &sender6, &ncec->ncec_addr, 2419 NDP_UNICAST); 2420 } else { 2421 /* 2422 * since the nce is REACHABLE, 2423 * the ARP request will be sent out 2424 * as a link-layer unicast. 2425 */ 2426 dropped = (arp_request(ncec, sender4, 2427 src_ill) == 0); 2428 mutex_exit(&ncec->ncec_lock); 2429 } 2430 if (!dropped) { 2431 mutex_enter(&ncec->ncec_lock); 2432 ncec->ncec_pcnt--; 2433 mutex_exit(&ncec->ncec_lock); 2434 } 2435 nce_restart_timer(ncec, 2436 ill->ill_reachable_retrans_time); 2437 } 2438 } else if (ncec->ncec_pcnt < 0) { 2439 /* No hope, delete the ncec */ 2440 /* Tell datapath it went bad */ 2441 ncec->ncec_state = ND_UNREACHABLE; 2442 mutex_exit(&ncec->ncec_lock); 2443 if (ip_debug > 2) { 2444 /* ip1dbg */ 2445 pr_addr_dbg("nce_timer: Delete NCE for" 2446 " dst %s\n", (isv6? AF_INET6: AF_INET), 2447 &ncec->ncec_addr); 2448 } 2449 /* if static ARP can't delete. */ 2450 if ((ncec->ncec_flags & NCE_F_STATIC) == 0) 2451 ncec_delete(ncec); 2452 2453 } else if (!NCE_PUBLISH(ncec)) { 2454 /* 2455 * Probe count is 0 for a dynamic entry (one that we 2456 * ourselves are not publishing). We should never get 2457 * here if NONUD was requested, hence the ASSERT below. 2458 */ 2459 ASSERT((ncec->ncec_flags & NCE_F_NONUD) == 0); 2460 ip2dbg(("nce_timer: pcount=%x dst %s\n", 2461 ncec->ncec_pcnt, inet_ntop(AF_INET6, 2462 &ncec->ncec_addr, addrbuf, sizeof (addrbuf)))); 2463 ncec->ncec_pcnt--; 2464 mutex_exit(&ncec->ncec_lock); 2465 /* Wait one interval before killing */ 2466 nce_restart_timer(ncec, 2467 ill->ill_reachable_retrans_time); 2468 } else if (ill->ill_phyint->phyint_flags & PHYI_RUNNING) { 2469 ipif_t *ipif; 2470 ipaddr_t ncec_addr; 2471 2472 /* 2473 * We're done probing, and we can now declare this 2474 * address to be usable. Let IP know that it's ok to 2475 * use. 2476 */ 2477 ncec->ncec_state = ND_REACHABLE; 2478 ncec->ncec_flags &= ~NCE_F_UNVERIFIED; 2479 mutex_exit(&ncec->ncec_lock); 2480 if (isv6) { 2481 ipif = ipif_lookup_addr_exact_v6( 2482 &ncec->ncec_addr, ill, ipst); 2483 } else { 2484 IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, 2485 ncec_addr); 2486 ipif = ipif_lookup_addr_exact(ncec_addr, ill, 2487 ipst); 2488 } 2489 if (ipif != NULL) { 2490 if (ipif->ipif_was_dup) { 2491 char ibuf[LIFNAMSIZ]; 2492 char sbuf[INET6_ADDRSTRLEN]; 2493 2494 ipif->ipif_was_dup = B_FALSE; 2495 (void) inet_ntop(AF_INET6, 2496 &ipif->ipif_v6lcl_addr, 2497 sbuf, sizeof (sbuf)); 2498 ipif_get_name(ipif, ibuf, 2499 sizeof (ibuf)); 2500 cmn_err(CE_NOTE, "recovered address " 2501 "%s on %s", sbuf, ibuf); 2502 } 2503 if ((ipif->ipif_flags & IPIF_UP) && 2504 !ipif->ipif_addr_ready) 2505 ipif_up_notify(ipif); 2506 ipif->ipif_addr_ready = 1; 2507 ipif_refrele(ipif); 2508 } 2509 if (!isv6 && arp_no_defense) 2510 break; 2511 /* Begin defending our new address */ 2512 if (ncec->ncec_unsolicit_count > 0) { 2513 ncec->ncec_unsolicit_count--; 2514 if (isv6) { 2515 dropped = ndp_announce(ncec); 2516 } else { 2517 dropped = arp_announce(ncec); 2518 } 2519 2520 if (dropped) 2521 ncec->ncec_unsolicit_count++; 2522 else 2523 ncec->ncec_last_time_defended = 2524 ddi_get_lbolt(); 2525 } 2526 if (ncec->ncec_unsolicit_count > 0) { 2527 nce_restart_timer(ncec, 2528 ANNOUNCE_INTERVAL(isv6)); 2529 } else if (DEFENSE_INTERVAL(isv6) != 0) { 2530 nce_restart_timer(ncec, DEFENSE_INTERVAL(isv6)); 2531 } 2532 } else { 2533 /* 2534 * This is an address we're probing to be our own, but 2535 * the ill is down. Wait until it comes back before 2536 * doing anything, but switch to reachable state so 2537 * that the restart will work. 2538 */ 2539 ncec->ncec_state = ND_REACHABLE; 2540 mutex_exit(&ncec->ncec_lock); 2541 } 2542 break; 2543 case ND_INCOMPLETE: { 2544 mblk_t *mp, *nextmp; 2545 mblk_t **prevmpp; 2546 2547 /* 2548 * Per case (2) in the nce_queue_mp() comments, scan ncec_qd_mp 2549 * for any IPMP probe packets, and toss them. IPMP probe 2550 * packets will always be at the head of ncec_qd_mp, so that 2551 * we can stop at the first queued ND packet that is 2552 * not a probe packet. 2553 */ 2554 prevmpp = &ncec->ncec_qd_mp; 2555 for (mp = ncec->ncec_qd_mp; mp != NULL; mp = nextmp) { 2556 nextmp = mp->b_next; 2557 2558 if (IS_UNDER_IPMP(ill) && ncec->ncec_nprobes > 0) { 2559 inet_freemsg(mp); 2560 ncec->ncec_nprobes--; 2561 *prevmpp = nextmp; 2562 } else { 2563 prevmpp = &mp->b_next; 2564 } 2565 } 2566 2567 /* 2568 * Must be resolver's retransmit timer. 2569 */ 2570 mutex_exit(&ncec->ncec_lock); 2571 ip_ndp_resolve(ncec); 2572 break; 2573 } 2574 case ND_REACHABLE: 2575 if (((ncec->ncec_flags & NCE_F_UNSOL_ADV) && 2576 ncec->ncec_unsolicit_count != 0) || 2577 (NCE_PUBLISH(ncec) && DEFENSE_INTERVAL(isv6) != 0)) { 2578 if (ncec->ncec_unsolicit_count > 0) { 2579 ncec->ncec_unsolicit_count--; 2580 mutex_exit(&ncec->ncec_lock); 2581 /* 2582 * When we get to zero announcements left, 2583 * switch to address defense 2584 */ 2585 } else { 2586 boolean_t rate_limit; 2587 2588 mutex_exit(&ncec->ncec_lock); 2589 rate_limit = ill_defend_rate_limit(ill, ncec); 2590 if (rate_limit) { 2591 nce_restart_timer(ncec, 2592 DEFENSE_INTERVAL(isv6)); 2593 break; 2594 } 2595 } 2596 if (isv6) { 2597 dropped = ndp_announce(ncec); 2598 } else { 2599 dropped = arp_announce(ncec); 2600 } 2601 mutex_enter(&ncec->ncec_lock); 2602 if (dropped) { 2603 ncec->ncec_unsolicit_count++; 2604 } else { 2605 ncec->ncec_last_time_defended = 2606 ddi_get_lbolt(); 2607 } 2608 mutex_exit(&ncec->ncec_lock); 2609 if (ncec->ncec_unsolicit_count != 0) { 2610 nce_restart_timer(ncec, 2611 ANNOUNCE_INTERVAL(isv6)); 2612 } else { 2613 nce_restart_timer(ncec, DEFENSE_INTERVAL(isv6)); 2614 } 2615 } else { 2616 mutex_exit(&ncec->ncec_lock); 2617 } 2618 break; 2619 default: 2620 mutex_exit(&ncec->ncec_lock); 2621 break; 2622 } 2623 done: 2624 ncec_refrele(ncec); 2625 ill_refrele(src_ill); 2626 } 2627 2628 /* 2629 * Set a link layer address from the ll_addr passed in. 2630 * Copy SAP from ill. 2631 */ 2632 static void 2633 nce_set_ll(ncec_t *ncec, uchar_t *ll_addr) 2634 { 2635 ill_t *ill = ncec->ncec_ill; 2636 2637 ASSERT(ll_addr != NULL); 2638 if (ill->ill_phys_addr_length > 0) { 2639 /* 2640 * The bcopy() below used to be called for the physical address 2641 * length rather than the link layer address length. For 2642 * ethernet and many other media, the phys_addr and lla are 2643 * identical. 2644 * 2645 * The phys_addr and lla may not be the same for devices that 2646 * support DL_IPV6_LINK_LAYER_ADDR, though there are currently 2647 * no known instances of these. 2648 * 2649 * For PPP or other interfaces with a zero length 2650 * physical address, don't do anything here. 2651 * The bcopy() with a zero phys_addr length was previously 2652 * a no-op for interfaces with a zero-length physical address. 2653 * Using the lla for them would change the way they operate. 2654 * Doing nothing in such cases preserves expected behavior. 2655 */ 2656 bcopy(ll_addr, ncec->ncec_lladdr, ill->ill_nd_lla_len); 2657 } 2658 } 2659 2660 boolean_t 2661 nce_cmp_ll_addr(const ncec_t *ncec, const uchar_t *ll_addr, 2662 uint32_t ll_addr_len) 2663 { 2664 ASSERT(ncec->ncec_lladdr != NULL); 2665 if (ll_addr == NULL) 2666 return (B_FALSE); 2667 if (bcmp(ll_addr, ncec->ncec_lladdr, ll_addr_len) != 0) 2668 return (B_TRUE); 2669 return (B_FALSE); 2670 } 2671 2672 /* 2673 * Updates the link layer address or the reachability state of 2674 * a cache entry. Reset probe counter if needed. 2675 */ 2676 void 2677 nce_update(ncec_t *ncec, uint16_t new_state, uchar_t *new_ll_addr) 2678 { 2679 ill_t *ill = ncec->ncec_ill; 2680 boolean_t need_stop_timer = B_FALSE; 2681 boolean_t need_fastpath_update = B_FALSE; 2682 nce_t *nce = NULL; 2683 timeout_id_t tid; 2684 2685 ASSERT(MUTEX_HELD(&ncec->ncec_lock)); 2686 /* 2687 * If this interface does not do NUD, there is no point 2688 * in allowing an update to the cache entry. Although 2689 * we will respond to NS. 2690 * The only time we accept an update for a resolver when 2691 * NUD is turned off is when it has just been created. 2692 * Non-Resolvers will always be created as REACHABLE. 2693 */ 2694 if (new_state != ND_UNCHANGED) { 2695 if ((ncec->ncec_flags & NCE_F_NONUD) && 2696 (ncec->ncec_state != ND_INCOMPLETE)) 2697 return; 2698 ASSERT((int16_t)new_state >= ND_STATE_VALID_MIN); 2699 ASSERT((int16_t)new_state <= ND_STATE_VALID_MAX); 2700 need_stop_timer = B_TRUE; 2701 if (new_state == ND_REACHABLE) 2702 ncec->ncec_last = TICK_TO_MSEC(ddi_get_lbolt64()); 2703 else { 2704 /* We force NUD in this case */ 2705 ncec->ncec_last = 0; 2706 } 2707 ncec->ncec_state = new_state; 2708 ncec->ncec_pcnt = ND_MAX_UNICAST_SOLICIT; 2709 ASSERT(ncec->ncec_lladdr != NULL || new_state == ND_INITIAL || 2710 new_state == ND_INCOMPLETE); 2711 } 2712 if (need_stop_timer || (ncec->ncec_flags & NCE_F_STATIC)) { 2713 tid = ncec->ncec_timeout_id; 2714 ncec->ncec_timeout_id = 0; 2715 } 2716 /* 2717 * Re-trigger fastpath probe and 2718 * overwrite the DL_UNITDATA_REQ data, noting we'll lose 2719 * whatever packets that happens to be transmitting at the time. 2720 */ 2721 if (new_ll_addr != NULL) { 2722 bcopy(new_ll_addr, ncec->ncec_lladdr, 2723 ill->ill_phys_addr_length); 2724 need_fastpath_update = B_TRUE; 2725 } 2726 mutex_exit(&ncec->ncec_lock); 2727 if (need_stop_timer || (ncec->ncec_flags & NCE_F_STATIC)) { 2728 if (tid != 0) 2729 (void) untimeout(tid); 2730 } 2731 if (need_fastpath_update) { 2732 /* 2733 * Delete any existing existing dlur_mp and fp_mp information. 2734 * For IPMP interfaces, all underlying ill's must be checked 2735 * and purged. 2736 */ 2737 nce_fastpath_list_delete(ncec->ncec_ill, ncec, NULL); 2738 /* 2739 * add the new dlur_mp and fp_mp 2740 */ 2741 nce = nce_fastpath(ncec, B_TRUE, NULL); 2742 if (nce != NULL) 2743 nce_refrele(nce); 2744 } 2745 mutex_enter(&ncec->ncec_lock); 2746 } 2747 2748 static void 2749 nce_queue_mp_common(ncec_t *ncec, mblk_t *mp, boolean_t head_insert) 2750 { 2751 uint_t count = 0; 2752 mblk_t **mpp, *tmp; 2753 2754 ASSERT(MUTEX_HELD(&ncec->ncec_lock)); 2755 2756 for (mpp = &ncec->ncec_qd_mp; *mpp != NULL; mpp = &(*mpp)->b_next) { 2757 if (++count > ncec->ncec_ill->ill_max_buf) { 2758 tmp = ncec->ncec_qd_mp->b_next; 2759 ncec->ncec_qd_mp->b_next = NULL; 2760 /* 2761 * if we never create data addrs on the under_ill 2762 * does this matter? 2763 */ 2764 BUMP_MIB(ncec->ncec_ill->ill_ip_mib, 2765 ipIfStatsOutDiscards); 2766 ip_drop_output("ipIfStatsOutDiscards", ncec->ncec_qd_mp, 2767 ncec->ncec_ill); 2768 freemsg(ncec->ncec_qd_mp); 2769 ncec->ncec_qd_mp = tmp; 2770 } 2771 } 2772 2773 if (head_insert) { 2774 ncec->ncec_nprobes++; 2775 mp->b_next = ncec->ncec_qd_mp; 2776 ncec->ncec_qd_mp = mp; 2777 } else { 2778 *mpp = mp; 2779 } 2780 } 2781 2782 /* 2783 * nce_queue_mp will queue the packet into the ncec_qd_mp. The packet will be 2784 * queued at the head or tail of the queue based on the input argument 2785 * 'head_insert'. The caller should specify this argument as B_TRUE if this 2786 * packet is an IPMP probe packet, in which case the following happens: 2787 * 2788 * 1. Insert it at the head of the ncec_qd_mp list. Consider the normal 2789 * (non-ipmp_probe) load-speading case where the source address of the ND 2790 * packet is not tied to ncec_ill. If the ill bound to the source address 2791 * cannot receive, the response to the ND packet will not be received. 2792 * However, if ND packets for ncec_ill's probes are queued behind that ND 2793 * packet, those probes will also fail to be sent, and thus in.mpathd will 2794 * erroneously conclude that ncec_ill has also failed. 2795 * 2796 * 2. Drop the ipmp_probe packet in ndp_timer() if the ND did not succeed on 2797 * the first attempt. This ensures that ND problems do not manifest as 2798 * probe RTT spikes. 2799 * 2800 * We achieve this by inserting ipmp_probe() packets at the head of the 2801 * nce_queue. 2802 * 2803 * The ncec for the probe target is created with ncec_ill set to the ipmp_ill, 2804 * but the caller needs to set head_insert to B_TRUE if this is a probe packet. 2805 */ 2806 void 2807 nce_queue_mp(ncec_t *ncec, mblk_t *mp, boolean_t head_insert) 2808 { 2809 ASSERT(MUTEX_HELD(&ncec->ncec_lock)); 2810 nce_queue_mp_common(ncec, mp, head_insert); 2811 } 2812 2813 /* 2814 * Called when address resolution failed due to a timeout. 2815 * Send an ICMP unreachable in response to all queued packets. 2816 */ 2817 void 2818 ndp_resolv_failed(ncec_t *ncec) 2819 { 2820 mblk_t *mp, *nxt_mp; 2821 char buf[INET6_ADDRSTRLEN]; 2822 ill_t *ill = ncec->ncec_ill; 2823 ip_recv_attr_t iras; 2824 2825 bzero(&iras, sizeof (iras)); 2826 iras.ira_flags = 0; 2827 /* 2828 * we are setting the ira_rill to the ipmp_ill (instead of 2829 * the actual ill on which the packet was received), but this 2830 * is ok because we don't actually need the real ira_rill. 2831 * to send the icmp unreachable to the sender. 2832 */ 2833 iras.ira_ill = iras.ira_rill = ill; 2834 iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex; 2835 iras.ira_rifindex = iras.ira_ruifindex; 2836 2837 ip1dbg(("ndp_resolv_failed: dst %s\n", 2838 inet_ntop(AF_INET6, (char *)&ncec->ncec_addr, buf, sizeof (buf)))); 2839 mutex_enter(&ncec->ncec_lock); 2840 mp = ncec->ncec_qd_mp; 2841 ncec->ncec_qd_mp = NULL; 2842 ncec->ncec_nprobes = 0; 2843 mutex_exit(&ncec->ncec_lock); 2844 while (mp != NULL) { 2845 nxt_mp = mp->b_next; 2846 mp->b_next = NULL; 2847 2848 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); 2849 ip_drop_output("ipIfStatsOutDiscards - address unreachable", 2850 mp, ill); 2851 icmp_unreachable_v6(mp, 2852 ICMP6_DST_UNREACH_ADDR, B_FALSE, &iras); 2853 ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE)); 2854 mp = nxt_mp; 2855 } 2856 ncec_cb_dispatch(ncec); /* finish off waiting callbacks */ 2857 } 2858 2859 /* 2860 * Handle the completion of NDP and ARP resolution. 2861 */ 2862 void 2863 nce_resolv_ok(ncec_t *ncec) 2864 { 2865 mblk_t *mp; 2866 uint_t pkt_len; 2867 iaflags_t ixaflags = IXAF_NO_TRACE; 2868 nce_t *nce; 2869 ill_t *ill = ncec->ncec_ill; 2870 boolean_t isv6 = (ncec->ncec_ipversion == IPV6_VERSION); 2871 ip_stack_t *ipst = ill->ill_ipst; 2872 2873 if (IS_IPMP(ncec->ncec_ill)) { 2874 nce_resolv_ipmp_ok(ncec); 2875 return; 2876 } 2877 /* non IPMP case */ 2878 2879 mutex_enter(&ncec->ncec_lock); 2880 ASSERT(ncec->ncec_nprobes == 0); 2881 mp = ncec->ncec_qd_mp; 2882 ncec->ncec_qd_mp = NULL; 2883 mutex_exit(&ncec->ncec_lock); 2884 2885 while (mp != NULL) { 2886 mblk_t *nxt_mp; 2887 2888 if (ill->ill_isv6) { 2889 ip6_t *ip6h = (ip6_t *)mp->b_rptr; 2890 2891 pkt_len = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN; 2892 } else { 2893 ipha_t *ipha = (ipha_t *)mp->b_rptr; 2894 2895 ixaflags |= IXAF_IS_IPV4; 2896 pkt_len = ntohs(ipha->ipha_length); 2897 } 2898 nxt_mp = mp->b_next; 2899 mp->b_next = NULL; 2900 /* 2901 * IXAF_NO_DEV_FLOW_CTL information for TCP packets is no 2902 * longer available, but it's ok to drop this flag because TCP 2903 * has its own flow-control in effect, so TCP packets 2904 * are not likely to get here when flow-control is in effect. 2905 */ 2906 mutex_enter(&ill->ill_lock); 2907 nce = nce_lookup(ill, &ncec->ncec_addr); 2908 mutex_exit(&ill->ill_lock); 2909 2910 if (nce == NULL) { 2911 if (isv6) { 2912 BUMP_MIB(&ipst->ips_ip6_mib, 2913 ipIfStatsOutDiscards); 2914 } else { 2915 BUMP_MIB(&ipst->ips_ip_mib, 2916 ipIfStatsOutDiscards); 2917 } 2918 ip_drop_output("ipIfStatsOutDiscards - no nce", 2919 mp, NULL); 2920 freemsg(mp); 2921 } else { 2922 /* 2923 * We don't know the zoneid, but 2924 * ip_xmit does not care since IXAF_NO_TRACE 2925 * is set. (We traced the packet the first 2926 * time through ip_xmit.) 2927 */ 2928 (void) ip_xmit(mp, nce, ixaflags, pkt_len, 0, 2929 ALL_ZONES, 0, NULL); 2930 nce_refrele(nce); 2931 } 2932 mp = nxt_mp; 2933 } 2934 2935 ncec_cb_dispatch(ncec); /* complete callbacks */ 2936 } 2937 2938 /* 2939 * Called by SIOCSNDP* ioctl to add/change an ncec entry 2940 * and the corresponding attributes. 2941 * Disallow states other than ND_REACHABLE or ND_STALE. 2942 */ 2943 int 2944 ndp_sioc_update(ill_t *ill, lif_nd_req_t *lnr) 2945 { 2946 sin6_t *sin6; 2947 in6_addr_t *addr; 2948 ncec_t *ncec; 2949 nce_t *nce; 2950 int err = 0; 2951 uint16_t new_flags = 0; 2952 uint16_t old_flags = 0; 2953 int inflags = lnr->lnr_flags; 2954 ip_stack_t *ipst = ill->ill_ipst; 2955 boolean_t do_postprocess = B_FALSE; 2956 2957 ASSERT(ill->ill_isv6); 2958 if ((lnr->lnr_state_create != ND_REACHABLE) && 2959 (lnr->lnr_state_create != ND_STALE)) 2960 return (EINVAL); 2961 2962 sin6 = (sin6_t *)&lnr->lnr_addr; 2963 addr = &sin6->sin6_addr; 2964 2965 mutex_enter(&ipst->ips_ndp6->ndp_g_lock); 2966 ASSERT(!IS_UNDER_IPMP(ill)); 2967 nce = nce_lookup_addr(ill, addr); 2968 if (nce != NULL) 2969 new_flags = nce->nce_common->ncec_flags; 2970 2971 switch (inflags & (NDF_ISROUTER_ON|NDF_ISROUTER_OFF)) { 2972 case NDF_ISROUTER_ON: 2973 new_flags |= NCE_F_ISROUTER; 2974 break; 2975 case NDF_ISROUTER_OFF: 2976 new_flags &= ~NCE_F_ISROUTER; 2977 break; 2978 case (NDF_ISROUTER_OFF|NDF_ISROUTER_ON): 2979 mutex_exit(&ipst->ips_ndp6->ndp_g_lock); 2980 if (nce != NULL) 2981 nce_refrele(nce); 2982 return (EINVAL); 2983 } 2984 if (inflags & NDF_STATIC) 2985 new_flags |= NCE_F_STATIC; 2986 2987 switch (inflags & (NDF_ANYCAST_ON|NDF_ANYCAST_OFF)) { 2988 case NDF_ANYCAST_ON: 2989 new_flags |= NCE_F_ANYCAST; 2990 break; 2991 case NDF_ANYCAST_OFF: 2992 new_flags &= ~NCE_F_ANYCAST; 2993 break; 2994 case (NDF_ANYCAST_OFF|NDF_ANYCAST_ON): 2995 mutex_exit(&ipst->ips_ndp6->ndp_g_lock); 2996 if (nce != NULL) 2997 nce_refrele(nce); 2998 return (EINVAL); 2999 } 3000 3001 if (nce == NULL) { 3002 err = nce_add_v6(ill, 3003 (uchar_t *)lnr->lnr_hdw_addr, 3004 ill->ill_phys_addr_length, 3005 addr, 3006 new_flags, 3007 lnr->lnr_state_create, 3008 &nce); 3009 if (err != 0) { 3010 mutex_exit(&ipst->ips_ndp6->ndp_g_lock); 3011 ip1dbg(("ndp_sioc_update: Can't create NCE %d\n", err)); 3012 return (err); 3013 } else { 3014 do_postprocess = B_TRUE; 3015 } 3016 } 3017 ncec = nce->nce_common; 3018 old_flags = ncec->ncec_flags; 3019 if (old_flags & NCE_F_ISROUTER && !(new_flags & NCE_F_ISROUTER)) { 3020 ncec_router_to_host(ncec); 3021 mutex_exit(&ipst->ips_ndp6->ndp_g_lock); 3022 if (do_postprocess) 3023 err = nce_add_v6_postprocess(nce); 3024 nce_refrele(nce); 3025 return (0); 3026 } 3027 mutex_exit(&ipst->ips_ndp6->ndp_g_lock); 3028 3029 if (do_postprocess) 3030 err = nce_add_v6_postprocess(nce); 3031 /* 3032 * err cannot be anything other than 0 because we don't support 3033 * proxy arp of static addresses. 3034 */ 3035 ASSERT(err == 0); 3036 3037 mutex_enter(&ncec->ncec_lock); 3038 ncec->ncec_flags = new_flags; 3039 mutex_exit(&ncec->ncec_lock); 3040 /* 3041 * Note that we ignore the state at this point, which 3042 * should be either STALE or REACHABLE. Instead we let 3043 * the link layer address passed in to determine the state 3044 * much like incoming packets. 3045 */ 3046 nce_process(ncec, (uchar_t *)lnr->lnr_hdw_addr, 0, B_FALSE); 3047 nce_refrele(nce); 3048 return (0); 3049 } 3050 3051 /* 3052 * Create an nce_t structure for ill using the ncec->ncec_lladdr to set up 3053 * the nce_dlur_mp. If ill != ncec->ncec_ill, then the ips_ill_g_lock must 3054 * be held to ensure that they are in the same group. 3055 */ 3056 static nce_t * 3057 nce_fastpath_create(ill_t *ill, ncec_t *ncec) 3058 { 3059 3060 nce_t *nce; 3061 3062 nce = nce_ill_lookup_then_add(ill, ncec); 3063 3064 if (nce == NULL || IS_LOOPBACK(nce->nce_ill) || IS_VNI(nce->nce_ill)) 3065 return (nce); 3066 3067 /* 3068 * hold the ncec_lock to synchronize with nce_update() so that, 3069 * at the end of this function, the contents of nce_dlur_mp are 3070 * consistent with ncec->ncec_lladdr, even though some intermediate 3071 * packet may have been sent out with a mangled address, which would 3072 * only be a transient condition. 3073 */ 3074 mutex_enter(&ncec->ncec_lock); 3075 if (ncec->ncec_lladdr != NULL) { 3076 bcopy(ncec->ncec_lladdr, nce->nce_dlur_mp->b_rptr + 3077 NCE_LL_ADDR_OFFSET(ill), ill->ill_phys_addr_length); 3078 } else { 3079 nce->nce_dlur_mp = ill_dlur_gen(NULL, 0, ill->ill_sap, 3080 ill->ill_sap_length); 3081 } 3082 mutex_exit(&ncec->ncec_lock); 3083 return (nce); 3084 } 3085 3086 /* 3087 * we make nce_fp_mp to have an M_DATA prepend. 3088 * The caller ensures there is hold on ncec for this function. 3089 * Note that since ill_fastpath_probe() copies the mblk there is 3090 * no need to hold the nce or ncec beyond this function. 3091 * 3092 * If the caller has passed in a non-null ncec_nce to nce_fastpath() that 3093 * ncec_nce must correspond to the nce for ncec with nce_ill == ncec->ncec_ill 3094 * and will be returned back by this function, so that no extra nce_refrele 3095 * is required for the caller. The calls from nce_add_common() use this 3096 * method. All other callers (that pass in NULL ncec_nce) will have to do a 3097 * nce_refrele of the returned nce (when it is non-null). 3098 */ 3099 nce_t * 3100 nce_fastpath(ncec_t *ncec, boolean_t trigger_fp_req, nce_t *ncec_nce) 3101 { 3102 nce_t *nce; 3103 ill_t *ill = ncec->ncec_ill; 3104 3105 ASSERT(ill != NULL); 3106 3107 if (IS_IPMP(ill) && trigger_fp_req) { 3108 trigger_fp_req = B_FALSE; 3109 ipmp_ncec_refresh_nce(ncec); 3110 } 3111 3112 /* 3113 * If the caller already has the nce corresponding to the ill, use 3114 * that one. Otherwise we have to lookup/add the nce. Calls from 3115 * nce_add_common() fall in the former category, and have just done 3116 * the nce lookup/add that can be reused. 3117 */ 3118 if (ncec_nce == NULL) 3119 nce = nce_fastpath_create(ill, ncec); 3120 else 3121 nce = ncec_nce; 3122 3123 if (nce == NULL || IS_LOOPBACK(nce->nce_ill) || IS_VNI(nce->nce_ill)) 3124 return (nce); 3125 3126 if (trigger_fp_req) 3127 nce_fastpath_trigger(nce); 3128 return (nce); 3129 } 3130 3131 /* 3132 * Trigger fastpath on nce. No locks may be held. 3133 */ 3134 static void 3135 nce_fastpath_trigger(nce_t *nce) 3136 { 3137 int res; 3138 ill_t *ill = nce->nce_ill; 3139 ncec_t *ncec = nce->nce_common; 3140 3141 res = ill_fastpath_probe(ill, nce->nce_dlur_mp); 3142 /* 3143 * EAGAIN is an indication of a transient error 3144 * i.e. allocation failure etc. leave the ncec in the list it 3145 * will be updated when another probe happens for another ire 3146 * if not it will be taken out of the list when the ire is 3147 * deleted. 3148 */ 3149 if (res != 0 && res != EAGAIN && res != ENOTSUP) 3150 nce_fastpath_list_delete(ill, ncec, NULL); 3151 } 3152 3153 /* 3154 * Add ncec to the nce fastpath list on ill. 3155 */ 3156 static nce_t * 3157 nce_ill_lookup_then_add_locked(ill_t *ill, ncec_t *ncec) 3158 { 3159 nce_t *nce = NULL; 3160 3161 ASSERT(MUTEX_HELD(&ill->ill_lock)); 3162 /* 3163 * Atomically ensure that the ill is not CONDEMNED and is not going 3164 * down, before adding the NCE. 3165 */ 3166 if (ill->ill_state_flags & ILL_CONDEMNED) 3167 return (NULL); 3168 mutex_enter(&ncec->ncec_lock); 3169 /* 3170 * if ncec has not been deleted and 3171 * is not already in the list add it. 3172 */ 3173 if (!NCE_ISCONDEMNED(ncec)) { 3174 nce = nce_lookup(ill, &ncec->ncec_addr); 3175 if (nce != NULL) 3176 goto done; 3177 nce = nce_add(ill, ncec); 3178 } 3179 done: 3180 mutex_exit(&ncec->ncec_lock); 3181 return (nce); 3182 } 3183 3184 nce_t * 3185 nce_ill_lookup_then_add(ill_t *ill, ncec_t *ncec) 3186 { 3187 nce_t *nce; 3188 3189 mutex_enter(&ill->ill_lock); 3190 nce = nce_ill_lookup_then_add_locked(ill, ncec); 3191 mutex_exit(&ill->ill_lock); 3192 return (nce); 3193 } 3194 3195 3196 /* 3197 * remove ncec from the ill_nce list. If 'dead' is non-null, the deleted 3198 * nce is added to the 'dead' list, and the caller must nce_refrele() the 3199 * entry after all locks have been dropped. 3200 */ 3201 void 3202 nce_fastpath_list_delete(ill_t *ill, ncec_t *ncec, list_t *dead) 3203 { 3204 nce_t *nce; 3205 3206 ASSERT(ill != NULL); 3207 3208 /* delete any nces referencing the ncec from underlying ills */ 3209 if (IS_IPMP(ill)) 3210 ipmp_ncec_delete_nce(ncec); 3211 3212 /* now the ill itself */ 3213 mutex_enter(&ill->ill_lock); 3214 for (nce = list_head(&ill->ill_nce); nce != NULL; 3215 nce = list_next(&ill->ill_nce, nce)) { 3216 if (nce->nce_common == ncec) { 3217 nce_refhold(nce); 3218 nce_delete(nce); 3219 break; 3220 } 3221 } 3222 mutex_exit(&ill->ill_lock); 3223 if (nce != NULL) { 3224 if (dead == NULL) 3225 nce_refrele(nce); 3226 else 3227 list_insert_tail(dead, nce); 3228 } 3229 } 3230 3231 /* 3232 * when the fastpath response does not fit in the datab 3233 * associated with the existing nce_fp_mp, we delete and 3234 * add the nce to retrigger fastpath based on the information 3235 * in the ncec_t. 3236 */ 3237 static nce_t * 3238 nce_delete_then_add(nce_t *nce) 3239 { 3240 ill_t *ill = nce->nce_ill; 3241 nce_t *newnce = NULL; 3242 3243 ip0dbg(("nce_delete_then_add nce %p ill %s\n", 3244 (void *)nce, ill->ill_name)); 3245 mutex_enter(&ill->ill_lock); 3246 mutex_enter(&nce->nce_common->ncec_lock); 3247 nce_delete(nce); 3248 /* 3249 * Make sure that ncec is not condemned before adding. We hold the 3250 * ill_lock and ncec_lock to synchronize with ncec_delete() and 3251 * ipmp_ncec_delete_nce() 3252 */ 3253 if (!NCE_ISCONDEMNED(nce->nce_common)) 3254 newnce = nce_add(ill, nce->nce_common); 3255 mutex_exit(&nce->nce_common->ncec_lock); 3256 mutex_exit(&ill->ill_lock); 3257 nce_refrele(nce); 3258 return (newnce); /* could be null if nomem */ 3259 } 3260 3261 typedef struct nce_fp_match_s { 3262 nce_t *nce_fp_match_res; 3263 mblk_t *nce_fp_match_ack_mp; 3264 } nce_fp_match_t; 3265 3266 /* ARGSUSED */ 3267 static int 3268 nce_fastpath_match_dlur(ill_t *ill, nce_t *nce, void *arg) 3269 { 3270 nce_fp_match_t *nce_fp_marg = arg; 3271 ncec_t *ncec = nce->nce_common; 3272 mblk_t *mp = nce_fp_marg->nce_fp_match_ack_mp; 3273 uchar_t *mp_rptr, *ud_mp_rptr; 3274 mblk_t *ud_mp = nce->nce_dlur_mp; 3275 ptrdiff_t cmplen; 3276 3277 /* 3278 * mp is the mp associated with the fastpath ack. 3279 * ud_mp is the outstanding DL_UNITDATA_REQ on the nce_t 3280 * under consideration. If the contents match, then the 3281 * fastpath ack is used to update the nce. 3282 */ 3283 if (ud_mp == NULL) 3284 return (0); 3285 mp_rptr = mp->b_rptr; 3286 cmplen = mp->b_wptr - mp_rptr; 3287 ASSERT(cmplen >= 0); 3288 3289 ud_mp_rptr = ud_mp->b_rptr; 3290 /* 3291 * The ncec is locked here to prevent any other threads from accessing 3292 * and changing nce_dlur_mp when the address becomes resolved to an 3293 * lla while we're in the middle of looking at and comparing the 3294 * hardware address (lla). It is also locked to prevent multiple 3295 * threads in nce_fastpath() from examining nce_dlur_mp at the same 3296 * time. 3297 */ 3298 mutex_enter(&ncec->ncec_lock); 3299 if (ud_mp->b_wptr - ud_mp_rptr != cmplen || 3300 bcmp((char *)mp_rptr, (char *)ud_mp_rptr, cmplen) == 0) { 3301 nce_fp_marg->nce_fp_match_res = nce; 3302 mutex_exit(&ncec->ncec_lock); 3303 nce_refhold(nce); 3304 return (1); 3305 } 3306 mutex_exit(&ncec->ncec_lock); 3307 return (0); 3308 } 3309 3310 /* 3311 * Update all NCE's that are not in fastpath mode and 3312 * have an nce_fp_mp that matches mp. mp->b_cont contains 3313 * the fastpath header. 3314 * 3315 * Returns TRUE if entry should be dequeued, or FALSE otherwise. 3316 */ 3317 void 3318 nce_fastpath_update(ill_t *ill, mblk_t *mp) 3319 { 3320 nce_fp_match_t nce_fp_marg; 3321 nce_t *nce; 3322 mblk_t *nce_fp_mp, *fp_mp; 3323 3324 nce_fp_marg.nce_fp_match_res = NULL; 3325 nce_fp_marg.nce_fp_match_ack_mp = mp; 3326 3327 nce_walk(ill, nce_fastpath_match_dlur, &nce_fp_marg); 3328 3329 if ((nce = nce_fp_marg.nce_fp_match_res) == NULL) 3330 return; 3331 3332 mutex_enter(&nce->nce_lock); 3333 nce_fp_mp = nce->nce_fp_mp; 3334 3335 if (nce_fp_mp != NULL) { 3336 fp_mp = mp->b_cont; 3337 if (nce_fp_mp->b_rptr + MBLKL(fp_mp) > 3338 nce_fp_mp->b_datap->db_lim) { 3339 mutex_exit(&nce->nce_lock); 3340 nce = nce_delete_then_add(nce); 3341 if (nce == NULL) { 3342 return; 3343 } 3344 mutex_enter(&nce->nce_lock); 3345 nce_fp_mp = nce->nce_fp_mp; 3346 } 3347 } 3348 3349 /* Matched - install mp as the fastpath mp */ 3350 if (nce_fp_mp == NULL) { 3351 fp_mp = dupb(mp->b_cont); 3352 nce->nce_fp_mp = fp_mp; 3353 } else { 3354 fp_mp = mp->b_cont; 3355 bcopy(fp_mp->b_rptr, nce_fp_mp->b_rptr, MBLKL(fp_mp)); 3356 nce->nce_fp_mp->b_wptr = nce->nce_fp_mp->b_rptr 3357 + MBLKL(fp_mp); 3358 } 3359 mutex_exit(&nce->nce_lock); 3360 nce_refrele(nce); 3361 } 3362 3363 /* 3364 * Return a pointer to a given option in the packet. 3365 * Assumes that option part of the packet have already been validated. 3366 */ 3367 nd_opt_hdr_t * 3368 ndp_get_option(nd_opt_hdr_t *opt, int optlen, int opt_type) 3369 { 3370 while (optlen > 0) { 3371 if (opt->nd_opt_type == opt_type) 3372 return (opt); 3373 optlen -= 8 * opt->nd_opt_len; 3374 opt = (struct nd_opt_hdr *)((char *)opt + 8 * opt->nd_opt_len); 3375 } 3376 return (NULL); 3377 } 3378 3379 /* 3380 * Verify all option lengths present are > 0, also check to see 3381 * if the option lengths and packet length are consistent. 3382 */ 3383 boolean_t 3384 ndp_verify_optlen(nd_opt_hdr_t *opt, int optlen) 3385 { 3386 ASSERT(opt != NULL); 3387 while (optlen > 0) { 3388 if (opt->nd_opt_len == 0) 3389 return (B_FALSE); 3390 optlen -= 8 * opt->nd_opt_len; 3391 if (optlen < 0) 3392 return (B_FALSE); 3393 opt = (struct nd_opt_hdr *)((char *)opt + 8 * opt->nd_opt_len); 3394 } 3395 return (B_TRUE); 3396 } 3397 3398 /* 3399 * ncec_walk function. 3400 * Free a fraction of the NCE cache entries. 3401 * 3402 * A possible optimization here would be to use ncec_last where possible, and 3403 * delete the least-frequently used entry, which would require more complex 3404 * computation as we walk through the ncec's (e.g., track ncec entries by 3405 * order of ncec_last and/or maintain state) 3406 */ 3407 static void 3408 ncec_cache_reclaim(ncec_t *ncec, char *arg) 3409 { 3410 ip_stack_t *ipst = ncec->ncec_ipst; 3411 uint_t fraction = *(uint_t *)arg; 3412 uint_t rand; 3413 3414 if ((ncec->ncec_flags & 3415 (NCE_F_MYADDR | NCE_F_STATIC | NCE_F_BCAST)) != 0) { 3416 return; 3417 } 3418 3419 rand = (uint_t)ddi_get_lbolt() + 3420 NCE_ADDR_HASH_V6(ncec->ncec_addr, NCE_TABLE_SIZE); 3421 if ((rand/fraction)*fraction == rand) { 3422 IP_STAT(ipst, ip_nce_reclaim_deleted); 3423 ncec_delete(ncec); 3424 } 3425 } 3426 3427 /* 3428 * kmem_cache callback to free up memory. 3429 * 3430 * For now we just delete a fixed fraction. 3431 */ 3432 static void 3433 ip_nce_reclaim_stack(ip_stack_t *ipst) 3434 { 3435 uint_t fraction = ipst->ips_ip_nce_reclaim_fraction; 3436 3437 IP_STAT(ipst, ip_nce_reclaim_calls); 3438 3439 ncec_walk(NULL, (pfi_t)ncec_cache_reclaim, (uchar_t *)&fraction, ipst); 3440 3441 /* 3442 * Walk all CONNs that can have a reference on an ire, ncec or dce. 3443 * Get them to update any stale references to drop any refholds they 3444 * have. 3445 */ 3446 ipcl_walk(conn_ixa_cleanup, (void *)B_FALSE, ipst); 3447 } 3448 3449 /* 3450 * Called by the memory allocator subsystem directly, when the system 3451 * is running low on memory. 3452 */ 3453 /* ARGSUSED */ 3454 void 3455 ip_nce_reclaim(void *args) 3456 { 3457 netstack_handle_t nh; 3458 netstack_t *ns; 3459 ip_stack_t *ipst; 3460 3461 netstack_next_init(&nh); 3462 while ((ns = netstack_next(&nh)) != NULL) { 3463 /* 3464 * netstack_next() can return a netstack_t with a NULL 3465 * netstack_ip at boot time. 3466 */ 3467 if ((ipst = ns->netstack_ip) == NULL) { 3468 netstack_rele(ns); 3469 continue; 3470 } 3471 ip_nce_reclaim_stack(ipst); 3472 netstack_rele(ns); 3473 } 3474 netstack_next_fini(&nh); 3475 } 3476 3477 #ifdef DEBUG 3478 void 3479 ncec_trace_ref(ncec_t *ncec) 3480 { 3481 ASSERT(MUTEX_HELD(&ncec->ncec_lock)); 3482 3483 if (ncec->ncec_trace_disable) 3484 return; 3485 3486 if (!th_trace_ref(ncec, ncec->ncec_ipst)) { 3487 ncec->ncec_trace_disable = B_TRUE; 3488 ncec_trace_cleanup(ncec); 3489 } 3490 } 3491 3492 void 3493 ncec_untrace_ref(ncec_t *ncec) 3494 { 3495 ASSERT(MUTEX_HELD(&ncec->ncec_lock)); 3496 3497 if (!ncec->ncec_trace_disable) 3498 th_trace_unref(ncec); 3499 } 3500 3501 static void 3502 ncec_trace_cleanup(const ncec_t *ncec) 3503 { 3504 th_trace_cleanup(ncec, ncec->ncec_trace_disable); 3505 } 3506 #endif 3507 3508 /* 3509 * Called when address resolution fails due to a timeout. 3510 * Send an ICMP unreachable in response to all queued packets. 3511 */ 3512 void 3513 arp_resolv_failed(ncec_t *ncec) 3514 { 3515 mblk_t *mp, *nxt_mp; 3516 char buf[INET6_ADDRSTRLEN]; 3517 struct in_addr ipv4addr; 3518 ill_t *ill = ncec->ncec_ill; 3519 ip_stack_t *ipst = ncec->ncec_ipst; 3520 ip_recv_attr_t iras; 3521 3522 bzero(&iras, sizeof (iras)); 3523 iras.ira_flags = IRAF_IS_IPV4; 3524 /* 3525 * we are setting the ira_rill to the ipmp_ill (instead of 3526 * the actual ill on which the packet was received), but this 3527 * is ok because we don't actually need the real ira_rill. 3528 * to send the icmp unreachable to the sender. 3529 */ 3530 iras.ira_ill = iras.ira_rill = ill; 3531 iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex; 3532 iras.ira_rifindex = iras.ira_ruifindex; 3533 3534 IN6_V4MAPPED_TO_INADDR(&ncec->ncec_addr, &ipv4addr); 3535 ip3dbg(("arp_resolv_failed: dst %s\n", 3536 inet_ntop(AF_INET, &ipv4addr, buf, sizeof (buf)))); 3537 mutex_enter(&ncec->ncec_lock); 3538 mp = ncec->ncec_qd_mp; 3539 ncec->ncec_qd_mp = NULL; 3540 ncec->ncec_nprobes = 0; 3541 mutex_exit(&ncec->ncec_lock); 3542 while (mp != NULL) { 3543 nxt_mp = mp->b_next; 3544 mp->b_next = NULL; 3545 3546 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); 3547 ip_drop_output("ipIfStatsOutDiscards - address unreachable", 3548 mp, ill); 3549 if (ipst->ips_ip_arp_icmp_error) { 3550 ip3dbg(("arp_resolv_failed: " 3551 "Calling icmp_unreachable\n")); 3552 icmp_unreachable(mp, ICMP_HOST_UNREACHABLE, &iras); 3553 } else { 3554 freemsg(mp); 3555 } 3556 ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE)); 3557 mp = nxt_mp; 3558 } 3559 ncec_cb_dispatch(ncec); /* finish off waiting callbacks */ 3560 } 3561 3562 /* 3563 * if ill is an under_ill, translate it to the ipmp_ill and add the 3564 * nce on the ipmp_ill. Two nce_t entries (one on the ipmp_ill, and 3565 * one on the underlying in_ill) will be created for the 3566 * ncec_t in this case. The ncec_t itself will be created on the ipmp_ill. 3567 */ 3568 int 3569 nce_lookup_then_add_v4(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len, 3570 const in_addr_t *addr, uint16_t flags, uint16_t state, nce_t **newnce) 3571 { 3572 int err; 3573 in6_addr_t addr6; 3574 ip_stack_t *ipst = ill->ill_ipst; 3575 nce_t *nce, *upper_nce = NULL; 3576 ill_t *in_ill = ill, *under = NULL; 3577 boolean_t need_ill_refrele = B_FALSE; 3578 3579 if (flags & NCE_F_MCAST) { 3580 /* 3581 * hw_addr will be figured out in nce_set_multicast_v4; 3582 * caller needs to pass in the cast_ill for ipmp 3583 */ 3584 ASSERT(hw_addr == NULL); 3585 ASSERT(!IS_IPMP(ill)); 3586 err = nce_set_multicast_v4(ill, addr, flags, newnce); 3587 return (err); 3588 } 3589 3590 if (IS_UNDER_IPMP(ill) && !(flags & NCE_F_MYADDR)) { 3591 ill = ipmp_ill_hold_ipmp_ill(ill); 3592 if (ill == NULL) 3593 return (ENXIO); 3594 need_ill_refrele = B_TRUE; 3595 } 3596 if ((flags & NCE_F_BCAST) != 0) { 3597 /* 3598 * IPv4 broadcast ncec: compute the hwaddr. 3599 */ 3600 if (IS_IPMP(ill)) { 3601 under = ipmp_ill_hold_xmit_ill(ill, B_FALSE); 3602 if (under == NULL) { 3603 if (need_ill_refrele) 3604 ill_refrele(ill); 3605 return (ENETDOWN); 3606 } 3607 hw_addr = under->ill_bcast_mp->b_rptr + 3608 NCE_LL_ADDR_OFFSET(under); 3609 hw_addr_len = under->ill_phys_addr_length; 3610 } else { 3611 hw_addr = ill->ill_bcast_mp->b_rptr + 3612 NCE_LL_ADDR_OFFSET(ill), 3613 hw_addr_len = ill->ill_phys_addr_length; 3614 } 3615 } 3616 3617 mutex_enter(&ipst->ips_ndp4->ndp_g_lock); 3618 IN6_IPADDR_TO_V4MAPPED(*addr, &addr6); 3619 nce = nce_lookup_addr(ill, &addr6); 3620 if (nce == NULL) { 3621 err = nce_add_v4(ill, hw_addr, hw_addr_len, addr, flags, 3622 state, &nce); 3623 } else { 3624 err = EEXIST; 3625 } 3626 mutex_exit(&ipst->ips_ndp4->ndp_g_lock); 3627 if (err == 0) 3628 err = nce_add_v4_postprocess(nce); 3629 3630 if (in_ill != ill && nce != NULL) { 3631 nce_t *under_nce = NULL; 3632 3633 /* 3634 * in_ill was the under_ill. Try to create the under_nce. 3635 * Hold the ill_g_lock to prevent changes to group membership 3636 * until we are done. 3637 */ 3638 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 3639 if (!IS_IN_SAME_ILLGRP(in_ill, ill)) { 3640 DTRACE_PROBE2(ill__not__in__group, nce_t *, nce, 3641 ill_t *, ill); 3642 rw_exit(&ipst->ips_ill_g_lock); 3643 err = ENXIO; 3644 nce_refrele(nce); 3645 nce = NULL; 3646 goto bail; 3647 } 3648 under_nce = nce_fastpath_create(in_ill, nce->nce_common); 3649 if (under_nce == NULL) { 3650 rw_exit(&ipst->ips_ill_g_lock); 3651 err = EINVAL; 3652 nce_refrele(nce); 3653 nce = NULL; 3654 goto bail; 3655 } 3656 rw_exit(&ipst->ips_ill_g_lock); 3657 upper_nce = nce; 3658 nce = under_nce; /* will be returned to caller */ 3659 if (NCE_ISREACHABLE(nce->nce_common)) 3660 nce_fastpath_trigger(under_nce); 3661 } 3662 if (nce != NULL) { 3663 if (newnce != NULL) 3664 *newnce = nce; 3665 else 3666 nce_refrele(nce); 3667 } 3668 bail: 3669 if (under != NULL) 3670 ill_refrele(under); 3671 if (upper_nce != NULL) 3672 nce_refrele(upper_nce); 3673 if (need_ill_refrele) 3674 ill_refrele(ill); 3675 3676 return (err); 3677 } 3678 3679 /* 3680 * NDP Cache Entry creation routine for IPv4. 3681 * This routine must always be called with ndp4->ndp_g_lock held. 3682 * Prior to return, ncec_refcnt is incremented. 3683 * 3684 * IPMP notes: the ncec for non-local (i.e., !NCE_MYADDR(ncec) addresses 3685 * are always added pointing at the ipmp_ill. Thus, when the ill passed 3686 * to nce_add_v4 is an under_ill (i.e., IS_UNDER_IPMP(ill)) two nce_t 3687 * entries will be created, both pointing at the same ncec_t. The nce_t 3688 * entries will have their nce_ill set to the ipmp_ill and the under_ill 3689 * respectively, with the ncec_t having its ncec_ill pointing at the ipmp_ill. 3690 * Local addresses are always created on the ill passed to nce_add_v4. 3691 */ 3692 int 3693 nce_add_v4(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len, 3694 const in_addr_t *addr, uint16_t flags, uint16_t state, nce_t **newnce) 3695 { 3696 int err; 3697 boolean_t is_multicast = (flags & NCE_F_MCAST); 3698 struct in6_addr addr6; 3699 nce_t *nce; 3700 3701 ASSERT(MUTEX_HELD(&ill->ill_ipst->ips_ndp4->ndp_g_lock)); 3702 ASSERT(!ill->ill_isv6); 3703 ASSERT(!IN_MULTICAST(htonl(*addr)) || is_multicast); 3704 3705 IN6_IPADDR_TO_V4MAPPED(*addr, &addr6); 3706 err = nce_add_common(ill, hw_addr, hw_addr_len, &addr6, flags, state, 3707 &nce); 3708 ASSERT(newnce != NULL); 3709 *newnce = nce; 3710 return (err); 3711 } 3712 3713 /* 3714 * Post-processing routine to be executed after nce_add_v4(). This function 3715 * triggers fastpath (if appropriate) and DAD on the newly added nce entry 3716 * and must be called without any locks held. 3717 * 3718 * Always returns 0, but we return an int to keep this symmetric with the 3719 * IPv6 counter-part. 3720 */ 3721 int 3722 nce_add_v4_postprocess(nce_t *nce) 3723 { 3724 ncec_t *ncec = nce->nce_common; 3725 uint16_t flags = ncec->ncec_flags; 3726 boolean_t ndp_need_dad = B_FALSE; 3727 boolean_t dropped; 3728 clock_t delay; 3729 ip_stack_t *ipst = ncec->ncec_ill->ill_ipst; 3730 uchar_t *hw_addr = ncec->ncec_lladdr; 3731 boolean_t trigger_fastpath = B_TRUE; 3732 3733 /* 3734 * If the hw_addr is NULL, typically for ND_INCOMPLETE nces, then 3735 * we call nce_fastpath as soon as the ncec is resolved in nce_process. 3736 * We call nce_fastpath from nce_update if the link layer address of 3737 * the peer changes from nce_update 3738 */ 3739 if (NCE_PUBLISH(ncec) || !NCE_ISREACHABLE(ncec) || (hw_addr == NULL && 3740 ncec->ncec_ill->ill_net_type != IRE_IF_NORESOLVER)) 3741 trigger_fastpath = B_FALSE; 3742 3743 if (trigger_fastpath) 3744 nce_fastpath_trigger(nce); 3745 3746 if (NCE_PUBLISH(ncec) && ncec->ncec_state == ND_PROBE) { 3747 /* 3748 * Either the caller (by passing in ND_PROBE) 3749 * or nce_add_common() (by the internally computed state 3750 * based on ncec_addr and ill_net_type) has determined 3751 * that this unicast entry needs DAD. Trigger DAD. 3752 */ 3753 ndp_need_dad = B_TRUE; 3754 } else if (flags & NCE_F_UNSOL_ADV) { 3755 /* 3756 * We account for the transmit below by assigning one 3757 * less than the ndd variable. Subsequent decrements 3758 * are done in nce_timer. 3759 */ 3760 mutex_enter(&ncec->ncec_lock); 3761 ncec->ncec_unsolicit_count = 3762 ipst->ips_ip_arp_publish_count - 1; 3763 mutex_exit(&ncec->ncec_lock); 3764 dropped = arp_announce(ncec); 3765 mutex_enter(&ncec->ncec_lock); 3766 if (dropped) 3767 ncec->ncec_unsolicit_count++; 3768 else 3769 ncec->ncec_last_time_defended = ddi_get_lbolt(); 3770 if (ncec->ncec_unsolicit_count != 0) { 3771 nce_start_timer(ncec, 3772 ipst->ips_ip_arp_publish_interval); 3773 } 3774 mutex_exit(&ncec->ncec_lock); 3775 } 3776 3777 /* 3778 * If ncec_xmit_interval is 0, user has configured us to send the first 3779 * probe right away. Do so, and set up for the subsequent probes. 3780 */ 3781 if (ndp_need_dad) { 3782 mutex_enter(&ncec->ncec_lock); 3783 if (ncec->ncec_pcnt == 0) { 3784 /* 3785 * DAD probes and announce can be 3786 * administratively disabled by setting the 3787 * probe_count to zero. Restart the timer in 3788 * this case to mark the ipif as ready. 3789 */ 3790 ncec->ncec_unsolicit_count = 0; 3791 mutex_exit(&ncec->ncec_lock); 3792 nce_restart_timer(ncec, 0); 3793 } else { 3794 mutex_exit(&ncec->ncec_lock); 3795 delay = ((ncec->ncec_flags & NCE_F_FAST) ? 3796 ipst->ips_arp_probe_delay : 3797 ipst->ips_arp_fastprobe_delay); 3798 nce_dad(ncec, NULL, (delay == 0 ? B_TRUE : B_FALSE)); 3799 } 3800 } 3801 return (0); 3802 } 3803 3804 /* 3805 * ncec_walk routine to update all entries that have a given destination or 3806 * gateway address and cached link layer (MAC) address. This is used when ARP 3807 * informs us that a network-to-link-layer mapping may have changed. 3808 */ 3809 void 3810 nce_update_hw_changed(ncec_t *ncec, void *arg) 3811 { 3812 nce_hw_map_t *hwm = arg; 3813 ipaddr_t ncec_addr; 3814 3815 if (ncec->ncec_state != ND_REACHABLE) 3816 return; 3817 3818 IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, ncec_addr); 3819 if (ncec_addr != hwm->hwm_addr) 3820 return; 3821 3822 mutex_enter(&ncec->ncec_lock); 3823 if (hwm->hwm_flags != 0) 3824 ncec->ncec_flags = hwm->hwm_flags; 3825 nce_update(ncec, ND_STALE, hwm->hwm_hwaddr); 3826 mutex_exit(&ncec->ncec_lock); 3827 } 3828 3829 void 3830 ncec_refhold(ncec_t *ncec) 3831 { 3832 mutex_enter(&(ncec)->ncec_lock); 3833 (ncec)->ncec_refcnt++; 3834 ASSERT((ncec)->ncec_refcnt != 0); 3835 #ifdef DEBUG 3836 ncec_trace_ref(ncec); 3837 #endif 3838 mutex_exit(&(ncec)->ncec_lock); 3839 } 3840 3841 void 3842 ncec_refhold_notr(ncec_t *ncec) 3843 { 3844 mutex_enter(&(ncec)->ncec_lock); 3845 (ncec)->ncec_refcnt++; 3846 ASSERT((ncec)->ncec_refcnt != 0); 3847 mutex_exit(&(ncec)->ncec_lock); 3848 } 3849 3850 static void 3851 ncec_refhold_locked(ncec_t *ncec) 3852 { 3853 ASSERT(MUTEX_HELD(&(ncec)->ncec_lock)); 3854 (ncec)->ncec_refcnt++; 3855 #ifdef DEBUG 3856 ncec_trace_ref(ncec); 3857 #endif 3858 } 3859 3860 /* ncec_inactive destroys the mutex thus no mutex_exit is needed */ 3861 void 3862 ncec_refrele(ncec_t *ncec) 3863 { 3864 mutex_enter(&(ncec)->ncec_lock); 3865 #ifdef DEBUG 3866 ncec_untrace_ref(ncec); 3867 #endif 3868 ASSERT((ncec)->ncec_refcnt != 0); 3869 if (--(ncec)->ncec_refcnt == 0) { 3870 ncec_inactive(ncec); 3871 } else { 3872 mutex_exit(&(ncec)->ncec_lock); 3873 } 3874 } 3875 3876 void 3877 ncec_refrele_notr(ncec_t *ncec) 3878 { 3879 mutex_enter(&(ncec)->ncec_lock); 3880 ASSERT((ncec)->ncec_refcnt != 0); 3881 if (--(ncec)->ncec_refcnt == 0) { 3882 ncec_inactive(ncec); 3883 } else { 3884 mutex_exit(&(ncec)->ncec_lock); 3885 } 3886 } 3887 3888 /* 3889 * Common to IPv4 and IPv6. 3890 */ 3891 void 3892 nce_restart_timer(ncec_t *ncec, uint_t ms) 3893 { 3894 timeout_id_t tid; 3895 3896 ASSERT(!MUTEX_HELD(&(ncec)->ncec_lock)); 3897 3898 /* First cancel any running timer */ 3899 mutex_enter(&ncec->ncec_lock); 3900 tid = ncec->ncec_timeout_id; 3901 ncec->ncec_timeout_id = 0; 3902 if (tid != 0) { 3903 mutex_exit(&ncec->ncec_lock); 3904 (void) untimeout(tid); 3905 mutex_enter(&ncec->ncec_lock); 3906 } 3907 3908 /* Restart timer */ 3909 nce_start_timer(ncec, ms); 3910 mutex_exit(&ncec->ncec_lock); 3911 } 3912 3913 static void 3914 nce_start_timer(ncec_t *ncec, uint_t ms) 3915 { 3916 ASSERT(MUTEX_HELD(&ncec->ncec_lock)); 3917 /* 3918 * Don't start the timer if the ncec has been deleted, or if the timer 3919 * is already running 3920 */ 3921 if (!NCE_ISCONDEMNED(ncec) && ncec->ncec_timeout_id == 0) { 3922 ncec->ncec_timeout_id = timeout(nce_timer, ncec, 3923 MSEC_TO_TICK(ms) == 0 ? 1 : MSEC_TO_TICK(ms)); 3924 } 3925 } 3926 3927 int 3928 nce_set_multicast_v4(ill_t *ill, const in_addr_t *dst, 3929 uint16_t flags, nce_t **newnce) 3930 { 3931 uchar_t *hw_addr; 3932 int err = 0; 3933 ip_stack_t *ipst = ill->ill_ipst; 3934 in6_addr_t dst6; 3935 nce_t *nce; 3936 3937 ASSERT(!ill->ill_isv6); 3938 3939 IN6_IPADDR_TO_V4MAPPED(*dst, &dst6); 3940 mutex_enter(&ipst->ips_ndp4->ndp_g_lock); 3941 if ((nce = nce_lookup_addr(ill, &dst6)) != NULL) { 3942 mutex_exit(&ipst->ips_ndp4->ndp_g_lock); 3943 goto done; 3944 } 3945 if (ill->ill_net_type == IRE_IF_RESOLVER) { 3946 /* 3947 * For IRE_IF_RESOLVER a hardware mapping can be 3948 * generated, for IRE_IF_NORESOLVER, resolution cookie 3949 * in the ill is copied in nce_add_v4(). 3950 */ 3951 hw_addr = kmem_alloc(ill->ill_phys_addr_length, KM_NOSLEEP); 3952 if (hw_addr == NULL) { 3953 mutex_exit(&ipst->ips_ndp4->ndp_g_lock); 3954 return (ENOMEM); 3955 } 3956 ip_mcast_mapping(ill, (uchar_t *)dst, hw_addr); 3957 } else { 3958 /* 3959 * IRE_IF_NORESOLVER type simply copies the resolution 3960 * cookie passed in. So no hw_addr is needed. 3961 */ 3962 hw_addr = NULL; 3963 } 3964 ASSERT(flags & NCE_F_MCAST); 3965 ASSERT(flags & NCE_F_NONUD); 3966 /* nce_state will be computed by nce_add_common() */ 3967 err = nce_add_v4(ill, hw_addr, ill->ill_phys_addr_length, dst, flags, 3968 ND_UNCHANGED, &nce); 3969 mutex_exit(&ipst->ips_ndp4->ndp_g_lock); 3970 if (err == 0) 3971 err = nce_add_v4_postprocess(nce); 3972 if (hw_addr != NULL) 3973 kmem_free(hw_addr, ill->ill_phys_addr_length); 3974 if (err != 0) { 3975 ip1dbg(("nce_set_multicast_v4: create failed" "%d\n", err)); 3976 return (err); 3977 } 3978 done: 3979 if (newnce != NULL) 3980 *newnce = nce; 3981 else 3982 nce_refrele(nce); 3983 return (0); 3984 } 3985 3986 /* 3987 * This is used when scanning for "old" (least recently broadcast) NCEs. We 3988 * don't want to have to walk the list for every single one, so we gather up 3989 * batches at a time. 3990 */ 3991 #define NCE_RESCHED_LIST_LEN 8 3992 3993 typedef struct { 3994 ill_t *ncert_ill; 3995 uint_t ncert_num; 3996 ncec_t *ncert_nces[NCE_RESCHED_LIST_LEN]; 3997 } nce_resched_t; 3998 3999 /* 4000 * Pick the longest waiting NCEs for defense. 4001 */ 4002 /* ARGSUSED */ 4003 static int 4004 ncec_reschedule(ill_t *ill, nce_t *nce, void *arg) 4005 { 4006 nce_resched_t *ncert = arg; 4007 ncec_t **ncecs; 4008 ncec_t **ncec_max; 4009 ncec_t *ncec_temp; 4010 ncec_t *ncec = nce->nce_common; 4011 4012 ASSERT(ncec->ncec_ill == ncert->ncert_ill); 4013 /* 4014 * Only reachable entries that are ready for announcement are eligible. 4015 */ 4016 if (!NCE_MYADDR(ncec) || ncec->ncec_state != ND_REACHABLE) 4017 return (0); 4018 if (ncert->ncert_num < NCE_RESCHED_LIST_LEN) { 4019 ncec_refhold(ncec); 4020 ncert->ncert_nces[ncert->ncert_num++] = ncec; 4021 } else { 4022 ncecs = ncert->ncert_nces; 4023 ncec_max = ncecs + NCE_RESCHED_LIST_LEN; 4024 ncec_refhold(ncec); 4025 for (; ncecs < ncec_max; ncecs++) { 4026 ASSERT(ncec != NULL); 4027 if ((*ncecs)->ncec_last_time_defended > 4028 ncec->ncec_last_time_defended) { 4029 ncec_temp = *ncecs; 4030 *ncecs = ncec; 4031 ncec = ncec_temp; 4032 } 4033 } 4034 ncec_refrele(ncec); 4035 } 4036 return (0); 4037 } 4038 4039 /* 4040 * Reschedule the ARP defense of any long-waiting NCEs. It's assumed that this 4041 * doesn't happen very often (if at all), and thus it needn't be highly 4042 * optimized. (Note, though, that it's actually O(N) complexity, because the 4043 * outer loop is bounded by a constant rather than by the length of the list.) 4044 */ 4045 static void 4046 nce_ill_reschedule(ill_t *ill, nce_resched_t *ncert) 4047 { 4048 ncec_t *ncec; 4049 ip_stack_t *ipst = ill->ill_ipst; 4050 uint_t i, defend_rate; 4051 4052 i = ill->ill_defend_count; 4053 ill->ill_defend_count = 0; 4054 if (ill->ill_isv6) 4055 defend_rate = ipst->ips_ndp_defend_rate; 4056 else 4057 defend_rate = ipst->ips_arp_defend_rate; 4058 /* If none could be sitting around, then don't reschedule */ 4059 if (i < defend_rate) { 4060 DTRACE_PROBE1(reschedule_none, ill_t *, ill); 4061 return; 4062 } 4063 ncert->ncert_ill = ill; 4064 while (ill->ill_defend_count < defend_rate) { 4065 nce_walk_common(ill, ncec_reschedule, ncert); 4066 for (i = 0; i < ncert->ncert_num; i++) { 4067 4068 ncec = ncert->ncert_nces[i]; 4069 mutex_enter(&ncec->ncec_lock); 4070 ncec->ncec_flags |= NCE_F_DELAYED; 4071 mutex_exit(&ncec->ncec_lock); 4072 /* 4073 * we plan to schedule this ncec, so incr the 4074 * defend_count in anticipation. 4075 */ 4076 if (++ill->ill_defend_count >= defend_rate) 4077 break; 4078 } 4079 if (ncert->ncert_num < NCE_RESCHED_LIST_LEN) 4080 break; 4081 } 4082 } 4083 4084 /* 4085 * Check if the current rate-limiting parameters permit the sending 4086 * of another address defense announcement for both IPv4 and IPv6. 4087 * Returns B_TRUE if rate-limiting is in effect (i.e., send is not 4088 * permitted), and B_FALSE otherwise. The `defend_rate' parameter 4089 * determines how many address defense announcements are permitted 4090 * in any `defense_perio' interval. 4091 */ 4092 static boolean_t 4093 ill_defend_rate_limit(ill_t *ill, ncec_t *ncec) 4094 { 4095 clock_t now = ddi_get_lbolt(); 4096 ip_stack_t *ipst = ill->ill_ipst; 4097 clock_t start = ill->ill_defend_start; 4098 uint32_t elapsed, defend_period, defend_rate; 4099 nce_resched_t ncert; 4100 boolean_t ret; 4101 int i; 4102 4103 if (ill->ill_isv6) { 4104 defend_period = ipst->ips_ndp_defend_period; 4105 defend_rate = ipst->ips_ndp_defend_rate; 4106 } else { 4107 defend_period = ipst->ips_arp_defend_period; 4108 defend_rate = ipst->ips_arp_defend_rate; 4109 } 4110 if (defend_rate == 0) 4111 return (B_TRUE); 4112 bzero(&ncert, sizeof (ncert)); 4113 mutex_enter(&ill->ill_lock); 4114 if (start > 0) { 4115 elapsed = now - start; 4116 if (elapsed > SEC_TO_TICK(defend_period)) { 4117 ill->ill_defend_start = now; 4118 /* 4119 * nce_ill_reschedule will attempt to 4120 * prevent starvation by reschduling the 4121 * oldest entries, which are marked with 4122 * the NCE_F_DELAYED flag. 4123 */ 4124 nce_ill_reschedule(ill, &ncert); 4125 } 4126 } else { 4127 ill->ill_defend_start = now; 4128 } 4129 ASSERT(ill->ill_defend_count <= defend_rate); 4130 mutex_enter(&ncec->ncec_lock); 4131 if (ncec->ncec_flags & NCE_F_DELAYED) { 4132 /* 4133 * This ncec was rescheduled as one of the really old 4134 * entries needing on-going defense. The 4135 * ill_defend_count was already incremented in 4136 * nce_ill_reschedule. Go ahead and send the announce. 4137 */ 4138 ncec->ncec_flags &= ~NCE_F_DELAYED; 4139 mutex_exit(&ncec->ncec_lock); 4140 ret = B_FALSE; 4141 goto done; 4142 } 4143 mutex_exit(&ncec->ncec_lock); 4144 if (ill->ill_defend_count < defend_rate) 4145 ill->ill_defend_count++; 4146 if (ill->ill_defend_count == defend_rate) { 4147 /* 4148 * we are no longer allowed to send unbidden defense 4149 * messages. Wait for rescheduling. 4150 */ 4151 ret = B_TRUE; 4152 } else { 4153 ret = B_FALSE; 4154 } 4155 done: 4156 mutex_exit(&ill->ill_lock); 4157 /* 4158 * After all the locks have been dropped we can restart nce timer, 4159 * and refrele the delayed ncecs 4160 */ 4161 for (i = 0; i < ncert.ncert_num; i++) { 4162 clock_t xmit_interval; 4163 ncec_t *tmp; 4164 4165 tmp = ncert.ncert_nces[i]; 4166 xmit_interval = nce_fuzz_interval(tmp->ncec_xmit_interval, 4167 B_FALSE); 4168 nce_restart_timer(tmp, xmit_interval); 4169 ncec_refrele(tmp); 4170 } 4171 return (ret); 4172 } 4173 4174 boolean_t 4175 ndp_announce(ncec_t *ncec) 4176 { 4177 return (ndp_xmit(ncec->ncec_ill, ND_NEIGHBOR_ADVERT, ncec->ncec_lladdr, 4178 ncec->ncec_lladdr_length, &ncec->ncec_addr, &ipv6_all_hosts_mcast, 4179 nce_advert_flags(ncec))); 4180 } 4181 4182 ill_t * 4183 nce_resolve_src(ncec_t *ncec, in6_addr_t *src) 4184 { 4185 mblk_t *mp; 4186 in6_addr_t src6; 4187 ipaddr_t src4; 4188 ill_t *ill = ncec->ncec_ill; 4189 ill_t *src_ill = NULL; 4190 ipif_t *ipif = NULL; 4191 boolean_t is_myaddr = NCE_MYADDR(ncec); 4192 boolean_t isv6 = (ncec->ncec_ipversion == IPV6_VERSION); 4193 4194 ASSERT(src != NULL); 4195 ASSERT(IN6_IS_ADDR_UNSPECIFIED(src)); 4196 src6 = *src; 4197 if (is_myaddr) { 4198 src6 = ncec->ncec_addr; 4199 if (!isv6) 4200 IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, src4); 4201 } else { 4202 /* 4203 * try to find one from the outgoing packet. 4204 */ 4205 mutex_enter(&ncec->ncec_lock); 4206 mp = ncec->ncec_qd_mp; 4207 if (mp != NULL) { 4208 if (isv6) { 4209 ip6_t *ip6h = (ip6_t *)mp->b_rptr; 4210 4211 src6 = ip6h->ip6_src; 4212 } else { 4213 ipha_t *ipha = (ipha_t *)mp->b_rptr; 4214 4215 src4 = ipha->ipha_src; 4216 IN6_IPADDR_TO_V4MAPPED(src4, &src6); 4217 } 4218 } 4219 mutex_exit(&ncec->ncec_lock); 4220 } 4221 4222 /* 4223 * For outgoing packets, if the src of outgoing packet is one 4224 * of the assigned interface addresses use it, otherwise we 4225 * will pick the source address below. 4226 * For local addresses (is_myaddr) doing DAD, NDP announce 4227 * messages are mcast. So we use the (IPMP) cast_ill or the 4228 * (non-IPMP) ncec_ill for these message types. The only case 4229 * of unicast DAD messages are for IPv6 ND probes, for which 4230 * we find the ipif_bound_ill corresponding to the ncec_addr. 4231 */ 4232 if (!IN6_IS_ADDR_UNSPECIFIED(&src6) || is_myaddr) { 4233 if (isv6) { 4234 ipif = ipif_lookup_addr_nondup_v6(&src6, ill, ALL_ZONES, 4235 ill->ill_ipst); 4236 } else { 4237 ipif = ipif_lookup_addr_nondup(src4, ill, ALL_ZONES, 4238 ill->ill_ipst); 4239 } 4240 4241 /* 4242 * If no relevant ipif can be found, then it's not one of our 4243 * addresses. Reset to :: and try to find a src for the NS or 4244 * ARP request using ipif_select_source_v[4,6] below. 4245 * If an ipif can be found, but it's not yet done with 4246 * DAD verification, and we are not being invoked for 4247 * DAD (i.e., !is_myaddr), then just postpone this 4248 * transmission until later. 4249 */ 4250 if (ipif == NULL) { 4251 src6 = ipv6_all_zeros; 4252 src4 = INADDR_ANY; 4253 } else if (!ipif->ipif_addr_ready && !is_myaddr) { 4254 DTRACE_PROBE2(nce__resolve__ipif__not__ready, 4255 ncec_t *, ncec, ipif_t *, ipif); 4256 ipif_refrele(ipif); 4257 return (NULL); 4258 } 4259 } 4260 4261 if (IN6_IS_ADDR_UNSPECIFIED(&src6) && !is_myaddr) { 4262 /* 4263 * Pick a source address for this solicitation, but 4264 * restrict the selection to addresses assigned to the 4265 * output interface. We do this because the destination will 4266 * create a neighbor cache entry for the source address of 4267 * this packet, so the source address had better be a valid 4268 * neighbor. 4269 */ 4270 if (isv6) { 4271 ipif = ipif_select_source_v6(ill, &ncec->ncec_addr, 4272 B_TRUE, IPV6_PREFER_SRC_DEFAULT, ALL_ZONES, 4273 B_FALSE, NULL); 4274 } else { 4275 ipaddr_t nce_addr; 4276 4277 IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, nce_addr); 4278 ipif = ipif_select_source_v4(ill, nce_addr, ALL_ZONES, 4279 B_FALSE, NULL); 4280 } 4281 if (ipif == NULL && IS_IPMP(ill)) { 4282 ill_t *send_ill = ipmp_ill_hold_xmit_ill(ill, B_TRUE); 4283 4284 if (send_ill != NULL) { 4285 if (isv6) { 4286 ipif = ipif_select_source_v6(send_ill, 4287 &ncec->ncec_addr, B_TRUE, 4288 IPV6_PREFER_SRC_DEFAULT, ALL_ZONES, 4289 B_FALSE, NULL); 4290 } else { 4291 IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, 4292 src4); 4293 ipif = ipif_select_source_v4(send_ill, 4294 src4, ALL_ZONES, B_TRUE, NULL); 4295 } 4296 ill_refrele(send_ill); 4297 } 4298 } 4299 4300 if (ipif == NULL) { 4301 char buf[INET6_ADDRSTRLEN]; 4302 4303 ip1dbg(("nce_resolve_src: No source ipif for dst %s\n", 4304 inet_ntop((isv6 ? AF_INET6 : AF_INET), 4305 (char *)&ncec->ncec_addr, buf, sizeof (buf)))); 4306 DTRACE_PROBE1(nce__resolve__no__ipif, ncec_t *, ncec); 4307 return (NULL); 4308 } 4309 src6 = ipif->ipif_v6lcl_addr; 4310 } 4311 *src = src6; 4312 if (ipif != NULL) { 4313 src_ill = ipif->ipif_ill; 4314 if (IS_IPMP(src_ill)) 4315 src_ill = ipmp_ipif_hold_bound_ill(ipif); 4316 else 4317 ill_refhold(src_ill); 4318 ipif_refrele(ipif); 4319 DTRACE_PROBE2(nce__resolve__src__ill, ncec_t *, ncec, 4320 ill_t *, src_ill); 4321 } 4322 return (src_ill); 4323 } 4324 4325 void 4326 ip_nce_lookup_and_update(ipaddr_t *addr, ipif_t *ipif, ip_stack_t *ipst, 4327 uchar_t *hwaddr, int hwaddr_len, int flags) 4328 { 4329 ill_t *ill; 4330 ncec_t *ncec; 4331 nce_t *nce; 4332 uint16_t new_state; 4333 4334 ill = (ipif ? ipif->ipif_ill : NULL); 4335 if (ill != NULL) { 4336 /* 4337 * only one ncec is possible 4338 */ 4339 nce = nce_lookup_v4(ill, addr); 4340 if (nce != NULL) { 4341 ncec = nce->nce_common; 4342 mutex_enter(&ncec->ncec_lock); 4343 if (NCE_ISREACHABLE(ncec)) 4344 new_state = ND_UNCHANGED; 4345 else 4346 new_state = ND_STALE; 4347 ncec->ncec_flags = flags; 4348 nce_update(ncec, new_state, hwaddr); 4349 mutex_exit(&ncec->ncec_lock); 4350 nce_refrele(nce); 4351 return; 4352 } 4353 } else { 4354 /* 4355 * ill is wildcard; clean up all ncec's and ire's 4356 * that match on addr. 4357 */ 4358 nce_hw_map_t hwm; 4359 4360 hwm.hwm_addr = *addr; 4361 hwm.hwm_hwlen = hwaddr_len; 4362 hwm.hwm_hwaddr = hwaddr; 4363 hwm.hwm_flags = flags; 4364 4365 ncec_walk_common(ipst->ips_ndp4, NULL, 4366 (pfi_t)nce_update_hw_changed, (uchar_t *)&hwm, B_TRUE); 4367 } 4368 } 4369 4370 /* 4371 * Common function to add ncec entries. 4372 * we always add the ncec with ncec_ill == ill, and always create 4373 * nce_t on ncec_ill. A dlpi fastpath message may be triggered if the 4374 * ncec is !reachable. 4375 * 4376 * When the caller passes in an nce_state of ND_UNCHANGED, 4377 * nce_add_common() will determine the state of the created nce based 4378 * on the ill_net_type and nce_flags used. Otherwise, the nce will 4379 * be created with state set to the passed in nce_state. 4380 */ 4381 static int 4382 nce_add_common(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len, 4383 const in6_addr_t *addr, uint16_t flags, uint16_t nce_state, nce_t **retnce) 4384 { 4385 static ncec_t nce_nil; 4386 uchar_t *template = NULL; 4387 int err; 4388 ncec_t *ncec; 4389 ncec_t **ncep; 4390 ip_stack_t *ipst = ill->ill_ipst; 4391 uint16_t state; 4392 boolean_t fastprobe = B_FALSE; 4393 struct ndp_g_s *ndp; 4394 nce_t *nce = NULL; 4395 mblk_t *dlur_mp = NULL; 4396 4397 if (ill->ill_isv6) 4398 ndp = ill->ill_ipst->ips_ndp6; 4399 else 4400 ndp = ill->ill_ipst->ips_ndp4; 4401 4402 *retnce = NULL; 4403 4404 ASSERT(MUTEX_HELD(&ndp->ndp_g_lock)); 4405 4406 if (IN6_IS_ADDR_UNSPECIFIED(addr)) { 4407 ip0dbg(("nce_add_common: no addr\n")); 4408 return (EINVAL); 4409 } 4410 if ((flags & ~NCE_EXTERNAL_FLAGS_MASK)) { 4411 ip0dbg(("nce_add_common: flags = %x\n", (int)flags)); 4412 return (EINVAL); 4413 } 4414 4415 if (ill->ill_isv6) { 4416 ncep = ((ncec_t **)NCE_HASH_PTR_V6(ipst, *addr)); 4417 } else { 4418 ipaddr_t v4addr; 4419 4420 IN6_V4MAPPED_TO_IPADDR(addr, v4addr); 4421 ncep = ((ncec_t **)NCE_HASH_PTR_V4(ipst, v4addr)); 4422 } 4423 4424 /* 4425 * The caller has ensured that there is no nce on ill, but there could 4426 * still be an nce_common_t for the address, so that we find exisiting 4427 * ncec_t strucutures first, and atomically add a new nce_t if 4428 * one is found. The ndp_g_lock ensures that we don't cross threads 4429 * with an ncec_delete(). Unlike ncec_lookup_illgrp() we do not 4430 * compare for matches across the illgrp because this function is 4431 * called via nce_lookup_then_add_v* -> nce_add_v* -> nce_add_common, 4432 * with the nce_lookup_then_add_v* passing in the ipmp_ill where 4433 * appropriate. 4434 */ 4435 ncec = *ncep; 4436 for (; ncec != NULL; ncec = ncec->ncec_next) { 4437 if (ncec->ncec_ill == ill) { 4438 if (IN6_ARE_ADDR_EQUAL(&ncec->ncec_addr, addr)) { 4439 /* 4440 * We should never find *retnce to be 4441 * MYADDR, since the caller may then 4442 * incorrectly restart a DAD timer that's 4443 * already running. However, if we are in 4444 * forwarding mode, and the interface is 4445 * moving in/out of groups, the data 4446 * path ire lookup (e.g., ire_revalidate_nce) 4447 * may have determined that some destination 4448 * is offlink while the control path is adding 4449 * that address as a local address. 4450 * Recover from this case by failing the 4451 * lookup 4452 */ 4453 if (NCE_MYADDR(ncec)) 4454 return (ENXIO); 4455 *retnce = nce_ill_lookup_then_add(ill, ncec); 4456 if (*retnce != NULL) 4457 break; 4458 } 4459 } 4460 } 4461 if (*retnce != NULL) /* caller must trigger fastpath on nce */ 4462 return (0); 4463 4464 ncec = kmem_cache_alloc(ncec_cache, KM_NOSLEEP); 4465 if (ncec == NULL) 4466 return (ENOMEM); 4467 *ncec = nce_nil; 4468 ncec->ncec_ill = ill; 4469 ncec->ncec_ipversion = (ill->ill_isv6 ? IPV6_VERSION : IPV4_VERSION); 4470 ncec->ncec_flags = flags; 4471 ncec->ncec_ipst = ipst; /* No netstack_hold */ 4472 4473 if (!ill->ill_isv6) { 4474 ipaddr_t addr4; 4475 4476 /* 4477 * DAD probe interval and probe count are set based on 4478 * fast/slow probe settings. If the underlying link doesn't 4479 * have reliably up/down notifications or if we're working 4480 * with IPv4 169.254.0.0/16 Link Local Address space, then 4481 * don't use the fast timers. Otherwise, use them. 4482 */ 4483 ASSERT(IN6_IS_ADDR_V4MAPPED(addr)); 4484 IN6_V4MAPPED_TO_IPADDR(addr, addr4); 4485 if (ill->ill_note_link && !IS_IPV4_LL_SPACE(&addr4)) { 4486 fastprobe = B_TRUE; 4487 } else if (IS_IPMP(ill) && NCE_PUBLISH(ncec) && 4488 !IS_IPV4_LL_SPACE(&addr4)) { 4489 ill_t *hwaddr_ill; 4490 4491 hwaddr_ill = ipmp_illgrp_find_ill(ill->ill_grp, hw_addr, 4492 hw_addr_len); 4493 if (hwaddr_ill != NULL && hwaddr_ill->ill_note_link) 4494 fastprobe = B_TRUE; 4495 } 4496 if (fastprobe) { 4497 ncec->ncec_xmit_interval = 4498 ipst->ips_arp_fastprobe_interval; 4499 ncec->ncec_pcnt = 4500 ipst->ips_arp_fastprobe_count; 4501 ncec->ncec_flags |= NCE_F_FAST; 4502 } else { 4503 ncec->ncec_xmit_interval = 4504 ipst->ips_arp_probe_interval; 4505 ncec->ncec_pcnt = 4506 ipst->ips_arp_probe_count; 4507 } 4508 if (NCE_PUBLISH(ncec)) { 4509 ncec->ncec_unsolicit_count = 4510 ipst->ips_ip_arp_publish_count; 4511 } 4512 } else { 4513 /* 4514 * probe interval is constant: ILL_PROBE_INTERVAL 4515 * probe count is constant: ND_MAX_UNICAST_SOLICIT 4516 */ 4517 ncec->ncec_pcnt = ND_MAX_UNICAST_SOLICIT; 4518 if (NCE_PUBLISH(ncec)) { 4519 ncec->ncec_unsolicit_count = 4520 ipst->ips_ip_ndp_unsolicit_count; 4521 } 4522 } 4523 ncec->ncec_rcnt = ill->ill_xmit_count; 4524 ncec->ncec_addr = *addr; 4525 ncec->ncec_qd_mp = NULL; 4526 ncec->ncec_refcnt = 1; /* for ncec getting created */ 4527 mutex_init(&ncec->ncec_lock, NULL, MUTEX_DEFAULT, NULL); 4528 ncec->ncec_trace_disable = B_FALSE; 4529 4530 /* 4531 * ncec_lladdr holds link layer address 4532 */ 4533 if (hw_addr_len > 0) { 4534 template = kmem_alloc(hw_addr_len, KM_NOSLEEP); 4535 if (template == NULL) { 4536 err = ENOMEM; 4537 goto err_ret; 4538 } 4539 ncec->ncec_lladdr = template; 4540 ncec->ncec_lladdr_length = hw_addr_len; 4541 bzero(ncec->ncec_lladdr, hw_addr_len); 4542 } 4543 if ((flags & NCE_F_BCAST) != 0) { 4544 state = ND_REACHABLE; 4545 ASSERT(hw_addr_len > 0); 4546 } else if (ill->ill_net_type == IRE_IF_RESOLVER) { 4547 state = ND_INITIAL; 4548 } else if (ill->ill_net_type == IRE_IF_NORESOLVER) { 4549 /* 4550 * NORESOLVER entries are always created in the REACHABLE 4551 * state. 4552 */ 4553 state = ND_REACHABLE; 4554 if (ill->ill_phys_addr_length == IP_ADDR_LEN && 4555 ill->ill_mactype != DL_IPV4 && 4556 ill->ill_mactype != DL_6TO4) { 4557 /* 4558 * We create a nce_res_mp with the IP nexthop address 4559 * as the destination address if the physical length 4560 * is exactly 4 bytes for point-to-multipoint links 4561 * that do their own resolution from IP to link-layer 4562 * address (e.g. IP over X.25). 4563 */ 4564 bcopy((uchar_t *)addr, 4565 ncec->ncec_lladdr, ill->ill_phys_addr_length); 4566 } 4567 if (ill->ill_phys_addr_length == IPV6_ADDR_LEN && 4568 ill->ill_mactype != DL_IPV6) { 4569 /* 4570 * We create a nce_res_mp with the IP nexthop address 4571 * as the destination address if the physical legnth 4572 * is exactly 16 bytes for point-to-multipoint links 4573 * that do their own resolution from IP to link-layer 4574 * address. 4575 */ 4576 bcopy((uchar_t *)addr, 4577 ncec->ncec_lladdr, ill->ill_phys_addr_length); 4578 } 4579 /* 4580 * Since NUD is not part of the base IPv4 protocol definition, 4581 * IPv4 neighbor entries on NORESOLVER interfaces will never 4582 * age, and are marked NCE_F_NONUD. 4583 */ 4584 if (!ill->ill_isv6) 4585 ncec->ncec_flags |= NCE_F_NONUD; 4586 } else if (ill->ill_net_type == IRE_LOOPBACK) { 4587 state = ND_REACHABLE; 4588 } 4589 4590 if (hw_addr != NULL || ill->ill_net_type == IRE_IF_NORESOLVER) { 4591 /* 4592 * We are adding an ncec with a deterministic hw_addr, 4593 * so the state can only be one of {REACHABLE, STALE, PROBE}. 4594 * 4595 * if we are adding a unicast ncec for the local address 4596 * it would be REACHABLE; we would be adding a ND_STALE entry 4597 * for the requestor of an ARP_REQUEST/ND_SOLICIT. Our own 4598 * addresses are added in PROBE to trigger DAD. 4599 */ 4600 if ((flags & (NCE_F_MCAST|NCE_F_BCAST)) || 4601 ill->ill_net_type == IRE_IF_NORESOLVER) 4602 state = ND_REACHABLE; 4603 else if (!NCE_PUBLISH(ncec)) 4604 state = ND_STALE; 4605 else 4606 state = ND_PROBE; 4607 if (hw_addr != NULL) 4608 nce_set_ll(ncec, hw_addr); 4609 } 4610 /* caller overrides internally computed state */ 4611 if (nce_state != ND_UNCHANGED) 4612 state = nce_state; 4613 4614 if (state == ND_PROBE) 4615 ncec->ncec_flags |= NCE_F_UNVERIFIED; 4616 4617 ncec->ncec_state = state; 4618 4619 if (state == ND_REACHABLE) { 4620 ncec->ncec_last = ncec->ncec_init_time = 4621 TICK_TO_MSEC(ddi_get_lbolt64()); 4622 } else { 4623 ncec->ncec_last = 0; 4624 if (state == ND_INITIAL) 4625 ncec->ncec_init_time = TICK_TO_MSEC(ddi_get_lbolt64()); 4626 } 4627 list_create(&ncec->ncec_cb, sizeof (ncec_cb_t), 4628 offsetof(ncec_cb_t, ncec_cb_node)); 4629 /* 4630 * have all the memory allocations out of the way before taking locks 4631 * and adding the nce. 4632 */ 4633 nce = kmem_cache_alloc(nce_cache, KM_NOSLEEP); 4634 if (nce == NULL) { 4635 err = ENOMEM; 4636 goto err_ret; 4637 } 4638 if (ncec->ncec_lladdr != NULL || 4639 ill->ill_net_type == IRE_IF_NORESOLVER) { 4640 dlur_mp = ill_dlur_gen(ncec->ncec_lladdr, 4641 ill->ill_phys_addr_length, ill->ill_sap, 4642 ill->ill_sap_length); 4643 if (dlur_mp == NULL) { 4644 err = ENOMEM; 4645 goto err_ret; 4646 } 4647 } 4648 4649 /* 4650 * Atomically ensure that the ill is not CONDEMNED, before 4651 * adding the NCE. 4652 */ 4653 mutex_enter(&ill->ill_lock); 4654 if (ill->ill_state_flags & ILL_CONDEMNED) { 4655 mutex_exit(&ill->ill_lock); 4656 err = EINVAL; 4657 goto err_ret; 4658 } 4659 if (!NCE_MYADDR(ncec) && 4660 (ill->ill_state_flags & ILL_DOWN_IN_PROGRESS)) { 4661 mutex_exit(&ill->ill_lock); 4662 DTRACE_PROBE1(nce__add__on__down__ill, ncec_t *, ncec); 4663 err = EINVAL; 4664 goto err_ret; 4665 } 4666 /* 4667 * Acquire the ncec_lock even before adding the ncec to the list 4668 * so that it cannot get deleted after the ncec is added, but 4669 * before we add the nce. 4670 */ 4671 mutex_enter(&ncec->ncec_lock); 4672 if ((ncec->ncec_next = *ncep) != NULL) 4673 ncec->ncec_next->ncec_ptpn = &ncec->ncec_next; 4674 *ncep = ncec; 4675 ncec->ncec_ptpn = ncep; 4676 4677 /* Bump up the number of ncec's referencing this ill */ 4678 DTRACE_PROBE3(ill__incr__cnt, (ill_t *), ill, 4679 (char *), "ncec", (void *), ncec); 4680 ill->ill_ncec_cnt++; 4681 /* 4682 * Since we hold the ncec_lock at this time, the ncec cannot be 4683 * condemned, and we can safely add the nce. 4684 */ 4685 *retnce = nce_add_impl(ill, ncec, nce, dlur_mp); 4686 mutex_exit(&ncec->ncec_lock); 4687 mutex_exit(&ill->ill_lock); 4688 4689 /* caller must trigger fastpath on *retnce */ 4690 return (0); 4691 4692 err_ret: 4693 if (ncec != NULL) 4694 kmem_cache_free(ncec_cache, ncec); 4695 if (nce != NULL) 4696 kmem_cache_free(nce_cache, nce); 4697 freemsg(dlur_mp); 4698 if (template != NULL) 4699 kmem_free(template, ill->ill_phys_addr_length); 4700 return (err); 4701 } 4702 4703 /* 4704 * take a ref on the nce 4705 */ 4706 void 4707 nce_refhold(nce_t *nce) 4708 { 4709 mutex_enter(&nce->nce_lock); 4710 nce->nce_refcnt++; 4711 ASSERT((nce)->nce_refcnt != 0); 4712 mutex_exit(&nce->nce_lock); 4713 } 4714 4715 /* 4716 * release a ref on the nce; In general, this 4717 * cannot be called with locks held because nce_inactive 4718 * may result in nce_inactive which will take the ill_lock, 4719 * do ipif_ill_refrele_tail etc. Thus the one exception 4720 * where this can be called with locks held is when the caller 4721 * is certain that the nce_refcnt is sufficient to prevent 4722 * the invocation of nce_inactive. 4723 */ 4724 void 4725 nce_refrele(nce_t *nce) 4726 { 4727 ASSERT((nce)->nce_refcnt != 0); 4728 mutex_enter(&nce->nce_lock); 4729 if (--nce->nce_refcnt == 0) 4730 nce_inactive(nce); /* destroys the mutex */ 4731 else 4732 mutex_exit(&nce->nce_lock); 4733 } 4734 4735 /* 4736 * free the nce after all refs have gone away. 4737 */ 4738 static void 4739 nce_inactive(nce_t *nce) 4740 { 4741 ill_t *ill = nce->nce_ill; 4742 4743 ASSERT(nce->nce_refcnt == 0); 4744 4745 ncec_refrele_notr(nce->nce_common); 4746 nce->nce_common = NULL; 4747 freemsg(nce->nce_fp_mp); 4748 freemsg(nce->nce_dlur_mp); 4749 4750 mutex_enter(&ill->ill_lock); 4751 DTRACE_PROBE3(ill__decr__cnt, (ill_t *), ill, 4752 (char *), "nce", (void *), nce); 4753 ill->ill_nce_cnt--; 4754 nce->nce_ill = NULL; 4755 /* 4756 * If the number of ncec's associated with this ill have dropped 4757 * to zero, check whether we need to restart any operation that 4758 * is waiting for this to happen. 4759 */ 4760 if (ILL_DOWN_OK(ill)) { 4761 /* ipif_ill_refrele_tail drops the ill_lock */ 4762 ipif_ill_refrele_tail(ill); 4763 } else { 4764 mutex_exit(&ill->ill_lock); 4765 } 4766 4767 mutex_destroy(&nce->nce_lock); 4768 kmem_cache_free(nce_cache, nce); 4769 } 4770 4771 /* 4772 * Add an nce to the ill_nce list. 4773 */ 4774 static nce_t * 4775 nce_add_impl(ill_t *ill, ncec_t *ncec, nce_t *nce, mblk_t *dlur_mp) 4776 { 4777 bzero(nce, sizeof (*nce)); 4778 mutex_init(&nce->nce_lock, NULL, MUTEX_DEFAULT, NULL); 4779 nce->nce_common = ncec; 4780 nce->nce_addr = ncec->ncec_addr; 4781 nce->nce_ill = ill; 4782 DTRACE_PROBE3(ill__incr__cnt, (ill_t *), ill, 4783 (char *), "nce", (void *), nce); 4784 ill->ill_nce_cnt++; 4785 4786 nce->nce_refcnt = 1; /* for the thread */ 4787 ncec->ncec_refcnt++; /* want ncec_refhold_locked_notr(ncec) */ 4788 nce->nce_dlur_mp = dlur_mp; 4789 4790 /* add nce to the ill's fastpath list. */ 4791 nce->nce_refcnt++; /* for the list */ 4792 list_insert_head(&ill->ill_nce, nce); 4793 return (nce); 4794 } 4795 4796 static nce_t * 4797 nce_add(ill_t *ill, ncec_t *ncec) 4798 { 4799 nce_t *nce; 4800 mblk_t *dlur_mp = NULL; 4801 4802 ASSERT(MUTEX_HELD(&ill->ill_lock)); 4803 ASSERT(MUTEX_HELD(&ncec->ncec_lock)); 4804 4805 nce = kmem_cache_alloc(nce_cache, KM_NOSLEEP); 4806 if (nce == NULL) 4807 return (NULL); 4808 if (ncec->ncec_lladdr != NULL || 4809 ill->ill_net_type == IRE_IF_NORESOLVER) { 4810 dlur_mp = ill_dlur_gen(ncec->ncec_lladdr, 4811 ill->ill_phys_addr_length, ill->ill_sap, 4812 ill->ill_sap_length); 4813 if (dlur_mp == NULL) { 4814 kmem_cache_free(nce_cache, nce); 4815 return (NULL); 4816 } 4817 } 4818 return (nce_add_impl(ill, ncec, nce, dlur_mp)); 4819 } 4820 4821 /* 4822 * remove the nce from the ill_faspath list 4823 */ 4824 void 4825 nce_delete(nce_t *nce) 4826 { 4827 ill_t *ill = nce->nce_ill; 4828 4829 ASSERT(MUTEX_HELD(&ill->ill_lock)); 4830 4831 mutex_enter(&nce->nce_lock); 4832 if (nce->nce_is_condemned) { 4833 /* 4834 * some other thread has removed this nce from the ill_nce list 4835 */ 4836 mutex_exit(&nce->nce_lock); 4837 return; 4838 } 4839 nce->nce_is_condemned = B_TRUE; 4840 mutex_exit(&nce->nce_lock); 4841 4842 list_remove(&ill->ill_nce, nce); 4843 /* 4844 * even though we are holding the ill_lock, it is ok to 4845 * call nce_refrele here because we know that we should have 4846 * at least 2 refs on the nce: one for the thread, and one 4847 * for the list. The refrele below will release the one for 4848 * the list. 4849 */ 4850 nce_refrele(nce); 4851 } 4852 4853 nce_t * 4854 nce_lookup(ill_t *ill, const in6_addr_t *addr) 4855 { 4856 nce_t *nce = NULL; 4857 4858 ASSERT(ill != NULL); 4859 ASSERT(MUTEX_HELD(&ill->ill_lock)); 4860 4861 for (nce = list_head(&ill->ill_nce); nce != NULL; 4862 nce = list_next(&ill->ill_nce, nce)) { 4863 if (IN6_ARE_ADDR_EQUAL(&nce->nce_addr, addr)) 4864 break; 4865 } 4866 4867 /* 4868 * if we found the nce on the ill_nce list while holding 4869 * the ill_lock, then it cannot be condemned yet. 4870 */ 4871 if (nce != NULL) { 4872 ASSERT(!nce->nce_is_condemned); 4873 nce_refhold(nce); 4874 } 4875 return (nce); 4876 } 4877 4878 /* 4879 * Walk the ill_nce list on ill. The callback function func() cannot perform 4880 * any destructive actions. 4881 */ 4882 static void 4883 nce_walk_common(ill_t *ill, pfi_t func, void *arg) 4884 { 4885 nce_t *nce = NULL, *nce_next; 4886 4887 ASSERT(MUTEX_HELD(&ill->ill_lock)); 4888 for (nce = list_head(&ill->ill_nce); nce != NULL; ) { 4889 nce_next = list_next(&ill->ill_nce, nce); 4890 if (func(ill, nce, arg) != 0) 4891 break; 4892 nce = nce_next; 4893 } 4894 } 4895 4896 void 4897 nce_walk(ill_t *ill, pfi_t func, void *arg) 4898 { 4899 mutex_enter(&ill->ill_lock); 4900 nce_walk_common(ill, func, arg); 4901 mutex_exit(&ill->ill_lock); 4902 } 4903 4904 void 4905 nce_flush(ill_t *ill, boolean_t flushall) 4906 { 4907 nce_t *nce, *nce_next; 4908 list_t dead; 4909 4910 list_create(&dead, sizeof (nce_t), offsetof(nce_t, nce_node)); 4911 mutex_enter(&ill->ill_lock); 4912 for (nce = list_head(&ill->ill_nce); nce != NULL; ) { 4913 nce_next = list_next(&ill->ill_nce, nce); 4914 if (!flushall && NCE_PUBLISH(nce->nce_common)) { 4915 nce = nce_next; 4916 continue; 4917 } 4918 /* 4919 * nce_delete requires that the caller should either not 4920 * be holding locks, or should hold a ref to ensure that 4921 * we wont hit ncec_inactive. So take a ref and clean up 4922 * after the list is flushed. 4923 */ 4924 nce_refhold(nce); 4925 nce_delete(nce); 4926 list_insert_tail(&dead, nce); 4927 nce = nce_next; 4928 } 4929 mutex_exit(&ill->ill_lock); 4930 while ((nce = list_head(&dead)) != NULL) { 4931 list_remove(&dead, nce); 4932 nce_refrele(nce); 4933 } 4934 ASSERT(list_is_empty(&dead)); 4935 list_destroy(&dead); 4936 } 4937 4938 /* Return an interval that is anywhere in the [1 .. intv] range */ 4939 static clock_t 4940 nce_fuzz_interval(clock_t intv, boolean_t initial_time) 4941 { 4942 clock_t rnd, frac; 4943 4944 (void) random_get_pseudo_bytes((uint8_t *)&rnd, sizeof (rnd)); 4945 /* Note that clock_t is signed; must chop off bits */ 4946 rnd &= (1ul << (NBBY * sizeof (rnd) - 1)) - 1; 4947 if (initial_time) { 4948 if (intv <= 0) 4949 intv = 1; 4950 else 4951 intv = (rnd % intv) + 1; 4952 } else { 4953 /* Compute 'frac' as 20% of the configured interval */ 4954 if ((frac = intv / 5) <= 1) 4955 frac = 2; 4956 /* Set intv randomly in the range [intv-frac .. intv+frac] */ 4957 if ((intv = intv - frac + rnd % (2 * frac + 1)) <= 0) 4958 intv = 1; 4959 } 4960 return (intv); 4961 } 4962 4963 void 4964 nce_resolv_ipmp_ok(ncec_t *ncec) 4965 { 4966 mblk_t *mp; 4967 uint_t pkt_len; 4968 iaflags_t ixaflags = IXAF_NO_TRACE; 4969 nce_t *under_nce; 4970 ill_t *ill = ncec->ncec_ill; 4971 boolean_t isv6 = (ncec->ncec_ipversion == IPV6_VERSION); 4972 ipif_t *src_ipif = NULL; 4973 ip_stack_t *ipst = ill->ill_ipst; 4974 ill_t *send_ill; 4975 uint_t nprobes; 4976 4977 ASSERT(IS_IPMP(ill)); 4978 4979 mutex_enter(&ncec->ncec_lock); 4980 nprobes = ncec->ncec_nprobes; 4981 mp = ncec->ncec_qd_mp; 4982 ncec->ncec_qd_mp = NULL; 4983 ncec->ncec_nprobes = 0; 4984 mutex_exit(&ncec->ncec_lock); 4985 4986 while (mp != NULL) { 4987 mblk_t *nxt_mp; 4988 4989 nxt_mp = mp->b_next; 4990 mp->b_next = NULL; 4991 if (isv6) { 4992 ip6_t *ip6h = (ip6_t *)mp->b_rptr; 4993 4994 pkt_len = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN; 4995 src_ipif = ipif_lookup_addr_nondup_v6(&ip6h->ip6_src, 4996 ill, ALL_ZONES, ipst); 4997 } else { 4998 ipha_t *ipha = (ipha_t *)mp->b_rptr; 4999 5000 ixaflags |= IXAF_IS_IPV4; 5001 pkt_len = ntohs(ipha->ipha_length); 5002 src_ipif = ipif_lookup_addr_nondup(ipha->ipha_src, 5003 ill, ALL_ZONES, ipst); 5004 } 5005 5006 /* 5007 * find a new nce based on an under_ill. The first IPMP probe 5008 * packet gets queued, so we could still find a src_ipif that 5009 * matches an IPMP test address. 5010 */ 5011 if (src_ipif == NULL || IS_IPMP(src_ipif->ipif_ill)) { 5012 /* 5013 * if src_ipif is null, this could be either a 5014 * forwarded packet or a probe whose src got deleted. 5015 * We identify the former case by looking for the 5016 * ncec_nprobes: the first ncec_nprobes packets are 5017 * probes; 5018 */ 5019 if (src_ipif == NULL && nprobes > 0) 5020 goto drop_pkt; 5021 5022 /* 5023 * For forwarded packets, we use the ipmp rotor 5024 * to find send_ill. 5025 */ 5026 send_ill = ipmp_ill_hold_xmit_ill(ncec->ncec_ill, 5027 B_TRUE); 5028 } else { 5029 send_ill = src_ipif->ipif_ill; 5030 ill_refhold(send_ill); 5031 } 5032 5033 DTRACE_PROBE4(nce__resolve__ipmp, (mblk_t *), mp, 5034 (ncec_t *), ncec, (ipif_t *), 5035 src_ipif, (ill_t *), send_ill); 5036 5037 if (send_ill == NULL) { 5038 if (src_ipif != NULL) 5039 ipif_refrele(src_ipif); 5040 goto drop_pkt; 5041 } 5042 /* create an under_nce on send_ill */ 5043 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 5044 if (IS_IN_SAME_ILLGRP(send_ill, ncec->ncec_ill)) 5045 under_nce = nce_fastpath_create(send_ill, ncec); 5046 else 5047 under_nce = NULL; 5048 rw_exit(&ipst->ips_ill_g_lock); 5049 if (under_nce != NULL && NCE_ISREACHABLE(ncec)) 5050 nce_fastpath_trigger(under_nce); 5051 5052 ill_refrele(send_ill); 5053 if (src_ipif != NULL) 5054 ipif_refrele(src_ipif); 5055 5056 if (under_nce != NULL) { 5057 (void) ip_xmit(mp, under_nce, ixaflags, pkt_len, 0, 5058 ALL_ZONES, 0, NULL); 5059 nce_refrele(under_nce); 5060 if (nprobes > 0) 5061 nprobes--; 5062 mp = nxt_mp; 5063 continue; 5064 } 5065 drop_pkt: 5066 if (isv6) { 5067 BUMP_MIB(&ipst->ips_ip6_mib, ipIfStatsOutDiscards); 5068 } else { 5069 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); 5070 } 5071 ip_drop_output("ipIfStatsOutDiscards - no under_ill", mp, NULL); 5072 freemsg(mp); 5073 if (nprobes > 0) 5074 nprobes--; 5075 mp = nxt_mp; 5076 } 5077 ncec_cb_dispatch(ncec); /* complete callbacks */ 5078 } 5079