1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved. 23 */ 24 25 #include <sys/types.h> 26 #include <sys/stream.h> 27 #include <sys/stropts.h> 28 #include <sys/strsun.h> 29 #include <sys/sysmacros.h> 30 #include <sys/errno.h> 31 #include <sys/dlpi.h> 32 #include <sys/socket.h> 33 #include <sys/ddi.h> 34 #include <sys/sunddi.h> 35 #include <sys/cmn_err.h> 36 #include <sys/debug.h> 37 #include <sys/vtrace.h> 38 #include <sys/kmem.h> 39 #include <sys/zone.h> 40 #include <sys/ethernet.h> 41 #include <sys/sdt.h> 42 #include <sys/mac.h> 43 44 #include <net/if.h> 45 #include <net/if_types.h> 46 #include <net/if_dl.h> 47 #include <net/route.h> 48 #include <netinet/in.h> 49 #include <netinet/ip6.h> 50 #include <netinet/icmp6.h> 51 52 #include <inet/common.h> 53 #include <inet/mi.h> 54 #include <inet/mib2.h> 55 #include <inet/nd.h> 56 #include <inet/ip.h> 57 #include <inet/ip_impl.h> 58 #include <inet/ipclassifier.h> 59 #include <inet/ip_if.h> 60 #include <inet/ip_ire.h> 61 #include <inet/ip_rts.h> 62 #include <inet/ip6.h> 63 #include <inet/ip_ndp.h> 64 #include <inet/sctp_ip.h> 65 #include <inet/ip_arp.h> 66 #include <inet/ip2mac_impl.h> 67 68 #define ANNOUNCE_INTERVAL(isv6) \ 69 (isv6 ? ipst->ips_ip_ndp_unsolicit_interval : \ 70 ipst->ips_ip_arp_publish_interval) 71 72 #define DEFENSE_INTERVAL(isv6) \ 73 (isv6 ? ipst->ips_ndp_defend_interval : \ 74 ipst->ips_arp_defend_interval) 75 76 /* Non-tunable probe interval, based on link capabilities */ 77 #define ILL_PROBE_INTERVAL(ill) ((ill)->ill_note_link ? 150 : 1500) 78 79 /* 80 * The IPv4 Link Local address space is special; we do extra duplicate checking 81 * there, as the entire assignment mechanism rests on random numbers. 82 */ 83 #define IS_IPV4_LL_SPACE(ptr) (((uchar_t *)ptr)[0] == 169 && \ 84 ((uchar_t *)ptr)[1] == 254) 85 86 /* 87 * NCE_EXTERNAL_FLAGS_MASK defines the set of ncec_flags that may be passed 88 * in to the ncec*add* functions. 89 * 90 * NCE_F_AUTHORITY means that we ignore any incoming adverts for that 91 * mapping (though DAD is performed for the mapping). NCE_F_PUBLISH means 92 * that we will respond to requests for the protocol address. 93 */ 94 #define NCE_EXTERNAL_FLAGS_MASK \ 95 (NCE_F_MYADDR | NCE_F_ISROUTER | NCE_F_NONUD | \ 96 NCE_F_ANYCAST | NCE_F_UNSOL_ADV | NCE_F_BCAST | NCE_F_MCAST | \ 97 NCE_F_AUTHORITY | NCE_F_PUBLISH | NCE_F_STATIC) 98 99 /* 100 * Lock ordering: 101 * 102 * ndp_g_lock -> ill_lock -> ncec_lock 103 * 104 * The ndp_g_lock protects the NCE hash (nce_hash_tbl, NCE_HASH_PTR) and 105 * ncec_next. ncec_lock protects the contents of the NCE (particularly 106 * ncec_refcnt). 107 */ 108 109 static void nce_cleanup_list(ncec_t *ncec); 110 static void nce_set_ll(ncec_t *ncec, uchar_t *ll_addr); 111 static ncec_t *ncec_lookup_illgrp(ill_t *, const in6_addr_t *, 112 ncec_t *); 113 static nce_t *nce_lookup_addr(ill_t *, const in6_addr_t *); 114 static int nce_set_multicast_v6(ill_t *ill, const in6_addr_t *addr, 115 uint16_t ncec_flags, nce_t **newnce); 116 static int nce_set_multicast_v4(ill_t *ill, const in_addr_t *dst, 117 uint16_t ncec_flags, nce_t **newnce); 118 static boolean_t ndp_xmit(ill_t *ill, uint32_t operation, 119 uint8_t *hwaddr, uint_t hwaddr_len, const in6_addr_t *sender, 120 const in6_addr_t *target, int flag); 121 static void ncec_refhold_locked(ncec_t *); 122 static boolean_t ill_defend_rate_limit(ill_t *, ncec_t *); 123 static void nce_queue_mp_common(ncec_t *, mblk_t *, boolean_t); 124 static int nce_add_common(ill_t *, uchar_t *, uint_t, const in6_addr_t *, 125 uint16_t, uint16_t, nce_t **); 126 static nce_t *nce_add_impl(ill_t *, ncec_t *, nce_t *, mblk_t *); 127 static nce_t *nce_add(ill_t *, ncec_t *); 128 static void nce_inactive(nce_t *); 129 extern nce_t *nce_lookup(ill_t *, const in6_addr_t *); 130 static nce_t *nce_ill_lookup_then_add(ill_t *, ncec_t *); 131 static int nce_add_v6(ill_t *, uchar_t *, uint_t, const in6_addr_t *, 132 uint16_t, uint16_t, nce_t **); 133 static int nce_add_v4(ill_t *, uchar_t *, uint_t, const in_addr_t *, 134 uint16_t, uint16_t, nce_t **); 135 static int nce_add_v6_postprocess(nce_t *); 136 static int nce_add_v4_postprocess(nce_t *); 137 static ill_t *nce_resolve_src(ncec_t *, in6_addr_t *); 138 static clock_t nce_fuzz_interval(clock_t, boolean_t); 139 static void nce_resolv_ipmp_ok(ncec_t *); 140 static void nce_walk_common(ill_t *, pfi_t, void *); 141 static void nce_start_timer(ncec_t *, uint_t); 142 static nce_t *nce_fastpath_create(ill_t *, ncec_t *); 143 static void nce_fastpath_trigger(nce_t *); 144 static nce_t *nce_fastpath(ncec_t *, boolean_t, nce_t *); 145 146 #ifdef DEBUG 147 static void ncec_trace_cleanup(const ncec_t *); 148 #endif 149 150 #define NCE_HASH_PTR_V4(ipst, addr) \ 151 (&((ipst)->ips_ndp4->nce_hash_tbl[IRE_ADDR_HASH(addr, NCE_TABLE_SIZE)])) 152 153 #define NCE_HASH_PTR_V6(ipst, addr) \ 154 (&((ipst)->ips_ndp6->nce_hash_tbl[NCE_ADDR_HASH_V6(addr, \ 155 NCE_TABLE_SIZE)])) 156 157 extern kmem_cache_t *ncec_cache; 158 extern kmem_cache_t *nce_cache; 159 160 /* 161 * Send out a IPv6 (unicast) or IPv4 (broadcast) DAD probe 162 * If src_ill is not null, the ncec_addr is bound to src_ill. The 163 * src_ill is ignored by nce_dad for IPv4 Neighbor Cache entries where 164 * the probe is sent on the ncec_ill (in the non-IPMP case) or the 165 * IPMP cast_ill (in the IPMP case). 166 * 167 * Note that the probe interval is based on the src_ill for IPv6, and 168 * the ncec_xmit_interval for IPv4. 169 */ 170 static void 171 nce_dad(ncec_t *ncec, ill_t *src_ill, boolean_t send_probe) 172 { 173 boolean_t dropped; 174 uint32_t probe_interval; 175 176 ASSERT(!(ncec->ncec_flags & NCE_F_MCAST)); 177 ASSERT(!(ncec->ncec_flags & NCE_F_BCAST)); 178 if (ncec->ncec_ipversion == IPV6_VERSION) { 179 dropped = ndp_xmit(src_ill, ND_NEIGHBOR_SOLICIT, 180 ncec->ncec_lladdr, ncec->ncec_lladdr_length, 181 &ipv6_all_zeros, &ncec->ncec_addr, NDP_PROBE); 182 probe_interval = ILL_PROBE_INTERVAL(src_ill); 183 } else { 184 /* IPv4 DAD delay the initial probe. */ 185 if (send_probe) 186 dropped = arp_probe(ncec); 187 else 188 dropped = B_TRUE; 189 probe_interval = nce_fuzz_interval(ncec->ncec_xmit_interval, 190 !send_probe); 191 } 192 if (!dropped) { 193 mutex_enter(&ncec->ncec_lock); 194 ncec->ncec_pcnt--; 195 mutex_exit(&ncec->ncec_lock); 196 } 197 nce_restart_timer(ncec, probe_interval); 198 } 199 200 /* 201 * Compute default flags to use for an advertisement of this ncec's address. 202 */ 203 static int 204 nce_advert_flags(const ncec_t *ncec) 205 { 206 int flag = 0; 207 208 if (ncec->ncec_flags & NCE_F_ISROUTER) 209 flag |= NDP_ISROUTER; 210 if (!(ncec->ncec_flags & NCE_F_ANYCAST)) 211 flag |= NDP_ORIDE; 212 213 return (flag); 214 } 215 216 /* 217 * NDP Cache Entry creation routine. 218 * This routine must always be called with ndp6->ndp_g_lock held. 219 */ 220 int 221 nce_add_v6(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len, 222 const in6_addr_t *addr, uint16_t flags, uint16_t state, nce_t **newnce) 223 { 224 int err; 225 nce_t *nce; 226 227 ASSERT(MUTEX_HELD(&ill->ill_ipst->ips_ndp6->ndp_g_lock)); 228 ASSERT(ill != NULL && ill->ill_isv6); 229 230 err = nce_add_common(ill, hw_addr, hw_addr_len, addr, flags, state, 231 &nce); 232 if (err != 0) 233 return (err); 234 ASSERT(newnce != NULL); 235 *newnce = nce; 236 return (err); 237 } 238 239 /* 240 * Post-processing routine to be executed after nce_add_v6(). This function 241 * triggers fastpath (if appropriate) and DAD on the newly added nce entry 242 * and must be called without any locks held. 243 */ 244 int 245 nce_add_v6_postprocess(nce_t *nce) 246 { 247 ncec_t *ncec = nce->nce_common; 248 boolean_t dropped = B_FALSE; 249 uchar_t *hw_addr = ncec->ncec_lladdr; 250 uint_t hw_addr_len = ncec->ncec_lladdr_length; 251 ill_t *ill = ncec->ncec_ill; 252 int err = 0; 253 uint16_t flags = ncec->ncec_flags; 254 ip_stack_t *ipst = ill->ill_ipst; 255 boolean_t trigger_fastpath = B_TRUE; 256 257 /* 258 * If the hw_addr is NULL, typically for ND_INCOMPLETE nces, then 259 * we call nce_fastpath as soon as the ncec is resolved in nce_process. 260 * We call nce_fastpath from nce_update if the link layer address of 261 * the peer changes from nce_update 262 */ 263 if (NCE_PUBLISH(ncec) || !NCE_ISREACHABLE(ncec) || 264 (hw_addr == NULL && ill->ill_net_type != IRE_IF_NORESOLVER)) 265 trigger_fastpath = B_FALSE; 266 267 if (trigger_fastpath) 268 nce_fastpath_trigger(nce); 269 if (NCE_PUBLISH(ncec) && ncec->ncec_state == ND_PROBE) { 270 ill_t *hwaddr_ill; 271 /* 272 * Unicast entry that needs DAD. 273 */ 274 if (IS_IPMP(ill)) { 275 hwaddr_ill = ipmp_illgrp_find_ill(ill->ill_grp, 276 hw_addr, hw_addr_len); 277 } else { 278 hwaddr_ill = ill; 279 } 280 nce_dad(ncec, hwaddr_ill, B_TRUE); 281 err = EINPROGRESS; 282 } else if (flags & NCE_F_UNSOL_ADV) { 283 /* 284 * We account for the transmit below by assigning one 285 * less than the ndd variable. Subsequent decrements 286 * are done in nce_timer. 287 */ 288 mutex_enter(&ncec->ncec_lock); 289 ncec->ncec_unsolicit_count = 290 ipst->ips_ip_ndp_unsolicit_count - 1; 291 mutex_exit(&ncec->ncec_lock); 292 dropped = ndp_xmit(ill, 293 ND_NEIGHBOR_ADVERT, 294 hw_addr, 295 hw_addr_len, 296 &ncec->ncec_addr, /* Source and target of the adv */ 297 &ipv6_all_hosts_mcast, /* Destination of the packet */ 298 nce_advert_flags(ncec)); 299 mutex_enter(&ncec->ncec_lock); 300 if (dropped) 301 ncec->ncec_unsolicit_count++; 302 else 303 ncec->ncec_last_time_defended = ddi_get_lbolt(); 304 if (ncec->ncec_unsolicit_count != 0) { 305 nce_start_timer(ncec, 306 ipst->ips_ip_ndp_unsolicit_interval); 307 } 308 mutex_exit(&ncec->ncec_lock); 309 } 310 return (err); 311 } 312 313 /* 314 * Atomically lookup and add (if needed) Neighbor Cache information for 315 * an address. 316 * 317 * IPMP notes: the ncec for non-local (i.e., !NCE_MYADDR(ncec) addresses 318 * are always added pointing at the ipmp_ill. Thus, when the ill passed 319 * to nce_add_v6 is an under_ill (i.e., IS_UNDER_IPMP(ill)) two nce_t 320 * entries will be created, both pointing at the same ncec_t. The nce_t 321 * entries will have their nce_ill set to the ipmp_ill and the under_ill 322 * respectively, with the ncec_t having its ncec_ill pointing at the ipmp_ill. 323 * Local addresses are always created on the ill passed to nce_add_v6. 324 */ 325 int 326 nce_lookup_then_add_v6(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len, 327 const in6_addr_t *addr, uint16_t flags, uint16_t state, nce_t **newnce) 328 { 329 int err = 0; 330 ip_stack_t *ipst = ill->ill_ipst; 331 nce_t *nce, *upper_nce = NULL; 332 ill_t *in_ill = ill; 333 boolean_t need_ill_refrele = B_FALSE; 334 335 if (flags & NCE_F_MCAST) { 336 /* 337 * hw_addr will be figured out in nce_set_multicast_v6; 338 * caller has to select the cast_ill 339 */ 340 ASSERT(hw_addr == NULL); 341 ASSERT(!IS_IPMP(ill)); 342 err = nce_set_multicast_v6(ill, addr, flags, newnce); 343 return (err); 344 } 345 ASSERT(ill->ill_isv6); 346 if (IS_UNDER_IPMP(ill) && !(flags & NCE_F_MYADDR)) { 347 ill = ipmp_ill_hold_ipmp_ill(ill); 348 if (ill == NULL) 349 return (ENXIO); 350 need_ill_refrele = B_TRUE; 351 } 352 353 mutex_enter(&ipst->ips_ndp6->ndp_g_lock); 354 nce = nce_lookup_addr(ill, addr); 355 if (nce == NULL) { 356 err = nce_add_v6(ill, hw_addr, hw_addr_len, addr, flags, state, 357 &nce); 358 } else { 359 err = EEXIST; 360 } 361 mutex_exit(&ipst->ips_ndp6->ndp_g_lock); 362 if (err == 0) 363 err = nce_add_v6_postprocess(nce); 364 if (in_ill != ill && nce != NULL) { 365 nce_t *under_nce = NULL; 366 367 /* 368 * in_ill was the under_ill. Try to create the under_nce. 369 * Hold the ill_g_lock to prevent changes to group membership 370 * until we are done. 371 */ 372 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 373 if (!IS_IN_SAME_ILLGRP(in_ill, ill)) { 374 DTRACE_PROBE2(ill__not__in__group, nce_t *, nce, 375 ill_t *, ill); 376 rw_exit(&ipst->ips_ill_g_lock); 377 err = ENXIO; 378 nce_refrele(nce); 379 nce = NULL; 380 goto bail; 381 } 382 under_nce = nce_fastpath_create(in_ill, nce->nce_common); 383 if (under_nce == NULL) { 384 rw_exit(&ipst->ips_ill_g_lock); 385 err = EINVAL; 386 nce_refrele(nce); 387 nce = NULL; 388 goto bail; 389 } 390 rw_exit(&ipst->ips_ill_g_lock); 391 upper_nce = nce; 392 nce = under_nce; /* will be returned to caller */ 393 if (NCE_ISREACHABLE(nce->nce_common)) 394 nce_fastpath_trigger(under_nce); 395 } 396 /* nce_refrele is deferred until the lock is dropped */ 397 if (nce != NULL) { 398 if (newnce != NULL) 399 *newnce = nce; 400 else 401 nce_refrele(nce); 402 } 403 bail: 404 if (upper_nce != NULL) 405 nce_refrele(upper_nce); 406 if (need_ill_refrele) 407 ill_refrele(ill); 408 return (err); 409 } 410 411 /* 412 * Remove all the CONDEMNED nces from the appropriate hash table. 413 * We create a private list of NCEs, these may have ires pointing 414 * to them, so the list will be passed through to clean up dependent 415 * ires and only then we can do ncec_refrele() which can make NCE inactive. 416 */ 417 static void 418 nce_remove(ndp_g_t *ndp, ncec_t *ncec, ncec_t **free_nce_list) 419 { 420 ncec_t *ncec1; 421 ncec_t **ptpn; 422 423 ASSERT(MUTEX_HELD(&ndp->ndp_g_lock)); 424 ASSERT(ndp->ndp_g_walker == 0); 425 for (; ncec; ncec = ncec1) { 426 ncec1 = ncec->ncec_next; 427 mutex_enter(&ncec->ncec_lock); 428 if (NCE_ISCONDEMNED(ncec)) { 429 ptpn = ncec->ncec_ptpn; 430 ncec1 = ncec->ncec_next; 431 if (ncec1 != NULL) 432 ncec1->ncec_ptpn = ptpn; 433 *ptpn = ncec1; 434 ncec->ncec_ptpn = NULL; 435 ncec->ncec_next = NULL; 436 ncec->ncec_next = *free_nce_list; 437 *free_nce_list = ncec; 438 } 439 mutex_exit(&ncec->ncec_lock); 440 } 441 } 442 443 /* 444 * 1. Mark the entry CONDEMNED. This ensures that no new nce_lookup() 445 * will return this NCE. Also no new timeouts will 446 * be started (See nce_restart_timer). 447 * 2. Cancel any currently running timeouts. 448 * 3. If there is an ndp walker, return. The walker will do the cleanup. 449 * This ensures that walkers see a consistent list of NCEs while walking. 450 * 4. Otherwise remove the NCE from the list of NCEs 451 */ 452 void 453 ncec_delete(ncec_t *ncec) 454 { 455 ncec_t **ptpn; 456 ncec_t *ncec1; 457 int ipversion = ncec->ncec_ipversion; 458 ndp_g_t *ndp; 459 ip_stack_t *ipst = ncec->ncec_ipst; 460 461 if (ipversion == IPV4_VERSION) 462 ndp = ipst->ips_ndp4; 463 else 464 ndp = ipst->ips_ndp6; 465 466 /* Serialize deletes */ 467 mutex_enter(&ncec->ncec_lock); 468 if (NCE_ISCONDEMNED(ncec)) { 469 /* Some other thread is doing the delete */ 470 mutex_exit(&ncec->ncec_lock); 471 return; 472 } 473 /* 474 * Caller has a refhold. Also 1 ref for being in the list. Thus 475 * refcnt has to be >= 2 476 */ 477 ASSERT(ncec->ncec_refcnt >= 2); 478 ncec->ncec_flags |= NCE_F_CONDEMNED; 479 mutex_exit(&ncec->ncec_lock); 480 481 /* Count how many condemned ires for kmem_cache callback */ 482 atomic_add_32(&ipst->ips_num_nce_condemned, 1); 483 nce_fastpath_list_delete(ncec->ncec_ill, ncec, NULL); 484 485 /* Complete any waiting callbacks */ 486 ncec_cb_dispatch(ncec); 487 488 /* 489 * Cancel any running timer. Timeout can't be restarted 490 * since CONDEMNED is set. Can't hold ncec_lock across untimeout. 491 * Passing invalid timeout id is fine. 492 */ 493 if (ncec->ncec_timeout_id != 0) { 494 (void) untimeout(ncec->ncec_timeout_id); 495 ncec->ncec_timeout_id = 0; 496 } 497 498 mutex_enter(&ndp->ndp_g_lock); 499 if (ncec->ncec_ptpn == NULL) { 500 /* 501 * The last ndp walker has already removed this ncec from 502 * the list after we marked the ncec CONDEMNED and before 503 * we grabbed the global lock. 504 */ 505 mutex_exit(&ndp->ndp_g_lock); 506 return; 507 } 508 if (ndp->ndp_g_walker > 0) { 509 /* 510 * Can't unlink. The walker will clean up 511 */ 512 ndp->ndp_g_walker_cleanup = B_TRUE; 513 mutex_exit(&ndp->ndp_g_lock); 514 return; 515 } 516 517 /* 518 * Now remove the ncec from the list. nce_restart_timer won't restart 519 * the timer since it is marked CONDEMNED. 520 */ 521 ptpn = ncec->ncec_ptpn; 522 ncec1 = ncec->ncec_next; 523 if (ncec1 != NULL) 524 ncec1->ncec_ptpn = ptpn; 525 *ptpn = ncec1; 526 ncec->ncec_ptpn = NULL; 527 ncec->ncec_next = NULL; 528 mutex_exit(&ndp->ndp_g_lock); 529 530 /* Removed from ncec_ptpn/ncec_next list */ 531 ncec_refrele_notr(ncec); 532 } 533 534 void 535 ncec_inactive(ncec_t *ncec) 536 { 537 mblk_t **mpp; 538 ill_t *ill = ncec->ncec_ill; 539 ip_stack_t *ipst = ncec->ncec_ipst; 540 541 ASSERT(ncec->ncec_refcnt == 0); 542 ASSERT(MUTEX_HELD(&ncec->ncec_lock)); 543 544 /* Count how many condemned nces for kmem_cache callback */ 545 if (NCE_ISCONDEMNED(ncec)) 546 atomic_add_32(&ipst->ips_num_nce_condemned, -1); 547 548 /* Free all allocated messages */ 549 mpp = &ncec->ncec_qd_mp; 550 while (*mpp != NULL) { 551 mblk_t *mp; 552 553 mp = *mpp; 554 *mpp = mp->b_next; 555 556 inet_freemsg(mp); 557 } 558 /* 559 * must have been cleaned up in ncec_delete 560 */ 561 ASSERT(list_is_empty(&ncec->ncec_cb)); 562 list_destroy(&ncec->ncec_cb); 563 /* 564 * free the ncec_lladdr if one was allocated in nce_add_common() 565 */ 566 if (ncec->ncec_lladdr_length > 0) 567 kmem_free(ncec->ncec_lladdr, ncec->ncec_lladdr_length); 568 569 #ifdef DEBUG 570 ncec_trace_cleanup(ncec); 571 #endif 572 573 mutex_enter(&ill->ill_lock); 574 DTRACE_PROBE3(ill__decr__cnt, (ill_t *), ill, 575 (char *), "ncec", (void *), ncec); 576 ill->ill_ncec_cnt--; 577 ncec->ncec_ill = NULL; 578 /* 579 * If the number of ncec's associated with this ill have dropped 580 * to zero, check whether we need to restart any operation that 581 * is waiting for this to happen. 582 */ 583 if (ILL_DOWN_OK(ill)) { 584 /* ipif_ill_refrele_tail drops the ill_lock */ 585 ipif_ill_refrele_tail(ill); 586 } else { 587 mutex_exit(&ill->ill_lock); 588 } 589 590 mutex_destroy(&ncec->ncec_lock); 591 kmem_cache_free(ncec_cache, ncec); 592 } 593 594 /* 595 * ncec_walk routine. Delete the ncec if it is associated with the ill 596 * that is going away. Always called as a writer. 597 */ 598 void 599 ncec_delete_per_ill(ncec_t *ncec, uchar_t *arg) 600 { 601 if ((ncec != NULL) && ncec->ncec_ill == (ill_t *)arg) { 602 ncec_delete(ncec); 603 } 604 } 605 606 /* 607 * Neighbor Cache cleanup logic for a list of ncec_t entries. 608 */ 609 static void 610 nce_cleanup_list(ncec_t *ncec) 611 { 612 ncec_t *ncec_next; 613 614 ASSERT(ncec != NULL); 615 while (ncec != NULL) { 616 ncec_next = ncec->ncec_next; 617 ncec->ncec_next = NULL; 618 619 /* 620 * It is possible for the last ndp walker (this thread) 621 * to come here after ncec_delete has marked the ncec CONDEMNED 622 * and before it has removed the ncec from the fastpath list 623 * or called untimeout. So we need to do it here. It is safe 624 * for both ncec_delete and this thread to do it twice or 625 * even simultaneously since each of the threads has a 626 * reference on the ncec. 627 */ 628 nce_fastpath_list_delete(ncec->ncec_ill, ncec, NULL); 629 /* 630 * Cancel any running timer. Timeout can't be restarted 631 * since CONDEMNED is set. The ncec_lock can't be 632 * held across untimeout though passing invalid timeout 633 * id is fine. 634 */ 635 if (ncec->ncec_timeout_id != 0) { 636 (void) untimeout(ncec->ncec_timeout_id); 637 ncec->ncec_timeout_id = 0; 638 } 639 /* Removed from ncec_ptpn/ncec_next list */ 640 ncec_refrele_notr(ncec); 641 ncec = ncec_next; 642 } 643 } 644 645 /* 646 * Restart DAD on given NCE. Returns B_TRUE if DAD has been restarted. 647 */ 648 boolean_t 649 nce_restart_dad(ncec_t *ncec) 650 { 651 boolean_t started; 652 ill_t *ill, *hwaddr_ill; 653 654 if (ncec == NULL) 655 return (B_FALSE); 656 ill = ncec->ncec_ill; 657 mutex_enter(&ncec->ncec_lock); 658 if (ncec->ncec_state == ND_PROBE) { 659 mutex_exit(&ncec->ncec_lock); 660 started = B_TRUE; 661 } else if (ncec->ncec_state == ND_REACHABLE) { 662 ASSERT(ncec->ncec_lladdr != NULL); 663 ncec->ncec_state = ND_PROBE; 664 ncec->ncec_pcnt = ND_MAX_UNICAST_SOLICIT; 665 /* 666 * Slight cheat here: we don't use the initial probe delay 667 * for IPv4 in this obscure case. 668 */ 669 mutex_exit(&ncec->ncec_lock); 670 if (IS_IPMP(ill)) { 671 hwaddr_ill = ipmp_illgrp_find_ill(ill->ill_grp, 672 ncec->ncec_lladdr, ncec->ncec_lladdr_length); 673 } else { 674 hwaddr_ill = ill; 675 } 676 nce_dad(ncec, hwaddr_ill, B_TRUE); 677 started = B_TRUE; 678 } else { 679 mutex_exit(&ncec->ncec_lock); 680 started = B_FALSE; 681 } 682 return (started); 683 } 684 685 /* 686 * IPv6 Cache entry lookup. Try to find an ncec matching the parameters passed. 687 * If one is found, the refcnt on the ncec will be incremented. 688 */ 689 ncec_t * 690 ncec_lookup_illgrp_v6(ill_t *ill, const in6_addr_t *addr) 691 { 692 ncec_t *ncec; 693 ip_stack_t *ipst = ill->ill_ipst; 694 695 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 696 mutex_enter(&ipst->ips_ndp6->ndp_g_lock); 697 698 /* Get head of v6 hash table */ 699 ncec = *((ncec_t **)NCE_HASH_PTR_V6(ipst, *addr)); 700 ncec = ncec_lookup_illgrp(ill, addr, ncec); 701 mutex_exit(&ipst->ips_ndp6->ndp_g_lock); 702 rw_exit(&ipst->ips_ill_g_lock); 703 return (ncec); 704 } 705 /* 706 * IPv4 Cache entry lookup. Try to find an ncec matching the parameters passed. 707 * If one is found, the refcnt on the ncec will be incremented. 708 */ 709 ncec_t * 710 ncec_lookup_illgrp_v4(ill_t *ill, const in_addr_t *addr) 711 { 712 ncec_t *ncec = NULL; 713 in6_addr_t addr6; 714 ip_stack_t *ipst = ill->ill_ipst; 715 716 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 717 mutex_enter(&ipst->ips_ndp4->ndp_g_lock); 718 719 /* Get head of v4 hash table */ 720 ncec = *((ncec_t **)NCE_HASH_PTR_V4(ipst, *addr)); 721 IN6_IPADDR_TO_V4MAPPED(*addr, &addr6); 722 ncec = ncec_lookup_illgrp(ill, &addr6, ncec); 723 mutex_exit(&ipst->ips_ndp4->ndp_g_lock); 724 rw_exit(&ipst->ips_ill_g_lock); 725 return (ncec); 726 } 727 728 /* 729 * Cache entry lookup. Try to find an ncec matching the parameters passed. 730 * If an ncec is found, increment the hold count on that ncec. 731 * The caller passes in the start of the appropriate hash table, and must 732 * be holding the appropriate global lock (ndp_g_lock). In addition, since 733 * this function matches ncec_t entries across the illgrp, the ips_ill_g_lock 734 * must be held as reader. 735 * 736 * This function always matches across the ipmp group. 737 */ 738 ncec_t * 739 ncec_lookup_illgrp(ill_t *ill, const in6_addr_t *addr, ncec_t *ncec) 740 { 741 ndp_g_t *ndp; 742 ip_stack_t *ipst = ill->ill_ipst; 743 744 if (ill->ill_isv6) 745 ndp = ipst->ips_ndp6; 746 else 747 ndp = ipst->ips_ndp4; 748 749 ASSERT(ill != NULL); 750 ASSERT(MUTEX_HELD(&ndp->ndp_g_lock)); 751 if (IN6_IS_ADDR_UNSPECIFIED(addr)) 752 return (NULL); 753 for (; ncec != NULL; ncec = ncec->ncec_next) { 754 if (ncec->ncec_ill == ill || 755 IS_IN_SAME_ILLGRP(ill, ncec->ncec_ill)) { 756 if (IN6_ARE_ADDR_EQUAL(&ncec->ncec_addr, addr)) { 757 mutex_enter(&ncec->ncec_lock); 758 if (!NCE_ISCONDEMNED(ncec)) { 759 ncec_refhold_locked(ncec); 760 mutex_exit(&ncec->ncec_lock); 761 break; 762 } 763 mutex_exit(&ncec->ncec_lock); 764 } 765 } 766 } 767 return (ncec); 768 } 769 770 /* 771 * Find an nce_t on ill with nce_addr == addr. Lookup the nce_t 772 * entries for ill only, i.e., when ill is part of an ipmp group, 773 * nce_lookup_v4 will never try to match across the group. 774 */ 775 nce_t * 776 nce_lookup_v4(ill_t *ill, const in_addr_t *addr) 777 { 778 nce_t *nce; 779 in6_addr_t addr6; 780 ip_stack_t *ipst = ill->ill_ipst; 781 782 mutex_enter(&ipst->ips_ndp4->ndp_g_lock); 783 IN6_IPADDR_TO_V4MAPPED(*addr, &addr6); 784 nce = nce_lookup_addr(ill, &addr6); 785 mutex_exit(&ipst->ips_ndp4->ndp_g_lock); 786 return (nce); 787 } 788 789 /* 790 * Find an nce_t on ill with nce_addr == addr. Lookup the nce_t 791 * entries for ill only, i.e., when ill is part of an ipmp group, 792 * nce_lookup_v6 will never try to match across the group. 793 */ 794 nce_t * 795 nce_lookup_v6(ill_t *ill, const in6_addr_t *addr6) 796 { 797 nce_t *nce; 798 ip_stack_t *ipst = ill->ill_ipst; 799 800 mutex_enter(&ipst->ips_ndp6->ndp_g_lock); 801 nce = nce_lookup_addr(ill, addr6); 802 mutex_exit(&ipst->ips_ndp6->ndp_g_lock); 803 return (nce); 804 } 805 806 static nce_t * 807 nce_lookup_addr(ill_t *ill, const in6_addr_t *addr) 808 { 809 nce_t *nce; 810 811 ASSERT(ill != NULL); 812 #ifdef DEBUG 813 if (ill->ill_isv6) 814 ASSERT(MUTEX_HELD(&ill->ill_ipst->ips_ndp6->ndp_g_lock)); 815 else 816 ASSERT(MUTEX_HELD(&ill->ill_ipst->ips_ndp4->ndp_g_lock)); 817 #endif 818 mutex_enter(&ill->ill_lock); 819 nce = nce_lookup(ill, addr); 820 mutex_exit(&ill->ill_lock); 821 return (nce); 822 } 823 824 825 /* 826 * Router turned to host. We need to make sure that cached copies of the ncec 827 * are not used for forwarding packets if they were derived from the default 828 * route, and that the default route itself is removed, as required by 829 * section 7.2.5 of RFC 2461. 830 * 831 * Note that the ncec itself probably has valid link-layer information for the 832 * nexthop, so that there is no reason to delete the ncec, as long as the 833 * ISROUTER flag is turned off. 834 */ 835 static void 836 ncec_router_to_host(ncec_t *ncec) 837 { 838 ire_t *ire; 839 ip_stack_t *ipst = ncec->ncec_ipst; 840 841 mutex_enter(&ncec->ncec_lock); 842 ncec->ncec_flags &= ~NCE_F_ISROUTER; 843 mutex_exit(&ncec->ncec_lock); 844 845 ire = ire_ftable_lookup_v6(&ipv6_all_zeros, &ipv6_all_zeros, 846 &ncec->ncec_addr, IRE_DEFAULT, ncec->ncec_ill, ALL_ZONES, NULL, 847 MATCH_IRE_ILL | MATCH_IRE_TYPE | MATCH_IRE_GW, 0, ipst, NULL); 848 if (ire != NULL) { 849 ip_rts_rtmsg(RTM_DELETE, ire, 0, ipst); 850 ire_delete(ire); 851 ire_refrele(ire); 852 } 853 } 854 855 /* 856 * Process passed in parameters either from an incoming packet or via 857 * user ioctl. 858 */ 859 void 860 nce_process(ncec_t *ncec, uchar_t *hw_addr, uint32_t flag, boolean_t is_adv) 861 { 862 ill_t *ill = ncec->ncec_ill; 863 uint32_t hw_addr_len = ill->ill_phys_addr_length; 864 boolean_t ll_updated = B_FALSE; 865 boolean_t ll_changed; 866 nce_t *nce; 867 868 ASSERT(ncec->ncec_ipversion == IPV6_VERSION); 869 /* 870 * No updates of link layer address or the neighbor state is 871 * allowed, when the cache is in NONUD state. This still 872 * allows for responding to reachability solicitation. 873 */ 874 mutex_enter(&ncec->ncec_lock); 875 if (ncec->ncec_state == ND_INCOMPLETE) { 876 if (hw_addr == NULL) { 877 mutex_exit(&ncec->ncec_lock); 878 return; 879 } 880 nce_set_ll(ncec, hw_addr); 881 /* 882 * Update ncec state and send the queued packets 883 * back to ip this time ire will be added. 884 */ 885 if (flag & ND_NA_FLAG_SOLICITED) { 886 nce_update(ncec, ND_REACHABLE, NULL); 887 } else { 888 nce_update(ncec, ND_STALE, NULL); 889 } 890 mutex_exit(&ncec->ncec_lock); 891 nce = nce_fastpath(ncec, B_TRUE, NULL); 892 nce_resolv_ok(ncec); 893 if (nce != NULL) 894 nce_refrele(nce); 895 return; 896 } 897 ll_changed = nce_cmp_ll_addr(ncec, hw_addr, hw_addr_len); 898 if (!is_adv) { 899 /* If this is a SOLICITATION request only */ 900 if (ll_changed) 901 nce_update(ncec, ND_STALE, hw_addr); 902 mutex_exit(&ncec->ncec_lock); 903 ncec_cb_dispatch(ncec); 904 return; 905 } 906 if (!(flag & ND_NA_FLAG_OVERRIDE) && ll_changed) { 907 /* If in any other state than REACHABLE, ignore */ 908 if (ncec->ncec_state == ND_REACHABLE) { 909 nce_update(ncec, ND_STALE, NULL); 910 } 911 mutex_exit(&ncec->ncec_lock); 912 ncec_cb_dispatch(ncec); 913 return; 914 } else { 915 if (ll_changed) { 916 nce_update(ncec, ND_UNCHANGED, hw_addr); 917 ll_updated = B_TRUE; 918 } 919 if (flag & ND_NA_FLAG_SOLICITED) { 920 nce_update(ncec, ND_REACHABLE, NULL); 921 } else { 922 if (ll_updated) { 923 nce_update(ncec, ND_STALE, NULL); 924 } 925 } 926 mutex_exit(&ncec->ncec_lock); 927 if (!(flag & ND_NA_FLAG_ROUTER) && (ncec->ncec_flags & 928 NCE_F_ISROUTER)) { 929 ncec_router_to_host(ncec); 930 } else { 931 ncec_cb_dispatch(ncec); 932 } 933 } 934 } 935 936 /* 937 * Pass arg1 to the pfi supplied, along with each ncec in existence. 938 * ncec_walk() places a REFHOLD on the ncec and drops the lock when 939 * walking the hash list. 940 */ 941 void 942 ncec_walk_common(ndp_g_t *ndp, ill_t *ill, pfi_t pfi, void *arg1, 943 boolean_t trace) 944 { 945 ncec_t *ncec; 946 ncec_t *ncec1; 947 ncec_t **ncep; 948 ncec_t *free_nce_list = NULL; 949 950 mutex_enter(&ndp->ndp_g_lock); 951 /* Prevent ncec_delete from unlink and free of NCE */ 952 ndp->ndp_g_walker++; 953 mutex_exit(&ndp->ndp_g_lock); 954 for (ncep = ndp->nce_hash_tbl; 955 ncep < A_END(ndp->nce_hash_tbl); ncep++) { 956 for (ncec = *ncep; ncec != NULL; ncec = ncec1) { 957 ncec1 = ncec->ncec_next; 958 if (ill == NULL || ncec->ncec_ill == ill) { 959 if (trace) { 960 ncec_refhold(ncec); 961 (*pfi)(ncec, arg1); 962 ncec_refrele(ncec); 963 } else { 964 ncec_refhold_notr(ncec); 965 (*pfi)(ncec, arg1); 966 ncec_refrele_notr(ncec); 967 } 968 } 969 } 970 } 971 mutex_enter(&ndp->ndp_g_lock); 972 ndp->ndp_g_walker--; 973 if (ndp->ndp_g_walker_cleanup && ndp->ndp_g_walker == 0) { 974 /* Time to delete condemned entries */ 975 for (ncep = ndp->nce_hash_tbl; 976 ncep < A_END(ndp->nce_hash_tbl); ncep++) { 977 ncec = *ncep; 978 if (ncec != NULL) { 979 nce_remove(ndp, ncec, &free_nce_list); 980 } 981 } 982 ndp->ndp_g_walker_cleanup = B_FALSE; 983 } 984 985 mutex_exit(&ndp->ndp_g_lock); 986 987 if (free_nce_list != NULL) { 988 nce_cleanup_list(free_nce_list); 989 } 990 } 991 992 /* 993 * Walk everything. 994 * Note that ill can be NULL hence can't derive the ipst from it. 995 */ 996 void 997 ncec_walk(ill_t *ill, pfi_t pfi, void *arg1, ip_stack_t *ipst) 998 { 999 ncec_walk_common(ipst->ips_ndp4, ill, pfi, arg1, B_TRUE); 1000 ncec_walk_common(ipst->ips_ndp6, ill, pfi, arg1, B_TRUE); 1001 } 1002 1003 /* 1004 * For each interface an entry is added for the unspecified multicast group. 1005 * Here that mapping is used to form the multicast cache entry for a particular 1006 * multicast destination. 1007 */ 1008 static int 1009 nce_set_multicast_v6(ill_t *ill, const in6_addr_t *dst, 1010 uint16_t flags, nce_t **newnce) 1011 { 1012 uchar_t *hw_addr; 1013 int err = 0; 1014 ip_stack_t *ipst = ill->ill_ipst; 1015 nce_t *nce; 1016 1017 ASSERT(ill != NULL); 1018 ASSERT(ill->ill_isv6); 1019 ASSERT(!(IN6_IS_ADDR_UNSPECIFIED(dst))); 1020 1021 mutex_enter(&ipst->ips_ndp6->ndp_g_lock); 1022 nce = nce_lookup_addr(ill, dst); 1023 if (nce != NULL) { 1024 mutex_exit(&ipst->ips_ndp6->ndp_g_lock); 1025 goto done; 1026 } 1027 if (ill->ill_net_type == IRE_IF_RESOLVER) { 1028 /* 1029 * For IRE_IF_RESOLVER a hardware mapping can be 1030 * generated. 1031 */ 1032 hw_addr = kmem_alloc(ill->ill_nd_lla_len, KM_NOSLEEP); 1033 if (hw_addr == NULL) { 1034 mutex_exit(&ipst->ips_ndp6->ndp_g_lock); 1035 return (ENOMEM); 1036 } 1037 ip_mcast_mapping(ill, (uchar_t *)dst, hw_addr); 1038 } else { 1039 /* No hw_addr is needed for IRE_IF_NORESOLVER. */ 1040 hw_addr = NULL; 1041 } 1042 ASSERT((flags & NCE_F_MCAST) != 0); 1043 ASSERT((flags & NCE_F_NONUD) != 0); 1044 /* nce_state will be computed by nce_add_common() */ 1045 err = nce_add_v6(ill, hw_addr, ill->ill_phys_addr_length, dst, flags, 1046 ND_UNCHANGED, &nce); 1047 mutex_exit(&ipst->ips_ndp6->ndp_g_lock); 1048 if (err == 0) 1049 err = nce_add_v6_postprocess(nce); 1050 if (hw_addr != NULL) 1051 kmem_free(hw_addr, ill->ill_nd_lla_len); 1052 if (err != 0) { 1053 ip1dbg(("nce_set_multicast_v6: create failed" "%d\n", err)); 1054 return (err); 1055 } 1056 done: 1057 ASSERT(nce->nce_common->ncec_state == ND_REACHABLE); 1058 if (newnce != NULL) 1059 *newnce = nce; 1060 else 1061 nce_refrele(nce); 1062 return (0); 1063 } 1064 1065 /* 1066 * Return the link layer address, and any flags of a ncec. 1067 */ 1068 int 1069 ndp_query(ill_t *ill, struct lif_nd_req *lnr) 1070 { 1071 ncec_t *ncec; 1072 in6_addr_t *addr; 1073 sin6_t *sin6; 1074 1075 ASSERT(ill != NULL && ill->ill_isv6); 1076 sin6 = (sin6_t *)&lnr->lnr_addr; 1077 addr = &sin6->sin6_addr; 1078 1079 /* 1080 * NOTE: if the ill is an IPMP interface, then match against the whole 1081 * illgrp. This e.g. allows in.ndpd to retrieve the link layer 1082 * addresses for the data addresses on an IPMP interface even though 1083 * ipif_ndp_up() created them with an ncec_ill of ipif_bound_ill. 1084 */ 1085 ncec = ncec_lookup_illgrp_v6(ill, addr); 1086 if (ncec == NULL) 1087 return (ESRCH); 1088 /* If no link layer address is available yet, return ESRCH */ 1089 if (!NCE_ISREACHABLE(ncec)) { 1090 ncec_refrele(ncec); 1091 return (ESRCH); 1092 } 1093 lnr->lnr_hdw_len = ill->ill_phys_addr_length; 1094 bcopy(ncec->ncec_lladdr, (uchar_t *)&lnr->lnr_hdw_addr, 1095 lnr->lnr_hdw_len); 1096 if (ncec->ncec_flags & NCE_F_ISROUTER) 1097 lnr->lnr_flags = NDF_ISROUTER_ON; 1098 if (ncec->ncec_flags & NCE_F_ANYCAST) 1099 lnr->lnr_flags |= NDF_ANYCAST_ON; 1100 ncec_refrele(ncec); 1101 return (0); 1102 } 1103 1104 /* 1105 * Finish setting up the Enable/Disable multicast for the driver. 1106 */ 1107 mblk_t * 1108 ndp_mcastreq(ill_t *ill, const in6_addr_t *v6group, uint32_t hw_addr_len, 1109 uint32_t hw_addr_offset, mblk_t *mp) 1110 { 1111 uchar_t *hw_addr; 1112 ipaddr_t v4group; 1113 uchar_t *addr; 1114 1115 ASSERT(ill->ill_net_type == IRE_IF_RESOLVER); 1116 if (IN6_IS_ADDR_V4MAPPED(v6group)) { 1117 IN6_V4MAPPED_TO_IPADDR(v6group, v4group); 1118 1119 ASSERT(CLASSD(v4group)); 1120 ASSERT(!(ill->ill_isv6)); 1121 1122 addr = (uchar_t *)&v4group; 1123 } else { 1124 ASSERT(IN6_IS_ADDR_MULTICAST(v6group)); 1125 ASSERT(ill->ill_isv6); 1126 1127 addr = (uchar_t *)v6group; 1128 } 1129 hw_addr = mi_offset_paramc(mp, hw_addr_offset, hw_addr_len); 1130 if (hw_addr == NULL) { 1131 ip0dbg(("ndp_mcastreq NULL hw_addr\n")); 1132 freemsg(mp); 1133 return (NULL); 1134 } 1135 1136 ip_mcast_mapping(ill, addr, hw_addr); 1137 return (mp); 1138 } 1139 1140 void 1141 ip_ndp_resolve(ncec_t *ncec) 1142 { 1143 in_addr_t sender4 = INADDR_ANY; 1144 in6_addr_t sender6 = ipv6_all_zeros; 1145 ill_t *src_ill; 1146 uint32_t ms; 1147 1148 src_ill = nce_resolve_src(ncec, &sender6); 1149 if (src_ill == NULL) { 1150 /* Make sure we try again later */ 1151 ms = ncec->ncec_ill->ill_reachable_retrans_time; 1152 nce_restart_timer(ncec, (clock_t)ms); 1153 return; 1154 } 1155 if (ncec->ncec_ipversion == IPV4_VERSION) 1156 IN6_V4MAPPED_TO_IPADDR(&sender6, sender4); 1157 mutex_enter(&ncec->ncec_lock); 1158 if (ncec->ncec_ipversion == IPV6_VERSION) 1159 ms = ndp_solicit(ncec, sender6, src_ill); 1160 else 1161 ms = arp_request(ncec, sender4, src_ill); 1162 mutex_exit(&ncec->ncec_lock); 1163 if (ms == 0) { 1164 if (ncec->ncec_state != ND_REACHABLE) { 1165 if (ncec->ncec_ipversion == IPV6_VERSION) 1166 ndp_resolv_failed(ncec); 1167 else 1168 arp_resolv_failed(ncec); 1169 ASSERT((ncec->ncec_flags & NCE_F_STATIC) == 0); 1170 nce_make_unreachable(ncec); 1171 ncec_delete(ncec); 1172 } 1173 } else { 1174 nce_restart_timer(ncec, (clock_t)ms); 1175 } 1176 done: 1177 ill_refrele(src_ill); 1178 } 1179 1180 /* 1181 * Send an IPv6 neighbor solicitation. 1182 * Returns number of milliseconds after which we should either rexmit or abort. 1183 * Return of zero means we should abort. 1184 * The caller holds the ncec_lock to protect ncec_qd_mp and ncec_rcnt. 1185 * The optional source address is used as a hint to ndp_solicit for 1186 * which source to use in the packet. 1187 * 1188 * NOTE: This routine drops ncec_lock (and later reacquires it) when sending 1189 * the packet. 1190 */ 1191 uint32_t 1192 ndp_solicit(ncec_t *ncec, in6_addr_t src, ill_t *ill) 1193 { 1194 in6_addr_t dst; 1195 boolean_t dropped = B_FALSE; 1196 1197 ASSERT(ncec->ncec_ipversion == IPV6_VERSION); 1198 ASSERT(MUTEX_HELD(&ncec->ncec_lock)); 1199 1200 if (ncec->ncec_rcnt == 0) 1201 return (0); 1202 1203 dst = ncec->ncec_addr; 1204 ncec->ncec_rcnt--; 1205 mutex_exit(&ncec->ncec_lock); 1206 dropped = ndp_xmit(ill, ND_NEIGHBOR_SOLICIT, ill->ill_phys_addr, 1207 ill->ill_phys_addr_length, &src, &dst, 0); 1208 mutex_enter(&ncec->ncec_lock); 1209 if (dropped) 1210 ncec->ncec_rcnt++; 1211 return (ncec->ncec_ill->ill_reachable_retrans_time); 1212 } 1213 1214 /* 1215 * Attempt to recover an address on an interface that's been marked as a 1216 * duplicate. Because NCEs are destroyed when the interface goes down, there's 1217 * no easy way to just probe the address and have the right thing happen if 1218 * it's no longer in use. Instead, we just bring it up normally and allow the 1219 * regular interface start-up logic to probe for a remaining duplicate and take 1220 * us back down if necessary. 1221 * Neither DHCP nor temporary addresses arrive here; they're excluded by 1222 * ip_ndp_excl. 1223 */ 1224 /* ARGSUSED */ 1225 void 1226 ip_addr_recover(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg) 1227 { 1228 ill_t *ill = rq->q_ptr; 1229 ipif_t *ipif; 1230 in6_addr_t *addr6 = (in6_addr_t *)mp->b_rptr; 1231 in_addr_t *addr4 = (in_addr_t *)mp->b_rptr; 1232 boolean_t addr_equal; 1233 1234 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 1235 /* 1236 * We do not support recovery of proxy ARP'd interfaces, 1237 * because the system lacks a complete proxy ARP mechanism. 1238 */ 1239 if (ill->ill_isv6) { 1240 addr_equal = IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6lcl_addr, 1241 addr6); 1242 } else { 1243 addr_equal = (ipif->ipif_lcl_addr == *addr4); 1244 } 1245 1246 if ((ipif->ipif_flags & IPIF_POINTOPOINT) || !addr_equal) 1247 continue; 1248 1249 /* 1250 * If we have already recovered or if the interface is going 1251 * away, then ignore. 1252 */ 1253 mutex_enter(&ill->ill_lock); 1254 if (!(ipif->ipif_flags & IPIF_DUPLICATE) || 1255 (ipif->ipif_state_flags & IPIF_CONDEMNED)) { 1256 mutex_exit(&ill->ill_lock); 1257 continue; 1258 } 1259 1260 ipif->ipif_flags &= ~IPIF_DUPLICATE; 1261 ill->ill_ipif_dup_count--; 1262 mutex_exit(&ill->ill_lock); 1263 ipif->ipif_was_dup = B_TRUE; 1264 1265 if (ill->ill_isv6) { 1266 VERIFY(ipif_ndp_up(ipif, B_TRUE) != EINPROGRESS); 1267 (void) ipif_up_done_v6(ipif); 1268 } else { 1269 VERIFY(ipif_arp_up(ipif, Res_act_initial, B_TRUE) != 1270 EINPROGRESS); 1271 (void) ipif_up_done(ipif); 1272 } 1273 } 1274 freeb(mp); 1275 } 1276 1277 /* 1278 * Attempt to recover an IPv6 interface that's been shut down as a duplicate. 1279 * As long as someone else holds the address, the interface will stay down. 1280 * When that conflict goes away, the interface is brought back up. This is 1281 * done so that accidental shutdowns of addresses aren't made permanent. Your 1282 * server will recover from a failure. 1283 * 1284 * For DHCP and temporary addresses, recovery is not done in the kernel. 1285 * Instead, it's handled by user space processes (dhcpagent and in.ndpd). 1286 * 1287 * This function is entered on a timer expiry; the ID is in ipif_recovery_id. 1288 */ 1289 void 1290 ipif_dup_recovery(void *arg) 1291 { 1292 ipif_t *ipif = arg; 1293 1294 ipif->ipif_recovery_id = 0; 1295 if (!(ipif->ipif_flags & IPIF_DUPLICATE)) 1296 return; 1297 1298 /* 1299 * No lock, because this is just an optimization. 1300 */ 1301 if (ipif->ipif_state_flags & IPIF_CONDEMNED) 1302 return; 1303 1304 /* If the link is down, we'll retry this later */ 1305 if (!(ipif->ipif_ill->ill_phyint->phyint_flags & PHYI_RUNNING)) 1306 return; 1307 1308 ipif_do_recovery(ipif); 1309 } 1310 1311 /* 1312 * Perform interface recovery by forcing the duplicate interfaces up and 1313 * allowing the system to determine which ones should stay up. 1314 * 1315 * Called both by recovery timer expiry and link-up notification. 1316 */ 1317 void 1318 ipif_do_recovery(ipif_t *ipif) 1319 { 1320 ill_t *ill = ipif->ipif_ill; 1321 mblk_t *mp; 1322 ip_stack_t *ipst = ill->ill_ipst; 1323 size_t mp_size; 1324 1325 if (ipif->ipif_isv6) 1326 mp_size = sizeof (ipif->ipif_v6lcl_addr); 1327 else 1328 mp_size = sizeof (ipif->ipif_lcl_addr); 1329 mp = allocb(mp_size, BPRI_MED); 1330 if (mp == NULL) { 1331 mutex_enter(&ill->ill_lock); 1332 if (ipst->ips_ip_dup_recovery > 0 && 1333 ipif->ipif_recovery_id == 0 && 1334 !(ipif->ipif_state_flags & IPIF_CONDEMNED)) { 1335 ipif->ipif_recovery_id = timeout(ipif_dup_recovery, 1336 ipif, MSEC_TO_TICK(ipst->ips_ip_dup_recovery)); 1337 } 1338 mutex_exit(&ill->ill_lock); 1339 } else { 1340 /* 1341 * A recovery timer may still be running if we got here from 1342 * ill_restart_dad(); cancel that timer. 1343 */ 1344 if (ipif->ipif_recovery_id != 0) 1345 (void) untimeout(ipif->ipif_recovery_id); 1346 ipif->ipif_recovery_id = 0; 1347 1348 if (ipif->ipif_isv6) { 1349 bcopy(&ipif->ipif_v6lcl_addr, mp->b_rptr, 1350 sizeof (ipif->ipif_v6lcl_addr)); 1351 } else { 1352 bcopy(&ipif->ipif_lcl_addr, mp->b_rptr, 1353 sizeof (ipif->ipif_lcl_addr)); 1354 } 1355 ill_refhold(ill); 1356 qwriter_ip(ill, ill->ill_rq, mp, ip_addr_recover, NEW_OP, 1357 B_FALSE); 1358 } 1359 } 1360 1361 /* 1362 * Find the MAC and IP addresses in an NA/NS message. 1363 */ 1364 static void 1365 ip_ndp_find_addresses(mblk_t *mp, ip_recv_attr_t *ira, ill_t *ill, 1366 in6_addr_t *targp, uchar_t **haddr, uint_t *haddrlenp) 1367 { 1368 icmp6_t *icmp6 = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN); 1369 nd_neighbor_solicit_t *ns = (nd_neighbor_solicit_t *)icmp6; 1370 uchar_t *addr; 1371 int alen; 1372 1373 /* icmp_inbound_v6 ensures this */ 1374 ASSERT(ira->ira_flags & IRAF_L2SRC_SET); 1375 1376 addr = ira->ira_l2src; 1377 alen = ill->ill_phys_addr_length; 1378 if (alen > 0) { 1379 *haddr = addr; 1380 *haddrlenp = alen; 1381 } else { 1382 *haddr = NULL; 1383 *haddrlenp = 0; 1384 } 1385 1386 /* nd_ns_target and nd_na_target are at the same offset, so we cheat */ 1387 *targp = ns->nd_ns_target; 1388 } 1389 1390 /* 1391 * This is for exclusive changes due to NDP duplicate address detection 1392 * failure. 1393 */ 1394 /* ARGSUSED */ 1395 static void 1396 ip_ndp_excl(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg) 1397 { 1398 ill_t *ill = rq->q_ptr; 1399 ipif_t *ipif; 1400 uchar_t *haddr; 1401 uint_t haddrlen; 1402 ip_stack_t *ipst = ill->ill_ipst; 1403 in6_addr_t targ; 1404 ip_recv_attr_t iras; 1405 mblk_t *attrmp; 1406 1407 attrmp = mp; 1408 mp = mp->b_cont; 1409 attrmp->b_cont = NULL; 1410 if (!ip_recv_attr_from_mblk(attrmp, &iras)) { 1411 /* The ill or ip_stack_t disappeared on us */ 1412 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 1413 ip_drop_input("ip_recv_attr_from_mblk", mp, ill); 1414 freemsg(mp); 1415 ira_cleanup(&iras, B_TRUE); 1416 return; 1417 } 1418 1419 ASSERT(ill == iras.ira_rill); 1420 1421 ip_ndp_find_addresses(mp, &iras, ill, &targ, &haddr, &haddrlen); 1422 if (haddr != NULL && haddrlen == ill->ill_phys_addr_length) { 1423 /* 1424 * Ignore conflicts generated by misbehaving switches that 1425 * just reflect our own messages back to us. For IPMP, we may 1426 * see reflections across any ill in the illgrp. 1427 * 1428 * RFC2462 and revisions tried to detect both the case 1429 * when a statically configured IPv6 address is a duplicate, 1430 * and the case when the L2 address itself is a duplicate. The 1431 * later is important because, with stateles address autoconf, 1432 * if the L2 address is a duplicate, the resulting IPv6 1433 * address(es) would also be duplicates. We rely on DAD of the 1434 * IPv6 address itself to detect the latter case. 1435 */ 1436 /* For an under ill_grp can change under lock */ 1437 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 1438 if (bcmp(haddr, ill->ill_phys_addr, haddrlen) == 0 || 1439 IS_UNDER_IPMP(ill) && 1440 ipmp_illgrp_find_ill(ill->ill_grp, haddr, 1441 haddrlen) != NULL) { 1442 rw_exit(&ipst->ips_ill_g_lock); 1443 goto ignore_conflict; 1444 } 1445 rw_exit(&ipst->ips_ill_g_lock); 1446 } 1447 1448 /* 1449 * Look up the appropriate ipif. 1450 */ 1451 ipif = ipif_lookup_addr_v6(&targ, ill, ALL_ZONES, ipst); 1452 if (ipif == NULL) 1453 goto ignore_conflict; 1454 1455 /* Reload the ill to match the ipif */ 1456 ill = ipif->ipif_ill; 1457 1458 /* If it's already duplicate or ineligible, then don't do anything. */ 1459 if (ipif->ipif_flags & (IPIF_POINTOPOINT|IPIF_DUPLICATE)) { 1460 ipif_refrele(ipif); 1461 goto ignore_conflict; 1462 } 1463 1464 /* 1465 * If this is a failure during duplicate recovery, then don't 1466 * complain. It may take a long time to recover. 1467 */ 1468 if (!ipif->ipif_was_dup) { 1469 char ibuf[LIFNAMSIZ]; 1470 char hbuf[MAC_STR_LEN]; 1471 char sbuf[INET6_ADDRSTRLEN]; 1472 1473 ipif_get_name(ipif, ibuf, sizeof (ibuf)); 1474 cmn_err(CE_WARN, "%s has duplicate address %s (in use by %s);" 1475 " disabled", ibuf, 1476 inet_ntop(AF_INET6, &targ, sbuf, sizeof (sbuf)), 1477 mac_colon_addr(haddr, haddrlen, hbuf, sizeof (hbuf))); 1478 } 1479 mutex_enter(&ill->ill_lock); 1480 ASSERT(!(ipif->ipif_flags & IPIF_DUPLICATE)); 1481 ipif->ipif_flags |= IPIF_DUPLICATE; 1482 ill->ill_ipif_dup_count++; 1483 mutex_exit(&ill->ill_lock); 1484 (void) ipif_down(ipif, NULL, NULL); 1485 (void) ipif_down_tail(ipif); 1486 mutex_enter(&ill->ill_lock); 1487 if (!(ipif->ipif_flags & (IPIF_DHCPRUNNING|IPIF_TEMPORARY)) && 1488 ill->ill_net_type == IRE_IF_RESOLVER && 1489 !(ipif->ipif_state_flags & IPIF_CONDEMNED) && 1490 ipst->ips_ip_dup_recovery > 0) { 1491 ASSERT(ipif->ipif_recovery_id == 0); 1492 ipif->ipif_recovery_id = timeout(ipif_dup_recovery, 1493 ipif, MSEC_TO_TICK(ipst->ips_ip_dup_recovery)); 1494 } 1495 mutex_exit(&ill->ill_lock); 1496 ipif_refrele(ipif); 1497 1498 ignore_conflict: 1499 freemsg(mp); 1500 ira_cleanup(&iras, B_TRUE); 1501 } 1502 1503 /* 1504 * Handle failure by tearing down the ipifs with the specified address. Note 1505 * that tearing down the ipif also means deleting the ncec through ipif_down, so 1506 * it's not possible to do recovery by just restarting the ncec timer. Instead, 1507 * we start a timer on the ipif. 1508 * Caller has to free mp; 1509 */ 1510 static void 1511 ndp_failure(mblk_t *mp, ip_recv_attr_t *ira) 1512 { 1513 const uchar_t *haddr; 1514 ill_t *ill = ira->ira_rill; 1515 1516 /* 1517 * Ignore conflicts generated by misbehaving switches that just 1518 * reflect our own messages back to us. 1519 */ 1520 1521 /* icmp_inbound_v6 ensures this */ 1522 ASSERT(ira->ira_flags & IRAF_L2SRC_SET); 1523 haddr = ira->ira_l2src; 1524 if (haddr != NULL && 1525 bcmp(haddr, ill->ill_phys_addr, ill->ill_phys_addr_length) == 0) { 1526 return; 1527 } 1528 1529 if ((mp = copymsg(mp)) != NULL) { 1530 mblk_t *attrmp; 1531 1532 attrmp = ip_recv_attr_to_mblk(ira); 1533 if (attrmp == NULL) { 1534 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 1535 ip_drop_input("ipIfStatsInDiscards", mp, ill); 1536 freemsg(mp); 1537 } else { 1538 ASSERT(attrmp->b_cont == NULL); 1539 attrmp->b_cont = mp; 1540 mp = attrmp; 1541 ill_refhold(ill); 1542 qwriter_ip(ill, ill->ill_rq, mp, ip_ndp_excl, NEW_OP, 1543 B_FALSE); 1544 } 1545 } 1546 } 1547 1548 /* 1549 * Handle a discovered conflict: some other system is advertising that it owns 1550 * one of our IP addresses. We need to defend ourselves, or just shut down the 1551 * interface. 1552 * 1553 * Handles both IPv4 and IPv6 1554 */ 1555 boolean_t 1556 ip_nce_conflict(mblk_t *mp, ip_recv_attr_t *ira, ncec_t *ncec) 1557 { 1558 ipif_t *ipif; 1559 clock_t now; 1560 uint_t maxdefense; 1561 uint_t defs; 1562 ill_t *ill = ira->ira_ill; 1563 ip_stack_t *ipst = ill->ill_ipst; 1564 uint32_t elapsed; 1565 boolean_t isv6 = ill->ill_isv6; 1566 ipaddr_t ncec_addr; 1567 1568 if (isv6) { 1569 ipif = ipif_lookup_addr_v6(&ncec->ncec_addr, ill, ALL_ZONES, 1570 ipst); 1571 } else { 1572 if (arp_no_defense) { 1573 /* 1574 * Yes, there is a conflict, but no, we do not 1575 * defend ourself. 1576 */ 1577 return (B_TRUE); 1578 } 1579 IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, ncec_addr); 1580 ipif = ipif_lookup_addr(ncec_addr, ill, ALL_ZONES, 1581 ipst); 1582 } 1583 if (ipif == NULL) 1584 return (B_FALSE); 1585 1586 /* 1587 * First, figure out if this address is disposable. 1588 */ 1589 if (ipif->ipif_flags & (IPIF_DHCPRUNNING | IPIF_TEMPORARY)) 1590 maxdefense = ipst->ips_ip_max_temp_defend; 1591 else 1592 maxdefense = ipst->ips_ip_max_defend; 1593 1594 /* 1595 * Now figure out how many times we've defended ourselves. Ignore 1596 * defenses that happened long in the past. 1597 */ 1598 now = ddi_get_lbolt(); 1599 elapsed = (drv_hztousec(now - ncec->ncec_last_time_defended))/1000000; 1600 mutex_enter(&ncec->ncec_lock); 1601 if ((defs = ncec->ncec_defense_count) > 0 && 1602 elapsed > ipst->ips_ip_defend_interval) { 1603 /* 1604 * ip_defend_interval has elapsed. 1605 * reset the defense count. 1606 */ 1607 ncec->ncec_defense_count = defs = 0; 1608 } 1609 ncec->ncec_defense_count++; 1610 ncec->ncec_last_time_defended = now; 1611 mutex_exit(&ncec->ncec_lock); 1612 ipif_refrele(ipif); 1613 1614 /* 1615 * If we've defended ourselves too many times already, then give up and 1616 * tear down the interface(s) using this address. 1617 * Otherwise, caller has to defend by sending out an announce. 1618 */ 1619 if (defs >= maxdefense) { 1620 if (isv6) 1621 ndp_failure(mp, ira); 1622 else 1623 arp_failure(mp, ira); 1624 } else { 1625 return (B_TRUE); /* caller must defend this address */ 1626 } 1627 return (B_FALSE); 1628 } 1629 1630 /* 1631 * Handle reception of Neighbor Solicitation messages. 1632 */ 1633 static void 1634 ndp_input_solicit(mblk_t *mp, ip_recv_attr_t *ira) 1635 { 1636 ill_t *ill = ira->ira_ill, *under_ill; 1637 nd_neighbor_solicit_t *ns; 1638 uint32_t hlen = ill->ill_phys_addr_length; 1639 uchar_t *haddr = NULL; 1640 icmp6_t *icmp_nd; 1641 ip6_t *ip6h; 1642 ncec_t *our_ncec = NULL; 1643 in6_addr_t target; 1644 in6_addr_t src; 1645 int len; 1646 int flag = 0; 1647 nd_opt_hdr_t *opt = NULL; 1648 boolean_t bad_solicit = B_FALSE; 1649 mib2_ipv6IfIcmpEntry_t *mib = ill->ill_icmp6_mib; 1650 boolean_t need_ill_refrele = B_FALSE; 1651 1652 ip6h = (ip6_t *)mp->b_rptr; 1653 icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN); 1654 len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN; 1655 src = ip6h->ip6_src; 1656 ns = (nd_neighbor_solicit_t *)icmp_nd; 1657 target = ns->nd_ns_target; 1658 if (IN6_IS_ADDR_MULTICAST(&target) || IN6_IS_ADDR_V4MAPPED(&target) || 1659 IN6_IS_ADDR_LOOPBACK(&target)) { 1660 if (ip_debug > 2) { 1661 /* ip1dbg */ 1662 pr_addr_dbg("ndp_input_solicit: Martian Target %s\n", 1663 AF_INET6, &target); 1664 } 1665 bad_solicit = B_TRUE; 1666 goto done; 1667 } 1668 if (len > sizeof (nd_neighbor_solicit_t)) { 1669 /* Options present */ 1670 opt = (nd_opt_hdr_t *)&ns[1]; 1671 len -= sizeof (nd_neighbor_solicit_t); 1672 if (!ndp_verify_optlen(opt, len)) { 1673 ip1dbg(("ndp_input_solicit: Bad opt len\n")); 1674 bad_solicit = B_TRUE; 1675 goto done; 1676 } 1677 } 1678 if (IN6_IS_ADDR_UNSPECIFIED(&src)) { 1679 /* Check to see if this is a valid DAD solicitation */ 1680 if (!IN6_IS_ADDR_MC_SOLICITEDNODE(&ip6h->ip6_dst)) { 1681 if (ip_debug > 2) { 1682 /* ip1dbg */ 1683 pr_addr_dbg("ndp_input_solicit: IPv6 " 1684 "Destination is not solicited node " 1685 "multicast %s\n", AF_INET6, 1686 &ip6h->ip6_dst); 1687 } 1688 bad_solicit = B_TRUE; 1689 goto done; 1690 } 1691 } 1692 1693 /* 1694 * NOTE: with IPMP, it's possible the nominated multicast ill (which 1695 * received this packet if it's multicast) is not the ill tied to 1696 * e.g. the IPMP ill's data link-local. So we match across the illgrp 1697 * to ensure we find the associated NCE. 1698 */ 1699 our_ncec = ncec_lookup_illgrp_v6(ill, &target); 1700 /* 1701 * If this is a valid Solicitation for an address we are publishing, 1702 * then a PUBLISH entry should exist in the cache 1703 */ 1704 if (our_ncec == NULL || !NCE_PUBLISH(our_ncec)) { 1705 ip1dbg(("ndp_input_solicit: Wrong target in NS?!" 1706 "ifname=%s ", ill->ill_name)); 1707 if (ip_debug > 2) { 1708 /* ip1dbg */ 1709 pr_addr_dbg(" dst %s\n", AF_INET6, &target); 1710 } 1711 if (our_ncec == NULL) 1712 bad_solicit = B_TRUE; 1713 goto done; 1714 } 1715 1716 /* At this point we should have a verified NS per spec */ 1717 if (opt != NULL) { 1718 opt = ndp_get_option(opt, len, ND_OPT_SOURCE_LINKADDR); 1719 if (opt != NULL) { 1720 haddr = (uchar_t *)&opt[1]; 1721 if (hlen > opt->nd_opt_len * 8 - sizeof (*opt) || 1722 hlen == 0) { 1723 ip1dbg(("ndp_input_advert: bad SLLA\n")); 1724 bad_solicit = B_TRUE; 1725 goto done; 1726 } 1727 } 1728 } 1729 1730 /* If sending directly to peer, set the unicast flag */ 1731 if (!IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst)) 1732 flag |= NDP_UNICAST; 1733 1734 /* 1735 * Create/update the entry for the soliciting node on the ipmp_ill. 1736 * or respond to outstanding queries, don't if 1737 * the source is unspecified address. 1738 */ 1739 if (!IN6_IS_ADDR_UNSPECIFIED(&src)) { 1740 int err; 1741 nce_t *nnce; 1742 1743 ASSERT(ill->ill_isv6); 1744 /* 1745 * Regular solicitations *must* include the Source Link-Layer 1746 * Address option. Ignore messages that do not. 1747 */ 1748 if (haddr == NULL && IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst)) { 1749 ip1dbg(("ndp_input_solicit: source link-layer address " 1750 "option missing with a specified source.\n")); 1751 bad_solicit = B_TRUE; 1752 goto done; 1753 } 1754 1755 /* 1756 * This is a regular solicitation. If we're still in the 1757 * process of verifying the address, then don't respond at all 1758 * and don't keep track of the sender. 1759 */ 1760 if (our_ncec->ncec_state == ND_PROBE) 1761 goto done; 1762 1763 /* 1764 * If the solicitation doesn't have sender hardware address 1765 * (legal for unicast solicitation), then process without 1766 * installing the return NCE. Either we already know it, or 1767 * we'll be forced to look it up when (and if) we reply to the 1768 * packet. 1769 */ 1770 if (haddr == NULL) 1771 goto no_source; 1772 1773 under_ill = ill; 1774 if (IS_UNDER_IPMP(under_ill)) { 1775 ill = ipmp_ill_hold_ipmp_ill(under_ill); 1776 if (ill == NULL) 1777 ill = under_ill; 1778 else 1779 need_ill_refrele = B_TRUE; 1780 } 1781 err = nce_lookup_then_add_v6(ill, 1782 haddr, hlen, 1783 &src, /* Soliciting nodes address */ 1784 0, 1785 ND_STALE, 1786 &nnce); 1787 1788 if (need_ill_refrele) { 1789 ill_refrele(ill); 1790 ill = under_ill; 1791 need_ill_refrele = B_FALSE; 1792 } 1793 switch (err) { 1794 case 0: 1795 /* done with this entry */ 1796 nce_refrele(nnce); 1797 break; 1798 case EEXIST: 1799 /* 1800 * B_FALSE indicates this is not an an advertisement. 1801 */ 1802 nce_process(nnce->nce_common, haddr, 0, B_FALSE); 1803 nce_refrele(nnce); 1804 break; 1805 default: 1806 ip1dbg(("ndp_input_solicit: Can't create NCE %d\n", 1807 err)); 1808 goto done; 1809 } 1810 no_source: 1811 flag |= NDP_SOLICITED; 1812 } else { 1813 /* 1814 * No source link layer address option should be present in a 1815 * valid DAD request. 1816 */ 1817 if (haddr != NULL) { 1818 ip1dbg(("ndp_input_solicit: source link-layer address " 1819 "option present with an unspecified source.\n")); 1820 bad_solicit = B_TRUE; 1821 goto done; 1822 } 1823 if (our_ncec->ncec_state == ND_PROBE) { 1824 /* 1825 * Internally looped-back probes will have 1826 * IRAF_L2SRC_LOOPBACK set so we can ignore our own 1827 * transmissions. 1828 */ 1829 if (!(ira->ira_flags & IRAF_L2SRC_LOOPBACK)) { 1830 /* 1831 * If someone else is probing our address, then 1832 * we've crossed wires. Declare failure. 1833 */ 1834 ndp_failure(mp, ira); 1835 } 1836 goto done; 1837 } 1838 /* 1839 * This is a DAD probe. Multicast the advertisement to the 1840 * all-nodes address. 1841 */ 1842 src = ipv6_all_hosts_mcast; 1843 } 1844 flag |= nce_advert_flags(our_ncec); 1845 (void) ndp_xmit(ill, 1846 ND_NEIGHBOR_ADVERT, 1847 our_ncec->ncec_lladdr, 1848 our_ncec->ncec_lladdr_length, 1849 &target, /* Source and target of the advertisement pkt */ 1850 &src, /* IP Destination (source of original pkt) */ 1851 flag); 1852 done: 1853 if (bad_solicit) 1854 BUMP_MIB(mib, ipv6IfIcmpInBadNeighborSolicitations); 1855 if (our_ncec != NULL) 1856 ncec_refrele(our_ncec); 1857 } 1858 1859 /* 1860 * Handle reception of Neighbor Solicitation messages 1861 */ 1862 void 1863 ndp_input_advert(mblk_t *mp, ip_recv_attr_t *ira) 1864 { 1865 ill_t *ill = ira->ira_ill; 1866 nd_neighbor_advert_t *na; 1867 uint32_t hlen = ill->ill_phys_addr_length; 1868 uchar_t *haddr = NULL; 1869 icmp6_t *icmp_nd; 1870 ip6_t *ip6h; 1871 ncec_t *dst_ncec = NULL; 1872 in6_addr_t target; 1873 nd_opt_hdr_t *opt = NULL; 1874 int len; 1875 ip_stack_t *ipst = ill->ill_ipst; 1876 mib2_ipv6IfIcmpEntry_t *mib = ill->ill_icmp6_mib; 1877 1878 ip6h = (ip6_t *)mp->b_rptr; 1879 icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN); 1880 len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN; 1881 na = (nd_neighbor_advert_t *)icmp_nd; 1882 1883 if (IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst) && 1884 (na->nd_na_flags_reserved & ND_NA_FLAG_SOLICITED)) { 1885 ip1dbg(("ndp_input_advert: Target is multicast but the " 1886 "solicited flag is not zero\n")); 1887 BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements); 1888 return; 1889 } 1890 target = na->nd_na_target; 1891 if (IN6_IS_ADDR_MULTICAST(&target) || IN6_IS_ADDR_V4MAPPED(&target) || 1892 IN6_IS_ADDR_LOOPBACK(&target)) { 1893 if (ip_debug > 2) { 1894 /* ip1dbg */ 1895 pr_addr_dbg("ndp_input_solicit: Martian Target %s\n", 1896 AF_INET6, &target); 1897 } 1898 BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements); 1899 return; 1900 } 1901 if (len > sizeof (nd_neighbor_advert_t)) { 1902 opt = (nd_opt_hdr_t *)&na[1]; 1903 if (!ndp_verify_optlen(opt, 1904 len - sizeof (nd_neighbor_advert_t))) { 1905 ip1dbg(("ndp_input_advert: cannot verify SLLA\n")); 1906 BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements); 1907 return; 1908 } 1909 /* At this point we have a verified NA per spec */ 1910 len -= sizeof (nd_neighbor_advert_t); 1911 opt = ndp_get_option(opt, len, ND_OPT_TARGET_LINKADDR); 1912 if (opt != NULL) { 1913 haddr = (uchar_t *)&opt[1]; 1914 if (hlen > opt->nd_opt_len * 8 - sizeof (*opt) || 1915 hlen == 0) { 1916 ip1dbg(("ndp_input_advert: bad SLLA\n")); 1917 BUMP_MIB(mib, 1918 ipv6IfIcmpInBadNeighborAdvertisements); 1919 return; 1920 } 1921 } 1922 } 1923 1924 /* 1925 * NOTE: we match across the illgrp since we need to do DAD for all of 1926 * our local addresses, and those are spread across all the active 1927 * ills in the group. 1928 */ 1929 if ((dst_ncec = ncec_lookup_illgrp_v6(ill, &target)) == NULL) 1930 return; 1931 1932 if (NCE_PUBLISH(dst_ncec)) { 1933 /* 1934 * Someone just advertised an addresses that we publish. First, 1935 * check it it was us -- if so, we can safely ignore it. 1936 * We don't get the haddr from the ira_l2src because, in the 1937 * case that the packet originated from us, on an IPMP group, 1938 * the ira_l2src may would be the link-layer address of the 1939 * cast_ill used to send the packet, which may not be the same 1940 * as the dst_ncec->ncec_lladdr of the address. 1941 */ 1942 if (haddr != NULL) { 1943 if (ira->ira_flags & IRAF_L2SRC_LOOPBACK) 1944 goto out; 1945 1946 if (!nce_cmp_ll_addr(dst_ncec, haddr, hlen)) 1947 goto out; /* from us -- no conflict */ 1948 1949 /* 1950 * If we're in an IPMP group, check if this is an echo 1951 * from another ill in the group. Use the double- 1952 * checked locking pattern to avoid grabbing 1953 * ill_g_lock in the non-IPMP case. 1954 */ 1955 if (IS_UNDER_IPMP(ill)) { 1956 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 1957 if (IS_UNDER_IPMP(ill) && ipmp_illgrp_find_ill( 1958 ill->ill_grp, haddr, hlen) != NULL) { 1959 rw_exit(&ipst->ips_ill_g_lock); 1960 goto out; 1961 } 1962 rw_exit(&ipst->ips_ill_g_lock); 1963 } 1964 } 1965 1966 /* 1967 * This appears to be a real conflict. If we're trying to 1968 * configure this NCE (ND_PROBE), then shut it down. 1969 * Otherwise, handle the discovered conflict. 1970 */ 1971 if (dst_ncec->ncec_state == ND_PROBE) { 1972 ndp_failure(mp, ira); 1973 } else { 1974 if (ip_nce_conflict(mp, ira, dst_ncec)) { 1975 char hbuf[MAC_STR_LEN]; 1976 char sbuf[INET6_ADDRSTRLEN]; 1977 1978 cmn_err(CE_WARN, 1979 "node '%s' is using %s on %s", 1980 inet_ntop(AF_INET6, &target, sbuf, 1981 sizeof (sbuf)), 1982 haddr == NULL ? "<none>" : 1983 mac_colon_addr(haddr, hlen, hbuf, 1984 sizeof (hbuf)), ill->ill_name); 1985 /* 1986 * RFC 4862, Section 5.4.4 does not mandate 1987 * any specific behavior when an NA matches 1988 * a non-tentative address assigned to the 1989 * receiver. We make the choice of defending 1990 * our address, based on the assumption that 1991 * the sender has not detected the Duplicate. 1992 * 1993 * ncec_last_time_defended has been adjusted 1994 * in ip_nce_conflict() 1995 */ 1996 (void) ndp_announce(dst_ncec); 1997 } 1998 } 1999 } else { 2000 if (na->nd_na_flags_reserved & ND_NA_FLAG_ROUTER) 2001 dst_ncec->ncec_flags |= NCE_F_ISROUTER; 2002 2003 /* B_TRUE indicates this an advertisement */ 2004 nce_process(dst_ncec, haddr, na->nd_na_flags_reserved, B_TRUE); 2005 } 2006 out: 2007 ncec_refrele(dst_ncec); 2008 } 2009 2010 /* 2011 * Process NDP neighbor solicitation/advertisement messages. 2012 * The checksum has already checked o.k before reaching here. 2013 * Information about the datalink header is contained in ira_l2src, but 2014 * that should be ignored for loopback packets. 2015 */ 2016 void 2017 ndp_input(mblk_t *mp, ip_recv_attr_t *ira) 2018 { 2019 ill_t *ill = ira->ira_rill; 2020 icmp6_t *icmp_nd; 2021 ip6_t *ip6h; 2022 int len; 2023 mib2_ipv6IfIcmpEntry_t *mib = ill->ill_icmp6_mib; 2024 ill_t *orig_ill = NULL; 2025 2026 /* 2027 * Since ira_ill is where the IRE_LOCAL was hosted we use ira_rill 2028 * and make it be the IPMP upper so avoid being confused by a packet 2029 * addressed to a unicast address on a different ill. 2030 */ 2031 if (IS_UNDER_IPMP(ill)) { 2032 orig_ill = ill; 2033 ill = ipmp_ill_hold_ipmp_ill(orig_ill); 2034 if (ill == NULL) { 2035 ill = orig_ill; 2036 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 2037 ip_drop_input("ipIfStatsInDiscards - IPMP ill", 2038 mp, ill); 2039 freemsg(mp); 2040 return; 2041 } 2042 ASSERT(ill != orig_ill); 2043 orig_ill = ira->ira_ill; 2044 ira->ira_ill = ill; 2045 mib = ill->ill_icmp6_mib; 2046 } 2047 if (!pullupmsg(mp, -1)) { 2048 ip1dbg(("ndp_input: pullupmsg failed\n")); 2049 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 2050 ip_drop_input("ipIfStatsInDiscards - pullupmsg", mp, ill); 2051 goto done; 2052 } 2053 ip6h = (ip6_t *)mp->b_rptr; 2054 if (ip6h->ip6_hops != IPV6_MAX_HOPS) { 2055 ip1dbg(("ndp_input: hoplimit != IPV6_MAX_HOPS\n")); 2056 ip_drop_input("ipv6IfIcmpBadHoplimit", mp, ill); 2057 BUMP_MIB(mib, ipv6IfIcmpBadHoplimit); 2058 goto done; 2059 } 2060 /* 2061 * NDP does not accept any extension headers between the 2062 * IP header and the ICMP header since e.g. a routing 2063 * header could be dangerous. 2064 * This assumes that any AH or ESP headers are removed 2065 * by ip prior to passing the packet to ndp_input. 2066 */ 2067 if (ip6h->ip6_nxt != IPPROTO_ICMPV6) { 2068 ip1dbg(("ndp_input: Wrong next header 0x%x\n", 2069 ip6h->ip6_nxt)); 2070 ip_drop_input("Wrong next header", mp, ill); 2071 BUMP_MIB(mib, ipv6IfIcmpInErrors); 2072 goto done; 2073 } 2074 icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN); 2075 ASSERT(icmp_nd->icmp6_type == ND_NEIGHBOR_SOLICIT || 2076 icmp_nd->icmp6_type == ND_NEIGHBOR_ADVERT); 2077 if (icmp_nd->icmp6_code != 0) { 2078 ip1dbg(("ndp_input: icmp6 code != 0 \n")); 2079 ip_drop_input("code non-zero", mp, ill); 2080 BUMP_MIB(mib, ipv6IfIcmpInErrors); 2081 goto done; 2082 } 2083 len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN; 2084 /* 2085 * Make sure packet length is large enough for either 2086 * a NS or a NA icmp packet. 2087 */ 2088 if (len < sizeof (struct icmp6_hdr) + sizeof (struct in6_addr)) { 2089 ip1dbg(("ndp_input: packet too short\n")); 2090 ip_drop_input("packet too short", mp, ill); 2091 BUMP_MIB(mib, ipv6IfIcmpInErrors); 2092 goto done; 2093 } 2094 if (icmp_nd->icmp6_type == ND_NEIGHBOR_SOLICIT) { 2095 ndp_input_solicit(mp, ira); 2096 } else { 2097 ndp_input_advert(mp, ira); 2098 } 2099 done: 2100 freemsg(mp); 2101 if (orig_ill != NULL) { 2102 ill_refrele(ill); 2103 ira->ira_ill = orig_ill; 2104 } 2105 } 2106 2107 /* 2108 * ndp_xmit is called to form and transmit a ND solicitation or 2109 * advertisement ICMP packet. 2110 * 2111 * If the source address is unspecified and this isn't a probe (used for 2112 * duplicate address detection), an appropriate source address and link layer 2113 * address will be chosen here. The link layer address option is included if 2114 * the source is specified (i.e., all non-probe packets), and omitted (per the 2115 * specification) otherwise. 2116 * 2117 * It returns B_FALSE only if it does a successful put() to the 2118 * corresponding ill's ill_wq otherwise returns B_TRUE. 2119 */ 2120 static boolean_t 2121 ndp_xmit(ill_t *ill, uint32_t operation, uint8_t *hw_addr, uint_t hw_addr_len, 2122 const in6_addr_t *sender, const in6_addr_t *target, int flag) 2123 { 2124 uint32_t len; 2125 icmp6_t *icmp6; 2126 mblk_t *mp; 2127 ip6_t *ip6h; 2128 nd_opt_hdr_t *opt; 2129 uint_t plen; 2130 zoneid_t zoneid = GLOBAL_ZONEID; 2131 ill_t *hwaddr_ill = ill; 2132 ip_xmit_attr_t ixas; 2133 ip_stack_t *ipst = ill->ill_ipst; 2134 boolean_t need_refrele = B_FALSE; 2135 boolean_t probe = B_FALSE; 2136 2137 if (IS_UNDER_IPMP(ill)) { 2138 probe = ipif_lookup_testaddr_v6(ill, sender, NULL); 2139 /* 2140 * We send non-probe packets on the upper IPMP interface. 2141 * ip_output_simple() will use cast_ill for sending any 2142 * multicast packets. Note that we can't follow the same 2143 * logic for probe packets because all interfaces in the ipmp 2144 * group may have failed, so that we really want to only try 2145 * to send the ND packet on the ill corresponding to the src 2146 * address. 2147 */ 2148 if (!probe) { 2149 ill = ipmp_ill_hold_ipmp_ill(ill); 2150 if (ill != NULL) 2151 need_refrele = B_TRUE; 2152 else 2153 ill = hwaddr_ill; 2154 } 2155 } 2156 2157 /* 2158 * If we have a unspecified source(sender) address, select a 2159 * proper source address for the solicitation here itself so 2160 * that we can initialize the h/w address correctly. 2161 * 2162 * If the sender is specified then we use this address in order 2163 * to lookup the zoneid before calling ip_output_v6(). This is to 2164 * enable unicast ND_NEIGHBOR_ADVERT packets to be routed correctly 2165 * by IP (we cannot guarantee that the global zone has an interface 2166 * route to the destination). 2167 * 2168 * Note that the NA never comes here with the unspecified source 2169 * address. 2170 */ 2171 2172 /* 2173 * Probes will have unspec src at this point. 2174 */ 2175 if (!(IN6_IS_ADDR_UNSPECIFIED(sender))) { 2176 zoneid = ipif_lookup_addr_zoneid_v6(sender, ill, ipst); 2177 /* 2178 * It's possible for ipif_lookup_addr_zoneid_v6() to return 2179 * ALL_ZONES if it cannot find a matching ipif for the address 2180 * we are trying to use. In this case we err on the side of 2181 * trying to send the packet by defaulting to the GLOBAL_ZONEID. 2182 */ 2183 if (zoneid == ALL_ZONES) 2184 zoneid = GLOBAL_ZONEID; 2185 } 2186 2187 plen = (sizeof (nd_opt_hdr_t) + hw_addr_len + 7) / 8; 2188 len = IPV6_HDR_LEN + sizeof (nd_neighbor_advert_t) + plen * 8; 2189 mp = allocb(len, BPRI_LO); 2190 if (mp == NULL) { 2191 if (need_refrele) 2192 ill_refrele(ill); 2193 return (B_TRUE); 2194 } 2195 2196 bzero((char *)mp->b_rptr, len); 2197 mp->b_wptr = mp->b_rptr + len; 2198 2199 bzero(&ixas, sizeof (ixas)); 2200 ixas.ixa_flags = IXAF_SET_ULP_CKSUM | IXAF_NO_HW_CKSUM; 2201 2202 ixas.ixa_ifindex = ill->ill_phyint->phyint_ifindex; 2203 ixas.ixa_ipst = ipst; 2204 ixas.ixa_cred = kcred; 2205 ixas.ixa_cpid = NOPID; 2206 ixas.ixa_tsl = NULL; 2207 ixas.ixa_zoneid = zoneid; 2208 2209 ip6h = (ip6_t *)mp->b_rptr; 2210 ip6h->ip6_vcf = IPV6_DEFAULT_VERS_AND_FLOW; 2211 ip6h->ip6_plen = htons(len - IPV6_HDR_LEN); 2212 ip6h->ip6_nxt = IPPROTO_ICMPV6; 2213 ip6h->ip6_hops = IPV6_MAX_HOPS; 2214 ixas.ixa_multicast_ttl = ip6h->ip6_hops; 2215 ip6h->ip6_dst = *target; 2216 icmp6 = (icmp6_t *)&ip6h[1]; 2217 2218 if (hw_addr_len != 0) { 2219 opt = (nd_opt_hdr_t *)((uint8_t *)ip6h + IPV6_HDR_LEN + 2220 sizeof (nd_neighbor_advert_t)); 2221 } else { 2222 opt = NULL; 2223 } 2224 if (operation == ND_NEIGHBOR_SOLICIT) { 2225 nd_neighbor_solicit_t *ns = (nd_neighbor_solicit_t *)icmp6; 2226 2227 if (opt != NULL && !(flag & NDP_PROBE)) { 2228 /* 2229 * Note that we don't send out SLLA for ND probes 2230 * per RFC 4862, even though we do send out the src 2231 * haddr for IPv4 DAD probes, even though both IPv4 2232 * and IPv6 go out with the unspecified/INADDR_ANY 2233 * src IP addr. 2234 */ 2235 opt->nd_opt_type = ND_OPT_SOURCE_LINKADDR; 2236 } 2237 ip6h->ip6_src = *sender; 2238 ns->nd_ns_target = *target; 2239 if (!(flag & NDP_UNICAST)) { 2240 /* Form multicast address of the target */ 2241 ip6h->ip6_dst = ipv6_solicited_node_mcast; 2242 ip6h->ip6_dst.s6_addr32[3] |= 2243 ns->nd_ns_target.s6_addr32[3]; 2244 } 2245 } else { 2246 nd_neighbor_advert_t *na = (nd_neighbor_advert_t *)icmp6; 2247 2248 ASSERT(!(flag & NDP_PROBE)); 2249 if (opt != NULL) 2250 opt->nd_opt_type = ND_OPT_TARGET_LINKADDR; 2251 ip6h->ip6_src = *sender; 2252 na->nd_na_target = *sender; 2253 if (flag & NDP_ISROUTER) 2254 na->nd_na_flags_reserved |= ND_NA_FLAG_ROUTER; 2255 if (flag & NDP_SOLICITED) 2256 na->nd_na_flags_reserved |= ND_NA_FLAG_SOLICITED; 2257 if (flag & NDP_ORIDE) 2258 na->nd_na_flags_reserved |= ND_NA_FLAG_OVERRIDE; 2259 } 2260 2261 if (!(flag & NDP_PROBE)) { 2262 if (hw_addr != NULL && opt != NULL) { 2263 /* Fill in link layer address and option len */ 2264 opt->nd_opt_len = (uint8_t)plen; 2265 bcopy(hw_addr, &opt[1], hw_addr_len); 2266 } 2267 } 2268 if (opt != NULL && opt->nd_opt_type == 0) { 2269 /* If there's no link layer address option, then strip it. */ 2270 len -= plen * 8; 2271 mp->b_wptr = mp->b_rptr + len; 2272 ip6h->ip6_plen = htons(len - IPV6_HDR_LEN); 2273 } 2274 2275 icmp6->icmp6_type = (uint8_t)operation; 2276 icmp6->icmp6_code = 0; 2277 /* 2278 * Prepare for checksum by putting icmp length in the icmp 2279 * checksum field. The checksum is calculated in ip_output.c. 2280 */ 2281 icmp6->icmp6_cksum = ip6h->ip6_plen; 2282 2283 (void) ip_output_simple(mp, &ixas); 2284 ixa_cleanup(&ixas); 2285 if (need_refrele) 2286 ill_refrele(ill); 2287 return (B_FALSE); 2288 } 2289 2290 /* 2291 * Used to set ND_UNREACHBLE before ncec_delete sets it NCE_F_CONDEMNED. 2292 * The datapath uses this as an indication that there 2293 * is a problem (as opposed to a NCE that was just 2294 * reclaimed due to lack of memory. 2295 * Note that static ARP entries never become unreachable. 2296 */ 2297 void 2298 nce_make_unreachable(ncec_t *ncec) 2299 { 2300 mutex_enter(&ncec->ncec_lock); 2301 ncec->ncec_state = ND_UNREACHABLE; 2302 mutex_exit(&ncec->ncec_lock); 2303 } 2304 2305 /* 2306 * NCE retransmit timer. Common to IPv4 and IPv6. 2307 * This timer goes off when: 2308 * a. It is time to retransmit a resolution for resolver. 2309 * b. It is time to send reachability probes. 2310 */ 2311 void 2312 nce_timer(void *arg) 2313 { 2314 ncec_t *ncec = arg; 2315 ill_t *ill = ncec->ncec_ill, *src_ill; 2316 char addrbuf[INET6_ADDRSTRLEN]; 2317 boolean_t dropped = B_FALSE; 2318 ip_stack_t *ipst = ncec->ncec_ipst; 2319 boolean_t isv6 = (ncec->ncec_ipversion == IPV6_VERSION); 2320 in_addr_t sender4 = INADDR_ANY; 2321 in6_addr_t sender6 = ipv6_all_zeros; 2322 2323 /* 2324 * The timer has to be cancelled by ncec_delete before doing the final 2325 * refrele. So the NCE is guaranteed to exist when the timer runs 2326 * until it clears the timeout_id. Before clearing the timeout_id 2327 * bump up the refcnt so that we can continue to use the ncec 2328 */ 2329 ASSERT(ncec != NULL); 2330 mutex_enter(&ncec->ncec_lock); 2331 ncec_refhold_locked(ncec); 2332 ncec->ncec_timeout_id = 0; 2333 mutex_exit(&ncec->ncec_lock); 2334 2335 src_ill = nce_resolve_src(ncec, &sender6); 2336 /* if we could not find a sender address, return */ 2337 if (src_ill == NULL) { 2338 if (!isv6) { 2339 IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, sender4); 2340 ip1dbg(("no src ill for %s\n", inet_ntop(AF_INET, 2341 &sender4, addrbuf, sizeof (addrbuf)))); 2342 } else { 2343 ip1dbg(("no src ill for %s\n", inet_ntop(AF_INET6, 2344 &ncec->ncec_addr, addrbuf, sizeof (addrbuf)))); 2345 } 2346 nce_restart_timer(ncec, ill->ill_reachable_retrans_time); 2347 ncec_refrele(ncec); 2348 return; 2349 } 2350 if (!isv6) 2351 IN6_V4MAPPED_TO_IPADDR(&sender6, sender4); 2352 2353 mutex_enter(&ncec->ncec_lock); 2354 /* 2355 * Check the reachability state. 2356 */ 2357 switch (ncec->ncec_state) { 2358 case ND_DELAY: 2359 ASSERT(ncec->ncec_lladdr != NULL); 2360 ncec->ncec_state = ND_PROBE; 2361 ncec->ncec_pcnt = ND_MAX_UNICAST_SOLICIT; 2362 if (isv6) { 2363 mutex_exit(&ncec->ncec_lock); 2364 dropped = ndp_xmit(src_ill, ND_NEIGHBOR_SOLICIT, 2365 src_ill->ill_phys_addr, 2366 src_ill->ill_phys_addr_length, 2367 &sender6, &ncec->ncec_addr, 2368 NDP_UNICAST); 2369 } else { 2370 dropped = (arp_request(ncec, sender4, src_ill) == 0); 2371 mutex_exit(&ncec->ncec_lock); 2372 } 2373 if (!dropped) { 2374 mutex_enter(&ncec->ncec_lock); 2375 ncec->ncec_pcnt--; 2376 mutex_exit(&ncec->ncec_lock); 2377 } 2378 if (ip_debug > 3) { 2379 /* ip2dbg */ 2380 pr_addr_dbg("nce_timer: state for %s changed " 2381 "to PROBE\n", AF_INET6, &ncec->ncec_addr); 2382 } 2383 nce_restart_timer(ncec, ill->ill_reachable_retrans_time); 2384 break; 2385 case ND_PROBE: 2386 /* must be retransmit timer */ 2387 ASSERT(ncec->ncec_pcnt >= -1); 2388 if (ncec->ncec_pcnt > 0) { 2389 /* 2390 * As per RFC2461, the ncec gets deleted after 2391 * MAX_UNICAST_SOLICIT unsuccessful re-transmissions. 2392 * Note that the first unicast solicitation is sent 2393 * during the DELAY state. 2394 */ 2395 ip2dbg(("nce_timer: pcount=%x dst %s\n", 2396 ncec->ncec_pcnt, 2397 inet_ntop((isv6? AF_INET6 : AF_INET), 2398 &ncec->ncec_addr, addrbuf, sizeof (addrbuf)))); 2399 if (NCE_PUBLISH(ncec)) { 2400 mutex_exit(&ncec->ncec_lock); 2401 /* 2402 * send out a probe; note that src_ill 2403 * is ignored by nce_dad() for all 2404 * DAD message types other than IPv6 2405 * unicast probes 2406 */ 2407 nce_dad(ncec, src_ill, B_TRUE); 2408 } else { 2409 ASSERT(src_ill != NULL); 2410 if (isv6) { 2411 mutex_exit(&ncec->ncec_lock); 2412 dropped = ndp_xmit(src_ill, 2413 ND_NEIGHBOR_SOLICIT, 2414 src_ill->ill_phys_addr, 2415 src_ill->ill_phys_addr_length, 2416 &sender6, &ncec->ncec_addr, 2417 NDP_UNICAST); 2418 } else { 2419 /* 2420 * since the nce is REACHABLE, 2421 * the ARP request will be sent out 2422 * as a link-layer unicast. 2423 */ 2424 dropped = (arp_request(ncec, sender4, 2425 src_ill) == 0); 2426 mutex_exit(&ncec->ncec_lock); 2427 } 2428 if (!dropped) { 2429 mutex_enter(&ncec->ncec_lock); 2430 ncec->ncec_pcnt--; 2431 mutex_exit(&ncec->ncec_lock); 2432 } 2433 nce_restart_timer(ncec, 2434 ill->ill_reachable_retrans_time); 2435 } 2436 } else if (ncec->ncec_pcnt < 0) { 2437 /* No hope, delete the ncec */ 2438 /* Tell datapath it went bad */ 2439 ncec->ncec_state = ND_UNREACHABLE; 2440 mutex_exit(&ncec->ncec_lock); 2441 if (ip_debug > 2) { 2442 /* ip1dbg */ 2443 pr_addr_dbg("nce_timer: Delete NCE for" 2444 " dst %s\n", (isv6? AF_INET6: AF_INET), 2445 &ncec->ncec_addr); 2446 } 2447 /* if static ARP can't delete. */ 2448 if ((ncec->ncec_flags & NCE_F_STATIC) == 0) 2449 ncec_delete(ncec); 2450 2451 } else if (!NCE_PUBLISH(ncec)) { 2452 /* 2453 * Probe count is 0 for a dynamic entry (one that we 2454 * ourselves are not publishing). We should never get 2455 * here if NONUD was requested, hence the ASSERT below. 2456 */ 2457 ASSERT((ncec->ncec_flags & NCE_F_NONUD) == 0); 2458 ip2dbg(("nce_timer: pcount=%x dst %s\n", 2459 ncec->ncec_pcnt, inet_ntop(AF_INET6, 2460 &ncec->ncec_addr, addrbuf, sizeof (addrbuf)))); 2461 ncec->ncec_pcnt--; 2462 mutex_exit(&ncec->ncec_lock); 2463 /* Wait one interval before killing */ 2464 nce_restart_timer(ncec, 2465 ill->ill_reachable_retrans_time); 2466 } else if (ill->ill_phyint->phyint_flags & PHYI_RUNNING) { 2467 ipif_t *ipif; 2468 ipaddr_t ncec_addr; 2469 2470 /* 2471 * We're done probing, and we can now declare this 2472 * address to be usable. Let IP know that it's ok to 2473 * use. 2474 */ 2475 ncec->ncec_state = ND_REACHABLE; 2476 ncec->ncec_flags &= ~NCE_F_UNVERIFIED; 2477 mutex_exit(&ncec->ncec_lock); 2478 if (isv6) { 2479 ipif = ipif_lookup_addr_exact_v6( 2480 &ncec->ncec_addr, ill, ipst); 2481 } else { 2482 IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, 2483 ncec_addr); 2484 ipif = ipif_lookup_addr_exact(ncec_addr, ill, 2485 ipst); 2486 } 2487 if (ipif != NULL) { 2488 if (ipif->ipif_was_dup) { 2489 char ibuf[LIFNAMSIZ]; 2490 char sbuf[INET6_ADDRSTRLEN]; 2491 2492 ipif->ipif_was_dup = B_FALSE; 2493 (void) inet_ntop(AF_INET6, 2494 &ipif->ipif_v6lcl_addr, 2495 sbuf, sizeof (sbuf)); 2496 ipif_get_name(ipif, ibuf, 2497 sizeof (ibuf)); 2498 cmn_err(CE_NOTE, "recovered address " 2499 "%s on %s", sbuf, ibuf); 2500 } 2501 if ((ipif->ipif_flags & IPIF_UP) && 2502 !ipif->ipif_addr_ready) 2503 ipif_up_notify(ipif); 2504 ipif->ipif_addr_ready = 1; 2505 ipif_refrele(ipif); 2506 } 2507 if (!isv6 && arp_no_defense) 2508 break; 2509 /* Begin defending our new address */ 2510 if (ncec->ncec_unsolicit_count > 0) { 2511 ncec->ncec_unsolicit_count--; 2512 if (isv6) { 2513 dropped = ndp_announce(ncec); 2514 } else { 2515 dropped = arp_announce(ncec); 2516 } 2517 2518 if (dropped) 2519 ncec->ncec_unsolicit_count++; 2520 else 2521 ncec->ncec_last_time_defended = 2522 ddi_get_lbolt(); 2523 } 2524 if (ncec->ncec_unsolicit_count > 0) { 2525 nce_restart_timer(ncec, 2526 ANNOUNCE_INTERVAL(isv6)); 2527 } else if (DEFENSE_INTERVAL(isv6) != 0) { 2528 nce_restart_timer(ncec, DEFENSE_INTERVAL(isv6)); 2529 } 2530 } else { 2531 /* 2532 * This is an address we're probing to be our own, but 2533 * the ill is down. Wait until it comes back before 2534 * doing anything, but switch to reachable state so 2535 * that the restart will work. 2536 */ 2537 ncec->ncec_state = ND_REACHABLE; 2538 mutex_exit(&ncec->ncec_lock); 2539 } 2540 break; 2541 case ND_INCOMPLETE: { 2542 mblk_t *mp, *nextmp; 2543 mblk_t **prevmpp; 2544 2545 /* 2546 * Per case (2) in the nce_queue_mp() comments, scan ncec_qd_mp 2547 * for any IPMP probe packets, and toss them. IPMP probe 2548 * packets will always be at the head of ncec_qd_mp, so that 2549 * we can stop at the first queued ND packet that is 2550 * not a probe packet. 2551 */ 2552 prevmpp = &ncec->ncec_qd_mp; 2553 for (mp = ncec->ncec_qd_mp; mp != NULL; mp = nextmp) { 2554 nextmp = mp->b_next; 2555 2556 if (IS_UNDER_IPMP(ill) && ncec->ncec_nprobes > 0) { 2557 inet_freemsg(mp); 2558 ncec->ncec_nprobes--; 2559 *prevmpp = nextmp; 2560 } else { 2561 prevmpp = &mp->b_next; 2562 } 2563 } 2564 2565 /* 2566 * Must be resolver's retransmit timer. 2567 */ 2568 mutex_exit(&ncec->ncec_lock); 2569 ip_ndp_resolve(ncec); 2570 break; 2571 } 2572 case ND_REACHABLE: 2573 if (((ncec->ncec_flags & NCE_F_UNSOL_ADV) && 2574 ncec->ncec_unsolicit_count != 0) || 2575 (NCE_PUBLISH(ncec) && DEFENSE_INTERVAL(isv6) != 0)) { 2576 if (ncec->ncec_unsolicit_count > 0) { 2577 ncec->ncec_unsolicit_count--; 2578 mutex_exit(&ncec->ncec_lock); 2579 /* 2580 * When we get to zero announcements left, 2581 * switch to address defense 2582 */ 2583 } else { 2584 boolean_t rate_limit; 2585 2586 mutex_exit(&ncec->ncec_lock); 2587 rate_limit = ill_defend_rate_limit(ill, ncec); 2588 if (rate_limit) { 2589 nce_restart_timer(ncec, 2590 DEFENSE_INTERVAL(isv6)); 2591 break; 2592 } 2593 } 2594 if (isv6) { 2595 dropped = ndp_announce(ncec); 2596 } else { 2597 dropped = arp_announce(ncec); 2598 } 2599 mutex_enter(&ncec->ncec_lock); 2600 if (dropped) { 2601 ncec->ncec_unsolicit_count++; 2602 } else { 2603 ncec->ncec_last_time_defended = 2604 ddi_get_lbolt(); 2605 } 2606 mutex_exit(&ncec->ncec_lock); 2607 if (ncec->ncec_unsolicit_count != 0) { 2608 nce_restart_timer(ncec, 2609 ANNOUNCE_INTERVAL(isv6)); 2610 } else { 2611 nce_restart_timer(ncec, DEFENSE_INTERVAL(isv6)); 2612 } 2613 } else { 2614 mutex_exit(&ncec->ncec_lock); 2615 } 2616 break; 2617 default: 2618 mutex_exit(&ncec->ncec_lock); 2619 break; 2620 } 2621 done: 2622 ncec_refrele(ncec); 2623 ill_refrele(src_ill); 2624 } 2625 2626 /* 2627 * Set a link layer address from the ll_addr passed in. 2628 * Copy SAP from ill. 2629 */ 2630 static void 2631 nce_set_ll(ncec_t *ncec, uchar_t *ll_addr) 2632 { 2633 ill_t *ill = ncec->ncec_ill; 2634 2635 ASSERT(ll_addr != NULL); 2636 if (ill->ill_phys_addr_length > 0) { 2637 /* 2638 * The bcopy() below used to be called for the physical address 2639 * length rather than the link layer address length. For 2640 * ethernet and many other media, the phys_addr and lla are 2641 * identical. 2642 * 2643 * The phys_addr and lla may not be the same for devices that 2644 * support DL_IPV6_LINK_LAYER_ADDR, though there are currently 2645 * no known instances of these. 2646 * 2647 * For PPP or other interfaces with a zero length 2648 * physical address, don't do anything here. 2649 * The bcopy() with a zero phys_addr length was previously 2650 * a no-op for interfaces with a zero-length physical address. 2651 * Using the lla for them would change the way they operate. 2652 * Doing nothing in such cases preserves expected behavior. 2653 */ 2654 bcopy(ll_addr, ncec->ncec_lladdr, ill->ill_nd_lla_len); 2655 } 2656 } 2657 2658 boolean_t 2659 nce_cmp_ll_addr(const ncec_t *ncec, const uchar_t *ll_addr, 2660 uint32_t ll_addr_len) 2661 { 2662 ASSERT(ncec->ncec_lladdr != NULL); 2663 if (ll_addr == NULL) 2664 return (B_FALSE); 2665 if (bcmp(ll_addr, ncec->ncec_lladdr, ll_addr_len) != 0) 2666 return (B_TRUE); 2667 return (B_FALSE); 2668 } 2669 2670 /* 2671 * Updates the link layer address or the reachability state of 2672 * a cache entry. Reset probe counter if needed. 2673 */ 2674 void 2675 nce_update(ncec_t *ncec, uint16_t new_state, uchar_t *new_ll_addr) 2676 { 2677 ill_t *ill = ncec->ncec_ill; 2678 boolean_t need_stop_timer = B_FALSE; 2679 boolean_t need_fastpath_update = B_FALSE; 2680 nce_t *nce = NULL; 2681 timeout_id_t tid; 2682 2683 ASSERT(MUTEX_HELD(&ncec->ncec_lock)); 2684 /* 2685 * If this interface does not do NUD, there is no point 2686 * in allowing an update to the cache entry. Although 2687 * we will respond to NS. 2688 * The only time we accept an update for a resolver when 2689 * NUD is turned off is when it has just been created. 2690 * Non-Resolvers will always be created as REACHABLE. 2691 */ 2692 if (new_state != ND_UNCHANGED) { 2693 if ((ncec->ncec_flags & NCE_F_NONUD) && 2694 (ncec->ncec_state != ND_INCOMPLETE)) 2695 return; 2696 ASSERT((int16_t)new_state >= ND_STATE_VALID_MIN); 2697 ASSERT((int16_t)new_state <= ND_STATE_VALID_MAX); 2698 need_stop_timer = B_TRUE; 2699 if (new_state == ND_REACHABLE) 2700 ncec->ncec_last = TICK_TO_MSEC(ddi_get_lbolt64()); 2701 else { 2702 /* We force NUD in this case */ 2703 ncec->ncec_last = 0; 2704 } 2705 ncec->ncec_state = new_state; 2706 ncec->ncec_pcnt = ND_MAX_UNICAST_SOLICIT; 2707 ASSERT(ncec->ncec_lladdr != NULL || new_state == ND_INITIAL || 2708 new_state == ND_INCOMPLETE); 2709 } 2710 if (need_stop_timer || (ncec->ncec_flags & NCE_F_STATIC)) { 2711 tid = ncec->ncec_timeout_id; 2712 ncec->ncec_timeout_id = 0; 2713 } 2714 /* 2715 * Re-trigger fastpath probe and 2716 * overwrite the DL_UNITDATA_REQ data, noting we'll lose 2717 * whatever packets that happens to be transmitting at the time. 2718 */ 2719 if (new_ll_addr != NULL) { 2720 bcopy(new_ll_addr, ncec->ncec_lladdr, 2721 ill->ill_phys_addr_length); 2722 need_fastpath_update = B_TRUE; 2723 } 2724 mutex_exit(&ncec->ncec_lock); 2725 if (need_stop_timer || (ncec->ncec_flags & NCE_F_STATIC)) { 2726 if (tid != 0) 2727 (void) untimeout(tid); 2728 } 2729 if (need_fastpath_update) { 2730 /* 2731 * Delete any existing existing dlur_mp and fp_mp information. 2732 * For IPMP interfaces, all underlying ill's must be checked 2733 * and purged. 2734 */ 2735 nce_fastpath_list_delete(ncec->ncec_ill, ncec, NULL); 2736 /* 2737 * add the new dlur_mp and fp_mp 2738 */ 2739 nce = nce_fastpath(ncec, B_TRUE, NULL); 2740 if (nce != NULL) 2741 nce_refrele(nce); 2742 } 2743 mutex_enter(&ncec->ncec_lock); 2744 } 2745 2746 static void 2747 nce_queue_mp_common(ncec_t *ncec, mblk_t *mp, boolean_t head_insert) 2748 { 2749 uint_t count = 0; 2750 mblk_t **mpp, *tmp; 2751 2752 ASSERT(MUTEX_HELD(&ncec->ncec_lock)); 2753 2754 for (mpp = &ncec->ncec_qd_mp; *mpp != NULL; mpp = &(*mpp)->b_next) { 2755 if (++count > ncec->ncec_ill->ill_max_buf) { 2756 tmp = ncec->ncec_qd_mp->b_next; 2757 ncec->ncec_qd_mp->b_next = NULL; 2758 /* 2759 * if we never create data addrs on the under_ill 2760 * does this matter? 2761 */ 2762 BUMP_MIB(ncec->ncec_ill->ill_ip_mib, 2763 ipIfStatsOutDiscards); 2764 ip_drop_output("ipIfStatsOutDiscards", ncec->ncec_qd_mp, 2765 ncec->ncec_ill); 2766 freemsg(ncec->ncec_qd_mp); 2767 ncec->ncec_qd_mp = tmp; 2768 } 2769 } 2770 2771 if (head_insert) { 2772 ncec->ncec_nprobes++; 2773 mp->b_next = ncec->ncec_qd_mp; 2774 ncec->ncec_qd_mp = mp; 2775 } else { 2776 *mpp = mp; 2777 } 2778 } 2779 2780 /* 2781 * nce_queue_mp will queue the packet into the ncec_qd_mp. The packet will be 2782 * queued at the head or tail of the queue based on the input argument 2783 * 'head_insert'. The caller should specify this argument as B_TRUE if this 2784 * packet is an IPMP probe packet, in which case the following happens: 2785 * 2786 * 1. Insert it at the head of the ncec_qd_mp list. Consider the normal 2787 * (non-ipmp_probe) load-speading case where the source address of the ND 2788 * packet is not tied to ncec_ill. If the ill bound to the source address 2789 * cannot receive, the response to the ND packet will not be received. 2790 * However, if ND packets for ncec_ill's probes are queued behind that ND 2791 * packet, those probes will also fail to be sent, and thus in.mpathd will 2792 * erroneously conclude that ncec_ill has also failed. 2793 * 2794 * 2. Drop the ipmp_probe packet in ndp_timer() if the ND did not succeed on 2795 * the first attempt. This ensures that ND problems do not manifest as 2796 * probe RTT spikes. 2797 * 2798 * We achieve this by inserting ipmp_probe() packets at the head of the 2799 * nce_queue. 2800 * 2801 * The ncec for the probe target is created with ncec_ill set to the ipmp_ill, 2802 * but the caller needs to set head_insert to B_TRUE if this is a probe packet. 2803 */ 2804 void 2805 nce_queue_mp(ncec_t *ncec, mblk_t *mp, boolean_t head_insert) 2806 { 2807 ASSERT(MUTEX_HELD(&ncec->ncec_lock)); 2808 nce_queue_mp_common(ncec, mp, head_insert); 2809 } 2810 2811 /* 2812 * Called when address resolution failed due to a timeout. 2813 * Send an ICMP unreachable in response to all queued packets. 2814 */ 2815 void 2816 ndp_resolv_failed(ncec_t *ncec) 2817 { 2818 mblk_t *mp, *nxt_mp; 2819 char buf[INET6_ADDRSTRLEN]; 2820 ill_t *ill = ncec->ncec_ill; 2821 ip_recv_attr_t iras; 2822 2823 bzero(&iras, sizeof (iras)); 2824 iras.ira_flags = 0; 2825 /* 2826 * we are setting the ira_rill to the ipmp_ill (instead of 2827 * the actual ill on which the packet was received), but this 2828 * is ok because we don't actually need the real ira_rill. 2829 * to send the icmp unreachable to the sender. 2830 */ 2831 iras.ira_ill = iras.ira_rill = ill; 2832 iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex; 2833 iras.ira_rifindex = iras.ira_ruifindex; 2834 2835 ip1dbg(("ndp_resolv_failed: dst %s\n", 2836 inet_ntop(AF_INET6, (char *)&ncec->ncec_addr, buf, sizeof (buf)))); 2837 mutex_enter(&ncec->ncec_lock); 2838 mp = ncec->ncec_qd_mp; 2839 ncec->ncec_qd_mp = NULL; 2840 ncec->ncec_nprobes = 0; 2841 mutex_exit(&ncec->ncec_lock); 2842 while (mp != NULL) { 2843 nxt_mp = mp->b_next; 2844 mp->b_next = NULL; 2845 2846 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); 2847 ip_drop_output("ipIfStatsOutDiscards - address unreachable", 2848 mp, ill); 2849 icmp_unreachable_v6(mp, 2850 ICMP6_DST_UNREACH_ADDR, B_FALSE, &iras); 2851 ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE)); 2852 mp = nxt_mp; 2853 } 2854 ncec_cb_dispatch(ncec); /* finish off waiting callbacks */ 2855 } 2856 2857 /* 2858 * Handle the completion of NDP and ARP resolution. 2859 */ 2860 void 2861 nce_resolv_ok(ncec_t *ncec) 2862 { 2863 mblk_t *mp; 2864 uint_t pkt_len; 2865 iaflags_t ixaflags = IXAF_NO_TRACE; 2866 nce_t *nce; 2867 ill_t *ill = ncec->ncec_ill; 2868 boolean_t isv6 = (ncec->ncec_ipversion == IPV6_VERSION); 2869 ip_stack_t *ipst = ill->ill_ipst; 2870 2871 if (IS_IPMP(ncec->ncec_ill)) { 2872 nce_resolv_ipmp_ok(ncec); 2873 return; 2874 } 2875 /* non IPMP case */ 2876 2877 mutex_enter(&ncec->ncec_lock); 2878 ASSERT(ncec->ncec_nprobes == 0); 2879 mp = ncec->ncec_qd_mp; 2880 ncec->ncec_qd_mp = NULL; 2881 mutex_exit(&ncec->ncec_lock); 2882 2883 while (mp != NULL) { 2884 mblk_t *nxt_mp; 2885 2886 if (ill->ill_isv6) { 2887 ip6_t *ip6h = (ip6_t *)mp->b_rptr; 2888 2889 pkt_len = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN; 2890 } else { 2891 ipha_t *ipha = (ipha_t *)mp->b_rptr; 2892 2893 ixaflags |= IXAF_IS_IPV4; 2894 pkt_len = ntohs(ipha->ipha_length); 2895 } 2896 nxt_mp = mp->b_next; 2897 mp->b_next = NULL; 2898 /* 2899 * IXAF_NO_DEV_FLOW_CTL information for TCP packets is no 2900 * longer available, but it's ok to drop this flag because TCP 2901 * has its own flow-control in effect, so TCP packets 2902 * are not likely to get here when flow-control is in effect. 2903 */ 2904 mutex_enter(&ill->ill_lock); 2905 nce = nce_lookup(ill, &ncec->ncec_addr); 2906 mutex_exit(&ill->ill_lock); 2907 2908 if (nce == NULL) { 2909 if (isv6) { 2910 BUMP_MIB(&ipst->ips_ip6_mib, 2911 ipIfStatsOutDiscards); 2912 } else { 2913 BUMP_MIB(&ipst->ips_ip_mib, 2914 ipIfStatsOutDiscards); 2915 } 2916 ip_drop_output("ipIfStatsOutDiscards - no nce", 2917 mp, NULL); 2918 freemsg(mp); 2919 } else { 2920 /* 2921 * We don't know the zoneid, but 2922 * ip_xmit does not care since IXAF_NO_TRACE 2923 * is set. (We traced the packet the first 2924 * time through ip_xmit.) 2925 */ 2926 (void) ip_xmit(mp, nce, ixaflags, pkt_len, 0, 2927 ALL_ZONES, 0, NULL); 2928 nce_refrele(nce); 2929 } 2930 mp = nxt_mp; 2931 } 2932 2933 ncec_cb_dispatch(ncec); /* complete callbacks */ 2934 } 2935 2936 /* 2937 * Called by SIOCSNDP* ioctl to add/change an ncec entry 2938 * and the corresponding attributes. 2939 * Disallow states other than ND_REACHABLE or ND_STALE. 2940 */ 2941 int 2942 ndp_sioc_update(ill_t *ill, lif_nd_req_t *lnr) 2943 { 2944 sin6_t *sin6; 2945 in6_addr_t *addr; 2946 ncec_t *ncec; 2947 nce_t *nce; 2948 int err = 0; 2949 uint16_t new_flags = 0; 2950 uint16_t old_flags = 0; 2951 int inflags = lnr->lnr_flags; 2952 ip_stack_t *ipst = ill->ill_ipst; 2953 boolean_t do_postprocess = B_FALSE; 2954 2955 ASSERT(ill->ill_isv6); 2956 if ((lnr->lnr_state_create != ND_REACHABLE) && 2957 (lnr->lnr_state_create != ND_STALE)) 2958 return (EINVAL); 2959 2960 sin6 = (sin6_t *)&lnr->lnr_addr; 2961 addr = &sin6->sin6_addr; 2962 2963 mutex_enter(&ipst->ips_ndp6->ndp_g_lock); 2964 ASSERT(!IS_UNDER_IPMP(ill)); 2965 nce = nce_lookup_addr(ill, addr); 2966 if (nce != NULL) 2967 new_flags = nce->nce_common->ncec_flags; 2968 2969 switch (inflags & (NDF_ISROUTER_ON|NDF_ISROUTER_OFF)) { 2970 case NDF_ISROUTER_ON: 2971 new_flags |= NCE_F_ISROUTER; 2972 break; 2973 case NDF_ISROUTER_OFF: 2974 new_flags &= ~NCE_F_ISROUTER; 2975 break; 2976 case (NDF_ISROUTER_OFF|NDF_ISROUTER_ON): 2977 mutex_exit(&ipst->ips_ndp6->ndp_g_lock); 2978 if (nce != NULL) 2979 nce_refrele(nce); 2980 return (EINVAL); 2981 } 2982 if (inflags & NDF_STATIC) 2983 new_flags |= NCE_F_STATIC; 2984 2985 switch (inflags & (NDF_ANYCAST_ON|NDF_ANYCAST_OFF)) { 2986 case NDF_ANYCAST_ON: 2987 new_flags |= NCE_F_ANYCAST; 2988 break; 2989 case NDF_ANYCAST_OFF: 2990 new_flags &= ~NCE_F_ANYCAST; 2991 break; 2992 case (NDF_ANYCAST_OFF|NDF_ANYCAST_ON): 2993 mutex_exit(&ipst->ips_ndp6->ndp_g_lock); 2994 if (nce != NULL) 2995 nce_refrele(nce); 2996 return (EINVAL); 2997 } 2998 2999 if (nce == NULL) { 3000 err = nce_add_v6(ill, 3001 (uchar_t *)lnr->lnr_hdw_addr, 3002 ill->ill_phys_addr_length, 3003 addr, 3004 new_flags, 3005 lnr->lnr_state_create, 3006 &nce); 3007 if (err != 0) { 3008 mutex_exit(&ipst->ips_ndp6->ndp_g_lock); 3009 ip1dbg(("ndp_sioc_update: Can't create NCE %d\n", err)); 3010 return (err); 3011 } else { 3012 do_postprocess = B_TRUE; 3013 } 3014 } 3015 ncec = nce->nce_common; 3016 old_flags = ncec->ncec_flags; 3017 if (old_flags & NCE_F_ISROUTER && !(new_flags & NCE_F_ISROUTER)) { 3018 ncec_router_to_host(ncec); 3019 mutex_exit(&ipst->ips_ndp6->ndp_g_lock); 3020 if (do_postprocess) 3021 err = nce_add_v6_postprocess(nce); 3022 nce_refrele(nce); 3023 return (0); 3024 } 3025 mutex_exit(&ipst->ips_ndp6->ndp_g_lock); 3026 3027 if (do_postprocess) 3028 err = nce_add_v6_postprocess(nce); 3029 /* 3030 * err cannot be anything other than 0 because we don't support 3031 * proxy arp of static addresses. 3032 */ 3033 ASSERT(err == 0); 3034 3035 mutex_enter(&ncec->ncec_lock); 3036 ncec->ncec_flags = new_flags; 3037 mutex_exit(&ncec->ncec_lock); 3038 /* 3039 * Note that we ignore the state at this point, which 3040 * should be either STALE or REACHABLE. Instead we let 3041 * the link layer address passed in to determine the state 3042 * much like incoming packets. 3043 */ 3044 nce_process(ncec, (uchar_t *)lnr->lnr_hdw_addr, 0, B_FALSE); 3045 nce_refrele(nce); 3046 return (0); 3047 } 3048 3049 /* 3050 * Create an nce_t structure for ill using the ncec->ncec_lladdr to set up 3051 * the nce_dlur_mp. If ill != ncec->ncec_ill, then the ips_ill_g_lock must 3052 * be held to ensure that they are in the same group. 3053 */ 3054 static nce_t * 3055 nce_fastpath_create(ill_t *ill, ncec_t *ncec) 3056 { 3057 3058 nce_t *nce; 3059 3060 nce = nce_ill_lookup_then_add(ill, ncec); 3061 3062 if (nce == NULL || IS_LOOPBACK(nce->nce_ill) || IS_VNI(nce->nce_ill)) 3063 return (nce); 3064 3065 /* 3066 * hold the ncec_lock to synchronize with nce_update() so that, 3067 * at the end of this function, the contents of nce_dlur_mp are 3068 * consistent with ncec->ncec_lladdr, even though some intermediate 3069 * packet may have been sent out with a mangled address, which would 3070 * only be a transient condition. 3071 */ 3072 mutex_enter(&ncec->ncec_lock); 3073 if (ncec->ncec_lladdr != NULL) { 3074 bcopy(ncec->ncec_lladdr, nce->nce_dlur_mp->b_rptr + 3075 NCE_LL_ADDR_OFFSET(ill), ill->ill_phys_addr_length); 3076 } else { 3077 nce->nce_dlur_mp = ill_dlur_gen(NULL, 0, ill->ill_sap, 3078 ill->ill_sap_length); 3079 } 3080 mutex_exit(&ncec->ncec_lock); 3081 return (nce); 3082 } 3083 3084 /* 3085 * we make nce_fp_mp to have an M_DATA prepend. 3086 * The caller ensures there is hold on ncec for this function. 3087 * Note that since ill_fastpath_probe() copies the mblk there is 3088 * no need to hold the nce or ncec beyond this function. 3089 * 3090 * If the caller has passed in a non-null ncec_nce to nce_fastpath() that 3091 * ncec_nce must correspond to the nce for ncec with nce_ill == ncec->ncec_ill 3092 * and will be returned back by this function, so that no extra nce_refrele 3093 * is required for the caller. The calls from nce_add_common() use this 3094 * method. All other callers (that pass in NULL ncec_nce) will have to do a 3095 * nce_refrele of the returned nce (when it is non-null). 3096 */ 3097 nce_t * 3098 nce_fastpath(ncec_t *ncec, boolean_t trigger_fp_req, nce_t *ncec_nce) 3099 { 3100 nce_t *nce; 3101 ill_t *ill = ncec->ncec_ill; 3102 3103 ASSERT(ill != NULL); 3104 3105 if (IS_IPMP(ill) && trigger_fp_req) { 3106 trigger_fp_req = B_FALSE; 3107 ipmp_ncec_refresh_nce(ncec); 3108 } 3109 3110 /* 3111 * If the caller already has the nce corresponding to the ill, use 3112 * that one. Otherwise we have to lookup/add the nce. Calls from 3113 * nce_add_common() fall in the former category, and have just done 3114 * the nce lookup/add that can be reused. 3115 */ 3116 if (ncec_nce == NULL) 3117 nce = nce_fastpath_create(ill, ncec); 3118 else 3119 nce = ncec_nce; 3120 3121 if (nce == NULL || IS_LOOPBACK(nce->nce_ill) || IS_VNI(nce->nce_ill)) 3122 return (nce); 3123 3124 if (trigger_fp_req) 3125 nce_fastpath_trigger(nce); 3126 return (nce); 3127 } 3128 3129 /* 3130 * Trigger fastpath on nce. No locks may be held. 3131 */ 3132 static void 3133 nce_fastpath_trigger(nce_t *nce) 3134 { 3135 int res; 3136 ill_t *ill = nce->nce_ill; 3137 ncec_t *ncec = nce->nce_common; 3138 3139 res = ill_fastpath_probe(ill, nce->nce_dlur_mp); 3140 /* 3141 * EAGAIN is an indication of a transient error 3142 * i.e. allocation failure etc. leave the ncec in the list it 3143 * will be updated when another probe happens for another ire 3144 * if not it will be taken out of the list when the ire is 3145 * deleted. 3146 */ 3147 if (res != 0 && res != EAGAIN && res != ENOTSUP) 3148 nce_fastpath_list_delete(ill, ncec, NULL); 3149 } 3150 3151 /* 3152 * Add ncec to the nce fastpath list on ill. 3153 */ 3154 static nce_t * 3155 nce_ill_lookup_then_add_locked(ill_t *ill, ncec_t *ncec) 3156 { 3157 nce_t *nce = NULL; 3158 3159 ASSERT(MUTEX_HELD(&ill->ill_lock)); 3160 /* 3161 * Atomically ensure that the ill is not CONDEMNED and is not going 3162 * down, before adding the NCE. 3163 */ 3164 if (ill->ill_state_flags & ILL_CONDEMNED) 3165 return (NULL); 3166 mutex_enter(&ncec->ncec_lock); 3167 /* 3168 * if ncec has not been deleted and 3169 * is not already in the list add it. 3170 */ 3171 if (!NCE_ISCONDEMNED(ncec)) { 3172 nce = nce_lookup(ill, &ncec->ncec_addr); 3173 if (nce != NULL) 3174 goto done; 3175 nce = nce_add(ill, ncec); 3176 } 3177 done: 3178 mutex_exit(&ncec->ncec_lock); 3179 return (nce); 3180 } 3181 3182 nce_t * 3183 nce_ill_lookup_then_add(ill_t *ill, ncec_t *ncec) 3184 { 3185 nce_t *nce; 3186 3187 mutex_enter(&ill->ill_lock); 3188 nce = nce_ill_lookup_then_add_locked(ill, ncec); 3189 mutex_exit(&ill->ill_lock); 3190 return (nce); 3191 } 3192 3193 3194 /* 3195 * remove ncec from the ill_nce list. If 'dead' is non-null, the deleted 3196 * nce is added to the 'dead' list, and the caller must nce_refrele() the 3197 * entry after all locks have been dropped. 3198 */ 3199 void 3200 nce_fastpath_list_delete(ill_t *ill, ncec_t *ncec, list_t *dead) 3201 { 3202 nce_t *nce; 3203 3204 ASSERT(ill != NULL); 3205 3206 /* delete any nces referencing the ncec from underlying ills */ 3207 if (IS_IPMP(ill)) 3208 ipmp_ncec_delete_nce(ncec); 3209 3210 /* now the ill itself */ 3211 mutex_enter(&ill->ill_lock); 3212 for (nce = list_head(&ill->ill_nce); nce != NULL; 3213 nce = list_next(&ill->ill_nce, nce)) { 3214 if (nce->nce_common == ncec) { 3215 nce_refhold(nce); 3216 nce_delete(nce); 3217 break; 3218 } 3219 } 3220 mutex_exit(&ill->ill_lock); 3221 if (nce != NULL) { 3222 if (dead == NULL) 3223 nce_refrele(nce); 3224 else 3225 list_insert_tail(dead, nce); 3226 } 3227 } 3228 3229 /* 3230 * when the fastpath response does not fit in the datab 3231 * associated with the existing nce_fp_mp, we delete and 3232 * add the nce to retrigger fastpath based on the information 3233 * in the ncec_t. 3234 */ 3235 static nce_t * 3236 nce_delete_then_add(nce_t *nce) 3237 { 3238 ill_t *ill = nce->nce_ill; 3239 nce_t *newnce = NULL; 3240 3241 ip0dbg(("nce_delete_then_add nce %p ill %s\n", 3242 (void *)nce, ill->ill_name)); 3243 mutex_enter(&ill->ill_lock); 3244 mutex_enter(&nce->nce_common->ncec_lock); 3245 nce_delete(nce); 3246 /* 3247 * Make sure that ncec is not condemned before adding. We hold the 3248 * ill_lock and ncec_lock to synchronize with ncec_delete() and 3249 * ipmp_ncec_delete_nce() 3250 */ 3251 if (!NCE_ISCONDEMNED(nce->nce_common)) 3252 newnce = nce_add(ill, nce->nce_common); 3253 mutex_exit(&nce->nce_common->ncec_lock); 3254 mutex_exit(&ill->ill_lock); 3255 nce_refrele(nce); 3256 return (newnce); /* could be null if nomem */ 3257 } 3258 3259 typedef struct nce_fp_match_s { 3260 nce_t *nce_fp_match_res; 3261 mblk_t *nce_fp_match_ack_mp; 3262 } nce_fp_match_t; 3263 3264 /* ARGSUSED */ 3265 static int 3266 nce_fastpath_match_dlur(ill_t *ill, nce_t *nce, void *arg) 3267 { 3268 nce_fp_match_t *nce_fp_marg = arg; 3269 ncec_t *ncec = nce->nce_common; 3270 mblk_t *mp = nce_fp_marg->nce_fp_match_ack_mp; 3271 uchar_t *mp_rptr, *ud_mp_rptr; 3272 mblk_t *ud_mp = nce->nce_dlur_mp; 3273 ptrdiff_t cmplen; 3274 3275 /* 3276 * mp is the mp associated with the fastpath ack. 3277 * ud_mp is the outstanding DL_UNITDATA_REQ on the nce_t 3278 * under consideration. If the contents match, then the 3279 * fastpath ack is used to update the nce. 3280 */ 3281 if (ud_mp == NULL) 3282 return (0); 3283 mp_rptr = mp->b_rptr; 3284 cmplen = mp->b_wptr - mp_rptr; 3285 ASSERT(cmplen >= 0); 3286 3287 ud_mp_rptr = ud_mp->b_rptr; 3288 /* 3289 * The ncec is locked here to prevent any other threads from accessing 3290 * and changing nce_dlur_mp when the address becomes resolved to an 3291 * lla while we're in the middle of looking at and comparing the 3292 * hardware address (lla). It is also locked to prevent multiple 3293 * threads in nce_fastpath() from examining nce_dlur_mp at the same 3294 * time. 3295 */ 3296 mutex_enter(&ncec->ncec_lock); 3297 if (ud_mp->b_wptr - ud_mp_rptr != cmplen || 3298 bcmp((char *)mp_rptr, (char *)ud_mp_rptr, cmplen) == 0) { 3299 nce_fp_marg->nce_fp_match_res = nce; 3300 mutex_exit(&ncec->ncec_lock); 3301 nce_refhold(nce); 3302 return (1); 3303 } 3304 mutex_exit(&ncec->ncec_lock); 3305 return (0); 3306 } 3307 3308 /* 3309 * Update all NCE's that are not in fastpath mode and 3310 * have an nce_fp_mp that matches mp. mp->b_cont contains 3311 * the fastpath header. 3312 * 3313 * Returns TRUE if entry should be dequeued, or FALSE otherwise. 3314 */ 3315 void 3316 nce_fastpath_update(ill_t *ill, mblk_t *mp) 3317 { 3318 nce_fp_match_t nce_fp_marg; 3319 nce_t *nce; 3320 mblk_t *nce_fp_mp, *fp_mp; 3321 3322 nce_fp_marg.nce_fp_match_res = NULL; 3323 nce_fp_marg.nce_fp_match_ack_mp = mp; 3324 3325 nce_walk(ill, nce_fastpath_match_dlur, &nce_fp_marg); 3326 3327 if ((nce = nce_fp_marg.nce_fp_match_res) == NULL) 3328 return; 3329 3330 mutex_enter(&nce->nce_lock); 3331 nce_fp_mp = nce->nce_fp_mp; 3332 3333 if (nce_fp_mp != NULL) { 3334 fp_mp = mp->b_cont; 3335 if (nce_fp_mp->b_rptr + MBLKL(fp_mp) > 3336 nce_fp_mp->b_datap->db_lim) { 3337 mutex_exit(&nce->nce_lock); 3338 nce = nce_delete_then_add(nce); 3339 if (nce == NULL) { 3340 return; 3341 } 3342 mutex_enter(&nce->nce_lock); 3343 nce_fp_mp = nce->nce_fp_mp; 3344 } 3345 } 3346 3347 /* Matched - install mp as the fastpath mp */ 3348 if (nce_fp_mp == NULL) { 3349 fp_mp = dupb(mp->b_cont); 3350 nce->nce_fp_mp = fp_mp; 3351 } else { 3352 fp_mp = mp->b_cont; 3353 bcopy(fp_mp->b_rptr, nce_fp_mp->b_rptr, MBLKL(fp_mp)); 3354 nce->nce_fp_mp->b_wptr = nce->nce_fp_mp->b_rptr 3355 + MBLKL(fp_mp); 3356 } 3357 mutex_exit(&nce->nce_lock); 3358 nce_refrele(nce); 3359 } 3360 3361 /* 3362 * Return a pointer to a given option in the packet. 3363 * Assumes that option part of the packet have already been validated. 3364 */ 3365 nd_opt_hdr_t * 3366 ndp_get_option(nd_opt_hdr_t *opt, int optlen, int opt_type) 3367 { 3368 while (optlen > 0) { 3369 if (opt->nd_opt_type == opt_type) 3370 return (opt); 3371 optlen -= 8 * opt->nd_opt_len; 3372 opt = (struct nd_opt_hdr *)((char *)opt + 8 * opt->nd_opt_len); 3373 } 3374 return (NULL); 3375 } 3376 3377 /* 3378 * Verify all option lengths present are > 0, also check to see 3379 * if the option lengths and packet length are consistent. 3380 */ 3381 boolean_t 3382 ndp_verify_optlen(nd_opt_hdr_t *opt, int optlen) 3383 { 3384 ASSERT(opt != NULL); 3385 while (optlen > 0) { 3386 if (opt->nd_opt_len == 0) 3387 return (B_FALSE); 3388 optlen -= 8 * opt->nd_opt_len; 3389 if (optlen < 0) 3390 return (B_FALSE); 3391 opt = (struct nd_opt_hdr *)((char *)opt + 8 * opt->nd_opt_len); 3392 } 3393 return (B_TRUE); 3394 } 3395 3396 /* 3397 * ncec_walk function. 3398 * Free a fraction of the NCE cache entries. 3399 * 3400 * A possible optimization here would be to use ncec_last where possible, and 3401 * delete the least-frequently used entry, which would require more complex 3402 * computation as we walk through the ncec's (e.g., track ncec entries by 3403 * order of ncec_last and/or maintain state) 3404 */ 3405 static void 3406 ncec_cache_reclaim(ncec_t *ncec, char *arg) 3407 { 3408 ip_stack_t *ipst = ncec->ncec_ipst; 3409 uint_t fraction = *(uint_t *)arg; 3410 uint_t rand; 3411 3412 if ((ncec->ncec_flags & 3413 (NCE_F_MYADDR | NCE_F_STATIC | NCE_F_BCAST)) != 0) { 3414 return; 3415 } 3416 3417 rand = (uint_t)ddi_get_lbolt() + 3418 NCE_ADDR_HASH_V6(ncec->ncec_addr, NCE_TABLE_SIZE); 3419 if ((rand/fraction)*fraction == rand) { 3420 IP_STAT(ipst, ip_nce_reclaim_deleted); 3421 ncec_delete(ncec); 3422 } 3423 } 3424 3425 /* 3426 * kmem_cache callback to free up memory. 3427 * 3428 * For now we just delete a fixed fraction. 3429 */ 3430 static void 3431 ip_nce_reclaim_stack(ip_stack_t *ipst) 3432 { 3433 uint_t fraction = ipst->ips_ip_nce_reclaim_fraction; 3434 3435 IP_STAT(ipst, ip_nce_reclaim_calls); 3436 3437 ncec_walk(NULL, (pfi_t)ncec_cache_reclaim, (uchar_t *)&fraction, ipst); 3438 3439 /* 3440 * Walk all CONNs that can have a reference on an ire, ncec or dce. 3441 * Get them to update any stale references to drop any refholds they 3442 * have. 3443 */ 3444 ipcl_walk(conn_ixa_cleanup, (void *)B_FALSE, ipst); 3445 } 3446 3447 /* 3448 * Called by the memory allocator subsystem directly, when the system 3449 * is running low on memory. 3450 */ 3451 /* ARGSUSED */ 3452 void 3453 ip_nce_reclaim(void *args) 3454 { 3455 netstack_handle_t nh; 3456 netstack_t *ns; 3457 ip_stack_t *ipst; 3458 3459 netstack_next_init(&nh); 3460 while ((ns = netstack_next(&nh)) != NULL) { 3461 /* 3462 * netstack_next() can return a netstack_t with a NULL 3463 * netstack_ip at boot time. 3464 */ 3465 if ((ipst = ns->netstack_ip) == NULL) { 3466 netstack_rele(ns); 3467 continue; 3468 } 3469 ip_nce_reclaim_stack(ipst); 3470 netstack_rele(ns); 3471 } 3472 netstack_next_fini(&nh); 3473 } 3474 3475 #ifdef DEBUG 3476 void 3477 ncec_trace_ref(ncec_t *ncec) 3478 { 3479 ASSERT(MUTEX_HELD(&ncec->ncec_lock)); 3480 3481 if (ncec->ncec_trace_disable) 3482 return; 3483 3484 if (!th_trace_ref(ncec, ncec->ncec_ipst)) { 3485 ncec->ncec_trace_disable = B_TRUE; 3486 ncec_trace_cleanup(ncec); 3487 } 3488 } 3489 3490 void 3491 ncec_untrace_ref(ncec_t *ncec) 3492 { 3493 ASSERT(MUTEX_HELD(&ncec->ncec_lock)); 3494 3495 if (!ncec->ncec_trace_disable) 3496 th_trace_unref(ncec); 3497 } 3498 3499 static void 3500 ncec_trace_cleanup(const ncec_t *ncec) 3501 { 3502 th_trace_cleanup(ncec, ncec->ncec_trace_disable); 3503 } 3504 #endif 3505 3506 /* 3507 * Called when address resolution fails due to a timeout. 3508 * Send an ICMP unreachable in response to all queued packets. 3509 */ 3510 void 3511 arp_resolv_failed(ncec_t *ncec) 3512 { 3513 mblk_t *mp, *nxt_mp; 3514 char buf[INET6_ADDRSTRLEN]; 3515 struct in_addr ipv4addr; 3516 ill_t *ill = ncec->ncec_ill; 3517 ip_stack_t *ipst = ncec->ncec_ipst; 3518 ip_recv_attr_t iras; 3519 3520 bzero(&iras, sizeof (iras)); 3521 iras.ira_flags = IRAF_IS_IPV4; 3522 /* 3523 * we are setting the ira_rill to the ipmp_ill (instead of 3524 * the actual ill on which the packet was received), but this 3525 * is ok because we don't actually need the real ira_rill. 3526 * to send the icmp unreachable to the sender. 3527 */ 3528 iras.ira_ill = iras.ira_rill = ill; 3529 iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex; 3530 iras.ira_rifindex = iras.ira_ruifindex; 3531 3532 IN6_V4MAPPED_TO_INADDR(&ncec->ncec_addr, &ipv4addr); 3533 ip3dbg(("arp_resolv_failed: dst %s\n", 3534 inet_ntop(AF_INET, &ipv4addr, buf, sizeof (buf)))); 3535 mutex_enter(&ncec->ncec_lock); 3536 mp = ncec->ncec_qd_mp; 3537 ncec->ncec_qd_mp = NULL; 3538 ncec->ncec_nprobes = 0; 3539 mutex_exit(&ncec->ncec_lock); 3540 while (mp != NULL) { 3541 nxt_mp = mp->b_next; 3542 mp->b_next = NULL; 3543 3544 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); 3545 ip_drop_output("ipIfStatsOutDiscards - address unreachable", 3546 mp, ill); 3547 if (ipst->ips_ip_arp_icmp_error) { 3548 ip3dbg(("arp_resolv_failed: " 3549 "Calling icmp_unreachable\n")); 3550 icmp_unreachable(mp, ICMP_HOST_UNREACHABLE, &iras); 3551 } else { 3552 freemsg(mp); 3553 } 3554 ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE)); 3555 mp = nxt_mp; 3556 } 3557 ncec_cb_dispatch(ncec); /* finish off waiting callbacks */ 3558 } 3559 3560 /* 3561 * if ill is an under_ill, translate it to the ipmp_ill and add the 3562 * nce on the ipmp_ill. Two nce_t entries (one on the ipmp_ill, and 3563 * one on the underlying in_ill) will be created for the 3564 * ncec_t in this case. The ncec_t itself will be created on the ipmp_ill. 3565 */ 3566 int 3567 nce_lookup_then_add_v4(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len, 3568 const in_addr_t *addr, uint16_t flags, uint16_t state, nce_t **newnce) 3569 { 3570 int err; 3571 in6_addr_t addr6; 3572 ip_stack_t *ipst = ill->ill_ipst; 3573 nce_t *nce, *upper_nce = NULL; 3574 ill_t *in_ill = ill, *under = NULL; 3575 boolean_t need_ill_refrele = B_FALSE; 3576 3577 if (flags & NCE_F_MCAST) { 3578 /* 3579 * hw_addr will be figured out in nce_set_multicast_v4; 3580 * caller needs to pass in the cast_ill for ipmp 3581 */ 3582 ASSERT(hw_addr == NULL); 3583 ASSERT(!IS_IPMP(ill)); 3584 err = nce_set_multicast_v4(ill, addr, flags, newnce); 3585 return (err); 3586 } 3587 3588 if (IS_UNDER_IPMP(ill) && !(flags & NCE_F_MYADDR)) { 3589 ill = ipmp_ill_hold_ipmp_ill(ill); 3590 if (ill == NULL) 3591 return (ENXIO); 3592 need_ill_refrele = B_TRUE; 3593 } 3594 if ((flags & NCE_F_BCAST) != 0) { 3595 /* 3596 * IPv4 broadcast ncec: compute the hwaddr. 3597 */ 3598 if (IS_IPMP(ill)) { 3599 under = ipmp_ill_hold_xmit_ill(ill, B_FALSE); 3600 if (under == NULL) { 3601 if (need_ill_refrele) 3602 ill_refrele(ill); 3603 return (ENETDOWN); 3604 } 3605 hw_addr = under->ill_bcast_mp->b_rptr + 3606 NCE_LL_ADDR_OFFSET(under); 3607 hw_addr_len = under->ill_phys_addr_length; 3608 } else { 3609 hw_addr = ill->ill_bcast_mp->b_rptr + 3610 NCE_LL_ADDR_OFFSET(ill), 3611 hw_addr_len = ill->ill_phys_addr_length; 3612 } 3613 } 3614 3615 mutex_enter(&ipst->ips_ndp4->ndp_g_lock); 3616 IN6_IPADDR_TO_V4MAPPED(*addr, &addr6); 3617 nce = nce_lookup_addr(ill, &addr6); 3618 if (nce == NULL) { 3619 err = nce_add_v4(ill, hw_addr, hw_addr_len, addr, flags, 3620 state, &nce); 3621 } else { 3622 err = EEXIST; 3623 } 3624 mutex_exit(&ipst->ips_ndp4->ndp_g_lock); 3625 if (err == 0) 3626 err = nce_add_v4_postprocess(nce); 3627 3628 if (in_ill != ill && nce != NULL) { 3629 nce_t *under_nce = NULL; 3630 3631 /* 3632 * in_ill was the under_ill. Try to create the under_nce. 3633 * Hold the ill_g_lock to prevent changes to group membership 3634 * until we are done. 3635 */ 3636 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 3637 if (!IS_IN_SAME_ILLGRP(in_ill, ill)) { 3638 DTRACE_PROBE2(ill__not__in__group, nce_t *, nce, 3639 ill_t *, ill); 3640 rw_exit(&ipst->ips_ill_g_lock); 3641 err = ENXIO; 3642 nce_refrele(nce); 3643 nce = NULL; 3644 goto bail; 3645 } 3646 under_nce = nce_fastpath_create(in_ill, nce->nce_common); 3647 if (under_nce == NULL) { 3648 rw_exit(&ipst->ips_ill_g_lock); 3649 err = EINVAL; 3650 nce_refrele(nce); 3651 nce = NULL; 3652 goto bail; 3653 } 3654 rw_exit(&ipst->ips_ill_g_lock); 3655 upper_nce = nce; 3656 nce = under_nce; /* will be returned to caller */ 3657 if (NCE_ISREACHABLE(nce->nce_common)) 3658 nce_fastpath_trigger(under_nce); 3659 } 3660 if (nce != NULL) { 3661 if (newnce != NULL) 3662 *newnce = nce; 3663 else 3664 nce_refrele(nce); 3665 } 3666 bail: 3667 if (under != NULL) 3668 ill_refrele(under); 3669 if (upper_nce != NULL) 3670 nce_refrele(upper_nce); 3671 if (need_ill_refrele) 3672 ill_refrele(ill); 3673 3674 return (err); 3675 } 3676 3677 /* 3678 * NDP Cache Entry creation routine for IPv4. 3679 * This routine must always be called with ndp4->ndp_g_lock held. 3680 * Prior to return, ncec_refcnt is incremented. 3681 * 3682 * IPMP notes: the ncec for non-local (i.e., !NCE_MYADDR(ncec) addresses 3683 * are always added pointing at the ipmp_ill. Thus, when the ill passed 3684 * to nce_add_v4 is an under_ill (i.e., IS_UNDER_IPMP(ill)) two nce_t 3685 * entries will be created, both pointing at the same ncec_t. The nce_t 3686 * entries will have their nce_ill set to the ipmp_ill and the under_ill 3687 * respectively, with the ncec_t having its ncec_ill pointing at the ipmp_ill. 3688 * Local addresses are always created on the ill passed to nce_add_v4. 3689 */ 3690 int 3691 nce_add_v4(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len, 3692 const in_addr_t *addr, uint16_t flags, uint16_t state, nce_t **newnce) 3693 { 3694 int err; 3695 boolean_t is_multicast = (flags & NCE_F_MCAST); 3696 struct in6_addr addr6; 3697 nce_t *nce; 3698 3699 ASSERT(MUTEX_HELD(&ill->ill_ipst->ips_ndp4->ndp_g_lock)); 3700 ASSERT(!ill->ill_isv6); 3701 ASSERT(!IN_MULTICAST(htonl(*addr)) || is_multicast); 3702 3703 IN6_IPADDR_TO_V4MAPPED(*addr, &addr6); 3704 err = nce_add_common(ill, hw_addr, hw_addr_len, &addr6, flags, state, 3705 &nce); 3706 ASSERT(newnce != NULL); 3707 *newnce = nce; 3708 return (err); 3709 } 3710 3711 /* 3712 * Post-processing routine to be executed after nce_add_v4(). This function 3713 * triggers fastpath (if appropriate) and DAD on the newly added nce entry 3714 * and must be called without any locks held. 3715 * 3716 * Always returns 0, but we return an int to keep this symmetric with the 3717 * IPv6 counter-part. 3718 */ 3719 int 3720 nce_add_v4_postprocess(nce_t *nce) 3721 { 3722 ncec_t *ncec = nce->nce_common; 3723 uint16_t flags = ncec->ncec_flags; 3724 boolean_t ndp_need_dad = B_FALSE; 3725 boolean_t dropped; 3726 clock_t delay; 3727 ip_stack_t *ipst = ncec->ncec_ill->ill_ipst; 3728 uchar_t *hw_addr = ncec->ncec_lladdr; 3729 boolean_t trigger_fastpath = B_TRUE; 3730 3731 /* 3732 * If the hw_addr is NULL, typically for ND_INCOMPLETE nces, then 3733 * we call nce_fastpath as soon as the ncec is resolved in nce_process. 3734 * We call nce_fastpath from nce_update if the link layer address of 3735 * the peer changes from nce_update 3736 */ 3737 if (NCE_PUBLISH(ncec) || !NCE_ISREACHABLE(ncec) || (hw_addr == NULL && 3738 ncec->ncec_ill->ill_net_type != IRE_IF_NORESOLVER)) 3739 trigger_fastpath = B_FALSE; 3740 3741 if (trigger_fastpath) 3742 nce_fastpath_trigger(nce); 3743 3744 if (NCE_PUBLISH(ncec) && ncec->ncec_state == ND_PROBE) { 3745 /* 3746 * Either the caller (by passing in ND_PROBE) 3747 * or nce_add_common() (by the internally computed state 3748 * based on ncec_addr and ill_net_type) has determined 3749 * that this unicast entry needs DAD. Trigger DAD. 3750 */ 3751 ndp_need_dad = B_TRUE; 3752 } else if (flags & NCE_F_UNSOL_ADV) { 3753 /* 3754 * We account for the transmit below by assigning one 3755 * less than the ndd variable. Subsequent decrements 3756 * are done in nce_timer. 3757 */ 3758 mutex_enter(&ncec->ncec_lock); 3759 ncec->ncec_unsolicit_count = 3760 ipst->ips_ip_arp_publish_count - 1; 3761 mutex_exit(&ncec->ncec_lock); 3762 dropped = arp_announce(ncec); 3763 mutex_enter(&ncec->ncec_lock); 3764 if (dropped) 3765 ncec->ncec_unsolicit_count++; 3766 else 3767 ncec->ncec_last_time_defended = ddi_get_lbolt(); 3768 if (ncec->ncec_unsolicit_count != 0) { 3769 nce_start_timer(ncec, 3770 ipst->ips_ip_arp_publish_interval); 3771 } 3772 mutex_exit(&ncec->ncec_lock); 3773 } 3774 3775 /* 3776 * If ncec_xmit_interval is 0, user has configured us to send the first 3777 * probe right away. Do so, and set up for the subsequent probes. 3778 */ 3779 if (ndp_need_dad) { 3780 mutex_enter(&ncec->ncec_lock); 3781 if (ncec->ncec_pcnt == 0) { 3782 /* 3783 * DAD probes and announce can be 3784 * administratively disabled by setting the 3785 * probe_count to zero. Restart the timer in 3786 * this case to mark the ipif as ready. 3787 */ 3788 ncec->ncec_unsolicit_count = 0; 3789 mutex_exit(&ncec->ncec_lock); 3790 nce_restart_timer(ncec, 0); 3791 } else { 3792 mutex_exit(&ncec->ncec_lock); 3793 delay = ((ncec->ncec_flags & NCE_F_FAST) ? 3794 ipst->ips_arp_probe_delay : 3795 ipst->ips_arp_fastprobe_delay); 3796 nce_dad(ncec, NULL, (delay == 0 ? B_TRUE : B_FALSE)); 3797 } 3798 } 3799 return (0); 3800 } 3801 3802 /* 3803 * ncec_walk routine to update all entries that have a given destination or 3804 * gateway address and cached link layer (MAC) address. This is used when ARP 3805 * informs us that a network-to-link-layer mapping may have changed. 3806 */ 3807 void 3808 nce_update_hw_changed(ncec_t *ncec, void *arg) 3809 { 3810 nce_hw_map_t *hwm = arg; 3811 ipaddr_t ncec_addr; 3812 3813 if (ncec->ncec_state != ND_REACHABLE) 3814 return; 3815 3816 IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, ncec_addr); 3817 if (ncec_addr != hwm->hwm_addr) 3818 return; 3819 3820 mutex_enter(&ncec->ncec_lock); 3821 if (hwm->hwm_flags != 0) 3822 ncec->ncec_flags = hwm->hwm_flags; 3823 nce_update(ncec, ND_STALE, hwm->hwm_hwaddr); 3824 mutex_exit(&ncec->ncec_lock); 3825 } 3826 3827 void 3828 ncec_refhold(ncec_t *ncec) 3829 { 3830 mutex_enter(&(ncec)->ncec_lock); 3831 (ncec)->ncec_refcnt++; 3832 ASSERT((ncec)->ncec_refcnt != 0); 3833 #ifdef DEBUG 3834 ncec_trace_ref(ncec); 3835 #endif 3836 mutex_exit(&(ncec)->ncec_lock); 3837 } 3838 3839 void 3840 ncec_refhold_notr(ncec_t *ncec) 3841 { 3842 mutex_enter(&(ncec)->ncec_lock); 3843 (ncec)->ncec_refcnt++; 3844 ASSERT((ncec)->ncec_refcnt != 0); 3845 mutex_exit(&(ncec)->ncec_lock); 3846 } 3847 3848 static void 3849 ncec_refhold_locked(ncec_t *ncec) 3850 { 3851 ASSERT(MUTEX_HELD(&(ncec)->ncec_lock)); 3852 (ncec)->ncec_refcnt++; 3853 #ifdef DEBUG 3854 ncec_trace_ref(ncec); 3855 #endif 3856 } 3857 3858 /* ncec_inactive destroys the mutex thus no mutex_exit is needed */ 3859 void 3860 ncec_refrele(ncec_t *ncec) 3861 { 3862 mutex_enter(&(ncec)->ncec_lock); 3863 #ifdef DEBUG 3864 ncec_untrace_ref(ncec); 3865 #endif 3866 ASSERT((ncec)->ncec_refcnt != 0); 3867 if (--(ncec)->ncec_refcnt == 0) { 3868 ncec_inactive(ncec); 3869 } else { 3870 mutex_exit(&(ncec)->ncec_lock); 3871 } 3872 } 3873 3874 void 3875 ncec_refrele_notr(ncec_t *ncec) 3876 { 3877 mutex_enter(&(ncec)->ncec_lock); 3878 ASSERT((ncec)->ncec_refcnt != 0); 3879 if (--(ncec)->ncec_refcnt == 0) { 3880 ncec_inactive(ncec); 3881 } else { 3882 mutex_exit(&(ncec)->ncec_lock); 3883 } 3884 } 3885 3886 /* 3887 * Common to IPv4 and IPv6. 3888 */ 3889 void 3890 nce_restart_timer(ncec_t *ncec, uint_t ms) 3891 { 3892 timeout_id_t tid; 3893 3894 ASSERT(!MUTEX_HELD(&(ncec)->ncec_lock)); 3895 3896 /* First cancel any running timer */ 3897 mutex_enter(&ncec->ncec_lock); 3898 tid = ncec->ncec_timeout_id; 3899 ncec->ncec_timeout_id = 0; 3900 if (tid != 0) { 3901 mutex_exit(&ncec->ncec_lock); 3902 (void) untimeout(tid); 3903 mutex_enter(&ncec->ncec_lock); 3904 } 3905 3906 /* Restart timer */ 3907 nce_start_timer(ncec, ms); 3908 mutex_exit(&ncec->ncec_lock); 3909 } 3910 3911 static void 3912 nce_start_timer(ncec_t *ncec, uint_t ms) 3913 { 3914 ASSERT(MUTEX_HELD(&ncec->ncec_lock)); 3915 /* 3916 * Don't start the timer if the ncec has been deleted, or if the timer 3917 * is already running 3918 */ 3919 if (!NCE_ISCONDEMNED(ncec) && ncec->ncec_timeout_id == 0) { 3920 ncec->ncec_timeout_id = timeout(nce_timer, ncec, 3921 MSEC_TO_TICK(ms) == 0 ? 1 : MSEC_TO_TICK(ms)); 3922 } 3923 } 3924 3925 int 3926 nce_set_multicast_v4(ill_t *ill, const in_addr_t *dst, 3927 uint16_t flags, nce_t **newnce) 3928 { 3929 uchar_t *hw_addr; 3930 int err = 0; 3931 ip_stack_t *ipst = ill->ill_ipst; 3932 in6_addr_t dst6; 3933 nce_t *nce; 3934 3935 ASSERT(!ill->ill_isv6); 3936 3937 IN6_IPADDR_TO_V4MAPPED(*dst, &dst6); 3938 mutex_enter(&ipst->ips_ndp4->ndp_g_lock); 3939 if ((nce = nce_lookup_addr(ill, &dst6)) != NULL) { 3940 mutex_exit(&ipst->ips_ndp4->ndp_g_lock); 3941 goto done; 3942 } 3943 if (ill->ill_net_type == IRE_IF_RESOLVER) { 3944 /* 3945 * For IRE_IF_RESOLVER a hardware mapping can be 3946 * generated, for IRE_IF_NORESOLVER, resolution cookie 3947 * in the ill is copied in nce_add_v4(). 3948 */ 3949 hw_addr = kmem_alloc(ill->ill_phys_addr_length, KM_NOSLEEP); 3950 if (hw_addr == NULL) { 3951 mutex_exit(&ipst->ips_ndp4->ndp_g_lock); 3952 return (ENOMEM); 3953 } 3954 ip_mcast_mapping(ill, (uchar_t *)dst, hw_addr); 3955 } else { 3956 /* 3957 * IRE_IF_NORESOLVER type simply copies the resolution 3958 * cookie passed in. So no hw_addr is needed. 3959 */ 3960 hw_addr = NULL; 3961 } 3962 ASSERT(flags & NCE_F_MCAST); 3963 ASSERT(flags & NCE_F_NONUD); 3964 /* nce_state will be computed by nce_add_common() */ 3965 err = nce_add_v4(ill, hw_addr, ill->ill_phys_addr_length, dst, flags, 3966 ND_UNCHANGED, &nce); 3967 mutex_exit(&ipst->ips_ndp4->ndp_g_lock); 3968 if (err == 0) 3969 err = nce_add_v4_postprocess(nce); 3970 if (hw_addr != NULL) 3971 kmem_free(hw_addr, ill->ill_phys_addr_length); 3972 if (err != 0) { 3973 ip1dbg(("nce_set_multicast_v4: create failed" "%d\n", err)); 3974 return (err); 3975 } 3976 done: 3977 if (newnce != NULL) 3978 *newnce = nce; 3979 else 3980 nce_refrele(nce); 3981 return (0); 3982 } 3983 3984 /* 3985 * This is used when scanning for "old" (least recently broadcast) NCEs. We 3986 * don't want to have to walk the list for every single one, so we gather up 3987 * batches at a time. 3988 */ 3989 #define NCE_RESCHED_LIST_LEN 8 3990 3991 typedef struct { 3992 ill_t *ncert_ill; 3993 uint_t ncert_num; 3994 ncec_t *ncert_nces[NCE_RESCHED_LIST_LEN]; 3995 } nce_resched_t; 3996 3997 /* 3998 * Pick the longest waiting NCEs for defense. 3999 */ 4000 /* ARGSUSED */ 4001 static int 4002 ncec_reschedule(ill_t *ill, nce_t *nce, void *arg) 4003 { 4004 nce_resched_t *ncert = arg; 4005 ncec_t **ncecs; 4006 ncec_t **ncec_max; 4007 ncec_t *ncec_temp; 4008 ncec_t *ncec = nce->nce_common; 4009 4010 ASSERT(ncec->ncec_ill == ncert->ncert_ill); 4011 /* 4012 * Only reachable entries that are ready for announcement are eligible. 4013 */ 4014 if (!NCE_MYADDR(ncec) || ncec->ncec_state != ND_REACHABLE) 4015 return (0); 4016 if (ncert->ncert_num < NCE_RESCHED_LIST_LEN) { 4017 ncec_refhold(ncec); 4018 ncert->ncert_nces[ncert->ncert_num++] = ncec; 4019 } else { 4020 ncecs = ncert->ncert_nces; 4021 ncec_max = ncecs + NCE_RESCHED_LIST_LEN; 4022 ncec_refhold(ncec); 4023 for (; ncecs < ncec_max; ncecs++) { 4024 ASSERT(ncec != NULL); 4025 if ((*ncecs)->ncec_last_time_defended > 4026 ncec->ncec_last_time_defended) { 4027 ncec_temp = *ncecs; 4028 *ncecs = ncec; 4029 ncec = ncec_temp; 4030 } 4031 } 4032 ncec_refrele(ncec); 4033 } 4034 return (0); 4035 } 4036 4037 /* 4038 * Reschedule the ARP defense of any long-waiting NCEs. It's assumed that this 4039 * doesn't happen very often (if at all), and thus it needn't be highly 4040 * optimized. (Note, though, that it's actually O(N) complexity, because the 4041 * outer loop is bounded by a constant rather than by the length of the list.) 4042 */ 4043 static void 4044 nce_ill_reschedule(ill_t *ill, nce_resched_t *ncert) 4045 { 4046 ncec_t *ncec; 4047 ip_stack_t *ipst = ill->ill_ipst; 4048 uint_t i, defend_rate; 4049 4050 i = ill->ill_defend_count; 4051 ill->ill_defend_count = 0; 4052 if (ill->ill_isv6) 4053 defend_rate = ipst->ips_ndp_defend_rate; 4054 else 4055 defend_rate = ipst->ips_arp_defend_rate; 4056 /* If none could be sitting around, then don't reschedule */ 4057 if (i < defend_rate) { 4058 DTRACE_PROBE1(reschedule_none, ill_t *, ill); 4059 return; 4060 } 4061 ncert->ncert_ill = ill; 4062 while (ill->ill_defend_count < defend_rate) { 4063 nce_walk_common(ill, ncec_reschedule, ncert); 4064 for (i = 0; i < ncert->ncert_num; i++) { 4065 4066 ncec = ncert->ncert_nces[i]; 4067 mutex_enter(&ncec->ncec_lock); 4068 ncec->ncec_flags |= NCE_F_DELAYED; 4069 mutex_exit(&ncec->ncec_lock); 4070 /* 4071 * we plan to schedule this ncec, so incr the 4072 * defend_count in anticipation. 4073 */ 4074 if (++ill->ill_defend_count >= defend_rate) 4075 break; 4076 } 4077 if (ncert->ncert_num < NCE_RESCHED_LIST_LEN) 4078 break; 4079 } 4080 } 4081 4082 /* 4083 * Check if the current rate-limiting parameters permit the sending 4084 * of another address defense announcement for both IPv4 and IPv6. 4085 * Returns B_TRUE if rate-limiting is in effect (i.e., send is not 4086 * permitted), and B_FALSE otherwise. The `defend_rate' parameter 4087 * determines how many address defense announcements are permitted 4088 * in any `defense_perio' interval. 4089 */ 4090 static boolean_t 4091 ill_defend_rate_limit(ill_t *ill, ncec_t *ncec) 4092 { 4093 clock_t now = ddi_get_lbolt(); 4094 ip_stack_t *ipst = ill->ill_ipst; 4095 clock_t start = ill->ill_defend_start; 4096 uint32_t elapsed, defend_period, defend_rate; 4097 nce_resched_t ncert; 4098 boolean_t ret; 4099 int i; 4100 4101 if (ill->ill_isv6) { 4102 defend_period = ipst->ips_ndp_defend_period; 4103 defend_rate = ipst->ips_ndp_defend_rate; 4104 } else { 4105 defend_period = ipst->ips_arp_defend_period; 4106 defend_rate = ipst->ips_arp_defend_rate; 4107 } 4108 if (defend_rate == 0) 4109 return (B_TRUE); 4110 bzero(&ncert, sizeof (ncert)); 4111 mutex_enter(&ill->ill_lock); 4112 if (start > 0) { 4113 elapsed = now - start; 4114 if (elapsed > SEC_TO_TICK(defend_period)) { 4115 ill->ill_defend_start = now; 4116 /* 4117 * nce_ill_reschedule will attempt to 4118 * prevent starvation by reschduling the 4119 * oldest entries, which are marked with 4120 * the NCE_F_DELAYED flag. 4121 */ 4122 nce_ill_reschedule(ill, &ncert); 4123 } 4124 } else { 4125 ill->ill_defend_start = now; 4126 } 4127 ASSERT(ill->ill_defend_count <= defend_rate); 4128 mutex_enter(&ncec->ncec_lock); 4129 if (ncec->ncec_flags & NCE_F_DELAYED) { 4130 /* 4131 * This ncec was rescheduled as one of the really old 4132 * entries needing on-going defense. The 4133 * ill_defend_count was already incremented in 4134 * nce_ill_reschedule. Go ahead and send the announce. 4135 */ 4136 ncec->ncec_flags &= ~NCE_F_DELAYED; 4137 mutex_exit(&ncec->ncec_lock); 4138 ret = B_FALSE; 4139 goto done; 4140 } 4141 mutex_exit(&ncec->ncec_lock); 4142 if (ill->ill_defend_count < defend_rate) 4143 ill->ill_defend_count++; 4144 if (ill->ill_defend_count == defend_rate) { 4145 /* 4146 * we are no longer allowed to send unbidden defense 4147 * messages. Wait for rescheduling. 4148 */ 4149 ret = B_TRUE; 4150 } else { 4151 ret = B_FALSE; 4152 } 4153 done: 4154 mutex_exit(&ill->ill_lock); 4155 /* 4156 * After all the locks have been dropped we can restart nce timer, 4157 * and refrele the delayed ncecs 4158 */ 4159 for (i = 0; i < ncert.ncert_num; i++) { 4160 clock_t xmit_interval; 4161 ncec_t *tmp; 4162 4163 tmp = ncert.ncert_nces[i]; 4164 xmit_interval = nce_fuzz_interval(tmp->ncec_xmit_interval, 4165 B_FALSE); 4166 nce_restart_timer(tmp, xmit_interval); 4167 ncec_refrele(tmp); 4168 } 4169 return (ret); 4170 } 4171 4172 boolean_t 4173 ndp_announce(ncec_t *ncec) 4174 { 4175 return (ndp_xmit(ncec->ncec_ill, ND_NEIGHBOR_ADVERT, ncec->ncec_lladdr, 4176 ncec->ncec_lladdr_length, &ncec->ncec_addr, &ipv6_all_hosts_mcast, 4177 nce_advert_flags(ncec))); 4178 } 4179 4180 ill_t * 4181 nce_resolve_src(ncec_t *ncec, in6_addr_t *src) 4182 { 4183 mblk_t *mp; 4184 in6_addr_t src6; 4185 ipaddr_t src4; 4186 ill_t *ill = ncec->ncec_ill; 4187 ill_t *src_ill = NULL; 4188 ipif_t *ipif = NULL; 4189 boolean_t is_myaddr = NCE_MYADDR(ncec); 4190 boolean_t isv6 = (ncec->ncec_ipversion == IPV6_VERSION); 4191 4192 ASSERT(src != NULL); 4193 ASSERT(IN6_IS_ADDR_UNSPECIFIED(src)); 4194 src6 = *src; 4195 if (is_myaddr) { 4196 src6 = ncec->ncec_addr; 4197 if (!isv6) 4198 IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, src4); 4199 } else { 4200 /* 4201 * try to find one from the outgoing packet. 4202 */ 4203 mutex_enter(&ncec->ncec_lock); 4204 mp = ncec->ncec_qd_mp; 4205 if (mp != NULL) { 4206 if (isv6) { 4207 ip6_t *ip6h = (ip6_t *)mp->b_rptr; 4208 4209 src6 = ip6h->ip6_src; 4210 } else { 4211 ipha_t *ipha = (ipha_t *)mp->b_rptr; 4212 4213 src4 = ipha->ipha_src; 4214 IN6_IPADDR_TO_V4MAPPED(src4, &src6); 4215 } 4216 } 4217 mutex_exit(&ncec->ncec_lock); 4218 } 4219 4220 /* 4221 * For outgoing packets, if the src of outgoing packet is one 4222 * of the assigned interface addresses use it, otherwise we 4223 * will pick the source address below. 4224 * For local addresses (is_myaddr) doing DAD, NDP announce 4225 * messages are mcast. So we use the (IPMP) cast_ill or the 4226 * (non-IPMP) ncec_ill for these message types. The only case 4227 * of unicast DAD messages are for IPv6 ND probes, for which 4228 * we find the ipif_bound_ill corresponding to the ncec_addr. 4229 */ 4230 if (!IN6_IS_ADDR_UNSPECIFIED(&src6) || is_myaddr) { 4231 if (isv6) { 4232 ipif = ipif_lookup_addr_nondup_v6(&src6, ill, ALL_ZONES, 4233 ill->ill_ipst); 4234 } else { 4235 ipif = ipif_lookup_addr_nondup(src4, ill, ALL_ZONES, 4236 ill->ill_ipst); 4237 } 4238 4239 /* 4240 * If no relevant ipif can be found, then it's not one of our 4241 * addresses. Reset to :: and try to find a src for the NS or 4242 * ARP request using ipif_select_source_v[4,6] below. 4243 * If an ipif can be found, but it's not yet done with 4244 * DAD verification, and we are not being invoked for 4245 * DAD (i.e., !is_myaddr), then just postpone this 4246 * transmission until later. 4247 */ 4248 if (ipif == NULL) { 4249 src6 = ipv6_all_zeros; 4250 src4 = INADDR_ANY; 4251 } else if (!ipif->ipif_addr_ready && !is_myaddr) { 4252 DTRACE_PROBE2(nce__resolve__ipif__not__ready, 4253 ncec_t *, ncec, ipif_t *, ipif); 4254 ipif_refrele(ipif); 4255 return (NULL); 4256 } 4257 } 4258 4259 if (IN6_IS_ADDR_UNSPECIFIED(&src6) && !is_myaddr) { 4260 /* 4261 * Pick a source address for this solicitation, but 4262 * restrict the selection to addresses assigned to the 4263 * output interface. We do this because the destination will 4264 * create a neighbor cache entry for the source address of 4265 * this packet, so the source address had better be a valid 4266 * neighbor. 4267 */ 4268 if (isv6) { 4269 ipif = ipif_select_source_v6(ill, &ncec->ncec_addr, 4270 B_TRUE, IPV6_PREFER_SRC_DEFAULT, ALL_ZONES, 4271 B_FALSE, NULL); 4272 } else { 4273 ipaddr_t nce_addr; 4274 4275 IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, nce_addr); 4276 ipif = ipif_select_source_v4(ill, nce_addr, ALL_ZONES, 4277 B_FALSE, NULL); 4278 } 4279 if (ipif == NULL && IS_IPMP(ill)) { 4280 ill_t *send_ill = ipmp_ill_hold_xmit_ill(ill, B_TRUE); 4281 4282 if (send_ill != NULL) { 4283 if (isv6) { 4284 ipif = ipif_select_source_v6(send_ill, 4285 &ncec->ncec_addr, B_TRUE, 4286 IPV6_PREFER_SRC_DEFAULT, ALL_ZONES, 4287 B_FALSE, NULL); 4288 } else { 4289 IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, 4290 src4); 4291 ipif = ipif_select_source_v4(send_ill, 4292 src4, ALL_ZONES, B_TRUE, NULL); 4293 } 4294 ill_refrele(send_ill); 4295 } 4296 } 4297 4298 if (ipif == NULL) { 4299 char buf[INET6_ADDRSTRLEN]; 4300 4301 ip1dbg(("nce_resolve_src: No source ipif for dst %s\n", 4302 inet_ntop((isv6 ? AF_INET6 : AF_INET), 4303 (char *)&ncec->ncec_addr, buf, sizeof (buf)))); 4304 DTRACE_PROBE1(nce__resolve__no__ipif, ncec_t *, ncec); 4305 return (NULL); 4306 } 4307 src6 = ipif->ipif_v6lcl_addr; 4308 } 4309 *src = src6; 4310 if (ipif != NULL) { 4311 src_ill = ipif->ipif_ill; 4312 if (IS_IPMP(src_ill)) 4313 src_ill = ipmp_ipif_hold_bound_ill(ipif); 4314 else 4315 ill_refhold(src_ill); 4316 ipif_refrele(ipif); 4317 DTRACE_PROBE2(nce__resolve__src__ill, ncec_t *, ncec, 4318 ill_t *, src_ill); 4319 } 4320 return (src_ill); 4321 } 4322 4323 void 4324 ip_nce_lookup_and_update(ipaddr_t *addr, ipif_t *ipif, ip_stack_t *ipst, 4325 uchar_t *hwaddr, int hwaddr_len, int flags) 4326 { 4327 ill_t *ill; 4328 ncec_t *ncec; 4329 nce_t *nce; 4330 uint16_t new_state; 4331 4332 ill = (ipif ? ipif->ipif_ill : NULL); 4333 if (ill != NULL) { 4334 /* 4335 * only one ncec is possible 4336 */ 4337 nce = nce_lookup_v4(ill, addr); 4338 if (nce != NULL) { 4339 ncec = nce->nce_common; 4340 mutex_enter(&ncec->ncec_lock); 4341 if (NCE_ISREACHABLE(ncec)) 4342 new_state = ND_UNCHANGED; 4343 else 4344 new_state = ND_STALE; 4345 ncec->ncec_flags = flags; 4346 nce_update(ncec, new_state, hwaddr); 4347 mutex_exit(&ncec->ncec_lock); 4348 nce_refrele(nce); 4349 return; 4350 } 4351 } else { 4352 /* 4353 * ill is wildcard; clean up all ncec's and ire's 4354 * that match on addr. 4355 */ 4356 nce_hw_map_t hwm; 4357 4358 hwm.hwm_addr = *addr; 4359 hwm.hwm_hwlen = hwaddr_len; 4360 hwm.hwm_hwaddr = hwaddr; 4361 hwm.hwm_flags = flags; 4362 4363 ncec_walk_common(ipst->ips_ndp4, NULL, 4364 (pfi_t)nce_update_hw_changed, (uchar_t *)&hwm, B_TRUE); 4365 } 4366 } 4367 4368 /* 4369 * Common function to add ncec entries. 4370 * we always add the ncec with ncec_ill == ill, and always create 4371 * nce_t on ncec_ill. A dlpi fastpath message may be triggered if the 4372 * ncec is !reachable. 4373 * 4374 * When the caller passes in an nce_state of ND_UNCHANGED, 4375 * nce_add_common() will determine the state of the created nce based 4376 * on the ill_net_type and nce_flags used. Otherwise, the nce will 4377 * be created with state set to the passed in nce_state. 4378 */ 4379 static int 4380 nce_add_common(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len, 4381 const in6_addr_t *addr, uint16_t flags, uint16_t nce_state, nce_t **retnce) 4382 { 4383 static ncec_t nce_nil; 4384 uchar_t *template = NULL; 4385 int err; 4386 ncec_t *ncec; 4387 ncec_t **ncep; 4388 ip_stack_t *ipst = ill->ill_ipst; 4389 uint16_t state; 4390 boolean_t fastprobe = B_FALSE; 4391 struct ndp_g_s *ndp; 4392 nce_t *nce = NULL; 4393 mblk_t *dlur_mp = NULL; 4394 4395 if (ill->ill_isv6) 4396 ndp = ill->ill_ipst->ips_ndp6; 4397 else 4398 ndp = ill->ill_ipst->ips_ndp4; 4399 4400 *retnce = NULL; 4401 4402 ASSERT(MUTEX_HELD(&ndp->ndp_g_lock)); 4403 4404 if (IN6_IS_ADDR_UNSPECIFIED(addr)) { 4405 ip0dbg(("nce_add_common: no addr\n")); 4406 return (EINVAL); 4407 } 4408 if ((flags & ~NCE_EXTERNAL_FLAGS_MASK)) { 4409 ip0dbg(("nce_add_common: flags = %x\n", (int)flags)); 4410 return (EINVAL); 4411 } 4412 4413 if (ill->ill_isv6) { 4414 ncep = ((ncec_t **)NCE_HASH_PTR_V6(ipst, *addr)); 4415 } else { 4416 ipaddr_t v4addr; 4417 4418 IN6_V4MAPPED_TO_IPADDR(addr, v4addr); 4419 ncep = ((ncec_t **)NCE_HASH_PTR_V4(ipst, v4addr)); 4420 } 4421 4422 /* 4423 * The caller has ensured that there is no nce on ill, but there could 4424 * still be an nce_common_t for the address, so that we find exisiting 4425 * ncec_t strucutures first, and atomically add a new nce_t if 4426 * one is found. The ndp_g_lock ensures that we don't cross threads 4427 * with an ncec_delete(). Unlike ncec_lookup_illgrp() we do not 4428 * compare for matches across the illgrp because this function is 4429 * called via nce_lookup_then_add_v* -> nce_add_v* -> nce_add_common, 4430 * with the nce_lookup_then_add_v* passing in the ipmp_ill where 4431 * appropriate. 4432 */ 4433 ncec = *ncep; 4434 for (; ncec != NULL; ncec = ncec->ncec_next) { 4435 if (ncec->ncec_ill == ill) { 4436 if (IN6_ARE_ADDR_EQUAL(&ncec->ncec_addr, addr)) { 4437 /* 4438 * We should never find *retnce to be 4439 * MYADDR, since the caller may then 4440 * incorrectly restart a DAD timer that's 4441 * already running. However, if we are in 4442 * forwarding mode, and the interface is 4443 * moving in/out of groups, the data 4444 * path ire lookup (e.g., ire_revalidate_nce) 4445 * may have determined that some destination 4446 * is offlink while the control path is adding 4447 * that address as a local address. 4448 * Recover from this case by failing the 4449 * lookup 4450 */ 4451 if (NCE_MYADDR(ncec)) 4452 return (ENXIO); 4453 *retnce = nce_ill_lookup_then_add(ill, ncec); 4454 if (*retnce != NULL) 4455 break; 4456 } 4457 } 4458 } 4459 if (*retnce != NULL) /* caller must trigger fastpath on nce */ 4460 return (0); 4461 4462 ncec = kmem_cache_alloc(ncec_cache, KM_NOSLEEP); 4463 if (ncec == NULL) 4464 return (ENOMEM); 4465 *ncec = nce_nil; 4466 ncec->ncec_ill = ill; 4467 ncec->ncec_ipversion = (ill->ill_isv6 ? IPV6_VERSION : IPV4_VERSION); 4468 ncec->ncec_flags = flags; 4469 ncec->ncec_ipst = ipst; /* No netstack_hold */ 4470 4471 if (!ill->ill_isv6) { 4472 ipaddr_t addr4; 4473 4474 /* 4475 * DAD probe interval and probe count are set based on 4476 * fast/slow probe settings. If the underlying link doesn't 4477 * have reliably up/down notifications or if we're working 4478 * with IPv4 169.254.0.0/16 Link Local Address space, then 4479 * don't use the fast timers. Otherwise, use them. 4480 */ 4481 ASSERT(IN6_IS_ADDR_V4MAPPED(addr)); 4482 IN6_V4MAPPED_TO_IPADDR(addr, addr4); 4483 if (ill->ill_note_link && !IS_IPV4_LL_SPACE(&addr4)) { 4484 fastprobe = B_TRUE; 4485 } else if (IS_IPMP(ill) && NCE_PUBLISH(ncec) && 4486 !IS_IPV4_LL_SPACE(&addr4)) { 4487 ill_t *hwaddr_ill; 4488 4489 hwaddr_ill = ipmp_illgrp_find_ill(ill->ill_grp, hw_addr, 4490 hw_addr_len); 4491 if (hwaddr_ill != NULL && hwaddr_ill->ill_note_link) 4492 fastprobe = B_TRUE; 4493 } 4494 if (fastprobe) { 4495 ncec->ncec_xmit_interval = 4496 ipst->ips_arp_fastprobe_interval; 4497 ncec->ncec_pcnt = 4498 ipst->ips_arp_fastprobe_count; 4499 ncec->ncec_flags |= NCE_F_FAST; 4500 } else { 4501 ncec->ncec_xmit_interval = 4502 ipst->ips_arp_probe_interval; 4503 ncec->ncec_pcnt = 4504 ipst->ips_arp_probe_count; 4505 } 4506 if (NCE_PUBLISH(ncec)) { 4507 ncec->ncec_unsolicit_count = 4508 ipst->ips_ip_arp_publish_count; 4509 } 4510 } else { 4511 /* 4512 * probe interval is constant: ILL_PROBE_INTERVAL 4513 * probe count is constant: ND_MAX_UNICAST_SOLICIT 4514 */ 4515 ncec->ncec_pcnt = ND_MAX_UNICAST_SOLICIT; 4516 if (NCE_PUBLISH(ncec)) { 4517 ncec->ncec_unsolicit_count = 4518 ipst->ips_ip_ndp_unsolicit_count; 4519 } 4520 } 4521 ncec->ncec_rcnt = ill->ill_xmit_count; 4522 ncec->ncec_addr = *addr; 4523 ncec->ncec_qd_mp = NULL; 4524 ncec->ncec_refcnt = 1; /* for ncec getting created */ 4525 mutex_init(&ncec->ncec_lock, NULL, MUTEX_DEFAULT, NULL); 4526 ncec->ncec_trace_disable = B_FALSE; 4527 4528 /* 4529 * ncec_lladdr holds link layer address 4530 */ 4531 if (hw_addr_len > 0) { 4532 template = kmem_alloc(hw_addr_len, KM_NOSLEEP); 4533 if (template == NULL) { 4534 err = ENOMEM; 4535 goto err_ret; 4536 } 4537 ncec->ncec_lladdr = template; 4538 ncec->ncec_lladdr_length = hw_addr_len; 4539 bzero(ncec->ncec_lladdr, hw_addr_len); 4540 } 4541 if ((flags & NCE_F_BCAST) != 0) { 4542 state = ND_REACHABLE; 4543 ASSERT(hw_addr_len > 0); 4544 } else if (ill->ill_net_type == IRE_IF_RESOLVER) { 4545 state = ND_INITIAL; 4546 } else if (ill->ill_net_type == IRE_IF_NORESOLVER) { 4547 /* 4548 * NORESOLVER entries are always created in the REACHABLE 4549 * state. 4550 */ 4551 state = ND_REACHABLE; 4552 if (ill->ill_phys_addr_length == IP_ADDR_LEN && 4553 ill->ill_mactype != DL_IPV4 && 4554 ill->ill_mactype != DL_6TO4) { 4555 /* 4556 * We create a nce_res_mp with the IP nexthop address 4557 * as the destination address if the physical length 4558 * is exactly 4 bytes for point-to-multipoint links 4559 * that do their own resolution from IP to link-layer 4560 * address (e.g. IP over X.25). 4561 */ 4562 bcopy((uchar_t *)addr, 4563 ncec->ncec_lladdr, ill->ill_phys_addr_length); 4564 } 4565 if (ill->ill_phys_addr_length == IPV6_ADDR_LEN && 4566 ill->ill_mactype != DL_IPV6) { 4567 /* 4568 * We create a nce_res_mp with the IP nexthop address 4569 * as the destination address if the physical legnth 4570 * is exactly 16 bytes for point-to-multipoint links 4571 * that do their own resolution from IP to link-layer 4572 * address. 4573 */ 4574 bcopy((uchar_t *)addr, 4575 ncec->ncec_lladdr, ill->ill_phys_addr_length); 4576 } 4577 /* 4578 * Since NUD is not part of the base IPv4 protocol definition, 4579 * IPv4 neighbor entries on NORESOLVER interfaces will never 4580 * age, and are marked NCE_F_NONUD. 4581 */ 4582 if (!ill->ill_isv6) 4583 ncec->ncec_flags |= NCE_F_NONUD; 4584 } else if (ill->ill_net_type == IRE_LOOPBACK) { 4585 state = ND_REACHABLE; 4586 } 4587 4588 if (hw_addr != NULL || ill->ill_net_type == IRE_IF_NORESOLVER) { 4589 /* 4590 * We are adding an ncec with a deterministic hw_addr, 4591 * so the state can only be one of {REACHABLE, STALE, PROBE}. 4592 * 4593 * if we are adding a unicast ncec for the local address 4594 * it would be REACHABLE; we would be adding a ND_STALE entry 4595 * for the requestor of an ARP_REQUEST/ND_SOLICIT. Our own 4596 * addresses are added in PROBE to trigger DAD. 4597 */ 4598 if ((flags & (NCE_F_MCAST|NCE_F_BCAST)) || 4599 ill->ill_net_type == IRE_IF_NORESOLVER) 4600 state = ND_REACHABLE; 4601 else if (!NCE_PUBLISH(ncec)) 4602 state = ND_STALE; 4603 else 4604 state = ND_PROBE; 4605 if (hw_addr != NULL) 4606 nce_set_ll(ncec, hw_addr); 4607 } 4608 /* caller overrides internally computed state */ 4609 if (nce_state != ND_UNCHANGED) 4610 state = nce_state; 4611 4612 if (state == ND_PROBE) 4613 ncec->ncec_flags |= NCE_F_UNVERIFIED; 4614 4615 ncec->ncec_state = state; 4616 4617 if (state == ND_REACHABLE) { 4618 ncec->ncec_last = ncec->ncec_init_time = 4619 TICK_TO_MSEC(ddi_get_lbolt64()); 4620 } else { 4621 ncec->ncec_last = 0; 4622 if (state == ND_INITIAL) 4623 ncec->ncec_init_time = TICK_TO_MSEC(ddi_get_lbolt64()); 4624 } 4625 list_create(&ncec->ncec_cb, sizeof (ncec_cb_t), 4626 offsetof(ncec_cb_t, ncec_cb_node)); 4627 /* 4628 * have all the memory allocations out of the way before taking locks 4629 * and adding the nce. 4630 */ 4631 nce = kmem_cache_alloc(nce_cache, KM_NOSLEEP); 4632 if (nce == NULL) { 4633 err = ENOMEM; 4634 goto err_ret; 4635 } 4636 if (ncec->ncec_lladdr != NULL || 4637 ill->ill_net_type == IRE_IF_NORESOLVER) { 4638 dlur_mp = ill_dlur_gen(ncec->ncec_lladdr, 4639 ill->ill_phys_addr_length, ill->ill_sap, 4640 ill->ill_sap_length); 4641 if (dlur_mp == NULL) { 4642 err = ENOMEM; 4643 goto err_ret; 4644 } 4645 } 4646 4647 /* 4648 * Atomically ensure that the ill is not CONDEMNED, before 4649 * adding the NCE. 4650 */ 4651 mutex_enter(&ill->ill_lock); 4652 if (ill->ill_state_flags & ILL_CONDEMNED) { 4653 mutex_exit(&ill->ill_lock); 4654 err = EINVAL; 4655 goto err_ret; 4656 } 4657 if (!NCE_MYADDR(ncec) && 4658 (ill->ill_state_flags & ILL_DOWN_IN_PROGRESS)) { 4659 mutex_exit(&ill->ill_lock); 4660 DTRACE_PROBE1(nce__add__on__down__ill, ncec_t *, ncec); 4661 err = EINVAL; 4662 goto err_ret; 4663 } 4664 /* 4665 * Acquire the ncec_lock even before adding the ncec to the list 4666 * so that it cannot get deleted after the ncec is added, but 4667 * before we add the nce. 4668 */ 4669 mutex_enter(&ncec->ncec_lock); 4670 if ((ncec->ncec_next = *ncep) != NULL) 4671 ncec->ncec_next->ncec_ptpn = &ncec->ncec_next; 4672 *ncep = ncec; 4673 ncec->ncec_ptpn = ncep; 4674 4675 /* Bump up the number of ncec's referencing this ill */ 4676 DTRACE_PROBE3(ill__incr__cnt, (ill_t *), ill, 4677 (char *), "ncec", (void *), ncec); 4678 ill->ill_ncec_cnt++; 4679 /* 4680 * Since we hold the ncec_lock at this time, the ncec cannot be 4681 * condemned, and we can safely add the nce. 4682 */ 4683 *retnce = nce_add_impl(ill, ncec, nce, dlur_mp); 4684 mutex_exit(&ncec->ncec_lock); 4685 mutex_exit(&ill->ill_lock); 4686 4687 /* caller must trigger fastpath on *retnce */ 4688 return (0); 4689 4690 err_ret: 4691 if (ncec != NULL) 4692 kmem_cache_free(ncec_cache, ncec); 4693 if (nce != NULL) 4694 kmem_cache_free(nce_cache, nce); 4695 freemsg(dlur_mp); 4696 if (template != NULL) 4697 kmem_free(template, ill->ill_phys_addr_length); 4698 return (err); 4699 } 4700 4701 /* 4702 * take a ref on the nce 4703 */ 4704 void 4705 nce_refhold(nce_t *nce) 4706 { 4707 mutex_enter(&nce->nce_lock); 4708 nce->nce_refcnt++; 4709 ASSERT((nce)->nce_refcnt != 0); 4710 mutex_exit(&nce->nce_lock); 4711 } 4712 4713 /* 4714 * release a ref on the nce; In general, this 4715 * cannot be called with locks held because nce_inactive 4716 * may result in nce_inactive which will take the ill_lock, 4717 * do ipif_ill_refrele_tail etc. Thus the one exception 4718 * where this can be called with locks held is when the caller 4719 * is certain that the nce_refcnt is sufficient to prevent 4720 * the invocation of nce_inactive. 4721 */ 4722 void 4723 nce_refrele(nce_t *nce) 4724 { 4725 ASSERT((nce)->nce_refcnt != 0); 4726 mutex_enter(&nce->nce_lock); 4727 if (--nce->nce_refcnt == 0) 4728 nce_inactive(nce); /* destroys the mutex */ 4729 else 4730 mutex_exit(&nce->nce_lock); 4731 } 4732 4733 /* 4734 * free the nce after all refs have gone away. 4735 */ 4736 static void 4737 nce_inactive(nce_t *nce) 4738 { 4739 ill_t *ill = nce->nce_ill; 4740 4741 ASSERT(nce->nce_refcnt == 0); 4742 4743 ncec_refrele_notr(nce->nce_common); 4744 nce->nce_common = NULL; 4745 freemsg(nce->nce_fp_mp); 4746 freemsg(nce->nce_dlur_mp); 4747 4748 mutex_enter(&ill->ill_lock); 4749 DTRACE_PROBE3(ill__decr__cnt, (ill_t *), ill, 4750 (char *), "nce", (void *), nce); 4751 ill->ill_nce_cnt--; 4752 nce->nce_ill = NULL; 4753 /* 4754 * If the number of ncec's associated with this ill have dropped 4755 * to zero, check whether we need to restart any operation that 4756 * is waiting for this to happen. 4757 */ 4758 if (ILL_DOWN_OK(ill)) { 4759 /* ipif_ill_refrele_tail drops the ill_lock */ 4760 ipif_ill_refrele_tail(ill); 4761 } else { 4762 mutex_exit(&ill->ill_lock); 4763 } 4764 4765 mutex_destroy(&nce->nce_lock); 4766 kmem_cache_free(nce_cache, nce); 4767 } 4768 4769 /* 4770 * Add an nce to the ill_nce list. 4771 */ 4772 static nce_t * 4773 nce_add_impl(ill_t *ill, ncec_t *ncec, nce_t *nce, mblk_t *dlur_mp) 4774 { 4775 bzero(nce, sizeof (*nce)); 4776 mutex_init(&nce->nce_lock, NULL, MUTEX_DEFAULT, NULL); 4777 nce->nce_common = ncec; 4778 nce->nce_addr = ncec->ncec_addr; 4779 nce->nce_ill = ill; 4780 DTRACE_PROBE3(ill__incr__cnt, (ill_t *), ill, 4781 (char *), "nce", (void *), nce); 4782 ill->ill_nce_cnt++; 4783 4784 nce->nce_refcnt = 1; /* for the thread */ 4785 ncec->ncec_refcnt++; /* want ncec_refhold_locked_notr(ncec) */ 4786 nce->nce_dlur_mp = dlur_mp; 4787 4788 /* add nce to the ill's fastpath list. */ 4789 nce->nce_refcnt++; /* for the list */ 4790 list_insert_head(&ill->ill_nce, nce); 4791 return (nce); 4792 } 4793 4794 static nce_t * 4795 nce_add(ill_t *ill, ncec_t *ncec) 4796 { 4797 nce_t *nce; 4798 mblk_t *dlur_mp = NULL; 4799 4800 ASSERT(MUTEX_HELD(&ill->ill_lock)); 4801 ASSERT(MUTEX_HELD(&ncec->ncec_lock)); 4802 4803 nce = kmem_cache_alloc(nce_cache, KM_NOSLEEP); 4804 if (nce == NULL) 4805 return (NULL); 4806 if (ncec->ncec_lladdr != NULL || 4807 ill->ill_net_type == IRE_IF_NORESOLVER) { 4808 dlur_mp = ill_dlur_gen(ncec->ncec_lladdr, 4809 ill->ill_phys_addr_length, ill->ill_sap, 4810 ill->ill_sap_length); 4811 if (dlur_mp == NULL) { 4812 kmem_cache_free(nce_cache, nce); 4813 return (NULL); 4814 } 4815 } 4816 return (nce_add_impl(ill, ncec, nce, dlur_mp)); 4817 } 4818 4819 /* 4820 * remove the nce from the ill_faspath list 4821 */ 4822 void 4823 nce_delete(nce_t *nce) 4824 { 4825 ill_t *ill = nce->nce_ill; 4826 4827 ASSERT(MUTEX_HELD(&ill->ill_lock)); 4828 4829 mutex_enter(&nce->nce_lock); 4830 if (nce->nce_is_condemned) { 4831 /* 4832 * some other thread has removed this nce from the ill_nce list 4833 */ 4834 mutex_exit(&nce->nce_lock); 4835 return; 4836 } 4837 nce->nce_is_condemned = B_TRUE; 4838 mutex_exit(&nce->nce_lock); 4839 4840 list_remove(&ill->ill_nce, nce); 4841 /* 4842 * even though we are holding the ill_lock, it is ok to 4843 * call nce_refrele here because we know that we should have 4844 * at least 2 refs on the nce: one for the thread, and one 4845 * for the list. The refrele below will release the one for 4846 * the list. 4847 */ 4848 nce_refrele(nce); 4849 } 4850 4851 nce_t * 4852 nce_lookup(ill_t *ill, const in6_addr_t *addr) 4853 { 4854 nce_t *nce = NULL; 4855 4856 ASSERT(ill != NULL); 4857 ASSERT(MUTEX_HELD(&ill->ill_lock)); 4858 4859 for (nce = list_head(&ill->ill_nce); nce != NULL; 4860 nce = list_next(&ill->ill_nce, nce)) { 4861 if (IN6_ARE_ADDR_EQUAL(&nce->nce_addr, addr)) 4862 break; 4863 } 4864 4865 /* 4866 * if we found the nce on the ill_nce list while holding 4867 * the ill_lock, then it cannot be condemned yet. 4868 */ 4869 if (nce != NULL) { 4870 ASSERT(!nce->nce_is_condemned); 4871 nce_refhold(nce); 4872 } 4873 return (nce); 4874 } 4875 4876 /* 4877 * Walk the ill_nce list on ill. The callback function func() cannot perform 4878 * any destructive actions. 4879 */ 4880 static void 4881 nce_walk_common(ill_t *ill, pfi_t func, void *arg) 4882 { 4883 nce_t *nce = NULL, *nce_next; 4884 4885 ASSERT(MUTEX_HELD(&ill->ill_lock)); 4886 for (nce = list_head(&ill->ill_nce); nce != NULL; ) { 4887 nce_next = list_next(&ill->ill_nce, nce); 4888 if (func(ill, nce, arg) != 0) 4889 break; 4890 nce = nce_next; 4891 } 4892 } 4893 4894 void 4895 nce_walk(ill_t *ill, pfi_t func, void *arg) 4896 { 4897 mutex_enter(&ill->ill_lock); 4898 nce_walk_common(ill, func, arg); 4899 mutex_exit(&ill->ill_lock); 4900 } 4901 4902 void 4903 nce_flush(ill_t *ill, boolean_t flushall) 4904 { 4905 nce_t *nce, *nce_next; 4906 list_t dead; 4907 4908 list_create(&dead, sizeof (nce_t), offsetof(nce_t, nce_node)); 4909 mutex_enter(&ill->ill_lock); 4910 for (nce = list_head(&ill->ill_nce); nce != NULL; ) { 4911 nce_next = list_next(&ill->ill_nce, nce); 4912 if (!flushall && NCE_PUBLISH(nce->nce_common)) { 4913 nce = nce_next; 4914 continue; 4915 } 4916 /* 4917 * nce_delete requires that the caller should either not 4918 * be holding locks, or should hold a ref to ensure that 4919 * we wont hit ncec_inactive. So take a ref and clean up 4920 * after the list is flushed. 4921 */ 4922 nce_refhold(nce); 4923 nce_delete(nce); 4924 list_insert_tail(&dead, nce); 4925 nce = nce_next; 4926 } 4927 mutex_exit(&ill->ill_lock); 4928 while ((nce = list_head(&dead)) != NULL) { 4929 list_remove(&dead, nce); 4930 nce_refrele(nce); 4931 } 4932 ASSERT(list_is_empty(&dead)); 4933 list_destroy(&dead); 4934 } 4935 4936 /* Return an interval that is anywhere in the [1 .. intv] range */ 4937 static clock_t 4938 nce_fuzz_interval(clock_t intv, boolean_t initial_time) 4939 { 4940 clock_t rnd, frac; 4941 4942 (void) random_get_pseudo_bytes((uint8_t *)&rnd, sizeof (rnd)); 4943 /* Note that clock_t is signed; must chop off bits */ 4944 rnd &= (1ul << (NBBY * sizeof (rnd) - 1)) - 1; 4945 if (initial_time) { 4946 if (intv <= 0) 4947 intv = 1; 4948 else 4949 intv = (rnd % intv) + 1; 4950 } else { 4951 /* Compute 'frac' as 20% of the configured interval */ 4952 if ((frac = intv / 5) <= 1) 4953 frac = 2; 4954 /* Set intv randomly in the range [intv-frac .. intv+frac] */ 4955 if ((intv = intv - frac + rnd % (2 * frac + 1)) <= 0) 4956 intv = 1; 4957 } 4958 return (intv); 4959 } 4960 4961 void 4962 nce_resolv_ipmp_ok(ncec_t *ncec) 4963 { 4964 mblk_t *mp; 4965 uint_t pkt_len; 4966 iaflags_t ixaflags = IXAF_NO_TRACE; 4967 nce_t *under_nce; 4968 ill_t *ill = ncec->ncec_ill; 4969 boolean_t isv6 = (ncec->ncec_ipversion == IPV6_VERSION); 4970 ipif_t *src_ipif = NULL; 4971 ip_stack_t *ipst = ill->ill_ipst; 4972 ill_t *send_ill; 4973 uint_t nprobes; 4974 4975 ASSERT(IS_IPMP(ill)); 4976 4977 mutex_enter(&ncec->ncec_lock); 4978 nprobes = ncec->ncec_nprobes; 4979 mp = ncec->ncec_qd_mp; 4980 ncec->ncec_qd_mp = NULL; 4981 ncec->ncec_nprobes = 0; 4982 mutex_exit(&ncec->ncec_lock); 4983 4984 while (mp != NULL) { 4985 mblk_t *nxt_mp; 4986 4987 nxt_mp = mp->b_next; 4988 mp->b_next = NULL; 4989 if (isv6) { 4990 ip6_t *ip6h = (ip6_t *)mp->b_rptr; 4991 4992 pkt_len = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN; 4993 src_ipif = ipif_lookup_addr_nondup_v6(&ip6h->ip6_src, 4994 ill, ALL_ZONES, ipst); 4995 } else { 4996 ipha_t *ipha = (ipha_t *)mp->b_rptr; 4997 4998 ixaflags |= IXAF_IS_IPV4; 4999 pkt_len = ntohs(ipha->ipha_length); 5000 src_ipif = ipif_lookup_addr_nondup(ipha->ipha_src, 5001 ill, ALL_ZONES, ipst); 5002 } 5003 5004 /* 5005 * find a new nce based on an under_ill. The first IPMP probe 5006 * packet gets queued, so we could still find a src_ipif that 5007 * matches an IPMP test address. 5008 */ 5009 if (src_ipif == NULL || IS_IPMP(src_ipif->ipif_ill)) { 5010 /* 5011 * if src_ipif is null, this could be either a 5012 * forwarded packet or a probe whose src got deleted. 5013 * We identify the former case by looking for the 5014 * ncec_nprobes: the first ncec_nprobes packets are 5015 * probes; 5016 */ 5017 if (src_ipif == NULL && nprobes > 0) 5018 goto drop_pkt; 5019 5020 /* 5021 * For forwarded packets, we use the ipmp rotor 5022 * to find send_ill. 5023 */ 5024 send_ill = ipmp_ill_hold_xmit_ill(ncec->ncec_ill, 5025 B_TRUE); 5026 } else { 5027 send_ill = src_ipif->ipif_ill; 5028 ill_refhold(send_ill); 5029 } 5030 5031 DTRACE_PROBE4(nce__resolve__ipmp, (mblk_t *), mp, 5032 (ncec_t *), ncec, (ipif_t *), 5033 src_ipif, (ill_t *), send_ill); 5034 5035 if (send_ill == NULL) { 5036 if (src_ipif != NULL) 5037 ipif_refrele(src_ipif); 5038 goto drop_pkt; 5039 } 5040 /* create an under_nce on send_ill */ 5041 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 5042 if (IS_IN_SAME_ILLGRP(send_ill, ncec->ncec_ill)) 5043 under_nce = nce_fastpath_create(send_ill, ncec); 5044 else 5045 under_nce = NULL; 5046 rw_exit(&ipst->ips_ill_g_lock); 5047 if (under_nce != NULL && NCE_ISREACHABLE(ncec)) 5048 nce_fastpath_trigger(under_nce); 5049 5050 ill_refrele(send_ill); 5051 if (src_ipif != NULL) 5052 ipif_refrele(src_ipif); 5053 5054 if (under_nce != NULL) { 5055 (void) ip_xmit(mp, under_nce, ixaflags, pkt_len, 0, 5056 ALL_ZONES, 0, NULL); 5057 nce_refrele(under_nce); 5058 if (nprobes > 0) 5059 nprobes--; 5060 mp = nxt_mp; 5061 continue; 5062 } 5063 drop_pkt: 5064 if (isv6) { 5065 BUMP_MIB(&ipst->ips_ip6_mib, ipIfStatsOutDiscards); 5066 } else { 5067 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); 5068 } 5069 ip_drop_output("ipIfStatsOutDiscards - no under_ill", mp, NULL); 5070 freemsg(mp); 5071 if (nprobes > 0) 5072 nprobes--; 5073 mp = nxt_mp; 5074 } 5075 ncec_cb_dispatch(ncec); /* complete callbacks */ 5076 } 5077