1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2010 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #include <sys/types.h> 27 #include <sys/stream.h> 28 #include <sys/stropts.h> 29 #include <sys/strsun.h> 30 #include <sys/sysmacros.h> 31 #include <sys/errno.h> 32 #include <sys/dlpi.h> 33 #include <sys/socket.h> 34 #include <sys/ddi.h> 35 #include <sys/sunddi.h> 36 #include <sys/cmn_err.h> 37 #include <sys/debug.h> 38 #include <sys/vtrace.h> 39 #include <sys/kmem.h> 40 #include <sys/zone.h> 41 #include <sys/ethernet.h> 42 #include <sys/sdt.h> 43 #include <sys/mac.h> 44 45 #include <net/if.h> 46 #include <net/if_types.h> 47 #include <net/if_dl.h> 48 #include <net/route.h> 49 #include <netinet/in.h> 50 #include <netinet/ip6.h> 51 #include <netinet/icmp6.h> 52 53 #include <inet/common.h> 54 #include <inet/mi.h> 55 #include <inet/mib2.h> 56 #include <inet/nd.h> 57 #include <inet/ip.h> 58 #include <inet/ip_impl.h> 59 #include <inet/ipclassifier.h> 60 #include <inet/ip_if.h> 61 #include <inet/ip_ire.h> 62 #include <inet/ip_rts.h> 63 #include <inet/ip6.h> 64 #include <inet/ip_ndp.h> 65 #include <inet/sctp_ip.h> 66 #include <inet/ip_arp.h> 67 #include <inet/ip2mac_impl.h> 68 69 #define ANNOUNCE_INTERVAL(isv6) \ 70 (isv6 ? ipst->ips_ip_ndp_unsolicit_interval : \ 71 ipst->ips_ip_arp_publish_interval) 72 73 #define DEFENSE_INTERVAL(isv6) \ 74 (isv6 ? ipst->ips_ndp_defend_interval : \ 75 ipst->ips_arp_defend_interval) 76 77 /* Non-tunable probe interval, based on link capabilities */ 78 #define ILL_PROBE_INTERVAL(ill) ((ill)->ill_note_link ? 150 : 1500) 79 80 /* 81 * The IPv4 Link Local address space is special; we do extra duplicate checking 82 * there, as the entire assignment mechanism rests on random numbers. 83 */ 84 #define IS_IPV4_LL_SPACE(ptr) (((uchar_t *)ptr)[0] == 169 && \ 85 ((uchar_t *)ptr)[1] == 254) 86 87 /* 88 * NCE_EXTERNAL_FLAGS_MASK defines the set of ncec_flags that may be passed 89 * in to the ncec*add* functions. 90 * 91 * NCE_F_AUTHORITY means that we ignore any incoming adverts for that 92 * mapping (though DAD is performed for the mapping). NCE_F_PUBLISH means 93 * that we will respond to requests for the protocol address. 94 */ 95 #define NCE_EXTERNAL_FLAGS_MASK \ 96 (NCE_F_MYADDR | NCE_F_ISROUTER | NCE_F_NONUD | \ 97 NCE_F_ANYCAST | NCE_F_UNSOL_ADV | NCE_F_BCAST | NCE_F_MCAST | \ 98 NCE_F_AUTHORITY | NCE_F_PUBLISH | NCE_F_STATIC) 99 100 /* 101 * Lock ordering: 102 * 103 * ndp_g_lock -> ill_lock -> ncec_lock 104 * 105 * The ndp_g_lock protects the NCE hash (nce_hash_tbl, NCE_HASH_PTR) and 106 * ncec_next. ncec_lock protects the contents of the NCE (particularly 107 * ncec_refcnt). 108 */ 109 110 static void nce_cleanup_list(ncec_t *ncec); 111 static void nce_set_ll(ncec_t *ncec, uchar_t *ll_addr); 112 static ncec_t *ncec_lookup_illgrp(ill_t *, const in6_addr_t *, 113 ncec_t *); 114 static nce_t *nce_lookup_addr(ill_t *, const in6_addr_t *); 115 static int nce_set_multicast_v6(ill_t *ill, const in6_addr_t *addr, 116 uint16_t ncec_flags, nce_t **newnce); 117 static int nce_set_multicast_v4(ill_t *ill, const in_addr_t *dst, 118 uint16_t ncec_flags, nce_t **newnce); 119 static boolean_t ndp_xmit(ill_t *ill, uint32_t operation, 120 uint8_t *hwaddr, uint_t hwaddr_len, const in6_addr_t *sender, 121 const in6_addr_t *target, int flag); 122 static void ncec_refhold_locked(ncec_t *); 123 static boolean_t ill_defend_rate_limit(ill_t *, ncec_t *); 124 static void nce_queue_mp_common(ncec_t *, mblk_t *, boolean_t); 125 static int nce_add_common(ill_t *, uchar_t *, uint_t, const in6_addr_t *, 126 uint16_t, uint16_t, nce_t **); 127 static nce_t *nce_add_impl(ill_t *, ncec_t *, nce_t *, mblk_t *); 128 static nce_t *nce_add(ill_t *, ncec_t *); 129 static void nce_inactive(nce_t *); 130 extern nce_t *nce_lookup(ill_t *, const in6_addr_t *); 131 static nce_t *nce_ill_lookup_then_add(ill_t *, ncec_t *); 132 static int nce_add_v6(ill_t *, uchar_t *, uint_t, const in6_addr_t *, 133 uint16_t, uint16_t, nce_t **); 134 static int nce_add_v4(ill_t *, uchar_t *, uint_t, const in_addr_t *, 135 uint16_t, uint16_t, nce_t **); 136 static int nce_add_v6_postprocess(nce_t *); 137 static int nce_add_v4_postprocess(nce_t *); 138 static ill_t *nce_resolve_src(ncec_t *, in6_addr_t *); 139 static clock_t nce_fuzz_interval(clock_t, boolean_t); 140 static void nce_resolv_ipmp_ok(ncec_t *); 141 static void nce_walk_common(ill_t *, pfi_t, void *); 142 static void nce_start_timer(ncec_t *, uint_t); 143 static nce_t *nce_fastpath_create(ill_t *, ncec_t *); 144 static void nce_fastpath_trigger(nce_t *); 145 static nce_t *nce_fastpath(ncec_t *, boolean_t, nce_t *); 146 147 #ifdef DEBUG 148 static void ncec_trace_cleanup(const ncec_t *); 149 #endif 150 151 #define NCE_HASH_PTR_V4(ipst, addr) \ 152 (&((ipst)->ips_ndp4->nce_hash_tbl[IRE_ADDR_HASH(addr, NCE_TABLE_SIZE)])) 153 154 #define NCE_HASH_PTR_V6(ipst, addr) \ 155 (&((ipst)->ips_ndp6->nce_hash_tbl[NCE_ADDR_HASH_V6(addr, \ 156 NCE_TABLE_SIZE)])) 157 158 extern kmem_cache_t *ncec_cache; 159 extern kmem_cache_t *nce_cache; 160 161 /* 162 * Send out a IPv6 (unicast) or IPv4 (broadcast) DAD probe 163 * If src_ill is not null, the ncec_addr is bound to src_ill. The 164 * src_ill is ignored by nce_dad for IPv4 Neighbor Cache entries where 165 * the probe is sent on the ncec_ill (in the non-IPMP case) or the 166 * IPMP cast_ill (in the IPMP case). 167 * 168 * Note that the probe interval is based on the src_ill for IPv6, and 169 * the ncec_xmit_interval for IPv4. 170 */ 171 static void 172 nce_dad(ncec_t *ncec, ill_t *src_ill, boolean_t send_probe) 173 { 174 boolean_t dropped; 175 uint32_t probe_interval; 176 177 ASSERT(!(ncec->ncec_flags & NCE_F_MCAST)); 178 ASSERT(!(ncec->ncec_flags & NCE_F_BCAST)); 179 if (ncec->ncec_ipversion == IPV6_VERSION) { 180 dropped = ndp_xmit(src_ill, ND_NEIGHBOR_SOLICIT, 181 ncec->ncec_lladdr, ncec->ncec_lladdr_length, 182 &ipv6_all_zeros, &ncec->ncec_addr, NDP_PROBE); 183 probe_interval = ILL_PROBE_INTERVAL(src_ill); 184 } else { 185 /* IPv4 DAD delay the initial probe. */ 186 if (send_probe) 187 dropped = arp_probe(ncec); 188 else 189 dropped = B_TRUE; 190 probe_interval = nce_fuzz_interval(ncec->ncec_xmit_interval, 191 !send_probe); 192 } 193 if (!dropped) { 194 mutex_enter(&ncec->ncec_lock); 195 ncec->ncec_pcnt--; 196 mutex_exit(&ncec->ncec_lock); 197 } 198 nce_restart_timer(ncec, probe_interval); 199 } 200 201 /* 202 * Compute default flags to use for an advertisement of this ncec's address. 203 */ 204 static int 205 nce_advert_flags(const ncec_t *ncec) 206 { 207 int flag = 0; 208 209 if (ncec->ncec_flags & NCE_F_ISROUTER) 210 flag |= NDP_ISROUTER; 211 if (!(ncec->ncec_flags & NCE_F_ANYCAST)) 212 flag |= NDP_ORIDE; 213 214 return (flag); 215 } 216 217 /* 218 * NDP Cache Entry creation routine. 219 * This routine must always be called with ndp6->ndp_g_lock held. 220 */ 221 int 222 nce_add_v6(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len, 223 const in6_addr_t *addr, uint16_t flags, uint16_t state, nce_t **newnce) 224 { 225 int err; 226 nce_t *nce; 227 228 ASSERT(MUTEX_HELD(&ill->ill_ipst->ips_ndp6->ndp_g_lock)); 229 ASSERT(ill != NULL && ill->ill_isv6); 230 231 err = nce_add_common(ill, hw_addr, hw_addr_len, addr, flags, state, 232 &nce); 233 if (err != 0) 234 return (err); 235 ASSERT(newnce != NULL); 236 *newnce = nce; 237 return (err); 238 } 239 240 /* 241 * Post-processing routine to be executed after nce_add_v6(). This function 242 * triggers fastpath (if appropriate) and DAD on the newly added nce entry 243 * and must be called without any locks held. 244 */ 245 int 246 nce_add_v6_postprocess(nce_t *nce) 247 { 248 ncec_t *ncec = nce->nce_common; 249 boolean_t dropped = B_FALSE; 250 uchar_t *hw_addr = ncec->ncec_lladdr; 251 uint_t hw_addr_len = ncec->ncec_lladdr_length; 252 ill_t *ill = ncec->ncec_ill; 253 int err = 0; 254 uint16_t flags = ncec->ncec_flags; 255 ip_stack_t *ipst = ill->ill_ipst; 256 boolean_t trigger_fastpath = B_TRUE; 257 258 /* 259 * If the hw_addr is NULL, typically for ND_INCOMPLETE nces, then 260 * we call nce_fastpath as soon as the ncec is resolved in nce_process. 261 * We call nce_fastpath from nce_update if the link layer address of 262 * the peer changes from nce_update 263 */ 264 if (NCE_PUBLISH(ncec) || !NCE_ISREACHABLE(ncec) || 265 (hw_addr == NULL && ill->ill_net_type != IRE_IF_NORESOLVER)) 266 trigger_fastpath = B_FALSE; 267 268 if (trigger_fastpath) 269 nce_fastpath_trigger(nce); 270 if (NCE_PUBLISH(ncec) && ncec->ncec_state == ND_PROBE) { 271 ill_t *hwaddr_ill; 272 /* 273 * Unicast entry that needs DAD. 274 */ 275 if (IS_IPMP(ill)) { 276 hwaddr_ill = ipmp_illgrp_find_ill(ill->ill_grp, 277 hw_addr, hw_addr_len); 278 } else { 279 hwaddr_ill = ill; 280 } 281 nce_dad(ncec, hwaddr_ill, B_TRUE); 282 err = EINPROGRESS; 283 } else if (flags & NCE_F_UNSOL_ADV) { 284 /* 285 * We account for the transmit below by assigning one 286 * less than the ndd variable. Subsequent decrements 287 * are done in nce_timer. 288 */ 289 mutex_enter(&ncec->ncec_lock); 290 ncec->ncec_unsolicit_count = 291 ipst->ips_ip_ndp_unsolicit_count - 1; 292 mutex_exit(&ncec->ncec_lock); 293 dropped = ndp_xmit(ill, 294 ND_NEIGHBOR_ADVERT, 295 hw_addr, 296 hw_addr_len, 297 &ncec->ncec_addr, /* Source and target of the adv */ 298 &ipv6_all_hosts_mcast, /* Destination of the packet */ 299 nce_advert_flags(ncec)); 300 mutex_enter(&ncec->ncec_lock); 301 if (dropped) 302 ncec->ncec_unsolicit_count++; 303 else 304 ncec->ncec_last_time_defended = ddi_get_lbolt(); 305 if (ncec->ncec_unsolicit_count != 0) { 306 nce_start_timer(ncec, 307 ipst->ips_ip_ndp_unsolicit_interval); 308 } 309 mutex_exit(&ncec->ncec_lock); 310 } 311 return (err); 312 } 313 314 /* 315 * Atomically lookup and add (if needed) Neighbor Cache information for 316 * an address. 317 * 318 * IPMP notes: the ncec for non-local (i.e., !NCE_MYADDR(ncec) addresses 319 * are always added pointing at the ipmp_ill. Thus, when the ill passed 320 * to nce_add_v6 is an under_ill (i.e., IS_UNDER_IPMP(ill)) two nce_t 321 * entries will be created, both pointing at the same ncec_t. The nce_t 322 * entries will have their nce_ill set to the ipmp_ill and the under_ill 323 * respectively, with the ncec_t having its ncec_ill pointing at the ipmp_ill. 324 * Local addresses are always created on the ill passed to nce_add_v6. 325 */ 326 int 327 nce_lookup_then_add_v6(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len, 328 const in6_addr_t *addr, uint16_t flags, uint16_t state, nce_t **newnce) 329 { 330 int err = 0; 331 ip_stack_t *ipst = ill->ill_ipst; 332 nce_t *nce, *upper_nce = NULL; 333 ill_t *in_ill = ill; 334 boolean_t need_ill_refrele = B_FALSE; 335 336 if (flags & NCE_F_MCAST) { 337 /* 338 * hw_addr will be figured out in nce_set_multicast_v6; 339 * caller has to select the cast_ill 340 */ 341 ASSERT(hw_addr == NULL); 342 ASSERT(!IS_IPMP(ill)); 343 err = nce_set_multicast_v6(ill, addr, flags, newnce); 344 return (err); 345 } 346 ASSERT(ill->ill_isv6); 347 if (IS_UNDER_IPMP(ill) && !(flags & NCE_F_MYADDR)) { 348 ill = ipmp_ill_hold_ipmp_ill(ill); 349 if (ill == NULL) 350 return (ENXIO); 351 need_ill_refrele = B_TRUE; 352 } 353 354 mutex_enter(&ipst->ips_ndp6->ndp_g_lock); 355 nce = nce_lookup_addr(ill, addr); 356 if (nce == NULL) { 357 err = nce_add_v6(ill, hw_addr, hw_addr_len, addr, flags, state, 358 &nce); 359 } else { 360 err = EEXIST; 361 } 362 mutex_exit(&ipst->ips_ndp6->ndp_g_lock); 363 if (err == 0) 364 err = nce_add_v6_postprocess(nce); 365 if (in_ill != ill && nce != NULL) { 366 nce_t *under_nce = NULL; 367 368 /* 369 * in_ill was the under_ill. Try to create the under_nce. 370 * Hold the ill_g_lock to prevent changes to group membership 371 * until we are done. 372 */ 373 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 374 if (!IS_IN_SAME_ILLGRP(in_ill, ill)) { 375 DTRACE_PROBE2(ill__not__in__group, nce_t *, nce, 376 ill_t *, ill); 377 rw_exit(&ipst->ips_ill_g_lock); 378 err = ENXIO; 379 nce_refrele(nce); 380 nce = NULL; 381 goto bail; 382 } 383 under_nce = nce_fastpath_create(in_ill, nce->nce_common); 384 if (under_nce == NULL) { 385 rw_exit(&ipst->ips_ill_g_lock); 386 err = EINVAL; 387 nce_refrele(nce); 388 nce = NULL; 389 goto bail; 390 } 391 rw_exit(&ipst->ips_ill_g_lock); 392 upper_nce = nce; 393 nce = under_nce; /* will be returned to caller */ 394 if (NCE_ISREACHABLE(nce->nce_common)) 395 nce_fastpath_trigger(under_nce); 396 } 397 /* nce_refrele is deferred until the lock is dropped */ 398 if (nce != NULL) { 399 if (newnce != NULL) 400 *newnce = nce; 401 else 402 nce_refrele(nce); 403 } 404 bail: 405 if (upper_nce != NULL) 406 nce_refrele(upper_nce); 407 if (need_ill_refrele) 408 ill_refrele(ill); 409 return (err); 410 } 411 412 /* 413 * Remove all the CONDEMNED nces from the appropriate hash table. 414 * We create a private list of NCEs, these may have ires pointing 415 * to them, so the list will be passed through to clean up dependent 416 * ires and only then we can do ncec_refrele() which can make NCE inactive. 417 */ 418 static void 419 nce_remove(ndp_g_t *ndp, ncec_t *ncec, ncec_t **free_nce_list) 420 { 421 ncec_t *ncec1; 422 ncec_t **ptpn; 423 424 ASSERT(MUTEX_HELD(&ndp->ndp_g_lock)); 425 ASSERT(ndp->ndp_g_walker == 0); 426 for (; ncec; ncec = ncec1) { 427 ncec1 = ncec->ncec_next; 428 mutex_enter(&ncec->ncec_lock); 429 if (NCE_ISCONDEMNED(ncec)) { 430 ptpn = ncec->ncec_ptpn; 431 ncec1 = ncec->ncec_next; 432 if (ncec1 != NULL) 433 ncec1->ncec_ptpn = ptpn; 434 *ptpn = ncec1; 435 ncec->ncec_ptpn = NULL; 436 ncec->ncec_next = NULL; 437 ncec->ncec_next = *free_nce_list; 438 *free_nce_list = ncec; 439 } 440 mutex_exit(&ncec->ncec_lock); 441 } 442 } 443 444 /* 445 * 1. Mark the entry CONDEMNED. This ensures that no new nce_lookup() 446 * will return this NCE. Also no new timeouts will 447 * be started (See nce_restart_timer). 448 * 2. Cancel any currently running timeouts. 449 * 3. If there is an ndp walker, return. The walker will do the cleanup. 450 * This ensures that walkers see a consistent list of NCEs while walking. 451 * 4. Otherwise remove the NCE from the list of NCEs 452 */ 453 void 454 ncec_delete(ncec_t *ncec) 455 { 456 ncec_t **ptpn; 457 ncec_t *ncec1; 458 int ipversion = ncec->ncec_ipversion; 459 ndp_g_t *ndp; 460 ip_stack_t *ipst = ncec->ncec_ipst; 461 462 if (ipversion == IPV4_VERSION) 463 ndp = ipst->ips_ndp4; 464 else 465 ndp = ipst->ips_ndp6; 466 467 /* Serialize deletes */ 468 mutex_enter(&ncec->ncec_lock); 469 if (NCE_ISCONDEMNED(ncec)) { 470 /* Some other thread is doing the delete */ 471 mutex_exit(&ncec->ncec_lock); 472 return; 473 } 474 /* 475 * Caller has a refhold. Also 1 ref for being in the list. Thus 476 * refcnt has to be >= 2 477 */ 478 ASSERT(ncec->ncec_refcnt >= 2); 479 ncec->ncec_flags |= NCE_F_CONDEMNED; 480 mutex_exit(&ncec->ncec_lock); 481 482 /* Count how many condemned ires for kmem_cache callback */ 483 atomic_add_32(&ipst->ips_num_nce_condemned, 1); 484 nce_fastpath_list_delete(ncec->ncec_ill, ncec, NULL); 485 486 /* Complete any waiting callbacks */ 487 ncec_cb_dispatch(ncec); 488 489 /* 490 * Cancel any running timer. Timeout can't be restarted 491 * since CONDEMNED is set. Can't hold ncec_lock across untimeout. 492 * Passing invalid timeout id is fine. 493 */ 494 if (ncec->ncec_timeout_id != 0) { 495 (void) untimeout(ncec->ncec_timeout_id); 496 ncec->ncec_timeout_id = 0; 497 } 498 499 mutex_enter(&ndp->ndp_g_lock); 500 if (ncec->ncec_ptpn == NULL) { 501 /* 502 * The last ndp walker has already removed this ncec from 503 * the list after we marked the ncec CONDEMNED and before 504 * we grabbed the global lock. 505 */ 506 mutex_exit(&ndp->ndp_g_lock); 507 return; 508 } 509 if (ndp->ndp_g_walker > 0) { 510 /* 511 * Can't unlink. The walker will clean up 512 */ 513 ndp->ndp_g_walker_cleanup = B_TRUE; 514 mutex_exit(&ndp->ndp_g_lock); 515 return; 516 } 517 518 /* 519 * Now remove the ncec from the list. nce_restart_timer won't restart 520 * the timer since it is marked CONDEMNED. 521 */ 522 ptpn = ncec->ncec_ptpn; 523 ncec1 = ncec->ncec_next; 524 if (ncec1 != NULL) 525 ncec1->ncec_ptpn = ptpn; 526 *ptpn = ncec1; 527 ncec->ncec_ptpn = NULL; 528 ncec->ncec_next = NULL; 529 mutex_exit(&ndp->ndp_g_lock); 530 531 /* Removed from ncec_ptpn/ncec_next list */ 532 ncec_refrele_notr(ncec); 533 } 534 535 void 536 ncec_inactive(ncec_t *ncec) 537 { 538 mblk_t **mpp; 539 ill_t *ill = ncec->ncec_ill; 540 ip_stack_t *ipst = ncec->ncec_ipst; 541 542 ASSERT(ncec->ncec_refcnt == 0); 543 ASSERT(MUTEX_HELD(&ncec->ncec_lock)); 544 545 /* Count how many condemned nces for kmem_cache callback */ 546 if (NCE_ISCONDEMNED(ncec)) 547 atomic_add_32(&ipst->ips_num_nce_condemned, -1); 548 549 /* Free all allocated messages */ 550 mpp = &ncec->ncec_qd_mp; 551 while (*mpp != NULL) { 552 mblk_t *mp; 553 554 mp = *mpp; 555 *mpp = mp->b_next; 556 557 inet_freemsg(mp); 558 } 559 /* 560 * must have been cleaned up in ncec_delete 561 */ 562 ASSERT(list_is_empty(&ncec->ncec_cb)); 563 list_destroy(&ncec->ncec_cb); 564 /* 565 * free the ncec_lladdr if one was allocated in nce_add_common() 566 */ 567 if (ncec->ncec_lladdr_length > 0) 568 kmem_free(ncec->ncec_lladdr, ncec->ncec_lladdr_length); 569 570 #ifdef DEBUG 571 ncec_trace_cleanup(ncec); 572 #endif 573 574 mutex_enter(&ill->ill_lock); 575 DTRACE_PROBE3(ill__decr__cnt, (ill_t *), ill, 576 (char *), "ncec", (void *), ncec); 577 ill->ill_ncec_cnt--; 578 ncec->ncec_ill = NULL; 579 /* 580 * If the number of ncec's associated with this ill have dropped 581 * to zero, check whether we need to restart any operation that 582 * is waiting for this to happen. 583 */ 584 if (ILL_DOWN_OK(ill)) { 585 /* ipif_ill_refrele_tail drops the ill_lock */ 586 ipif_ill_refrele_tail(ill); 587 } else { 588 mutex_exit(&ill->ill_lock); 589 } 590 591 mutex_destroy(&ncec->ncec_lock); 592 kmem_cache_free(ncec_cache, ncec); 593 } 594 595 /* 596 * ncec_walk routine. Delete the ncec if it is associated with the ill 597 * that is going away. Always called as a writer. 598 */ 599 void 600 ncec_delete_per_ill(ncec_t *ncec, uchar_t *arg) 601 { 602 if ((ncec != NULL) && ncec->ncec_ill == (ill_t *)arg) { 603 ncec_delete(ncec); 604 } 605 } 606 607 /* 608 * Neighbor Cache cleanup logic for a list of ncec_t entries. 609 */ 610 static void 611 nce_cleanup_list(ncec_t *ncec) 612 { 613 ncec_t *ncec_next; 614 615 ASSERT(ncec != NULL); 616 while (ncec != NULL) { 617 ncec_next = ncec->ncec_next; 618 ncec->ncec_next = NULL; 619 620 /* 621 * It is possible for the last ndp walker (this thread) 622 * to come here after ncec_delete has marked the ncec CONDEMNED 623 * and before it has removed the ncec from the fastpath list 624 * or called untimeout. So we need to do it here. It is safe 625 * for both ncec_delete and this thread to do it twice or 626 * even simultaneously since each of the threads has a 627 * reference on the ncec. 628 */ 629 nce_fastpath_list_delete(ncec->ncec_ill, ncec, NULL); 630 /* 631 * Cancel any running timer. Timeout can't be restarted 632 * since CONDEMNED is set. The ncec_lock can't be 633 * held across untimeout though passing invalid timeout 634 * id is fine. 635 */ 636 if (ncec->ncec_timeout_id != 0) { 637 (void) untimeout(ncec->ncec_timeout_id); 638 ncec->ncec_timeout_id = 0; 639 } 640 /* Removed from ncec_ptpn/ncec_next list */ 641 ncec_refrele_notr(ncec); 642 ncec = ncec_next; 643 } 644 } 645 646 /* 647 * Restart DAD on given NCE. Returns B_TRUE if DAD has been restarted. 648 */ 649 boolean_t 650 nce_restart_dad(ncec_t *ncec) 651 { 652 boolean_t started; 653 ill_t *ill, *hwaddr_ill; 654 655 if (ncec == NULL) 656 return (B_FALSE); 657 ill = ncec->ncec_ill; 658 mutex_enter(&ncec->ncec_lock); 659 if (ncec->ncec_state == ND_PROBE) { 660 mutex_exit(&ncec->ncec_lock); 661 started = B_TRUE; 662 } else if (ncec->ncec_state == ND_REACHABLE) { 663 ASSERT(ncec->ncec_lladdr != NULL); 664 ncec->ncec_state = ND_PROBE; 665 ncec->ncec_pcnt = ND_MAX_UNICAST_SOLICIT; 666 /* 667 * Slight cheat here: we don't use the initial probe delay 668 * for IPv4 in this obscure case. 669 */ 670 mutex_exit(&ncec->ncec_lock); 671 if (IS_IPMP(ill)) { 672 hwaddr_ill = ipmp_illgrp_find_ill(ill->ill_grp, 673 ncec->ncec_lladdr, ncec->ncec_lladdr_length); 674 } else { 675 hwaddr_ill = ill; 676 } 677 nce_dad(ncec, hwaddr_ill, B_TRUE); 678 started = B_TRUE; 679 } else { 680 mutex_exit(&ncec->ncec_lock); 681 started = B_FALSE; 682 } 683 return (started); 684 } 685 686 /* 687 * IPv6 Cache entry lookup. Try to find an ncec matching the parameters passed. 688 * If one is found, the refcnt on the ncec will be incremented. 689 */ 690 ncec_t * 691 ncec_lookup_illgrp_v6(ill_t *ill, const in6_addr_t *addr) 692 { 693 ncec_t *ncec; 694 ip_stack_t *ipst = ill->ill_ipst; 695 696 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 697 mutex_enter(&ipst->ips_ndp6->ndp_g_lock); 698 699 /* Get head of v6 hash table */ 700 ncec = *((ncec_t **)NCE_HASH_PTR_V6(ipst, *addr)); 701 ncec = ncec_lookup_illgrp(ill, addr, ncec); 702 mutex_exit(&ipst->ips_ndp6->ndp_g_lock); 703 rw_exit(&ipst->ips_ill_g_lock); 704 return (ncec); 705 } 706 /* 707 * IPv4 Cache entry lookup. Try to find an ncec matching the parameters passed. 708 * If one is found, the refcnt on the ncec will be incremented. 709 */ 710 ncec_t * 711 ncec_lookup_illgrp_v4(ill_t *ill, const in_addr_t *addr) 712 { 713 ncec_t *ncec = NULL; 714 in6_addr_t addr6; 715 ip_stack_t *ipst = ill->ill_ipst; 716 717 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 718 mutex_enter(&ipst->ips_ndp4->ndp_g_lock); 719 720 /* Get head of v4 hash table */ 721 ncec = *((ncec_t **)NCE_HASH_PTR_V4(ipst, *addr)); 722 IN6_IPADDR_TO_V4MAPPED(*addr, &addr6); 723 ncec = ncec_lookup_illgrp(ill, &addr6, ncec); 724 mutex_exit(&ipst->ips_ndp4->ndp_g_lock); 725 rw_exit(&ipst->ips_ill_g_lock); 726 return (ncec); 727 } 728 729 /* 730 * Cache entry lookup. Try to find an ncec matching the parameters passed. 731 * If an ncec is found, increment the hold count on that ncec. 732 * The caller passes in the start of the appropriate hash table, and must 733 * be holding the appropriate global lock (ndp_g_lock). In addition, since 734 * this function matches ncec_t entries across the illgrp, the ips_ill_g_lock 735 * must be held as reader. 736 * 737 * This function always matches across the ipmp group. 738 */ 739 ncec_t * 740 ncec_lookup_illgrp(ill_t *ill, const in6_addr_t *addr, ncec_t *ncec) 741 { 742 ndp_g_t *ndp; 743 ip_stack_t *ipst = ill->ill_ipst; 744 745 if (ill->ill_isv6) 746 ndp = ipst->ips_ndp6; 747 else 748 ndp = ipst->ips_ndp4; 749 750 ASSERT(ill != NULL); 751 ASSERT(MUTEX_HELD(&ndp->ndp_g_lock)); 752 if (IN6_IS_ADDR_UNSPECIFIED(addr)) 753 return (NULL); 754 for (; ncec != NULL; ncec = ncec->ncec_next) { 755 if (ncec->ncec_ill == ill || 756 IS_IN_SAME_ILLGRP(ill, ncec->ncec_ill)) { 757 if (IN6_ARE_ADDR_EQUAL(&ncec->ncec_addr, addr)) { 758 mutex_enter(&ncec->ncec_lock); 759 if (!NCE_ISCONDEMNED(ncec)) { 760 ncec_refhold_locked(ncec); 761 mutex_exit(&ncec->ncec_lock); 762 break; 763 } 764 mutex_exit(&ncec->ncec_lock); 765 } 766 } 767 } 768 return (ncec); 769 } 770 771 /* 772 * Find an nce_t on ill with nce_addr == addr. Lookup the nce_t 773 * entries for ill only, i.e., when ill is part of an ipmp group, 774 * nce_lookup_v4 will never try to match across the group. 775 */ 776 nce_t * 777 nce_lookup_v4(ill_t *ill, const in_addr_t *addr) 778 { 779 nce_t *nce; 780 in6_addr_t addr6; 781 ip_stack_t *ipst = ill->ill_ipst; 782 783 mutex_enter(&ipst->ips_ndp4->ndp_g_lock); 784 IN6_IPADDR_TO_V4MAPPED(*addr, &addr6); 785 nce = nce_lookup_addr(ill, &addr6); 786 mutex_exit(&ipst->ips_ndp4->ndp_g_lock); 787 return (nce); 788 } 789 790 /* 791 * Find an nce_t on ill with nce_addr == addr. Lookup the nce_t 792 * entries for ill only, i.e., when ill is part of an ipmp group, 793 * nce_lookup_v6 will never try to match across the group. 794 */ 795 nce_t * 796 nce_lookup_v6(ill_t *ill, const in6_addr_t *addr6) 797 { 798 nce_t *nce; 799 ip_stack_t *ipst = ill->ill_ipst; 800 801 mutex_enter(&ipst->ips_ndp6->ndp_g_lock); 802 nce = nce_lookup_addr(ill, addr6); 803 mutex_exit(&ipst->ips_ndp6->ndp_g_lock); 804 return (nce); 805 } 806 807 static nce_t * 808 nce_lookup_addr(ill_t *ill, const in6_addr_t *addr) 809 { 810 nce_t *nce; 811 812 ASSERT(ill != NULL); 813 #ifdef DEBUG 814 if (ill->ill_isv6) 815 ASSERT(MUTEX_HELD(&ill->ill_ipst->ips_ndp6->ndp_g_lock)); 816 else 817 ASSERT(MUTEX_HELD(&ill->ill_ipst->ips_ndp4->ndp_g_lock)); 818 #endif 819 mutex_enter(&ill->ill_lock); 820 nce = nce_lookup(ill, addr); 821 mutex_exit(&ill->ill_lock); 822 return (nce); 823 } 824 825 826 /* 827 * Router turned to host. We need to make sure that cached copies of the ncec 828 * are not used for forwarding packets if they were derived from the default 829 * route, and that the default route itself is removed, as required by 830 * section 7.2.5 of RFC 2461. 831 * 832 * Note that the ncec itself probably has valid link-layer information for the 833 * nexthop, so that there is no reason to delete the ncec, as long as the 834 * ISROUTER flag is turned off. 835 */ 836 static void 837 ncec_router_to_host(ncec_t *ncec) 838 { 839 ire_t *ire; 840 ip_stack_t *ipst = ncec->ncec_ipst; 841 842 mutex_enter(&ncec->ncec_lock); 843 ncec->ncec_flags &= ~NCE_F_ISROUTER; 844 mutex_exit(&ncec->ncec_lock); 845 846 ire = ire_ftable_lookup_v6(&ipv6_all_zeros, &ipv6_all_zeros, 847 &ncec->ncec_addr, IRE_DEFAULT, ncec->ncec_ill, ALL_ZONES, NULL, 848 MATCH_IRE_ILL | MATCH_IRE_TYPE | MATCH_IRE_GW, 0, ipst, NULL); 849 if (ire != NULL) { 850 ip_rts_rtmsg(RTM_DELETE, ire, 0, ipst); 851 ire_delete(ire); 852 ire_refrele(ire); 853 } 854 } 855 856 /* 857 * Process passed in parameters either from an incoming packet or via 858 * user ioctl. 859 */ 860 void 861 nce_process(ncec_t *ncec, uchar_t *hw_addr, uint32_t flag, boolean_t is_adv) 862 { 863 ill_t *ill = ncec->ncec_ill; 864 uint32_t hw_addr_len = ill->ill_phys_addr_length; 865 boolean_t ll_updated = B_FALSE; 866 boolean_t ll_changed; 867 nce_t *nce; 868 869 ASSERT(ncec->ncec_ipversion == IPV6_VERSION); 870 /* 871 * No updates of link layer address or the neighbor state is 872 * allowed, when the cache is in NONUD state. This still 873 * allows for responding to reachability solicitation. 874 */ 875 mutex_enter(&ncec->ncec_lock); 876 if (ncec->ncec_state == ND_INCOMPLETE) { 877 if (hw_addr == NULL) { 878 mutex_exit(&ncec->ncec_lock); 879 return; 880 } 881 nce_set_ll(ncec, hw_addr); 882 /* 883 * Update ncec state and send the queued packets 884 * back to ip this time ire will be added. 885 */ 886 if (flag & ND_NA_FLAG_SOLICITED) { 887 nce_update(ncec, ND_REACHABLE, NULL); 888 } else { 889 nce_update(ncec, ND_STALE, NULL); 890 } 891 mutex_exit(&ncec->ncec_lock); 892 nce = nce_fastpath(ncec, B_TRUE, NULL); 893 nce_resolv_ok(ncec); 894 if (nce != NULL) 895 nce_refrele(nce); 896 return; 897 } 898 ll_changed = nce_cmp_ll_addr(ncec, hw_addr, hw_addr_len); 899 if (!is_adv) { 900 /* If this is a SOLICITATION request only */ 901 if (ll_changed) 902 nce_update(ncec, ND_STALE, hw_addr); 903 mutex_exit(&ncec->ncec_lock); 904 ncec_cb_dispatch(ncec); 905 return; 906 } 907 if (!(flag & ND_NA_FLAG_OVERRIDE) && ll_changed) { 908 /* If in any other state than REACHABLE, ignore */ 909 if (ncec->ncec_state == ND_REACHABLE) { 910 nce_update(ncec, ND_STALE, NULL); 911 } 912 mutex_exit(&ncec->ncec_lock); 913 ncec_cb_dispatch(ncec); 914 return; 915 } else { 916 if (ll_changed) { 917 nce_update(ncec, ND_UNCHANGED, hw_addr); 918 ll_updated = B_TRUE; 919 } 920 if (flag & ND_NA_FLAG_SOLICITED) { 921 nce_update(ncec, ND_REACHABLE, NULL); 922 } else { 923 if (ll_updated) { 924 nce_update(ncec, ND_STALE, NULL); 925 } 926 } 927 mutex_exit(&ncec->ncec_lock); 928 if (!(flag & ND_NA_FLAG_ROUTER) && (ncec->ncec_flags & 929 NCE_F_ISROUTER)) { 930 ncec_router_to_host(ncec); 931 } else { 932 ncec_cb_dispatch(ncec); 933 } 934 } 935 } 936 937 /* 938 * Pass arg1 to the pfi supplied, along with each ncec in existence. 939 * ncec_walk() places a REFHOLD on the ncec and drops the lock when 940 * walking the hash list. 941 */ 942 void 943 ncec_walk_common(ndp_g_t *ndp, ill_t *ill, pfi_t pfi, void *arg1, 944 boolean_t trace) 945 { 946 ncec_t *ncec; 947 ncec_t *ncec1; 948 ncec_t **ncep; 949 ncec_t *free_nce_list = NULL; 950 951 mutex_enter(&ndp->ndp_g_lock); 952 /* Prevent ncec_delete from unlink and free of NCE */ 953 ndp->ndp_g_walker++; 954 mutex_exit(&ndp->ndp_g_lock); 955 for (ncep = ndp->nce_hash_tbl; 956 ncep < A_END(ndp->nce_hash_tbl); ncep++) { 957 for (ncec = *ncep; ncec != NULL; ncec = ncec1) { 958 ncec1 = ncec->ncec_next; 959 if (ill == NULL || ncec->ncec_ill == ill) { 960 if (trace) { 961 ncec_refhold(ncec); 962 (*pfi)(ncec, arg1); 963 ncec_refrele(ncec); 964 } else { 965 ncec_refhold_notr(ncec); 966 (*pfi)(ncec, arg1); 967 ncec_refrele_notr(ncec); 968 } 969 } 970 } 971 } 972 mutex_enter(&ndp->ndp_g_lock); 973 ndp->ndp_g_walker--; 974 if (ndp->ndp_g_walker_cleanup && ndp->ndp_g_walker == 0) { 975 /* Time to delete condemned entries */ 976 for (ncep = ndp->nce_hash_tbl; 977 ncep < A_END(ndp->nce_hash_tbl); ncep++) { 978 ncec = *ncep; 979 if (ncec != NULL) { 980 nce_remove(ndp, ncec, &free_nce_list); 981 } 982 } 983 ndp->ndp_g_walker_cleanup = B_FALSE; 984 } 985 986 mutex_exit(&ndp->ndp_g_lock); 987 988 if (free_nce_list != NULL) { 989 nce_cleanup_list(free_nce_list); 990 } 991 } 992 993 /* 994 * Walk everything. 995 * Note that ill can be NULL hence can't derive the ipst from it. 996 */ 997 void 998 ncec_walk(ill_t *ill, pfi_t pfi, void *arg1, ip_stack_t *ipst) 999 { 1000 ncec_walk_common(ipst->ips_ndp4, ill, pfi, arg1, B_TRUE); 1001 ncec_walk_common(ipst->ips_ndp6, ill, pfi, arg1, B_TRUE); 1002 } 1003 1004 /* 1005 * For each interface an entry is added for the unspecified multicast group. 1006 * Here that mapping is used to form the multicast cache entry for a particular 1007 * multicast destination. 1008 */ 1009 static int 1010 nce_set_multicast_v6(ill_t *ill, const in6_addr_t *dst, 1011 uint16_t flags, nce_t **newnce) 1012 { 1013 uchar_t *hw_addr; 1014 int err = 0; 1015 ip_stack_t *ipst = ill->ill_ipst; 1016 nce_t *nce; 1017 1018 ASSERT(ill != NULL); 1019 ASSERT(ill->ill_isv6); 1020 ASSERT(!(IN6_IS_ADDR_UNSPECIFIED(dst))); 1021 1022 mutex_enter(&ipst->ips_ndp6->ndp_g_lock); 1023 nce = nce_lookup_addr(ill, dst); 1024 if (nce != NULL) { 1025 mutex_exit(&ipst->ips_ndp6->ndp_g_lock); 1026 goto done; 1027 } 1028 if (ill->ill_net_type == IRE_IF_RESOLVER) { 1029 /* 1030 * For IRE_IF_RESOLVER a hardware mapping can be 1031 * generated. 1032 */ 1033 hw_addr = kmem_alloc(ill->ill_nd_lla_len, KM_NOSLEEP); 1034 if (hw_addr == NULL) { 1035 mutex_exit(&ipst->ips_ndp6->ndp_g_lock); 1036 return (ENOMEM); 1037 } 1038 ip_mcast_mapping(ill, (uchar_t *)dst, hw_addr); 1039 } else { 1040 /* No hw_addr is needed for IRE_IF_NORESOLVER. */ 1041 hw_addr = NULL; 1042 } 1043 ASSERT((flags & NCE_F_MCAST) != 0); 1044 ASSERT((flags & NCE_F_NONUD) != 0); 1045 /* nce_state will be computed by nce_add_common() */ 1046 err = nce_add_v6(ill, hw_addr, ill->ill_phys_addr_length, dst, flags, 1047 ND_UNCHANGED, &nce); 1048 mutex_exit(&ipst->ips_ndp6->ndp_g_lock); 1049 if (err == 0) 1050 err = nce_add_v6_postprocess(nce); 1051 if (hw_addr != NULL) 1052 kmem_free(hw_addr, ill->ill_nd_lla_len); 1053 if (err != 0) { 1054 ip1dbg(("nce_set_multicast_v6: create failed" "%d\n", err)); 1055 return (err); 1056 } 1057 done: 1058 ASSERT(nce->nce_common->ncec_state == ND_REACHABLE); 1059 if (newnce != NULL) 1060 *newnce = nce; 1061 else 1062 nce_refrele(nce); 1063 return (0); 1064 } 1065 1066 /* 1067 * Return the link layer address, and any flags of a ncec. 1068 */ 1069 int 1070 ndp_query(ill_t *ill, struct lif_nd_req *lnr) 1071 { 1072 ncec_t *ncec; 1073 in6_addr_t *addr; 1074 sin6_t *sin6; 1075 1076 ASSERT(ill != NULL && ill->ill_isv6); 1077 sin6 = (sin6_t *)&lnr->lnr_addr; 1078 addr = &sin6->sin6_addr; 1079 1080 /* 1081 * NOTE: if the ill is an IPMP interface, then match against the whole 1082 * illgrp. This e.g. allows in.ndpd to retrieve the link layer 1083 * addresses for the data addresses on an IPMP interface even though 1084 * ipif_ndp_up() created them with an ncec_ill of ipif_bound_ill. 1085 */ 1086 ncec = ncec_lookup_illgrp_v6(ill, addr); 1087 if (ncec == NULL) 1088 return (ESRCH); 1089 /* If no link layer address is available yet, return ESRCH */ 1090 if (!NCE_ISREACHABLE(ncec)) { 1091 ncec_refrele(ncec); 1092 return (ESRCH); 1093 } 1094 lnr->lnr_hdw_len = ill->ill_phys_addr_length; 1095 bcopy(ncec->ncec_lladdr, (uchar_t *)&lnr->lnr_hdw_addr, 1096 lnr->lnr_hdw_len); 1097 if (ncec->ncec_flags & NCE_F_ISROUTER) 1098 lnr->lnr_flags = NDF_ISROUTER_ON; 1099 if (ncec->ncec_flags & NCE_F_ANYCAST) 1100 lnr->lnr_flags |= NDF_ANYCAST_ON; 1101 ncec_refrele(ncec); 1102 return (0); 1103 } 1104 1105 /* 1106 * Finish setting up the Enable/Disable multicast for the driver. 1107 */ 1108 mblk_t * 1109 ndp_mcastreq(ill_t *ill, const in6_addr_t *v6group, uint32_t hw_addr_len, 1110 uint32_t hw_addr_offset, mblk_t *mp) 1111 { 1112 uchar_t *hw_addr; 1113 ipaddr_t v4group; 1114 uchar_t *addr; 1115 1116 ASSERT(ill->ill_net_type == IRE_IF_RESOLVER); 1117 if (IN6_IS_ADDR_V4MAPPED(v6group)) { 1118 IN6_V4MAPPED_TO_IPADDR(v6group, v4group); 1119 1120 ASSERT(CLASSD(v4group)); 1121 ASSERT(!(ill->ill_isv6)); 1122 1123 addr = (uchar_t *)&v4group; 1124 } else { 1125 ASSERT(IN6_IS_ADDR_MULTICAST(v6group)); 1126 ASSERT(ill->ill_isv6); 1127 1128 addr = (uchar_t *)v6group; 1129 } 1130 hw_addr = mi_offset_paramc(mp, hw_addr_offset, hw_addr_len); 1131 if (hw_addr == NULL) { 1132 ip0dbg(("ndp_mcastreq NULL hw_addr\n")); 1133 freemsg(mp); 1134 return (NULL); 1135 } 1136 1137 ip_mcast_mapping(ill, addr, hw_addr); 1138 return (mp); 1139 } 1140 1141 void 1142 ip_ndp_resolve(ncec_t *ncec) 1143 { 1144 in_addr_t sender4 = INADDR_ANY; 1145 in6_addr_t sender6 = ipv6_all_zeros; 1146 ill_t *src_ill; 1147 uint32_t ms; 1148 1149 src_ill = nce_resolve_src(ncec, &sender6); 1150 if (src_ill == NULL) { 1151 /* Make sure we try again later */ 1152 ms = ncec->ncec_ill->ill_reachable_retrans_time; 1153 nce_restart_timer(ncec, (clock_t)ms); 1154 return; 1155 } 1156 if (ncec->ncec_ipversion == IPV4_VERSION) 1157 IN6_V4MAPPED_TO_IPADDR(&sender6, sender4); 1158 mutex_enter(&ncec->ncec_lock); 1159 if (ncec->ncec_ipversion == IPV6_VERSION) 1160 ms = ndp_solicit(ncec, sender6, src_ill); 1161 else 1162 ms = arp_request(ncec, sender4, src_ill); 1163 mutex_exit(&ncec->ncec_lock); 1164 if (ms == 0) { 1165 if (ncec->ncec_state != ND_REACHABLE) { 1166 if (ncec->ncec_ipversion == IPV6_VERSION) 1167 ndp_resolv_failed(ncec); 1168 else 1169 arp_resolv_failed(ncec); 1170 ASSERT((ncec->ncec_flags & NCE_F_STATIC) == 0); 1171 nce_make_unreachable(ncec); 1172 ncec_delete(ncec); 1173 } 1174 } else { 1175 nce_restart_timer(ncec, (clock_t)ms); 1176 } 1177 done: 1178 ill_refrele(src_ill); 1179 } 1180 1181 /* 1182 * Send an IPv6 neighbor solicitation. 1183 * Returns number of milliseconds after which we should either rexmit or abort. 1184 * Return of zero means we should abort. 1185 * The caller holds the ncec_lock to protect ncec_qd_mp and ncec_rcnt. 1186 * The optional source address is used as a hint to ndp_solicit for 1187 * which source to use in the packet. 1188 * 1189 * NOTE: This routine drops ncec_lock (and later reacquires it) when sending 1190 * the packet. 1191 */ 1192 uint32_t 1193 ndp_solicit(ncec_t *ncec, in6_addr_t src, ill_t *ill) 1194 { 1195 in6_addr_t dst; 1196 boolean_t dropped = B_FALSE; 1197 1198 ASSERT(ncec->ncec_ipversion == IPV6_VERSION); 1199 ASSERT(MUTEX_HELD(&ncec->ncec_lock)); 1200 1201 if (ncec->ncec_rcnt == 0) 1202 return (0); 1203 1204 dst = ncec->ncec_addr; 1205 ncec->ncec_rcnt--; 1206 mutex_exit(&ncec->ncec_lock); 1207 dropped = ndp_xmit(ill, ND_NEIGHBOR_SOLICIT, ill->ill_phys_addr, 1208 ill->ill_phys_addr_length, &src, &dst, 0); 1209 mutex_enter(&ncec->ncec_lock); 1210 if (dropped) 1211 ncec->ncec_rcnt++; 1212 return (ncec->ncec_ill->ill_reachable_retrans_time); 1213 } 1214 1215 /* 1216 * Attempt to recover an address on an interface that's been marked as a 1217 * duplicate. Because NCEs are destroyed when the interface goes down, there's 1218 * no easy way to just probe the address and have the right thing happen if 1219 * it's no longer in use. Instead, we just bring it up normally and allow the 1220 * regular interface start-up logic to probe for a remaining duplicate and take 1221 * us back down if necessary. 1222 * Neither DHCP nor temporary addresses arrive here; they're excluded by 1223 * ip_ndp_excl. 1224 */ 1225 /* ARGSUSED */ 1226 void 1227 ip_addr_recover(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg) 1228 { 1229 ill_t *ill = rq->q_ptr; 1230 ipif_t *ipif; 1231 in6_addr_t *addr6 = (in6_addr_t *)mp->b_rptr; 1232 in_addr_t *addr4 = (in_addr_t *)mp->b_rptr; 1233 boolean_t addr_equal; 1234 1235 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 1236 /* 1237 * We do not support recovery of proxy ARP'd interfaces, 1238 * because the system lacks a complete proxy ARP mechanism. 1239 */ 1240 if (ill->ill_isv6) { 1241 addr_equal = IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6lcl_addr, 1242 addr6); 1243 } else { 1244 addr_equal = (ipif->ipif_lcl_addr == *addr4); 1245 } 1246 1247 if ((ipif->ipif_flags & IPIF_POINTOPOINT) || !addr_equal) 1248 continue; 1249 1250 /* 1251 * If we have already recovered or if the interface is going 1252 * away, then ignore. 1253 */ 1254 mutex_enter(&ill->ill_lock); 1255 if (!(ipif->ipif_flags & IPIF_DUPLICATE) || 1256 (ipif->ipif_state_flags & IPIF_CONDEMNED)) { 1257 mutex_exit(&ill->ill_lock); 1258 continue; 1259 } 1260 1261 ipif->ipif_flags &= ~IPIF_DUPLICATE; 1262 ill->ill_ipif_dup_count--; 1263 mutex_exit(&ill->ill_lock); 1264 ipif->ipif_was_dup = B_TRUE; 1265 1266 if (ill->ill_isv6) { 1267 VERIFY(ipif_ndp_up(ipif, B_TRUE) != EINPROGRESS); 1268 (void) ipif_up_done_v6(ipif); 1269 } else { 1270 VERIFY(ipif_arp_up(ipif, Res_act_initial, B_TRUE) != 1271 EINPROGRESS); 1272 (void) ipif_up_done(ipif); 1273 } 1274 } 1275 freeb(mp); 1276 } 1277 1278 /* 1279 * Attempt to recover an IPv6 interface that's been shut down as a duplicate. 1280 * As long as someone else holds the address, the interface will stay down. 1281 * When that conflict goes away, the interface is brought back up. This is 1282 * done so that accidental shutdowns of addresses aren't made permanent. Your 1283 * server will recover from a failure. 1284 * 1285 * For DHCP and temporary addresses, recovery is not done in the kernel. 1286 * Instead, it's handled by user space processes (dhcpagent and in.ndpd). 1287 * 1288 * This function is entered on a timer expiry; the ID is in ipif_recovery_id. 1289 */ 1290 void 1291 ipif_dup_recovery(void *arg) 1292 { 1293 ipif_t *ipif = arg; 1294 1295 ipif->ipif_recovery_id = 0; 1296 if (!(ipif->ipif_flags & IPIF_DUPLICATE)) 1297 return; 1298 1299 /* 1300 * No lock, because this is just an optimization. 1301 */ 1302 if (ipif->ipif_state_flags & IPIF_CONDEMNED) 1303 return; 1304 1305 /* If the link is down, we'll retry this later */ 1306 if (!(ipif->ipif_ill->ill_phyint->phyint_flags & PHYI_RUNNING)) 1307 return; 1308 1309 ipif_do_recovery(ipif); 1310 } 1311 1312 /* 1313 * Perform interface recovery by forcing the duplicate interfaces up and 1314 * allowing the system to determine which ones should stay up. 1315 * 1316 * Called both by recovery timer expiry and link-up notification. 1317 */ 1318 void 1319 ipif_do_recovery(ipif_t *ipif) 1320 { 1321 ill_t *ill = ipif->ipif_ill; 1322 mblk_t *mp; 1323 ip_stack_t *ipst = ill->ill_ipst; 1324 size_t mp_size; 1325 1326 if (ipif->ipif_isv6) 1327 mp_size = sizeof (ipif->ipif_v6lcl_addr); 1328 else 1329 mp_size = sizeof (ipif->ipif_lcl_addr); 1330 mp = allocb(mp_size, BPRI_MED); 1331 if (mp == NULL) { 1332 mutex_enter(&ill->ill_lock); 1333 if (ipst->ips_ip_dup_recovery > 0 && 1334 ipif->ipif_recovery_id == 0 && 1335 !(ipif->ipif_state_flags & IPIF_CONDEMNED)) { 1336 ipif->ipif_recovery_id = timeout(ipif_dup_recovery, 1337 ipif, MSEC_TO_TICK(ipst->ips_ip_dup_recovery)); 1338 } 1339 mutex_exit(&ill->ill_lock); 1340 } else { 1341 /* 1342 * A recovery timer may still be running if we got here from 1343 * ill_restart_dad(); cancel that timer. 1344 */ 1345 if (ipif->ipif_recovery_id != 0) 1346 (void) untimeout(ipif->ipif_recovery_id); 1347 ipif->ipif_recovery_id = 0; 1348 1349 if (ipif->ipif_isv6) { 1350 bcopy(&ipif->ipif_v6lcl_addr, mp->b_rptr, 1351 sizeof (ipif->ipif_v6lcl_addr)); 1352 } else { 1353 bcopy(&ipif->ipif_lcl_addr, mp->b_rptr, 1354 sizeof (ipif->ipif_lcl_addr)); 1355 } 1356 ill_refhold(ill); 1357 qwriter_ip(ill, ill->ill_rq, mp, ip_addr_recover, NEW_OP, 1358 B_FALSE); 1359 } 1360 } 1361 1362 /* 1363 * Find the MAC and IP addresses in an NA/NS message. 1364 */ 1365 static void 1366 ip_ndp_find_addresses(mblk_t *mp, ip_recv_attr_t *ira, ill_t *ill, 1367 in6_addr_t *targp, uchar_t **haddr, uint_t *haddrlenp) 1368 { 1369 icmp6_t *icmp6 = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN); 1370 nd_neighbor_solicit_t *ns = (nd_neighbor_solicit_t *)icmp6; 1371 uchar_t *addr; 1372 int alen; 1373 1374 /* icmp_inbound_v6 ensures this */ 1375 ASSERT(ira->ira_flags & IRAF_L2SRC_SET); 1376 1377 addr = ira->ira_l2src; 1378 alen = ill->ill_phys_addr_length; 1379 if (alen > 0) { 1380 *haddr = addr; 1381 *haddrlenp = alen; 1382 } else { 1383 *haddr = NULL; 1384 *haddrlenp = 0; 1385 } 1386 1387 /* nd_ns_target and nd_na_target are at the same offset, so we cheat */ 1388 *targp = ns->nd_ns_target; 1389 } 1390 1391 /* 1392 * This is for exclusive changes due to NDP duplicate address detection 1393 * failure. 1394 */ 1395 /* ARGSUSED */ 1396 static void 1397 ip_ndp_excl(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg) 1398 { 1399 ill_t *ill = rq->q_ptr; 1400 ipif_t *ipif; 1401 uchar_t *haddr; 1402 uint_t haddrlen; 1403 ip_stack_t *ipst = ill->ill_ipst; 1404 in6_addr_t targ; 1405 ip_recv_attr_t iras; 1406 mblk_t *attrmp; 1407 1408 attrmp = mp; 1409 mp = mp->b_cont; 1410 attrmp->b_cont = NULL; 1411 if (!ip_recv_attr_from_mblk(attrmp, &iras)) { 1412 /* The ill or ip_stack_t disappeared on us */ 1413 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 1414 ip_drop_input("ip_recv_attr_from_mblk", mp, ill); 1415 freemsg(mp); 1416 ira_cleanup(&iras, B_TRUE); 1417 return; 1418 } 1419 1420 ASSERT(ill == iras.ira_rill); 1421 1422 ip_ndp_find_addresses(mp, &iras, ill, &targ, &haddr, &haddrlen); 1423 if (haddr != NULL && haddrlen == ill->ill_phys_addr_length) { 1424 /* 1425 * Ignore conflicts generated by misbehaving switches that 1426 * just reflect our own messages back to us. For IPMP, we may 1427 * see reflections across any ill in the illgrp. 1428 * 1429 * RFC2462 and revisions tried to detect both the case 1430 * when a statically configured IPv6 address is a duplicate, 1431 * and the case when the L2 address itself is a duplicate. The 1432 * later is important because, with stateles address autoconf, 1433 * if the L2 address is a duplicate, the resulting IPv6 1434 * address(es) would also be duplicates. We rely on DAD of the 1435 * IPv6 address itself to detect the latter case. 1436 */ 1437 /* For an under ill_grp can change under lock */ 1438 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 1439 if (bcmp(haddr, ill->ill_phys_addr, haddrlen) == 0 || 1440 IS_UNDER_IPMP(ill) && 1441 ipmp_illgrp_find_ill(ill->ill_grp, haddr, 1442 haddrlen) != NULL) { 1443 rw_exit(&ipst->ips_ill_g_lock); 1444 goto ignore_conflict; 1445 } 1446 rw_exit(&ipst->ips_ill_g_lock); 1447 } 1448 1449 /* 1450 * Look up the appropriate ipif. 1451 */ 1452 ipif = ipif_lookup_addr_v6(&targ, ill, ALL_ZONES, ipst); 1453 if (ipif == NULL) 1454 goto ignore_conflict; 1455 1456 /* Reload the ill to match the ipif */ 1457 ill = ipif->ipif_ill; 1458 1459 /* If it's already duplicate or ineligible, then don't do anything. */ 1460 if (ipif->ipif_flags & (IPIF_POINTOPOINT|IPIF_DUPLICATE)) { 1461 ipif_refrele(ipif); 1462 goto ignore_conflict; 1463 } 1464 1465 /* 1466 * If this is a failure during duplicate recovery, then don't 1467 * complain. It may take a long time to recover. 1468 */ 1469 if (!ipif->ipif_was_dup) { 1470 char ibuf[LIFNAMSIZ]; 1471 char hbuf[MAC_STR_LEN]; 1472 char sbuf[INET6_ADDRSTRLEN]; 1473 1474 ipif_get_name(ipif, ibuf, sizeof (ibuf)); 1475 cmn_err(CE_WARN, "%s has duplicate address %s (in use by %s);" 1476 " disabled", ibuf, 1477 inet_ntop(AF_INET6, &targ, sbuf, sizeof (sbuf)), 1478 mac_colon_addr(haddr, haddrlen, hbuf, sizeof (hbuf))); 1479 } 1480 mutex_enter(&ill->ill_lock); 1481 ASSERT(!(ipif->ipif_flags & IPIF_DUPLICATE)); 1482 ipif->ipif_flags |= IPIF_DUPLICATE; 1483 ill->ill_ipif_dup_count++; 1484 mutex_exit(&ill->ill_lock); 1485 (void) ipif_down(ipif, NULL, NULL); 1486 (void) ipif_down_tail(ipif); 1487 mutex_enter(&ill->ill_lock); 1488 if (!(ipif->ipif_flags & (IPIF_DHCPRUNNING|IPIF_TEMPORARY)) && 1489 ill->ill_net_type == IRE_IF_RESOLVER && 1490 !(ipif->ipif_state_flags & IPIF_CONDEMNED) && 1491 ipst->ips_ip_dup_recovery > 0) { 1492 ASSERT(ipif->ipif_recovery_id == 0); 1493 ipif->ipif_recovery_id = timeout(ipif_dup_recovery, 1494 ipif, MSEC_TO_TICK(ipst->ips_ip_dup_recovery)); 1495 } 1496 mutex_exit(&ill->ill_lock); 1497 ipif_refrele(ipif); 1498 1499 ignore_conflict: 1500 freemsg(mp); 1501 ira_cleanup(&iras, B_TRUE); 1502 } 1503 1504 /* 1505 * Handle failure by tearing down the ipifs with the specified address. Note 1506 * that tearing down the ipif also means deleting the ncec through ipif_down, so 1507 * it's not possible to do recovery by just restarting the ncec timer. Instead, 1508 * we start a timer on the ipif. 1509 * Caller has to free mp; 1510 */ 1511 static void 1512 ndp_failure(mblk_t *mp, ip_recv_attr_t *ira) 1513 { 1514 const uchar_t *haddr; 1515 ill_t *ill = ira->ira_rill; 1516 1517 /* 1518 * Ignore conflicts generated by misbehaving switches that just 1519 * reflect our own messages back to us. 1520 */ 1521 1522 /* icmp_inbound_v6 ensures this */ 1523 ASSERT(ira->ira_flags & IRAF_L2SRC_SET); 1524 haddr = ira->ira_l2src; 1525 if (haddr != NULL && 1526 bcmp(haddr, ill->ill_phys_addr, ill->ill_phys_addr_length) == 0) { 1527 return; 1528 } 1529 1530 if ((mp = copymsg(mp)) != NULL) { 1531 mblk_t *attrmp; 1532 1533 attrmp = ip_recv_attr_to_mblk(ira); 1534 if (attrmp == NULL) { 1535 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 1536 ip_drop_input("ipIfStatsInDiscards", mp, ill); 1537 freemsg(mp); 1538 } else { 1539 ASSERT(attrmp->b_cont == NULL); 1540 attrmp->b_cont = mp; 1541 mp = attrmp; 1542 ill_refhold(ill); 1543 qwriter_ip(ill, ill->ill_rq, mp, ip_ndp_excl, NEW_OP, 1544 B_FALSE); 1545 } 1546 } 1547 } 1548 1549 /* 1550 * Handle a discovered conflict: some other system is advertising that it owns 1551 * one of our IP addresses. We need to defend ourselves, or just shut down the 1552 * interface. 1553 * 1554 * Handles both IPv4 and IPv6 1555 */ 1556 boolean_t 1557 ip_nce_conflict(mblk_t *mp, ip_recv_attr_t *ira, ncec_t *ncec) 1558 { 1559 ipif_t *ipif; 1560 clock_t now; 1561 uint_t maxdefense; 1562 uint_t defs; 1563 ill_t *ill = ira->ira_ill; 1564 ip_stack_t *ipst = ill->ill_ipst; 1565 uint32_t elapsed; 1566 boolean_t isv6 = ill->ill_isv6; 1567 ipaddr_t ncec_addr; 1568 1569 if (isv6) { 1570 ipif = ipif_lookup_addr_v6(&ncec->ncec_addr, ill, ALL_ZONES, 1571 ipst); 1572 } else { 1573 if (arp_no_defense) { 1574 /* 1575 * Yes, there is a conflict, but no, we do not 1576 * defend ourself. 1577 */ 1578 return (B_TRUE); 1579 } 1580 IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, ncec_addr); 1581 ipif = ipif_lookup_addr(ncec_addr, ill, ALL_ZONES, 1582 ipst); 1583 } 1584 if (ipif == NULL) 1585 return (B_FALSE); 1586 1587 /* 1588 * First, figure out if this address is disposable. 1589 */ 1590 if (ipif->ipif_flags & (IPIF_DHCPRUNNING | IPIF_TEMPORARY)) 1591 maxdefense = ipst->ips_ip_max_temp_defend; 1592 else 1593 maxdefense = ipst->ips_ip_max_defend; 1594 1595 /* 1596 * Now figure out how many times we've defended ourselves. Ignore 1597 * defenses that happened long in the past. 1598 */ 1599 now = ddi_get_lbolt(); 1600 elapsed = (drv_hztousec(now - ncec->ncec_last_time_defended))/1000000; 1601 mutex_enter(&ncec->ncec_lock); 1602 if ((defs = ncec->ncec_defense_count) > 0 && 1603 elapsed > ipst->ips_ip_defend_interval) { 1604 /* 1605 * ip_defend_interval has elapsed. 1606 * reset the defense count. 1607 */ 1608 ncec->ncec_defense_count = defs = 0; 1609 } 1610 ncec->ncec_defense_count++; 1611 ncec->ncec_last_time_defended = now; 1612 mutex_exit(&ncec->ncec_lock); 1613 ipif_refrele(ipif); 1614 1615 /* 1616 * If we've defended ourselves too many times already, then give up and 1617 * tear down the interface(s) using this address. 1618 * Otherwise, caller has to defend by sending out an announce. 1619 */ 1620 if (defs >= maxdefense) { 1621 if (isv6) 1622 ndp_failure(mp, ira); 1623 else 1624 arp_failure(mp, ira); 1625 } else { 1626 return (B_TRUE); /* caller must defend this address */ 1627 } 1628 return (B_FALSE); 1629 } 1630 1631 /* 1632 * Handle reception of Neighbor Solicitation messages. 1633 */ 1634 static void 1635 ndp_input_solicit(mblk_t *mp, ip_recv_attr_t *ira) 1636 { 1637 ill_t *ill = ira->ira_ill, *under_ill; 1638 nd_neighbor_solicit_t *ns; 1639 uint32_t hlen = ill->ill_phys_addr_length; 1640 uchar_t *haddr = NULL; 1641 icmp6_t *icmp_nd; 1642 ip6_t *ip6h; 1643 ncec_t *our_ncec = NULL; 1644 in6_addr_t target; 1645 in6_addr_t src; 1646 int len; 1647 int flag = 0; 1648 nd_opt_hdr_t *opt = NULL; 1649 boolean_t bad_solicit = B_FALSE; 1650 mib2_ipv6IfIcmpEntry_t *mib = ill->ill_icmp6_mib; 1651 boolean_t need_ill_refrele = B_FALSE; 1652 1653 ip6h = (ip6_t *)mp->b_rptr; 1654 icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN); 1655 len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN; 1656 src = ip6h->ip6_src; 1657 ns = (nd_neighbor_solicit_t *)icmp_nd; 1658 target = ns->nd_ns_target; 1659 if (IN6_IS_ADDR_MULTICAST(&target)) { 1660 if (ip_debug > 2) { 1661 /* ip1dbg */ 1662 pr_addr_dbg("ndp_input_solicit: Target is" 1663 " multicast! %s\n", AF_INET6, &target); 1664 } 1665 bad_solicit = B_TRUE; 1666 goto done; 1667 } 1668 if (len > sizeof (nd_neighbor_solicit_t)) { 1669 /* Options present */ 1670 opt = (nd_opt_hdr_t *)&ns[1]; 1671 len -= sizeof (nd_neighbor_solicit_t); 1672 if (!ndp_verify_optlen(opt, len)) { 1673 ip1dbg(("ndp_input_solicit: Bad opt len\n")); 1674 bad_solicit = B_TRUE; 1675 goto done; 1676 } 1677 } 1678 if (IN6_IS_ADDR_UNSPECIFIED(&src)) { 1679 /* Check to see if this is a valid DAD solicitation */ 1680 if (!IN6_IS_ADDR_MC_SOLICITEDNODE(&ip6h->ip6_dst)) { 1681 if (ip_debug > 2) { 1682 /* ip1dbg */ 1683 pr_addr_dbg("ndp_input_solicit: IPv6 " 1684 "Destination is not solicited node " 1685 "multicast %s\n", AF_INET6, 1686 &ip6h->ip6_dst); 1687 } 1688 bad_solicit = B_TRUE; 1689 goto done; 1690 } 1691 } 1692 1693 /* 1694 * NOTE: with IPMP, it's possible the nominated multicast ill (which 1695 * received this packet if it's multicast) is not the ill tied to 1696 * e.g. the IPMP ill's data link-local. So we match across the illgrp 1697 * to ensure we find the associated NCE. 1698 */ 1699 our_ncec = ncec_lookup_illgrp_v6(ill, &target); 1700 /* 1701 * If this is a valid Solicitation for an address we are publishing, 1702 * then a PUBLISH entry should exist in the cache 1703 */ 1704 if (our_ncec == NULL || !NCE_PUBLISH(our_ncec)) { 1705 ip1dbg(("ndp_input_solicit: Wrong target in NS?!" 1706 "ifname=%s ", ill->ill_name)); 1707 if (ip_debug > 2) { 1708 /* ip1dbg */ 1709 pr_addr_dbg(" dst %s\n", AF_INET6, &target); 1710 } 1711 if (our_ncec == NULL) 1712 bad_solicit = B_TRUE; 1713 goto done; 1714 } 1715 1716 /* At this point we should have a verified NS per spec */ 1717 if (opt != NULL) { 1718 opt = ndp_get_option(opt, len, ND_OPT_SOURCE_LINKADDR); 1719 if (opt != NULL) { 1720 haddr = (uchar_t *)&opt[1]; 1721 if (hlen > opt->nd_opt_len * 8 - sizeof (*opt) || 1722 hlen == 0) { 1723 ip1dbg(("ndp_input_advert: bad SLLA\n")); 1724 bad_solicit = B_TRUE; 1725 goto done; 1726 } 1727 } 1728 } 1729 1730 /* If sending directly to peer, set the unicast flag */ 1731 if (!IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst)) 1732 flag |= NDP_UNICAST; 1733 1734 /* 1735 * Create/update the entry for the soliciting node on the ipmp_ill. 1736 * or respond to outstanding queries, don't if 1737 * the source is unspecified address. 1738 */ 1739 if (!IN6_IS_ADDR_UNSPECIFIED(&src)) { 1740 int err; 1741 nce_t *nnce; 1742 1743 ASSERT(ill->ill_isv6); 1744 /* 1745 * Regular solicitations *must* include the Source Link-Layer 1746 * Address option. Ignore messages that do not. 1747 */ 1748 if (haddr == NULL && IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst)) { 1749 ip1dbg(("ndp_input_solicit: source link-layer address " 1750 "option missing with a specified source.\n")); 1751 bad_solicit = B_TRUE; 1752 goto done; 1753 } 1754 1755 /* 1756 * This is a regular solicitation. If we're still in the 1757 * process of verifying the address, then don't respond at all 1758 * and don't keep track of the sender. 1759 */ 1760 if (our_ncec->ncec_state == ND_PROBE) 1761 goto done; 1762 1763 /* 1764 * If the solicitation doesn't have sender hardware address 1765 * (legal for unicast solicitation), then process without 1766 * installing the return NCE. Either we already know it, or 1767 * we'll be forced to look it up when (and if) we reply to the 1768 * packet. 1769 */ 1770 if (haddr == NULL) 1771 goto no_source; 1772 1773 under_ill = ill; 1774 if (IS_UNDER_IPMP(under_ill)) { 1775 ill = ipmp_ill_hold_ipmp_ill(under_ill); 1776 if (ill == NULL) 1777 ill = under_ill; 1778 else 1779 need_ill_refrele = B_TRUE; 1780 } 1781 err = nce_lookup_then_add_v6(ill, 1782 haddr, hlen, 1783 &src, /* Soliciting nodes address */ 1784 0, 1785 ND_STALE, 1786 &nnce); 1787 1788 if (need_ill_refrele) { 1789 ill_refrele(ill); 1790 ill = under_ill; 1791 need_ill_refrele = B_FALSE; 1792 } 1793 switch (err) { 1794 case 0: 1795 /* done with this entry */ 1796 nce_refrele(nnce); 1797 break; 1798 case EEXIST: 1799 /* 1800 * B_FALSE indicates this is not an an advertisement. 1801 */ 1802 nce_process(nnce->nce_common, haddr, 0, B_FALSE); 1803 nce_refrele(nnce); 1804 break; 1805 default: 1806 ip1dbg(("ndp_input_solicit: Can't create NCE %d\n", 1807 err)); 1808 goto done; 1809 } 1810 no_source: 1811 flag |= NDP_SOLICITED; 1812 } else { 1813 /* 1814 * No source link layer address option should be present in a 1815 * valid DAD request. 1816 */ 1817 if (haddr != NULL) { 1818 ip1dbg(("ndp_input_solicit: source link-layer address " 1819 "option present with an unspecified source.\n")); 1820 bad_solicit = B_TRUE; 1821 goto done; 1822 } 1823 if (our_ncec->ncec_state == ND_PROBE) { 1824 /* 1825 * Internally looped-back probes will have 1826 * IRAF_L2SRC_LOOPBACK set so we can ignore our own 1827 * transmissions. 1828 */ 1829 if (!(ira->ira_flags & IRAF_L2SRC_LOOPBACK)) { 1830 /* 1831 * If someone else is probing our address, then 1832 * we've crossed wires. Declare failure. 1833 */ 1834 ndp_failure(mp, ira); 1835 } 1836 goto done; 1837 } 1838 /* 1839 * This is a DAD probe. Multicast the advertisement to the 1840 * all-nodes address. 1841 */ 1842 src = ipv6_all_hosts_mcast; 1843 } 1844 flag |= nce_advert_flags(our_ncec); 1845 (void) ndp_xmit(ill, 1846 ND_NEIGHBOR_ADVERT, 1847 our_ncec->ncec_lladdr, 1848 our_ncec->ncec_lladdr_length, 1849 &target, /* Source and target of the advertisement pkt */ 1850 &src, /* IP Destination (source of original pkt) */ 1851 flag); 1852 done: 1853 if (bad_solicit) 1854 BUMP_MIB(mib, ipv6IfIcmpInBadNeighborSolicitations); 1855 if (our_ncec != NULL) 1856 ncec_refrele(our_ncec); 1857 } 1858 1859 /* 1860 * Handle reception of Neighbor Solicitation messages 1861 */ 1862 void 1863 ndp_input_advert(mblk_t *mp, ip_recv_attr_t *ira) 1864 { 1865 ill_t *ill = ira->ira_ill; 1866 nd_neighbor_advert_t *na; 1867 uint32_t hlen = ill->ill_phys_addr_length; 1868 uchar_t *haddr = NULL; 1869 icmp6_t *icmp_nd; 1870 ip6_t *ip6h; 1871 ncec_t *dst_ncec = NULL; 1872 in6_addr_t target; 1873 nd_opt_hdr_t *opt = NULL; 1874 int len; 1875 ip_stack_t *ipst = ill->ill_ipst; 1876 mib2_ipv6IfIcmpEntry_t *mib = ill->ill_icmp6_mib; 1877 1878 ip6h = (ip6_t *)mp->b_rptr; 1879 icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN); 1880 len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN; 1881 na = (nd_neighbor_advert_t *)icmp_nd; 1882 1883 if (IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst) && 1884 (na->nd_na_flags_reserved & ND_NA_FLAG_SOLICITED)) { 1885 ip1dbg(("ndp_input_advert: Target is multicast but the " 1886 "solicited flag is not zero\n")); 1887 BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements); 1888 return; 1889 } 1890 target = na->nd_na_target; 1891 if (IN6_IS_ADDR_MULTICAST(&target)) { 1892 ip1dbg(("ndp_input_advert: Target is multicast!\n")); 1893 BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements); 1894 return; 1895 } 1896 if (len > sizeof (nd_neighbor_advert_t)) { 1897 opt = (nd_opt_hdr_t *)&na[1]; 1898 if (!ndp_verify_optlen(opt, 1899 len - sizeof (nd_neighbor_advert_t))) { 1900 ip1dbg(("ndp_input_advert: cannot verify SLLA\n")); 1901 BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements); 1902 return; 1903 } 1904 /* At this point we have a verified NA per spec */ 1905 len -= sizeof (nd_neighbor_advert_t); 1906 opt = ndp_get_option(opt, len, ND_OPT_TARGET_LINKADDR); 1907 if (opt != NULL) { 1908 haddr = (uchar_t *)&opt[1]; 1909 if (hlen > opt->nd_opt_len * 8 - sizeof (*opt) || 1910 hlen == 0) { 1911 ip1dbg(("ndp_input_advert: bad SLLA\n")); 1912 BUMP_MIB(mib, 1913 ipv6IfIcmpInBadNeighborAdvertisements); 1914 return; 1915 } 1916 } 1917 } 1918 1919 /* 1920 * NOTE: we match across the illgrp since we need to do DAD for all of 1921 * our local addresses, and those are spread across all the active 1922 * ills in the group. 1923 */ 1924 if ((dst_ncec = ncec_lookup_illgrp_v6(ill, &target)) == NULL) 1925 return; 1926 1927 if (NCE_PUBLISH(dst_ncec)) { 1928 /* 1929 * Someone just advertised an addresses that we publish. First, 1930 * check it it was us -- if so, we can safely ignore it. 1931 * We don't get the haddr from the ira_l2src because, in the 1932 * case that the packet originated from us, on an IPMP group, 1933 * the ira_l2src may would be the link-layer address of the 1934 * cast_ill used to send the packet, which may not be the same 1935 * as the dst_ncec->ncec_lladdr of the address. 1936 */ 1937 if (haddr != NULL) { 1938 if (ira->ira_flags & IRAF_L2SRC_LOOPBACK) 1939 goto out; 1940 1941 if (!nce_cmp_ll_addr(dst_ncec, haddr, hlen)) 1942 goto out; /* from us -- no conflict */ 1943 1944 /* 1945 * If we're in an IPMP group, check if this is an echo 1946 * from another ill in the group. Use the double- 1947 * checked locking pattern to avoid grabbing 1948 * ill_g_lock in the non-IPMP case. 1949 */ 1950 if (IS_UNDER_IPMP(ill)) { 1951 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 1952 if (IS_UNDER_IPMP(ill) && ipmp_illgrp_find_ill( 1953 ill->ill_grp, haddr, hlen) != NULL) { 1954 rw_exit(&ipst->ips_ill_g_lock); 1955 goto out; 1956 } 1957 rw_exit(&ipst->ips_ill_g_lock); 1958 } 1959 } 1960 1961 /* 1962 * This appears to be a real conflict. If we're trying to 1963 * configure this NCE (ND_PROBE), then shut it down. 1964 * Otherwise, handle the discovered conflict. 1965 */ 1966 if (dst_ncec->ncec_state == ND_PROBE) { 1967 ndp_failure(mp, ira); 1968 } else { 1969 if (ip_nce_conflict(mp, ira, dst_ncec)) { 1970 char hbuf[MAC_STR_LEN]; 1971 char sbuf[INET6_ADDRSTRLEN]; 1972 1973 cmn_err(CE_WARN, 1974 "node '%s' is using %s on %s", 1975 inet_ntop(AF_INET6, &target, sbuf, 1976 sizeof (sbuf)), 1977 haddr == NULL ? "<none>" : 1978 mac_colon_addr(haddr, hlen, hbuf, 1979 sizeof (hbuf)), ill->ill_name); 1980 /* 1981 * RFC 4862, Section 5.4.4 does not mandate 1982 * any specific behavior when an NA matches 1983 * a non-tentative address assigned to the 1984 * receiver. We make the choice of defending 1985 * our address, based on the assumption that 1986 * the sender has not detected the Duplicate. 1987 * 1988 * ncec_last_time_defended has been adjusted 1989 * in ip_nce_conflict() 1990 */ 1991 (void) ndp_announce(dst_ncec); 1992 } 1993 } 1994 } else { 1995 if (na->nd_na_flags_reserved & ND_NA_FLAG_ROUTER) 1996 dst_ncec->ncec_flags |= NCE_F_ISROUTER; 1997 1998 /* B_TRUE indicates this an advertisement */ 1999 nce_process(dst_ncec, haddr, na->nd_na_flags_reserved, B_TRUE); 2000 } 2001 out: 2002 ncec_refrele(dst_ncec); 2003 } 2004 2005 /* 2006 * Process NDP neighbor solicitation/advertisement messages. 2007 * The checksum has already checked o.k before reaching here. 2008 * Information about the datalink header is contained in ira_l2src, but 2009 * that should be ignored for loopback packets. 2010 */ 2011 void 2012 ndp_input(mblk_t *mp, ip_recv_attr_t *ira) 2013 { 2014 ill_t *ill = ira->ira_rill; 2015 icmp6_t *icmp_nd; 2016 ip6_t *ip6h; 2017 int len; 2018 mib2_ipv6IfIcmpEntry_t *mib = ill->ill_icmp6_mib; 2019 ill_t *orig_ill = NULL; 2020 2021 /* 2022 * Since ira_ill is where the IRE_LOCAL was hosted we use ira_rill 2023 * and make it be the IPMP upper so avoid being confused by a packet 2024 * addressed to a unicast address on a different ill. 2025 */ 2026 if (IS_UNDER_IPMP(ill)) { 2027 orig_ill = ill; 2028 ill = ipmp_ill_hold_ipmp_ill(orig_ill); 2029 if (ill == NULL) { 2030 ill = orig_ill; 2031 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 2032 ip_drop_input("ipIfStatsInDiscards - IPMP ill", 2033 mp, ill); 2034 freemsg(mp); 2035 return; 2036 } 2037 ASSERT(ill != orig_ill); 2038 orig_ill = ira->ira_ill; 2039 ira->ira_ill = ill; 2040 mib = ill->ill_icmp6_mib; 2041 } 2042 if (!pullupmsg(mp, -1)) { 2043 ip1dbg(("ndp_input: pullupmsg failed\n")); 2044 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 2045 ip_drop_input("ipIfStatsInDiscards - pullupmsg", mp, ill); 2046 goto done; 2047 } 2048 ip6h = (ip6_t *)mp->b_rptr; 2049 if (ip6h->ip6_hops != IPV6_MAX_HOPS) { 2050 ip1dbg(("ndp_input: hoplimit != IPV6_MAX_HOPS\n")); 2051 ip_drop_input("ipv6IfIcmpBadHoplimit", mp, ill); 2052 BUMP_MIB(mib, ipv6IfIcmpBadHoplimit); 2053 goto done; 2054 } 2055 /* 2056 * NDP does not accept any extension headers between the 2057 * IP header and the ICMP header since e.g. a routing 2058 * header could be dangerous. 2059 * This assumes that any AH or ESP headers are removed 2060 * by ip prior to passing the packet to ndp_input. 2061 */ 2062 if (ip6h->ip6_nxt != IPPROTO_ICMPV6) { 2063 ip1dbg(("ndp_input: Wrong next header 0x%x\n", 2064 ip6h->ip6_nxt)); 2065 ip_drop_input("Wrong next header", mp, ill); 2066 BUMP_MIB(mib, ipv6IfIcmpInErrors); 2067 goto done; 2068 } 2069 icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN); 2070 ASSERT(icmp_nd->icmp6_type == ND_NEIGHBOR_SOLICIT || 2071 icmp_nd->icmp6_type == ND_NEIGHBOR_ADVERT); 2072 if (icmp_nd->icmp6_code != 0) { 2073 ip1dbg(("ndp_input: icmp6 code != 0 \n")); 2074 ip_drop_input("code non-zero", mp, ill); 2075 BUMP_MIB(mib, ipv6IfIcmpInErrors); 2076 goto done; 2077 } 2078 len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN; 2079 /* 2080 * Make sure packet length is large enough for either 2081 * a NS or a NA icmp packet. 2082 */ 2083 if (len < sizeof (struct icmp6_hdr) + sizeof (struct in6_addr)) { 2084 ip1dbg(("ndp_input: packet too short\n")); 2085 ip_drop_input("packet too short", mp, ill); 2086 BUMP_MIB(mib, ipv6IfIcmpInErrors); 2087 goto done; 2088 } 2089 if (icmp_nd->icmp6_type == ND_NEIGHBOR_SOLICIT) { 2090 ndp_input_solicit(mp, ira); 2091 } else { 2092 ndp_input_advert(mp, ira); 2093 } 2094 done: 2095 freemsg(mp); 2096 if (orig_ill != NULL) { 2097 ill_refrele(ill); 2098 ira->ira_ill = orig_ill; 2099 } 2100 } 2101 2102 /* 2103 * ndp_xmit is called to form and transmit a ND solicitation or 2104 * advertisement ICMP packet. 2105 * 2106 * If the source address is unspecified and this isn't a probe (used for 2107 * duplicate address detection), an appropriate source address and link layer 2108 * address will be chosen here. The link layer address option is included if 2109 * the source is specified (i.e., all non-probe packets), and omitted (per the 2110 * specification) otherwise. 2111 * 2112 * It returns B_FALSE only if it does a successful put() to the 2113 * corresponding ill's ill_wq otherwise returns B_TRUE. 2114 */ 2115 static boolean_t 2116 ndp_xmit(ill_t *ill, uint32_t operation, uint8_t *hw_addr, uint_t hw_addr_len, 2117 const in6_addr_t *sender, const in6_addr_t *target, int flag) 2118 { 2119 uint32_t len; 2120 icmp6_t *icmp6; 2121 mblk_t *mp; 2122 ip6_t *ip6h; 2123 nd_opt_hdr_t *opt; 2124 uint_t plen; 2125 zoneid_t zoneid = GLOBAL_ZONEID; 2126 ill_t *hwaddr_ill = ill; 2127 ip_xmit_attr_t ixas; 2128 ip_stack_t *ipst = ill->ill_ipst; 2129 boolean_t need_refrele = B_FALSE; 2130 boolean_t probe = B_FALSE; 2131 2132 if (IS_UNDER_IPMP(ill)) { 2133 probe = ipif_lookup_testaddr_v6(ill, sender, NULL); 2134 /* 2135 * We send non-probe packets on the upper IPMP interface. 2136 * ip_output_simple() will use cast_ill for sending any 2137 * multicast packets. Note that we can't follow the same 2138 * logic for probe packets because all interfaces in the ipmp 2139 * group may have failed, so that we really want to only try 2140 * to send the ND packet on the ill corresponding to the src 2141 * address. 2142 */ 2143 if (!probe) { 2144 ill = ipmp_ill_hold_ipmp_ill(ill); 2145 if (ill != NULL) 2146 need_refrele = B_TRUE; 2147 else 2148 ill = hwaddr_ill; 2149 } 2150 } 2151 2152 /* 2153 * If we have a unspecified source(sender) address, select a 2154 * proper source address for the solicitation here itself so 2155 * that we can initialize the h/w address correctly. 2156 * 2157 * If the sender is specified then we use this address in order 2158 * to lookup the zoneid before calling ip_output_v6(). This is to 2159 * enable unicast ND_NEIGHBOR_ADVERT packets to be routed correctly 2160 * by IP (we cannot guarantee that the global zone has an interface 2161 * route to the destination). 2162 * 2163 * Note that the NA never comes here with the unspecified source 2164 * address. 2165 */ 2166 2167 /* 2168 * Probes will have unspec src at this point. 2169 */ 2170 if (!(IN6_IS_ADDR_UNSPECIFIED(sender))) { 2171 zoneid = ipif_lookup_addr_zoneid_v6(sender, ill, ipst); 2172 /* 2173 * It's possible for ipif_lookup_addr_zoneid_v6() to return 2174 * ALL_ZONES if it cannot find a matching ipif for the address 2175 * we are trying to use. In this case we err on the side of 2176 * trying to send the packet by defaulting to the GLOBAL_ZONEID. 2177 */ 2178 if (zoneid == ALL_ZONES) 2179 zoneid = GLOBAL_ZONEID; 2180 } 2181 2182 plen = (sizeof (nd_opt_hdr_t) + hw_addr_len + 7) / 8; 2183 len = IPV6_HDR_LEN + sizeof (nd_neighbor_advert_t) + plen * 8; 2184 mp = allocb(len, BPRI_LO); 2185 if (mp == NULL) { 2186 if (need_refrele) 2187 ill_refrele(ill); 2188 return (B_TRUE); 2189 } 2190 2191 bzero((char *)mp->b_rptr, len); 2192 mp->b_wptr = mp->b_rptr + len; 2193 2194 bzero(&ixas, sizeof (ixas)); 2195 ixas.ixa_flags = IXAF_SET_ULP_CKSUM | IXAF_NO_HW_CKSUM; 2196 2197 ixas.ixa_ifindex = ill->ill_phyint->phyint_ifindex; 2198 ixas.ixa_ipst = ipst; 2199 ixas.ixa_cred = kcred; 2200 ixas.ixa_cpid = NOPID; 2201 ixas.ixa_tsl = NULL; 2202 ixas.ixa_zoneid = zoneid; 2203 2204 ip6h = (ip6_t *)mp->b_rptr; 2205 ip6h->ip6_vcf = IPV6_DEFAULT_VERS_AND_FLOW; 2206 ip6h->ip6_plen = htons(len - IPV6_HDR_LEN); 2207 ip6h->ip6_nxt = IPPROTO_ICMPV6; 2208 ip6h->ip6_hops = IPV6_MAX_HOPS; 2209 ixas.ixa_multicast_ttl = ip6h->ip6_hops; 2210 ip6h->ip6_dst = *target; 2211 icmp6 = (icmp6_t *)&ip6h[1]; 2212 2213 if (hw_addr_len != 0) { 2214 opt = (nd_opt_hdr_t *)((uint8_t *)ip6h + IPV6_HDR_LEN + 2215 sizeof (nd_neighbor_advert_t)); 2216 } else { 2217 opt = NULL; 2218 } 2219 if (operation == ND_NEIGHBOR_SOLICIT) { 2220 nd_neighbor_solicit_t *ns = (nd_neighbor_solicit_t *)icmp6; 2221 2222 if (opt != NULL && !(flag & NDP_PROBE)) { 2223 /* 2224 * Note that we don't send out SLLA for ND probes 2225 * per RFC 4862, even though we do send out the src 2226 * haddr for IPv4 DAD probes, even though both IPv4 2227 * and IPv6 go out with the unspecified/INADDR_ANY 2228 * src IP addr. 2229 */ 2230 opt->nd_opt_type = ND_OPT_SOURCE_LINKADDR; 2231 } 2232 ip6h->ip6_src = *sender; 2233 ns->nd_ns_target = *target; 2234 if (!(flag & NDP_UNICAST)) { 2235 /* Form multicast address of the target */ 2236 ip6h->ip6_dst = ipv6_solicited_node_mcast; 2237 ip6h->ip6_dst.s6_addr32[3] |= 2238 ns->nd_ns_target.s6_addr32[3]; 2239 } 2240 } else { 2241 nd_neighbor_advert_t *na = (nd_neighbor_advert_t *)icmp6; 2242 2243 ASSERT(!(flag & NDP_PROBE)); 2244 if (opt != NULL) 2245 opt->nd_opt_type = ND_OPT_TARGET_LINKADDR; 2246 ip6h->ip6_src = *sender; 2247 na->nd_na_target = *sender; 2248 if (flag & NDP_ISROUTER) 2249 na->nd_na_flags_reserved |= ND_NA_FLAG_ROUTER; 2250 if (flag & NDP_SOLICITED) 2251 na->nd_na_flags_reserved |= ND_NA_FLAG_SOLICITED; 2252 if (flag & NDP_ORIDE) 2253 na->nd_na_flags_reserved |= ND_NA_FLAG_OVERRIDE; 2254 } 2255 2256 if (!(flag & NDP_PROBE)) { 2257 if (hw_addr != NULL && opt != NULL) { 2258 /* Fill in link layer address and option len */ 2259 opt->nd_opt_len = (uint8_t)plen; 2260 bcopy(hw_addr, &opt[1], hw_addr_len); 2261 } 2262 } 2263 if (opt != NULL && opt->nd_opt_type == 0) { 2264 /* If there's no link layer address option, then strip it. */ 2265 len -= plen * 8; 2266 mp->b_wptr = mp->b_rptr + len; 2267 ip6h->ip6_plen = htons(len - IPV6_HDR_LEN); 2268 } 2269 2270 icmp6->icmp6_type = (uint8_t)operation; 2271 icmp6->icmp6_code = 0; 2272 /* 2273 * Prepare for checksum by putting icmp length in the icmp 2274 * checksum field. The checksum is calculated in ip_output.c. 2275 */ 2276 icmp6->icmp6_cksum = ip6h->ip6_plen; 2277 2278 (void) ip_output_simple(mp, &ixas); 2279 ixa_cleanup(&ixas); 2280 if (need_refrele) 2281 ill_refrele(ill); 2282 return (B_FALSE); 2283 } 2284 2285 /* 2286 * Used to set ND_UNREACHBLE before ncec_delete sets it NCE_F_CONDEMNED. 2287 * The datapath uses this as an indication that there 2288 * is a problem (as opposed to a NCE that was just 2289 * reclaimed due to lack of memory. 2290 * Note that static ARP entries never become unreachable. 2291 */ 2292 void 2293 nce_make_unreachable(ncec_t *ncec) 2294 { 2295 mutex_enter(&ncec->ncec_lock); 2296 ncec->ncec_state = ND_UNREACHABLE; 2297 mutex_exit(&ncec->ncec_lock); 2298 } 2299 2300 /* 2301 * NCE retransmit timer. Common to IPv4 and IPv6. 2302 * This timer goes off when: 2303 * a. It is time to retransmit a resolution for resolver. 2304 * b. It is time to send reachability probes. 2305 */ 2306 void 2307 nce_timer(void *arg) 2308 { 2309 ncec_t *ncec = arg; 2310 ill_t *ill = ncec->ncec_ill, *src_ill; 2311 char addrbuf[INET6_ADDRSTRLEN]; 2312 boolean_t dropped = B_FALSE; 2313 ip_stack_t *ipst = ncec->ncec_ipst; 2314 boolean_t isv6 = (ncec->ncec_ipversion == IPV6_VERSION); 2315 in_addr_t sender4 = INADDR_ANY; 2316 in6_addr_t sender6 = ipv6_all_zeros; 2317 2318 /* 2319 * The timer has to be cancelled by ncec_delete before doing the final 2320 * refrele. So the NCE is guaranteed to exist when the timer runs 2321 * until it clears the timeout_id. Before clearing the timeout_id 2322 * bump up the refcnt so that we can continue to use the ncec 2323 */ 2324 ASSERT(ncec != NULL); 2325 mutex_enter(&ncec->ncec_lock); 2326 ncec_refhold_locked(ncec); 2327 ncec->ncec_timeout_id = 0; 2328 mutex_exit(&ncec->ncec_lock); 2329 2330 src_ill = nce_resolve_src(ncec, &sender6); 2331 /* if we could not find a sender address, return */ 2332 if (src_ill == NULL) { 2333 if (!isv6) { 2334 IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, sender4); 2335 ip1dbg(("no src ill for %s\n", inet_ntop(AF_INET, 2336 &sender4, addrbuf, sizeof (addrbuf)))); 2337 } else { 2338 ip1dbg(("no src ill for %s\n", inet_ntop(AF_INET6, 2339 &ncec->ncec_addr, addrbuf, sizeof (addrbuf)))); 2340 } 2341 nce_restart_timer(ncec, ill->ill_reachable_retrans_time); 2342 ncec_refrele(ncec); 2343 return; 2344 } 2345 if (!isv6) 2346 IN6_V4MAPPED_TO_IPADDR(&sender6, sender4); 2347 2348 mutex_enter(&ncec->ncec_lock); 2349 /* 2350 * Check the reachability state. 2351 */ 2352 switch (ncec->ncec_state) { 2353 case ND_DELAY: 2354 ASSERT(ncec->ncec_lladdr != NULL); 2355 ncec->ncec_state = ND_PROBE; 2356 ncec->ncec_pcnt = ND_MAX_UNICAST_SOLICIT; 2357 if (isv6) { 2358 mutex_exit(&ncec->ncec_lock); 2359 dropped = ndp_xmit(src_ill, ND_NEIGHBOR_SOLICIT, 2360 src_ill->ill_phys_addr, 2361 src_ill->ill_phys_addr_length, 2362 &sender6, &ncec->ncec_addr, 2363 NDP_UNICAST); 2364 } else { 2365 dropped = arp_request(ncec, sender4, src_ill); 2366 mutex_exit(&ncec->ncec_lock); 2367 } 2368 if (!dropped) { 2369 mutex_enter(&ncec->ncec_lock); 2370 ncec->ncec_pcnt--; 2371 mutex_exit(&ncec->ncec_lock); 2372 } 2373 if (ip_debug > 3) { 2374 /* ip2dbg */ 2375 pr_addr_dbg("nce_timer: state for %s changed " 2376 "to PROBE\n", AF_INET6, &ncec->ncec_addr); 2377 } 2378 nce_restart_timer(ncec, ill->ill_reachable_retrans_time); 2379 break; 2380 case ND_PROBE: 2381 /* must be retransmit timer */ 2382 ASSERT(ncec->ncec_pcnt >= -1); 2383 if (ncec->ncec_pcnt > 0) { 2384 /* 2385 * As per RFC2461, the ncec gets deleted after 2386 * MAX_UNICAST_SOLICIT unsuccessful re-transmissions. 2387 * Note that the first unicast solicitation is sent 2388 * during the DELAY state. 2389 */ 2390 ip2dbg(("nce_timer: pcount=%x dst %s\n", 2391 ncec->ncec_pcnt, 2392 inet_ntop((isv6? AF_INET6 : AF_INET), 2393 &ncec->ncec_addr, addrbuf, sizeof (addrbuf)))); 2394 if (NCE_PUBLISH(ncec)) { 2395 mutex_exit(&ncec->ncec_lock); 2396 /* 2397 * send out a probe; note that src_ill 2398 * is ignored by nce_dad() for all 2399 * DAD message types other than IPv6 2400 * unicast probes 2401 */ 2402 nce_dad(ncec, src_ill, B_TRUE); 2403 } else { 2404 ASSERT(src_ill != NULL); 2405 if (isv6) { 2406 mutex_exit(&ncec->ncec_lock); 2407 dropped = ndp_xmit(src_ill, 2408 ND_NEIGHBOR_SOLICIT, 2409 src_ill->ill_phys_addr, 2410 src_ill->ill_phys_addr_length, 2411 &sender6, &ncec->ncec_addr, 2412 NDP_UNICAST); 2413 } else { 2414 /* 2415 * since the nce is REACHABLE, 2416 * the ARP request will be sent out 2417 * as a link-layer unicast. 2418 */ 2419 dropped = arp_request(ncec, sender4, 2420 src_ill); 2421 mutex_exit(&ncec->ncec_lock); 2422 } 2423 if (!dropped) { 2424 mutex_enter(&ncec->ncec_lock); 2425 ncec->ncec_pcnt--; 2426 mutex_exit(&ncec->ncec_lock); 2427 } 2428 nce_restart_timer(ncec, 2429 ill->ill_reachable_retrans_time); 2430 } 2431 } else if (ncec->ncec_pcnt < 0) { 2432 /* No hope, delete the ncec */ 2433 /* Tell datapath it went bad */ 2434 ncec->ncec_state = ND_UNREACHABLE; 2435 mutex_exit(&ncec->ncec_lock); 2436 if (ip_debug > 2) { 2437 /* ip1dbg */ 2438 pr_addr_dbg("nce_timer: Delete NCE for" 2439 " dst %s\n", (isv6? AF_INET6: AF_INET), 2440 &ncec->ncec_addr); 2441 } 2442 /* if static ARP can't delete. */ 2443 if ((ncec->ncec_flags & NCE_F_STATIC) == 0) 2444 ncec_delete(ncec); 2445 2446 } else if (!NCE_PUBLISH(ncec)) { 2447 /* 2448 * Probe count is 0 for a dynamic entry (one that we 2449 * ourselves are not publishing). We should never get 2450 * here if NONUD was requested, hence the ASSERT below. 2451 */ 2452 ASSERT((ncec->ncec_flags & NCE_F_NONUD) == 0); 2453 ip2dbg(("nce_timer: pcount=%x dst %s\n", 2454 ncec->ncec_pcnt, inet_ntop(AF_INET6, 2455 &ncec->ncec_addr, addrbuf, sizeof (addrbuf)))); 2456 ncec->ncec_pcnt--; 2457 mutex_exit(&ncec->ncec_lock); 2458 /* Wait one interval before killing */ 2459 nce_restart_timer(ncec, 2460 ill->ill_reachable_retrans_time); 2461 } else if (ill->ill_phyint->phyint_flags & PHYI_RUNNING) { 2462 ipif_t *ipif; 2463 ipaddr_t ncec_addr; 2464 2465 /* 2466 * We're done probing, and we can now declare this 2467 * address to be usable. Let IP know that it's ok to 2468 * use. 2469 */ 2470 ncec->ncec_state = ND_REACHABLE; 2471 ncec->ncec_flags &= ~NCE_F_UNVERIFIED; 2472 mutex_exit(&ncec->ncec_lock); 2473 if (isv6) { 2474 ipif = ipif_lookup_addr_exact_v6( 2475 &ncec->ncec_addr, ill, ipst); 2476 } else { 2477 IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, 2478 ncec_addr); 2479 ipif = ipif_lookup_addr_exact(ncec_addr, ill, 2480 ipst); 2481 } 2482 if (ipif != NULL) { 2483 if (ipif->ipif_was_dup) { 2484 char ibuf[LIFNAMSIZ]; 2485 char sbuf[INET6_ADDRSTRLEN]; 2486 2487 ipif->ipif_was_dup = B_FALSE; 2488 (void) inet_ntop(AF_INET6, 2489 &ipif->ipif_v6lcl_addr, 2490 sbuf, sizeof (sbuf)); 2491 ipif_get_name(ipif, ibuf, 2492 sizeof (ibuf)); 2493 cmn_err(CE_NOTE, "recovered address " 2494 "%s on %s", sbuf, ibuf); 2495 } 2496 if ((ipif->ipif_flags & IPIF_UP) && 2497 !ipif->ipif_addr_ready) 2498 ipif_up_notify(ipif); 2499 ipif->ipif_addr_ready = 1; 2500 ipif_refrele(ipif); 2501 } 2502 if (!isv6 && arp_no_defense) 2503 break; 2504 /* Begin defending our new address */ 2505 if (ncec->ncec_unsolicit_count > 0) { 2506 ncec->ncec_unsolicit_count--; 2507 if (isv6) { 2508 dropped = ndp_announce(ncec); 2509 } else { 2510 dropped = arp_announce(ncec); 2511 } 2512 2513 if (dropped) 2514 ncec->ncec_unsolicit_count++; 2515 else 2516 ncec->ncec_last_time_defended = 2517 ddi_get_lbolt(); 2518 } 2519 if (ncec->ncec_unsolicit_count > 0) { 2520 nce_restart_timer(ncec, 2521 ANNOUNCE_INTERVAL(isv6)); 2522 } else if (DEFENSE_INTERVAL(isv6) != 0) { 2523 nce_restart_timer(ncec, DEFENSE_INTERVAL(isv6)); 2524 } 2525 } else { 2526 /* 2527 * This is an address we're probing to be our own, but 2528 * the ill is down. Wait until it comes back before 2529 * doing anything, but switch to reachable state so 2530 * that the restart will work. 2531 */ 2532 ncec->ncec_state = ND_REACHABLE; 2533 mutex_exit(&ncec->ncec_lock); 2534 } 2535 break; 2536 case ND_INCOMPLETE: { 2537 mblk_t *mp, *nextmp; 2538 mblk_t **prevmpp; 2539 2540 /* 2541 * Per case (2) in the nce_queue_mp() comments, scan ncec_qd_mp 2542 * for any IPMP probe packets, and toss them. IPMP probe 2543 * packets will always be at the head of ncec_qd_mp, so that 2544 * we can stop at the first queued ND packet that is 2545 * not a probe packet. 2546 */ 2547 prevmpp = &ncec->ncec_qd_mp; 2548 for (mp = ncec->ncec_qd_mp; mp != NULL; mp = nextmp) { 2549 nextmp = mp->b_next; 2550 2551 if (IS_UNDER_IPMP(ill) && ncec->ncec_nprobes > 0) { 2552 inet_freemsg(mp); 2553 ncec->ncec_nprobes--; 2554 *prevmpp = nextmp; 2555 } else { 2556 prevmpp = &mp->b_next; 2557 } 2558 } 2559 2560 /* 2561 * Must be resolver's retransmit timer. 2562 */ 2563 mutex_exit(&ncec->ncec_lock); 2564 ip_ndp_resolve(ncec); 2565 break; 2566 } 2567 case ND_REACHABLE: 2568 if (((ncec->ncec_flags & NCE_F_UNSOL_ADV) && 2569 ncec->ncec_unsolicit_count != 0) || 2570 (NCE_PUBLISH(ncec) && DEFENSE_INTERVAL(isv6) != 0)) { 2571 if (ncec->ncec_unsolicit_count > 0) { 2572 ncec->ncec_unsolicit_count--; 2573 mutex_exit(&ncec->ncec_lock); 2574 /* 2575 * When we get to zero announcements left, 2576 * switch to address defense 2577 */ 2578 } else { 2579 boolean_t rate_limit; 2580 2581 mutex_exit(&ncec->ncec_lock); 2582 rate_limit = ill_defend_rate_limit(ill, ncec); 2583 if (rate_limit) { 2584 nce_restart_timer(ncec, 2585 DEFENSE_INTERVAL(isv6)); 2586 break; 2587 } 2588 } 2589 if (isv6) { 2590 dropped = ndp_announce(ncec); 2591 } else { 2592 dropped = arp_announce(ncec); 2593 } 2594 mutex_enter(&ncec->ncec_lock); 2595 if (dropped) { 2596 ncec->ncec_unsolicit_count++; 2597 } else { 2598 ncec->ncec_last_time_defended = 2599 ddi_get_lbolt(); 2600 } 2601 mutex_exit(&ncec->ncec_lock); 2602 if (ncec->ncec_unsolicit_count != 0) { 2603 nce_restart_timer(ncec, 2604 ANNOUNCE_INTERVAL(isv6)); 2605 } else { 2606 nce_restart_timer(ncec, DEFENSE_INTERVAL(isv6)); 2607 } 2608 } else { 2609 mutex_exit(&ncec->ncec_lock); 2610 } 2611 break; 2612 default: 2613 mutex_exit(&ncec->ncec_lock); 2614 break; 2615 } 2616 done: 2617 ncec_refrele(ncec); 2618 ill_refrele(src_ill); 2619 } 2620 2621 /* 2622 * Set a link layer address from the ll_addr passed in. 2623 * Copy SAP from ill. 2624 */ 2625 static void 2626 nce_set_ll(ncec_t *ncec, uchar_t *ll_addr) 2627 { 2628 ill_t *ill = ncec->ncec_ill; 2629 2630 ASSERT(ll_addr != NULL); 2631 if (ill->ill_phys_addr_length > 0) { 2632 /* 2633 * The bcopy() below used to be called for the physical address 2634 * length rather than the link layer address length. For 2635 * ethernet and many other media, the phys_addr and lla are 2636 * identical. 2637 * 2638 * The phys_addr and lla may not be the same for devices that 2639 * support DL_IPV6_LINK_LAYER_ADDR, though there are currently 2640 * no known instances of these. 2641 * 2642 * For PPP or other interfaces with a zero length 2643 * physical address, don't do anything here. 2644 * The bcopy() with a zero phys_addr length was previously 2645 * a no-op for interfaces with a zero-length physical address. 2646 * Using the lla for them would change the way they operate. 2647 * Doing nothing in such cases preserves expected behavior. 2648 */ 2649 bcopy(ll_addr, ncec->ncec_lladdr, ill->ill_nd_lla_len); 2650 } 2651 } 2652 2653 boolean_t 2654 nce_cmp_ll_addr(const ncec_t *ncec, const uchar_t *ll_addr, 2655 uint32_t ll_addr_len) 2656 { 2657 ASSERT(ncec->ncec_lladdr != NULL); 2658 if (ll_addr == NULL) 2659 return (B_FALSE); 2660 if (bcmp(ll_addr, ncec->ncec_lladdr, ll_addr_len) != 0) 2661 return (B_TRUE); 2662 return (B_FALSE); 2663 } 2664 2665 /* 2666 * Updates the link layer address or the reachability state of 2667 * a cache entry. Reset probe counter if needed. 2668 */ 2669 void 2670 nce_update(ncec_t *ncec, uint16_t new_state, uchar_t *new_ll_addr) 2671 { 2672 ill_t *ill = ncec->ncec_ill; 2673 boolean_t need_stop_timer = B_FALSE; 2674 boolean_t need_fastpath_update = B_FALSE; 2675 nce_t *nce = NULL; 2676 timeout_id_t tid; 2677 2678 ASSERT(MUTEX_HELD(&ncec->ncec_lock)); 2679 /* 2680 * If this interface does not do NUD, there is no point 2681 * in allowing an update to the cache entry. Although 2682 * we will respond to NS. 2683 * The only time we accept an update for a resolver when 2684 * NUD is turned off is when it has just been created. 2685 * Non-Resolvers will always be created as REACHABLE. 2686 */ 2687 if (new_state != ND_UNCHANGED) { 2688 if ((ncec->ncec_flags & NCE_F_NONUD) && 2689 (ncec->ncec_state != ND_INCOMPLETE)) 2690 return; 2691 ASSERT((int16_t)new_state >= ND_STATE_VALID_MIN); 2692 ASSERT((int16_t)new_state <= ND_STATE_VALID_MAX); 2693 need_stop_timer = B_TRUE; 2694 if (new_state == ND_REACHABLE) 2695 ncec->ncec_last = TICK_TO_MSEC(ddi_get_lbolt64()); 2696 else { 2697 /* We force NUD in this case */ 2698 ncec->ncec_last = 0; 2699 } 2700 ncec->ncec_state = new_state; 2701 ncec->ncec_pcnt = ND_MAX_UNICAST_SOLICIT; 2702 ASSERT(ncec->ncec_lladdr != NULL || new_state == ND_INITIAL || 2703 new_state == ND_INCOMPLETE); 2704 } 2705 if (need_stop_timer || (ncec->ncec_flags & NCE_F_STATIC)) { 2706 tid = ncec->ncec_timeout_id; 2707 ncec->ncec_timeout_id = 0; 2708 } 2709 /* 2710 * Re-trigger fastpath probe and 2711 * overwrite the DL_UNITDATA_REQ data, noting we'll lose 2712 * whatever packets that happens to be transmitting at the time. 2713 */ 2714 if (new_ll_addr != NULL) { 2715 bcopy(new_ll_addr, ncec->ncec_lladdr, 2716 ill->ill_phys_addr_length); 2717 need_fastpath_update = B_TRUE; 2718 } 2719 mutex_exit(&ncec->ncec_lock); 2720 if (need_stop_timer || (ncec->ncec_flags & NCE_F_STATIC)) { 2721 if (tid != 0) 2722 (void) untimeout(tid); 2723 } 2724 if (need_fastpath_update) { 2725 /* 2726 * Delete any existing existing dlur_mp and fp_mp information. 2727 * For IPMP interfaces, all underlying ill's must be checked 2728 * and purged. 2729 */ 2730 nce_fastpath_list_delete(ncec->ncec_ill, ncec, NULL); 2731 /* 2732 * add the new dlur_mp and fp_mp 2733 */ 2734 nce = nce_fastpath(ncec, B_TRUE, NULL); 2735 if (nce != NULL) 2736 nce_refrele(nce); 2737 } 2738 mutex_enter(&ncec->ncec_lock); 2739 } 2740 2741 static void 2742 nce_queue_mp_common(ncec_t *ncec, mblk_t *mp, boolean_t head_insert) 2743 { 2744 uint_t count = 0; 2745 mblk_t **mpp, *tmp; 2746 2747 ASSERT(MUTEX_HELD(&ncec->ncec_lock)); 2748 2749 for (mpp = &ncec->ncec_qd_mp; *mpp != NULL; mpp = &(*mpp)->b_next) { 2750 if (++count > ncec->ncec_ill->ill_max_buf) { 2751 tmp = ncec->ncec_qd_mp->b_next; 2752 ncec->ncec_qd_mp->b_next = NULL; 2753 /* 2754 * if we never create data addrs on the under_ill 2755 * does this matter? 2756 */ 2757 BUMP_MIB(ncec->ncec_ill->ill_ip_mib, 2758 ipIfStatsOutDiscards); 2759 ip_drop_output("ipIfStatsOutDiscards", ncec->ncec_qd_mp, 2760 ncec->ncec_ill); 2761 freemsg(ncec->ncec_qd_mp); 2762 ncec->ncec_qd_mp = tmp; 2763 } 2764 } 2765 2766 if (head_insert) { 2767 ncec->ncec_nprobes++; 2768 mp->b_next = ncec->ncec_qd_mp; 2769 ncec->ncec_qd_mp = mp; 2770 } else { 2771 *mpp = mp; 2772 } 2773 } 2774 2775 /* 2776 * nce_queue_mp will queue the packet into the ncec_qd_mp. The packet will be 2777 * queued at the head or tail of the queue based on the input argument 2778 * 'head_insert'. The caller should specify this argument as B_TRUE if this 2779 * packet is an IPMP probe packet, in which case the following happens: 2780 * 2781 * 1. Insert it at the head of the ncec_qd_mp list. Consider the normal 2782 * (non-ipmp_probe) load-speading case where the source address of the ND 2783 * packet is not tied to ncec_ill. If the ill bound to the source address 2784 * cannot receive, the response to the ND packet will not be received. 2785 * However, if ND packets for ncec_ill's probes are queued behind that ND 2786 * packet, those probes will also fail to be sent, and thus in.mpathd will 2787 * erroneously conclude that ncec_ill has also failed. 2788 * 2789 * 2. Drop the ipmp_probe packet in ndp_timer() if the ND did not succeed on 2790 * the first attempt. This ensures that ND problems do not manifest as 2791 * probe RTT spikes. 2792 * 2793 * We achieve this by inserting ipmp_probe() packets at the head of the 2794 * nce_queue. 2795 * 2796 * The ncec for the probe target is created with ncec_ill set to the ipmp_ill, 2797 * but the caller needs to set head_insert to B_TRUE if this is a probe packet. 2798 */ 2799 void 2800 nce_queue_mp(ncec_t *ncec, mblk_t *mp, boolean_t head_insert) 2801 { 2802 ASSERT(MUTEX_HELD(&ncec->ncec_lock)); 2803 nce_queue_mp_common(ncec, mp, head_insert); 2804 } 2805 2806 /* 2807 * Called when address resolution failed due to a timeout. 2808 * Send an ICMP unreachable in response to all queued packets. 2809 */ 2810 void 2811 ndp_resolv_failed(ncec_t *ncec) 2812 { 2813 mblk_t *mp, *nxt_mp; 2814 char buf[INET6_ADDRSTRLEN]; 2815 ill_t *ill = ncec->ncec_ill; 2816 ip_recv_attr_t iras; 2817 2818 bzero(&iras, sizeof (iras)); 2819 iras.ira_flags = 0; 2820 /* 2821 * we are setting the ira_rill to the ipmp_ill (instead of 2822 * the actual ill on which the packet was received), but this 2823 * is ok because we don't actually need the real ira_rill. 2824 * to send the icmp unreachable to the sender. 2825 */ 2826 iras.ira_ill = iras.ira_rill = ill; 2827 iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex; 2828 iras.ira_rifindex = iras.ira_ruifindex; 2829 2830 ip1dbg(("ndp_resolv_failed: dst %s\n", 2831 inet_ntop(AF_INET6, (char *)&ncec->ncec_addr, buf, sizeof (buf)))); 2832 mutex_enter(&ncec->ncec_lock); 2833 mp = ncec->ncec_qd_mp; 2834 ncec->ncec_qd_mp = NULL; 2835 ncec->ncec_nprobes = 0; 2836 mutex_exit(&ncec->ncec_lock); 2837 while (mp != NULL) { 2838 nxt_mp = mp->b_next; 2839 mp->b_next = NULL; 2840 2841 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); 2842 ip_drop_output("ipIfStatsOutDiscards - address unreachable", 2843 mp, ill); 2844 icmp_unreachable_v6(mp, 2845 ICMP6_DST_UNREACH_ADDR, B_FALSE, &iras); 2846 ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE)); 2847 mp = nxt_mp; 2848 } 2849 ncec_cb_dispatch(ncec); /* finish off waiting callbacks */ 2850 } 2851 2852 /* 2853 * Handle the completion of NDP and ARP resolution. 2854 */ 2855 void 2856 nce_resolv_ok(ncec_t *ncec) 2857 { 2858 mblk_t *mp; 2859 uint_t pkt_len; 2860 iaflags_t ixaflags = IXAF_NO_TRACE; 2861 nce_t *nce; 2862 ill_t *ill = ncec->ncec_ill; 2863 boolean_t isv6 = (ncec->ncec_ipversion == IPV6_VERSION); 2864 ip_stack_t *ipst = ill->ill_ipst; 2865 2866 if (IS_IPMP(ncec->ncec_ill)) { 2867 nce_resolv_ipmp_ok(ncec); 2868 return; 2869 } 2870 /* non IPMP case */ 2871 2872 mutex_enter(&ncec->ncec_lock); 2873 ASSERT(ncec->ncec_nprobes == 0); 2874 mp = ncec->ncec_qd_mp; 2875 ncec->ncec_qd_mp = NULL; 2876 mutex_exit(&ncec->ncec_lock); 2877 2878 while (mp != NULL) { 2879 mblk_t *nxt_mp; 2880 2881 if (ill->ill_isv6) { 2882 ip6_t *ip6h = (ip6_t *)mp->b_rptr; 2883 2884 pkt_len = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN; 2885 } else { 2886 ipha_t *ipha = (ipha_t *)mp->b_rptr; 2887 2888 ixaflags |= IXAF_IS_IPV4; 2889 pkt_len = ntohs(ipha->ipha_length); 2890 } 2891 nxt_mp = mp->b_next; 2892 mp->b_next = NULL; 2893 /* 2894 * IXAF_NO_DEV_FLOW_CTL information for TCP packets is no 2895 * longer available, but it's ok to drop this flag because TCP 2896 * has its own flow-control in effect, so TCP packets 2897 * are not likely to get here when flow-control is in effect. 2898 */ 2899 mutex_enter(&ill->ill_lock); 2900 nce = nce_lookup(ill, &ncec->ncec_addr); 2901 mutex_exit(&ill->ill_lock); 2902 2903 if (nce == NULL) { 2904 if (isv6) { 2905 BUMP_MIB(&ipst->ips_ip6_mib, 2906 ipIfStatsOutDiscards); 2907 } else { 2908 BUMP_MIB(&ipst->ips_ip_mib, 2909 ipIfStatsOutDiscards); 2910 } 2911 ip_drop_output("ipIfStatsOutDiscards - no nce", 2912 mp, NULL); 2913 freemsg(mp); 2914 } else { 2915 /* 2916 * We don't know the zoneid, but 2917 * ip_xmit does not care since IXAF_NO_TRACE 2918 * is set. (We traced the packet the first 2919 * time through ip_xmit.) 2920 */ 2921 (void) ip_xmit(mp, nce, ixaflags, pkt_len, 0, 2922 ALL_ZONES, 0, NULL); 2923 nce_refrele(nce); 2924 } 2925 mp = nxt_mp; 2926 } 2927 2928 ncec_cb_dispatch(ncec); /* complete callbacks */ 2929 } 2930 2931 /* 2932 * Called by SIOCSNDP* ioctl to add/change an ncec entry 2933 * and the corresponding attributes. 2934 * Disallow states other than ND_REACHABLE or ND_STALE. 2935 */ 2936 int 2937 ndp_sioc_update(ill_t *ill, lif_nd_req_t *lnr) 2938 { 2939 sin6_t *sin6; 2940 in6_addr_t *addr; 2941 ncec_t *ncec; 2942 nce_t *nce; 2943 int err = 0; 2944 uint16_t new_flags = 0; 2945 uint16_t old_flags = 0; 2946 int inflags = lnr->lnr_flags; 2947 ip_stack_t *ipst = ill->ill_ipst; 2948 boolean_t do_postprocess = B_FALSE; 2949 2950 ASSERT(ill->ill_isv6); 2951 if ((lnr->lnr_state_create != ND_REACHABLE) && 2952 (lnr->lnr_state_create != ND_STALE)) 2953 return (EINVAL); 2954 2955 sin6 = (sin6_t *)&lnr->lnr_addr; 2956 addr = &sin6->sin6_addr; 2957 2958 mutex_enter(&ipst->ips_ndp6->ndp_g_lock); 2959 ASSERT(!IS_UNDER_IPMP(ill)); 2960 nce = nce_lookup_addr(ill, addr); 2961 if (nce != NULL) 2962 new_flags = nce->nce_common->ncec_flags; 2963 2964 switch (inflags & (NDF_ISROUTER_ON|NDF_ISROUTER_OFF)) { 2965 case NDF_ISROUTER_ON: 2966 new_flags |= NCE_F_ISROUTER; 2967 break; 2968 case NDF_ISROUTER_OFF: 2969 new_flags &= ~NCE_F_ISROUTER; 2970 break; 2971 case (NDF_ISROUTER_OFF|NDF_ISROUTER_ON): 2972 mutex_exit(&ipst->ips_ndp6->ndp_g_lock); 2973 if (nce != NULL) 2974 nce_refrele(nce); 2975 return (EINVAL); 2976 } 2977 2978 switch (inflags & (NDF_ANYCAST_ON|NDF_ANYCAST_OFF)) { 2979 case NDF_ANYCAST_ON: 2980 new_flags |= NCE_F_ANYCAST; 2981 break; 2982 case NDF_ANYCAST_OFF: 2983 new_flags &= ~NCE_F_ANYCAST; 2984 break; 2985 case (NDF_ANYCAST_OFF|NDF_ANYCAST_ON): 2986 mutex_exit(&ipst->ips_ndp6->ndp_g_lock); 2987 if (nce != NULL) 2988 nce_refrele(nce); 2989 return (EINVAL); 2990 } 2991 2992 if (nce == NULL) { 2993 err = nce_add_v6(ill, 2994 (uchar_t *)lnr->lnr_hdw_addr, 2995 ill->ill_phys_addr_length, 2996 addr, 2997 new_flags, 2998 lnr->lnr_state_create, 2999 &nce); 3000 if (err != 0) { 3001 mutex_exit(&ipst->ips_ndp6->ndp_g_lock); 3002 ip1dbg(("ndp_sioc_update: Can't create NCE %d\n", err)); 3003 return (err); 3004 } else { 3005 do_postprocess = B_TRUE; 3006 } 3007 } 3008 ncec = nce->nce_common; 3009 old_flags = ncec->ncec_flags; 3010 if (old_flags & NCE_F_ISROUTER && !(new_flags & NCE_F_ISROUTER)) { 3011 ncec_router_to_host(ncec); 3012 mutex_exit(&ipst->ips_ndp6->ndp_g_lock); 3013 if (do_postprocess) 3014 err = nce_add_v6_postprocess(nce); 3015 nce_refrele(nce); 3016 return (0); 3017 } 3018 mutex_exit(&ipst->ips_ndp6->ndp_g_lock); 3019 3020 if (do_postprocess) 3021 err = nce_add_v6_postprocess(nce); 3022 /* 3023 * err cannot be anything other than 0 because we don't support 3024 * proxy arp of static addresses. 3025 */ 3026 ASSERT(err == 0); 3027 3028 mutex_enter(&ncec->ncec_lock); 3029 ncec->ncec_flags = new_flags; 3030 mutex_exit(&ncec->ncec_lock); 3031 /* 3032 * Note that we ignore the state at this point, which 3033 * should be either STALE or REACHABLE. Instead we let 3034 * the link layer address passed in to determine the state 3035 * much like incoming packets. 3036 */ 3037 nce_process(ncec, (uchar_t *)lnr->lnr_hdw_addr, 0, B_FALSE); 3038 nce_refrele(nce); 3039 return (0); 3040 } 3041 3042 /* 3043 * Create an nce_t structure for ill using the ncec->ncec_lladdr to set up 3044 * the nce_dlur_mp. If ill != ncec->ncec_ill, then the ips_ill_g_lock must 3045 * be held to ensure that they are in the same group. 3046 */ 3047 static nce_t * 3048 nce_fastpath_create(ill_t *ill, ncec_t *ncec) 3049 { 3050 3051 nce_t *nce; 3052 3053 nce = nce_ill_lookup_then_add(ill, ncec); 3054 3055 if (nce == NULL || IS_LOOPBACK(nce->nce_ill) || IS_VNI(nce->nce_ill)) 3056 return (nce); 3057 3058 /* 3059 * hold the ncec_lock to synchronize with nce_update() so that, 3060 * at the end of this function, the contents of nce_dlur_mp are 3061 * consistent with ncec->ncec_lladdr, even though some intermediate 3062 * packet may have been sent out with a mangled address, which would 3063 * only be a transient condition. 3064 */ 3065 mutex_enter(&ncec->ncec_lock); 3066 if (ncec->ncec_lladdr != NULL) { 3067 bcopy(ncec->ncec_lladdr, nce->nce_dlur_mp->b_rptr + 3068 NCE_LL_ADDR_OFFSET(ill), ill->ill_phys_addr_length); 3069 } else { 3070 nce->nce_dlur_mp = ill_dlur_gen(NULL, 0, ill->ill_sap, 3071 ill->ill_sap_length); 3072 } 3073 mutex_exit(&ncec->ncec_lock); 3074 return (nce); 3075 } 3076 3077 /* 3078 * we make nce_fp_mp to have an M_DATA prepend. 3079 * The caller ensures there is hold on ncec for this function. 3080 * Note that since ill_fastpath_probe() copies the mblk there is 3081 * no need to hold the nce or ncec beyond this function. 3082 * 3083 * If the caller has passed in a non-null ncec_nce to nce_faspath() that 3084 * ncec_nce must correspond to the nce for ncec with nce_ill == ncec->ncec_ill 3085 * and will be returned back by this function, so that no extra nce_refrele 3086 * is required for the caller. The calls from nce_add_common() use this 3087 * method. All other callers (that pass in NULL ncec_nce) will have to do a 3088 * nce_refrele of the returned nce (when it is non-null). 3089 */ 3090 nce_t * 3091 nce_fastpath(ncec_t *ncec, boolean_t trigger_fp_req, nce_t *ncec_nce) 3092 { 3093 nce_t *nce; 3094 ill_t *ill = ncec->ncec_ill; 3095 3096 ASSERT(ill != NULL); 3097 3098 if (IS_IPMP(ill) && trigger_fp_req) { 3099 trigger_fp_req = B_FALSE; 3100 ipmp_ncec_fastpath(ncec, ill); 3101 3102 } 3103 /* 3104 * If the caller already has the nce corresponding to the ill, use 3105 * that one. Otherwise we have to lookup/add the nce. Calls from 3106 * nce_add_common() fall in the former category, and have just done 3107 * the nce lookup/add that can be reused. 3108 */ 3109 if (ncec_nce == NULL) 3110 nce = nce_fastpath_create(ill, ncec); 3111 else 3112 nce = ncec_nce; 3113 3114 if (nce == NULL || IS_LOOPBACK(nce->nce_ill) || IS_VNI(nce->nce_ill)) 3115 return (nce); 3116 3117 if (trigger_fp_req) 3118 nce_fastpath_trigger(nce); 3119 return (nce); 3120 } 3121 3122 /* 3123 * Trigger fastpath on nce. No locks may be held. 3124 */ 3125 static void 3126 nce_fastpath_trigger(nce_t *nce) 3127 { 3128 int res; 3129 ill_t *ill = nce->nce_ill; 3130 ncec_t *ncec = nce->nce_common; 3131 3132 res = ill_fastpath_probe(ill, nce->nce_dlur_mp); 3133 /* 3134 * EAGAIN is an indication of a transient error 3135 * i.e. allocation failure etc. leave the ncec in the list it 3136 * will be updated when another probe happens for another ire 3137 * if not it will be taken out of the list when the ire is 3138 * deleted. 3139 */ 3140 if (res != 0 && res != EAGAIN && res != ENOTSUP) 3141 nce_fastpath_list_delete(ill, ncec, NULL); 3142 } 3143 3144 /* 3145 * Add ncec to the nce fastpath list on ill. 3146 */ 3147 static nce_t * 3148 nce_ill_lookup_then_add_locked(ill_t *ill, ncec_t *ncec) 3149 { 3150 nce_t *nce = NULL; 3151 3152 ASSERT(MUTEX_HELD(&ill->ill_lock)); 3153 /* 3154 * Atomically ensure that the ill is not CONDEMNED and is not going 3155 * down, before adding the NCE. 3156 */ 3157 if (ill->ill_state_flags & ILL_CONDEMNED) 3158 return (NULL); 3159 mutex_enter(&ncec->ncec_lock); 3160 /* 3161 * if ncec has not been deleted and 3162 * is not already in the list add it. 3163 */ 3164 if (!NCE_ISCONDEMNED(ncec)) { 3165 nce = nce_lookup(ill, &ncec->ncec_addr); 3166 if (nce != NULL) 3167 goto done; 3168 nce = nce_add(ill, ncec); 3169 } 3170 done: 3171 mutex_exit(&ncec->ncec_lock); 3172 return (nce); 3173 } 3174 3175 nce_t * 3176 nce_ill_lookup_then_add(ill_t *ill, ncec_t *ncec) 3177 { 3178 nce_t *nce; 3179 3180 mutex_enter(&ill->ill_lock); 3181 nce = nce_ill_lookup_then_add_locked(ill, ncec); 3182 mutex_exit(&ill->ill_lock); 3183 return (nce); 3184 } 3185 3186 3187 /* 3188 * remove ncec from the ill_nce list. If 'dead' is non-null, the deleted 3189 * nce is added to the 'dead' list, and the caller must nce_refrele() the 3190 * entry after all locks have been dropped. 3191 */ 3192 void 3193 nce_fastpath_list_delete(ill_t *ill, ncec_t *ncec, list_t *dead) 3194 { 3195 nce_t *nce; 3196 3197 ASSERT(ill != NULL); 3198 3199 /* first clean out any nce pointers in the under_ills */ 3200 if (IS_IPMP(ill)) 3201 ipmp_ncec_flush_nce(ncec); 3202 3203 /* now the ill itself */ 3204 mutex_enter(&ill->ill_lock); 3205 for (nce = list_head(&ill->ill_nce); nce != NULL; 3206 nce = list_next(&ill->ill_nce, nce)) { 3207 if (nce->nce_common == ncec) { 3208 nce_refhold(nce); 3209 nce_delete(nce); 3210 break; 3211 } 3212 } 3213 mutex_exit(&ill->ill_lock); 3214 if (nce != NULL) { 3215 if (dead == NULL) 3216 nce_refrele(nce); 3217 else 3218 list_insert_tail(dead, nce); 3219 } 3220 } 3221 3222 /* 3223 * when the fastpath response does not fit in the datab 3224 * associated with the existing nce_fp_mp, we delete and 3225 * add the nce to retrigger fastpath based on the information 3226 * in the ncec_t. 3227 */ 3228 static nce_t * 3229 nce_delete_then_add(nce_t *nce) 3230 { 3231 ill_t *ill = nce->nce_ill; 3232 nce_t *newnce = NULL; 3233 3234 ip0dbg(("nce_delete_then_add nce %p ill %s\n", 3235 (void *)nce, ill->ill_name)); 3236 mutex_enter(&ill->ill_lock); 3237 mutex_enter(&nce->nce_common->ncec_lock); 3238 nce_delete(nce); 3239 /* 3240 * Make sure that ncec is not condemned before adding. We hold the 3241 * ill_lock and ncec_lock to synchronize with ncec_delete() and 3242 * ipmp_ncec_flush_nce() 3243 */ 3244 if (!NCE_ISCONDEMNED(nce->nce_common)) 3245 newnce = nce_add(ill, nce->nce_common); 3246 mutex_exit(&nce->nce_common->ncec_lock); 3247 mutex_exit(&ill->ill_lock); 3248 nce_refrele(nce); 3249 return (newnce); /* could be null if nomem */ 3250 } 3251 3252 typedef struct nce_fp_match_s { 3253 nce_t *nce_fp_match_res; 3254 mblk_t *nce_fp_match_ack_mp; 3255 } nce_fp_match_t; 3256 3257 /* ARGSUSED */ 3258 static int 3259 nce_fastpath_match_dlur(ill_t *ill, nce_t *nce, void *arg) 3260 { 3261 nce_fp_match_t *nce_fp_marg = arg; 3262 ncec_t *ncec = nce->nce_common; 3263 mblk_t *mp = nce_fp_marg->nce_fp_match_ack_mp; 3264 uchar_t *mp_rptr, *ud_mp_rptr; 3265 mblk_t *ud_mp = nce->nce_dlur_mp; 3266 ptrdiff_t cmplen; 3267 3268 /* 3269 * mp is the mp associated with the fastpath ack. 3270 * ud_mp is the outstanding DL_UNITDATA_REQ on the nce_t 3271 * under consideration. If the contents match, then the 3272 * fastpath ack is used to update the nce. 3273 */ 3274 if (ud_mp == NULL) 3275 return (0); 3276 mp_rptr = mp->b_rptr; 3277 cmplen = mp->b_wptr - mp_rptr; 3278 ASSERT(cmplen >= 0); 3279 3280 ud_mp_rptr = ud_mp->b_rptr; 3281 /* 3282 * The ncec is locked here to prevent any other threads from accessing 3283 * and changing nce_dlur_mp when the address becomes resolved to an 3284 * lla while we're in the middle of looking at and comparing the 3285 * hardware address (lla). It is also locked to prevent multiple 3286 * threads in nce_fastpath() from examining nce_dlur_mp at the same 3287 * time. 3288 */ 3289 mutex_enter(&ncec->ncec_lock); 3290 if (ud_mp->b_wptr - ud_mp_rptr != cmplen || 3291 bcmp((char *)mp_rptr, (char *)ud_mp_rptr, cmplen) == 0) { 3292 nce_fp_marg->nce_fp_match_res = nce; 3293 mutex_exit(&ncec->ncec_lock); 3294 nce_refhold(nce); 3295 return (1); 3296 } 3297 mutex_exit(&ncec->ncec_lock); 3298 return (0); 3299 } 3300 3301 /* 3302 * Update all NCE's that are not in fastpath mode and 3303 * have an nce_fp_mp that matches mp. mp->b_cont contains 3304 * the fastpath header. 3305 * 3306 * Returns TRUE if entry should be dequeued, or FALSE otherwise. 3307 */ 3308 void 3309 nce_fastpath_update(ill_t *ill, mblk_t *mp) 3310 { 3311 nce_fp_match_t nce_fp_marg; 3312 nce_t *nce; 3313 mblk_t *nce_fp_mp, *fp_mp; 3314 3315 nce_fp_marg.nce_fp_match_res = NULL; 3316 nce_fp_marg.nce_fp_match_ack_mp = mp; 3317 3318 nce_walk(ill, nce_fastpath_match_dlur, &nce_fp_marg); 3319 3320 if ((nce = nce_fp_marg.nce_fp_match_res) == NULL) 3321 return; 3322 3323 mutex_enter(&nce->nce_lock); 3324 nce_fp_mp = nce->nce_fp_mp; 3325 3326 if (nce_fp_mp != NULL) { 3327 fp_mp = mp->b_cont; 3328 if (nce_fp_mp->b_rptr + MBLKL(fp_mp) > 3329 nce_fp_mp->b_datap->db_lim) { 3330 mutex_exit(&nce->nce_lock); 3331 nce = nce_delete_then_add(nce); 3332 if (nce == NULL) { 3333 return; 3334 } 3335 mutex_enter(&nce->nce_lock); 3336 nce_fp_mp = nce->nce_fp_mp; 3337 } 3338 } 3339 3340 /* Matched - install mp as the fastpath mp */ 3341 if (nce_fp_mp == NULL) { 3342 fp_mp = dupb(mp->b_cont); 3343 nce->nce_fp_mp = fp_mp; 3344 } else { 3345 fp_mp = mp->b_cont; 3346 bcopy(fp_mp->b_rptr, nce_fp_mp->b_rptr, MBLKL(fp_mp)); 3347 nce->nce_fp_mp->b_wptr = nce->nce_fp_mp->b_rptr 3348 + MBLKL(fp_mp); 3349 } 3350 mutex_exit(&nce->nce_lock); 3351 nce_refrele(nce); 3352 } 3353 3354 /* 3355 * Return a pointer to a given option in the packet. 3356 * Assumes that option part of the packet have already been validated. 3357 */ 3358 nd_opt_hdr_t * 3359 ndp_get_option(nd_opt_hdr_t *opt, int optlen, int opt_type) 3360 { 3361 while (optlen > 0) { 3362 if (opt->nd_opt_type == opt_type) 3363 return (opt); 3364 optlen -= 8 * opt->nd_opt_len; 3365 opt = (struct nd_opt_hdr *)((char *)opt + 8 * opt->nd_opt_len); 3366 } 3367 return (NULL); 3368 } 3369 3370 /* 3371 * Verify all option lengths present are > 0, also check to see 3372 * if the option lengths and packet length are consistent. 3373 */ 3374 boolean_t 3375 ndp_verify_optlen(nd_opt_hdr_t *opt, int optlen) 3376 { 3377 ASSERT(opt != NULL); 3378 while (optlen > 0) { 3379 if (opt->nd_opt_len == 0) 3380 return (B_FALSE); 3381 optlen -= 8 * opt->nd_opt_len; 3382 if (optlen < 0) 3383 return (B_FALSE); 3384 opt = (struct nd_opt_hdr *)((char *)opt + 8 * opt->nd_opt_len); 3385 } 3386 return (B_TRUE); 3387 } 3388 3389 /* 3390 * ncec_walk function. 3391 * Free a fraction of the NCE cache entries. 3392 * 3393 * A possible optimization here would be to use ncec_last where possible, and 3394 * delete the least-frequently used entry, which would require more complex 3395 * computation as we walk through the ncec's (e.g., track ncec entries by 3396 * order of ncec_last and/or maintain state) 3397 */ 3398 static void 3399 ncec_cache_reclaim(ncec_t *ncec, char *arg) 3400 { 3401 ip_stack_t *ipst = ncec->ncec_ipst; 3402 uint_t fraction = *(uint_t *)arg; 3403 uint_t rand; 3404 3405 if ((ncec->ncec_flags & 3406 (NCE_F_MYADDR | NCE_F_STATIC | NCE_F_BCAST)) != 0) { 3407 return; 3408 } 3409 3410 rand = (uint_t)ddi_get_lbolt() + 3411 NCE_ADDR_HASH_V6(ncec->ncec_addr, NCE_TABLE_SIZE); 3412 if ((rand/fraction)*fraction == rand) { 3413 IP_STAT(ipst, ip_nce_reclaim_deleted); 3414 ncec_delete(ncec); 3415 } 3416 } 3417 3418 /* 3419 * kmem_cache callback to free up memory. 3420 * 3421 * For now we just delete a fixed fraction. 3422 */ 3423 static void 3424 ip_nce_reclaim_stack(ip_stack_t *ipst) 3425 { 3426 uint_t fraction = ipst->ips_ip_nce_reclaim_fraction; 3427 3428 IP_STAT(ipst, ip_nce_reclaim_calls); 3429 3430 ncec_walk(NULL, (pfi_t)ncec_cache_reclaim, (uchar_t *)&fraction, ipst); 3431 3432 /* 3433 * Walk all CONNs that can have a reference on an ire, ncec or dce. 3434 * Get them to update any stale references to drop any refholds they 3435 * have. 3436 */ 3437 ipcl_walk(conn_ixa_cleanup, (void *)B_FALSE, ipst); 3438 } 3439 3440 /* 3441 * Called by the memory allocator subsystem directly, when the system 3442 * is running low on memory. 3443 */ 3444 /* ARGSUSED */ 3445 void 3446 ip_nce_reclaim(void *args) 3447 { 3448 netstack_handle_t nh; 3449 netstack_t *ns; 3450 ip_stack_t *ipst; 3451 3452 netstack_next_init(&nh); 3453 while ((ns = netstack_next(&nh)) != NULL) { 3454 /* 3455 * netstack_next() can return a netstack_t with a NULL 3456 * netstack_ip at boot time. 3457 */ 3458 if ((ipst = ns->netstack_ip) == NULL) { 3459 netstack_rele(ns); 3460 continue; 3461 } 3462 ip_nce_reclaim_stack(ipst); 3463 netstack_rele(ns); 3464 } 3465 netstack_next_fini(&nh); 3466 } 3467 3468 #ifdef DEBUG 3469 void 3470 ncec_trace_ref(ncec_t *ncec) 3471 { 3472 ASSERT(MUTEX_HELD(&ncec->ncec_lock)); 3473 3474 if (ncec->ncec_trace_disable) 3475 return; 3476 3477 if (!th_trace_ref(ncec, ncec->ncec_ipst)) { 3478 ncec->ncec_trace_disable = B_TRUE; 3479 ncec_trace_cleanup(ncec); 3480 } 3481 } 3482 3483 void 3484 ncec_untrace_ref(ncec_t *ncec) 3485 { 3486 ASSERT(MUTEX_HELD(&ncec->ncec_lock)); 3487 3488 if (!ncec->ncec_trace_disable) 3489 th_trace_unref(ncec); 3490 } 3491 3492 static void 3493 ncec_trace_cleanup(const ncec_t *ncec) 3494 { 3495 th_trace_cleanup(ncec, ncec->ncec_trace_disable); 3496 } 3497 #endif 3498 3499 /* 3500 * Called when address resolution fails due to a timeout. 3501 * Send an ICMP unreachable in response to all queued packets. 3502 */ 3503 void 3504 arp_resolv_failed(ncec_t *ncec) 3505 { 3506 mblk_t *mp, *nxt_mp; 3507 char buf[INET6_ADDRSTRLEN]; 3508 struct in_addr ipv4addr; 3509 ill_t *ill = ncec->ncec_ill; 3510 ip_stack_t *ipst = ncec->ncec_ipst; 3511 ip_recv_attr_t iras; 3512 3513 bzero(&iras, sizeof (iras)); 3514 iras.ira_flags = IRAF_IS_IPV4; 3515 /* 3516 * we are setting the ira_rill to the ipmp_ill (instead of 3517 * the actual ill on which the packet was received), but this 3518 * is ok because we don't actually need the real ira_rill. 3519 * to send the icmp unreachable to the sender. 3520 */ 3521 iras.ira_ill = iras.ira_rill = ill; 3522 iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex; 3523 iras.ira_rifindex = iras.ira_ruifindex; 3524 3525 IN6_V4MAPPED_TO_INADDR(&ncec->ncec_addr, &ipv4addr); 3526 ip3dbg(("arp_resolv_failed: dst %s\n", 3527 inet_ntop(AF_INET, &ipv4addr, buf, sizeof (buf)))); 3528 mutex_enter(&ncec->ncec_lock); 3529 mp = ncec->ncec_qd_mp; 3530 ncec->ncec_qd_mp = NULL; 3531 ncec->ncec_nprobes = 0; 3532 mutex_exit(&ncec->ncec_lock); 3533 while (mp != NULL) { 3534 nxt_mp = mp->b_next; 3535 mp->b_next = NULL; 3536 3537 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); 3538 ip_drop_output("ipIfStatsOutDiscards - address unreachable", 3539 mp, ill); 3540 if (ipst->ips_ip_arp_icmp_error) { 3541 ip3dbg(("arp_resolv_failed: " 3542 "Calling icmp_unreachable\n")); 3543 icmp_unreachable(mp, ICMP_HOST_UNREACHABLE, &iras); 3544 } else { 3545 freemsg(mp); 3546 } 3547 ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE)); 3548 mp = nxt_mp; 3549 } 3550 ncec_cb_dispatch(ncec); /* finish off waiting callbacks */ 3551 } 3552 3553 /* 3554 * if ill is an under_ill, translate it to the ipmp_ill and add the 3555 * nce on the ipmp_ill. Two nce_t entries (one on the ipmp_ill, and 3556 * one on the underlying in_ill) will be created for the 3557 * ncec_t in this case. The ncec_t itself will be created on the ipmp_ill. 3558 */ 3559 int 3560 nce_lookup_then_add_v4(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len, 3561 const in_addr_t *addr, uint16_t flags, uint16_t state, nce_t **newnce) 3562 { 3563 int err; 3564 in6_addr_t addr6; 3565 ip_stack_t *ipst = ill->ill_ipst; 3566 nce_t *nce, *upper_nce = NULL; 3567 ill_t *in_ill = ill, *under = NULL; 3568 boolean_t need_ill_refrele = B_FALSE; 3569 3570 if (flags & NCE_F_MCAST) { 3571 /* 3572 * hw_addr will be figured out in nce_set_multicast_v4; 3573 * caller needs to pass in the cast_ill for ipmp 3574 */ 3575 ASSERT(hw_addr == NULL); 3576 ASSERT(!IS_IPMP(ill)); 3577 err = nce_set_multicast_v4(ill, addr, flags, newnce); 3578 return (err); 3579 } 3580 3581 if (IS_UNDER_IPMP(ill) && !(flags & NCE_F_MYADDR)) { 3582 ill = ipmp_ill_hold_ipmp_ill(ill); 3583 if (ill == NULL) 3584 return (ENXIO); 3585 need_ill_refrele = B_TRUE; 3586 } 3587 if ((flags & NCE_F_BCAST) != 0) { 3588 /* 3589 * IPv4 broadcast ncec: compute the hwaddr. 3590 */ 3591 if (IS_IPMP(ill)) { 3592 under = ipmp_ill_get_xmit_ill(ill, B_FALSE); 3593 if (under == NULL) { 3594 if (need_ill_refrele) 3595 ill_refrele(ill); 3596 return (ENETDOWN); 3597 } 3598 hw_addr = under->ill_bcast_mp->b_rptr + 3599 NCE_LL_ADDR_OFFSET(under); 3600 hw_addr_len = under->ill_phys_addr_length; 3601 } else { 3602 hw_addr = ill->ill_bcast_mp->b_rptr + 3603 NCE_LL_ADDR_OFFSET(ill), 3604 hw_addr_len = ill->ill_phys_addr_length; 3605 } 3606 } 3607 3608 mutex_enter(&ipst->ips_ndp4->ndp_g_lock); 3609 IN6_IPADDR_TO_V4MAPPED(*addr, &addr6); 3610 nce = nce_lookup_addr(ill, &addr6); 3611 if (nce == NULL) { 3612 err = nce_add_v4(ill, hw_addr, hw_addr_len, addr, flags, 3613 state, &nce); 3614 } else { 3615 err = EEXIST; 3616 } 3617 mutex_exit(&ipst->ips_ndp4->ndp_g_lock); 3618 if (err == 0) 3619 err = nce_add_v4_postprocess(nce); 3620 3621 if (in_ill != ill && nce != NULL) { 3622 nce_t *under_nce = NULL; 3623 3624 /* 3625 * in_ill was the under_ill. Try to create the under_nce. 3626 * Hold the ill_g_lock to prevent changes to group membership 3627 * until we are done. 3628 */ 3629 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 3630 if (!IS_IN_SAME_ILLGRP(in_ill, ill)) { 3631 DTRACE_PROBE2(ill__not__in__group, nce_t *, nce, 3632 ill_t *, ill); 3633 rw_exit(&ipst->ips_ill_g_lock); 3634 err = ENXIO; 3635 nce_refrele(nce); 3636 nce = NULL; 3637 goto bail; 3638 } 3639 under_nce = nce_fastpath_create(in_ill, nce->nce_common); 3640 if (under_nce == NULL) { 3641 rw_exit(&ipst->ips_ill_g_lock); 3642 err = EINVAL; 3643 nce_refrele(nce); 3644 nce = NULL; 3645 goto bail; 3646 } 3647 rw_exit(&ipst->ips_ill_g_lock); 3648 upper_nce = nce; 3649 nce = under_nce; /* will be returned to caller */ 3650 if (NCE_ISREACHABLE(nce->nce_common)) 3651 nce_fastpath_trigger(under_nce); 3652 } 3653 if (nce != NULL) { 3654 if (newnce != NULL) 3655 *newnce = nce; 3656 else 3657 nce_refrele(nce); 3658 } 3659 bail: 3660 if (under != NULL) 3661 ill_refrele(under); 3662 if (upper_nce != NULL) 3663 nce_refrele(upper_nce); 3664 if (need_ill_refrele) 3665 ill_refrele(ill); 3666 3667 return (err); 3668 } 3669 3670 /* 3671 * NDP Cache Entry creation routine for IPv4. 3672 * This routine must always be called with ndp4->ndp_g_lock held. 3673 * Prior to return, ncec_refcnt is incremented. 3674 * 3675 * IPMP notes: the ncec for non-local (i.e., !NCE_MYADDR(ncec) addresses 3676 * are always added pointing at the ipmp_ill. Thus, when the ill passed 3677 * to nce_add_v4 is an under_ill (i.e., IS_UNDER_IPMP(ill)) two nce_t 3678 * entries will be created, both pointing at the same ncec_t. The nce_t 3679 * entries will have their nce_ill set to the ipmp_ill and the under_ill 3680 * respectively, with the ncec_t having its ncec_ill pointing at the ipmp_ill. 3681 * Local addresses are always created on the ill passed to nce_add_v4. 3682 */ 3683 int 3684 nce_add_v4(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len, 3685 const in_addr_t *addr, uint16_t flags, uint16_t state, nce_t **newnce) 3686 { 3687 int err; 3688 boolean_t is_multicast = (flags & NCE_F_MCAST); 3689 struct in6_addr addr6; 3690 nce_t *nce; 3691 3692 ASSERT(MUTEX_HELD(&ill->ill_ipst->ips_ndp4->ndp_g_lock)); 3693 ASSERT(!ill->ill_isv6); 3694 ASSERT(!IN_MULTICAST(htonl(*addr)) || is_multicast); 3695 3696 IN6_IPADDR_TO_V4MAPPED(*addr, &addr6); 3697 err = nce_add_common(ill, hw_addr, hw_addr_len, &addr6, flags, state, 3698 &nce); 3699 ASSERT(newnce != NULL); 3700 *newnce = nce; 3701 return (err); 3702 } 3703 3704 /* 3705 * Post-processing routine to be executed after nce_add_v4(). This function 3706 * triggers fastpath (if appropriate) and DAD on the newly added nce entry 3707 * and must be called without any locks held. 3708 * 3709 * Always returns 0, but we return an int to keep this symmetric with the 3710 * IPv6 counter-part. 3711 */ 3712 int 3713 nce_add_v4_postprocess(nce_t *nce) 3714 { 3715 ncec_t *ncec = nce->nce_common; 3716 uint16_t flags = ncec->ncec_flags; 3717 boolean_t ndp_need_dad = B_FALSE; 3718 boolean_t dropped; 3719 clock_t delay; 3720 ip_stack_t *ipst = ncec->ncec_ill->ill_ipst; 3721 uchar_t *hw_addr = ncec->ncec_lladdr; 3722 boolean_t trigger_fastpath = B_TRUE; 3723 3724 /* 3725 * If the hw_addr is NULL, typically for ND_INCOMPLETE nces, then 3726 * we call nce_fastpath as soon as the ncec is resolved in nce_process. 3727 * We call nce_fastpath from nce_update if the link layer address of 3728 * the peer changes from nce_update 3729 */ 3730 if (NCE_PUBLISH(ncec) || !NCE_ISREACHABLE(ncec) || (hw_addr == NULL && 3731 ncec->ncec_ill->ill_net_type != IRE_IF_NORESOLVER)) 3732 trigger_fastpath = B_FALSE; 3733 3734 if (trigger_fastpath) 3735 nce_fastpath_trigger(nce); 3736 3737 if (NCE_PUBLISH(ncec) && ncec->ncec_state == ND_PROBE) { 3738 /* 3739 * Either the caller (by passing in ND_PROBE) 3740 * or nce_add_common() (by the internally computed state 3741 * based on ncec_addr and ill_net_type) has determined 3742 * that this unicast entry needs DAD. Trigger DAD. 3743 */ 3744 ndp_need_dad = B_TRUE; 3745 } else if (flags & NCE_F_UNSOL_ADV) { 3746 /* 3747 * We account for the transmit below by assigning one 3748 * less than the ndd variable. Subsequent decrements 3749 * are done in nce_timer. 3750 */ 3751 mutex_enter(&ncec->ncec_lock); 3752 ncec->ncec_unsolicit_count = 3753 ipst->ips_ip_arp_publish_count - 1; 3754 mutex_exit(&ncec->ncec_lock); 3755 dropped = arp_announce(ncec); 3756 mutex_enter(&ncec->ncec_lock); 3757 if (dropped) 3758 ncec->ncec_unsolicit_count++; 3759 else 3760 ncec->ncec_last_time_defended = ddi_get_lbolt(); 3761 if (ncec->ncec_unsolicit_count != 0) { 3762 nce_start_timer(ncec, 3763 ipst->ips_ip_arp_publish_interval); 3764 } 3765 mutex_exit(&ncec->ncec_lock); 3766 } 3767 3768 /* 3769 * If ncec_xmit_interval is 0, user has configured us to send the first 3770 * probe right away. Do so, and set up for the subsequent probes. 3771 */ 3772 if (ndp_need_dad) { 3773 mutex_enter(&ncec->ncec_lock); 3774 if (ncec->ncec_pcnt == 0) { 3775 /* 3776 * DAD probes and announce can be 3777 * administratively disabled by setting the 3778 * probe_count to zero. Restart the timer in 3779 * this case to mark the ipif as ready. 3780 */ 3781 ncec->ncec_unsolicit_count = 0; 3782 mutex_exit(&ncec->ncec_lock); 3783 nce_restart_timer(ncec, 0); 3784 } else { 3785 mutex_exit(&ncec->ncec_lock); 3786 delay = ((ncec->ncec_flags & NCE_F_FAST) ? 3787 ipst->ips_arp_probe_delay : 3788 ipst->ips_arp_fastprobe_delay); 3789 nce_dad(ncec, NULL, (delay == 0 ? B_TRUE : B_FALSE)); 3790 } 3791 } 3792 return (0); 3793 } 3794 3795 /* 3796 * ncec_walk routine to update all entries that have a given destination or 3797 * gateway address and cached link layer (MAC) address. This is used when ARP 3798 * informs us that a network-to-link-layer mapping may have changed. 3799 */ 3800 void 3801 nce_update_hw_changed(ncec_t *ncec, void *arg) 3802 { 3803 nce_hw_map_t *hwm = arg; 3804 ipaddr_t ncec_addr; 3805 3806 if (ncec->ncec_state != ND_REACHABLE) 3807 return; 3808 3809 IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, ncec_addr); 3810 if (ncec_addr != hwm->hwm_addr) 3811 return; 3812 3813 mutex_enter(&ncec->ncec_lock); 3814 if (hwm->hwm_flags != 0) 3815 ncec->ncec_flags = hwm->hwm_flags; 3816 nce_update(ncec, ND_STALE, hwm->hwm_hwaddr); 3817 mutex_exit(&ncec->ncec_lock); 3818 } 3819 3820 void 3821 ncec_refhold(ncec_t *ncec) 3822 { 3823 mutex_enter(&(ncec)->ncec_lock); 3824 (ncec)->ncec_refcnt++; 3825 ASSERT((ncec)->ncec_refcnt != 0); 3826 #ifdef DEBUG 3827 ncec_trace_ref(ncec); 3828 #endif 3829 mutex_exit(&(ncec)->ncec_lock); 3830 } 3831 3832 void 3833 ncec_refhold_notr(ncec_t *ncec) 3834 { 3835 mutex_enter(&(ncec)->ncec_lock); 3836 (ncec)->ncec_refcnt++; 3837 ASSERT((ncec)->ncec_refcnt != 0); 3838 mutex_exit(&(ncec)->ncec_lock); 3839 } 3840 3841 static void 3842 ncec_refhold_locked(ncec_t *ncec) 3843 { 3844 ASSERT(MUTEX_HELD(&(ncec)->ncec_lock)); 3845 (ncec)->ncec_refcnt++; 3846 #ifdef DEBUG 3847 ncec_trace_ref(ncec); 3848 #endif 3849 } 3850 3851 /* ncec_inactive destroys the mutex thus no mutex_exit is needed */ 3852 void 3853 ncec_refrele(ncec_t *ncec) 3854 { 3855 mutex_enter(&(ncec)->ncec_lock); 3856 #ifdef DEBUG 3857 ncec_untrace_ref(ncec); 3858 #endif 3859 ASSERT((ncec)->ncec_refcnt != 0); 3860 if (--(ncec)->ncec_refcnt == 0) { 3861 ncec_inactive(ncec); 3862 } else { 3863 mutex_exit(&(ncec)->ncec_lock); 3864 } 3865 } 3866 3867 void 3868 ncec_refrele_notr(ncec_t *ncec) 3869 { 3870 mutex_enter(&(ncec)->ncec_lock); 3871 ASSERT((ncec)->ncec_refcnt != 0); 3872 if (--(ncec)->ncec_refcnt == 0) { 3873 ncec_inactive(ncec); 3874 } else { 3875 mutex_exit(&(ncec)->ncec_lock); 3876 } 3877 } 3878 3879 /* 3880 * Common to IPv4 and IPv6. 3881 */ 3882 void 3883 nce_restart_timer(ncec_t *ncec, uint_t ms) 3884 { 3885 timeout_id_t tid; 3886 3887 ASSERT(!MUTEX_HELD(&(ncec)->ncec_lock)); 3888 3889 /* First cancel any running timer */ 3890 mutex_enter(&ncec->ncec_lock); 3891 tid = ncec->ncec_timeout_id; 3892 ncec->ncec_timeout_id = 0; 3893 if (tid != 0) { 3894 mutex_exit(&ncec->ncec_lock); 3895 (void) untimeout(tid); 3896 mutex_enter(&ncec->ncec_lock); 3897 } 3898 3899 /* Restart timer */ 3900 nce_start_timer(ncec, ms); 3901 mutex_exit(&ncec->ncec_lock); 3902 } 3903 3904 static void 3905 nce_start_timer(ncec_t *ncec, uint_t ms) 3906 { 3907 ASSERT(MUTEX_HELD(&ncec->ncec_lock)); 3908 /* 3909 * Don't start the timer if the ncec has been deleted, or if the timer 3910 * is already running 3911 */ 3912 if (!NCE_ISCONDEMNED(ncec) && ncec->ncec_timeout_id == 0) { 3913 ncec->ncec_timeout_id = timeout(nce_timer, ncec, 3914 MSEC_TO_TICK(ms) == 0 ? 1 : MSEC_TO_TICK(ms)); 3915 } 3916 } 3917 3918 int 3919 nce_set_multicast_v4(ill_t *ill, const in_addr_t *dst, 3920 uint16_t flags, nce_t **newnce) 3921 { 3922 uchar_t *hw_addr; 3923 int err = 0; 3924 ip_stack_t *ipst = ill->ill_ipst; 3925 in6_addr_t dst6; 3926 nce_t *nce; 3927 3928 ASSERT(!ill->ill_isv6); 3929 3930 IN6_IPADDR_TO_V4MAPPED(*dst, &dst6); 3931 mutex_enter(&ipst->ips_ndp4->ndp_g_lock); 3932 if ((nce = nce_lookup_addr(ill, &dst6)) != NULL) { 3933 mutex_exit(&ipst->ips_ndp4->ndp_g_lock); 3934 goto done; 3935 } 3936 if (ill->ill_net_type == IRE_IF_RESOLVER) { 3937 /* 3938 * For IRE_IF_RESOLVER a hardware mapping can be 3939 * generated, for IRE_IF_NORESOLVER, resolution cookie 3940 * in the ill is copied in nce_add_v4(). 3941 */ 3942 hw_addr = kmem_alloc(ill->ill_phys_addr_length, KM_NOSLEEP); 3943 if (hw_addr == NULL) { 3944 mutex_exit(&ipst->ips_ndp4->ndp_g_lock); 3945 return (ENOMEM); 3946 } 3947 ip_mcast_mapping(ill, (uchar_t *)dst, hw_addr); 3948 } else { 3949 /* 3950 * IRE_IF_NORESOLVER type simply copies the resolution 3951 * cookie passed in. So no hw_addr is needed. 3952 */ 3953 hw_addr = NULL; 3954 } 3955 ASSERT(flags & NCE_F_MCAST); 3956 ASSERT(flags & NCE_F_NONUD); 3957 /* nce_state will be computed by nce_add_common() */ 3958 err = nce_add_v4(ill, hw_addr, ill->ill_phys_addr_length, dst, flags, 3959 ND_UNCHANGED, &nce); 3960 mutex_exit(&ipst->ips_ndp4->ndp_g_lock); 3961 if (err == 0) 3962 err = nce_add_v4_postprocess(nce); 3963 if (hw_addr != NULL) 3964 kmem_free(hw_addr, ill->ill_phys_addr_length); 3965 if (err != 0) { 3966 ip1dbg(("nce_set_multicast_v4: create failed" "%d\n", err)); 3967 return (err); 3968 } 3969 done: 3970 if (newnce != NULL) 3971 *newnce = nce; 3972 else 3973 nce_refrele(nce); 3974 return (0); 3975 } 3976 3977 /* 3978 * This is used when scanning for "old" (least recently broadcast) NCEs. We 3979 * don't want to have to walk the list for every single one, so we gather up 3980 * batches at a time. 3981 */ 3982 #define NCE_RESCHED_LIST_LEN 8 3983 3984 typedef struct { 3985 ill_t *ncert_ill; 3986 uint_t ncert_num; 3987 ncec_t *ncert_nces[NCE_RESCHED_LIST_LEN]; 3988 } nce_resched_t; 3989 3990 /* 3991 * Pick the longest waiting NCEs for defense. 3992 */ 3993 /* ARGSUSED */ 3994 static int 3995 ncec_reschedule(ill_t *ill, nce_t *nce, void *arg) 3996 { 3997 nce_resched_t *ncert = arg; 3998 ncec_t **ncecs; 3999 ncec_t **ncec_max; 4000 ncec_t *ncec_temp; 4001 ncec_t *ncec = nce->nce_common; 4002 4003 ASSERT(ncec->ncec_ill == ncert->ncert_ill); 4004 /* 4005 * Only reachable entries that are ready for announcement are eligible. 4006 */ 4007 if (!NCE_MYADDR(ncec) || ncec->ncec_state != ND_REACHABLE) 4008 return (0); 4009 if (ncert->ncert_num < NCE_RESCHED_LIST_LEN) { 4010 ncec_refhold(ncec); 4011 ncert->ncert_nces[ncert->ncert_num++] = ncec; 4012 } else { 4013 ncecs = ncert->ncert_nces; 4014 ncec_max = ncecs + NCE_RESCHED_LIST_LEN; 4015 ncec_refhold(ncec); 4016 for (; ncecs < ncec_max; ncecs++) { 4017 ASSERT(ncec != NULL); 4018 if ((*ncecs)->ncec_last_time_defended > 4019 ncec->ncec_last_time_defended) { 4020 ncec_temp = *ncecs; 4021 *ncecs = ncec; 4022 ncec = ncec_temp; 4023 } 4024 } 4025 ncec_refrele(ncec); 4026 } 4027 return (0); 4028 } 4029 4030 /* 4031 * Reschedule the ARP defense of any long-waiting NCEs. It's assumed that this 4032 * doesn't happen very often (if at all), and thus it needn't be highly 4033 * optimized. (Note, though, that it's actually O(N) complexity, because the 4034 * outer loop is bounded by a constant rather than by the length of the list.) 4035 */ 4036 static void 4037 nce_ill_reschedule(ill_t *ill, nce_resched_t *ncert) 4038 { 4039 ncec_t *ncec; 4040 ip_stack_t *ipst = ill->ill_ipst; 4041 uint_t i, defend_rate; 4042 4043 i = ill->ill_defend_count; 4044 ill->ill_defend_count = 0; 4045 if (ill->ill_isv6) 4046 defend_rate = ipst->ips_ndp_defend_rate; 4047 else 4048 defend_rate = ipst->ips_arp_defend_rate; 4049 /* If none could be sitting around, then don't reschedule */ 4050 if (i < defend_rate) { 4051 DTRACE_PROBE1(reschedule_none, ill_t *, ill); 4052 return; 4053 } 4054 ncert->ncert_ill = ill; 4055 while (ill->ill_defend_count < defend_rate) { 4056 nce_walk_common(ill, ncec_reschedule, ncert); 4057 for (i = 0; i < ncert->ncert_num; i++) { 4058 4059 ncec = ncert->ncert_nces[i]; 4060 mutex_enter(&ncec->ncec_lock); 4061 ncec->ncec_flags |= NCE_F_DELAYED; 4062 mutex_exit(&ncec->ncec_lock); 4063 /* 4064 * we plan to schedule this ncec, so incr the 4065 * defend_count in anticipation. 4066 */ 4067 if (++ill->ill_defend_count >= defend_rate) 4068 break; 4069 } 4070 if (ncert->ncert_num < NCE_RESCHED_LIST_LEN) 4071 break; 4072 } 4073 } 4074 4075 /* 4076 * Check if the current rate-limiting parameters permit the sending 4077 * of another address defense announcement for both IPv4 and IPv6. 4078 * Returns B_TRUE if rate-limiting is in effect (i.e., send is not 4079 * permitted), and B_FALSE otherwise. The `defend_rate' parameter 4080 * determines how many address defense announcements are permitted 4081 * in any `defense_perio' interval. 4082 */ 4083 static boolean_t 4084 ill_defend_rate_limit(ill_t *ill, ncec_t *ncec) 4085 { 4086 clock_t now = ddi_get_lbolt(); 4087 ip_stack_t *ipst = ill->ill_ipst; 4088 clock_t start = ill->ill_defend_start; 4089 uint32_t elapsed, defend_period, defend_rate; 4090 nce_resched_t ncert; 4091 boolean_t ret; 4092 int i; 4093 4094 if (ill->ill_isv6) { 4095 defend_period = ipst->ips_ndp_defend_period; 4096 defend_rate = ipst->ips_ndp_defend_rate; 4097 } else { 4098 defend_period = ipst->ips_arp_defend_period; 4099 defend_rate = ipst->ips_arp_defend_rate; 4100 } 4101 if (defend_rate == 0) 4102 return (B_TRUE); 4103 bzero(&ncert, sizeof (ncert)); 4104 mutex_enter(&ill->ill_lock); 4105 if (start > 0) { 4106 elapsed = now - start; 4107 if (elapsed > SEC_TO_TICK(defend_period)) { 4108 ill->ill_defend_start = now; 4109 /* 4110 * nce_ill_reschedule will attempt to 4111 * prevent starvation by reschduling the 4112 * oldest entries, which are marked with 4113 * the NCE_F_DELAYED flag. 4114 */ 4115 nce_ill_reschedule(ill, &ncert); 4116 } 4117 } else { 4118 ill->ill_defend_start = now; 4119 } 4120 ASSERT(ill->ill_defend_count <= defend_rate); 4121 mutex_enter(&ncec->ncec_lock); 4122 if (ncec->ncec_flags & NCE_F_DELAYED) { 4123 /* 4124 * This ncec was rescheduled as one of the really old 4125 * entries needing on-going defense. The 4126 * ill_defend_count was already incremented in 4127 * nce_ill_reschedule. Go ahead and send the announce. 4128 */ 4129 ncec->ncec_flags &= ~NCE_F_DELAYED; 4130 mutex_exit(&ncec->ncec_lock); 4131 ret = B_FALSE; 4132 goto done; 4133 } 4134 mutex_exit(&ncec->ncec_lock); 4135 if (ill->ill_defend_count < defend_rate) 4136 ill->ill_defend_count++; 4137 if (ill->ill_defend_count == defend_rate) { 4138 /* 4139 * we are no longer allowed to send unbidden defense 4140 * messages. Wait for rescheduling. 4141 */ 4142 ret = B_TRUE; 4143 } else { 4144 ret = B_FALSE; 4145 } 4146 done: 4147 mutex_exit(&ill->ill_lock); 4148 /* 4149 * After all the locks have been dropped we can restart nce timer, 4150 * and refrele the delayed ncecs 4151 */ 4152 for (i = 0; i < ncert.ncert_num; i++) { 4153 clock_t xmit_interval; 4154 ncec_t *tmp; 4155 4156 tmp = ncert.ncert_nces[i]; 4157 xmit_interval = nce_fuzz_interval(tmp->ncec_xmit_interval, 4158 B_FALSE); 4159 nce_restart_timer(tmp, xmit_interval); 4160 ncec_refrele(tmp); 4161 } 4162 return (ret); 4163 } 4164 4165 boolean_t 4166 ndp_announce(ncec_t *ncec) 4167 { 4168 return (ndp_xmit(ncec->ncec_ill, ND_NEIGHBOR_ADVERT, ncec->ncec_lladdr, 4169 ncec->ncec_lladdr_length, &ncec->ncec_addr, &ipv6_all_hosts_mcast, 4170 nce_advert_flags(ncec))); 4171 } 4172 4173 ill_t * 4174 nce_resolve_src(ncec_t *ncec, in6_addr_t *src) 4175 { 4176 mblk_t *mp; 4177 in6_addr_t src6; 4178 ipaddr_t src4; 4179 ill_t *ill = ncec->ncec_ill; 4180 ill_t *src_ill = NULL; 4181 ipif_t *ipif = NULL; 4182 boolean_t is_myaddr = NCE_MYADDR(ncec); 4183 boolean_t isv6 = (ncec->ncec_ipversion == IPV6_VERSION); 4184 4185 ASSERT(src != NULL); 4186 ASSERT(IN6_IS_ADDR_UNSPECIFIED(src)); 4187 src6 = *src; 4188 if (is_myaddr) { 4189 src6 = ncec->ncec_addr; 4190 if (!isv6) 4191 IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, src4); 4192 } else { 4193 /* 4194 * try to find one from the outgoing packet. 4195 */ 4196 mutex_enter(&ncec->ncec_lock); 4197 mp = ncec->ncec_qd_mp; 4198 if (mp != NULL) { 4199 if (isv6) { 4200 ip6_t *ip6h = (ip6_t *)mp->b_rptr; 4201 4202 src6 = ip6h->ip6_src; 4203 } else { 4204 ipha_t *ipha = (ipha_t *)mp->b_rptr; 4205 4206 src4 = ipha->ipha_src; 4207 IN6_IPADDR_TO_V4MAPPED(src4, &src6); 4208 } 4209 } 4210 mutex_exit(&ncec->ncec_lock); 4211 } 4212 4213 /* 4214 * For outgoing packets, if the src of outgoing packet is one 4215 * of the assigned interface addresses use it, otherwise we 4216 * will pick the source address below. 4217 * For local addresses (is_myaddr) doing DAD, NDP announce 4218 * messages are mcast. So we use the (IPMP) cast_ill or the 4219 * (non-IPMP) ncec_ill for these message types. The only case 4220 * of unicast DAD messages are for IPv6 ND probes, for which 4221 * we find the ipif_bound_ill corresponding to the ncec_addr. 4222 */ 4223 if (!IN6_IS_ADDR_UNSPECIFIED(&src6) || is_myaddr) { 4224 if (isv6) { 4225 ipif = ipif_lookup_addr_nondup_v6(&src6, ill, ALL_ZONES, 4226 ill->ill_ipst); 4227 } else { 4228 ipif = ipif_lookup_addr_nondup(src4, ill, ALL_ZONES, 4229 ill->ill_ipst); 4230 } 4231 4232 /* 4233 * If no relevant ipif can be found, then it's not one of our 4234 * addresses. Reset to :: and try to find a src for the NS or 4235 * ARP request using ipif_select_source_v[4,6] below. 4236 * If an ipif can be found, but it's not yet done with 4237 * DAD verification, and we are not being invoked for 4238 * DAD (i.e., !is_myaddr), then just postpone this 4239 * transmission until later. 4240 */ 4241 if (ipif == NULL) { 4242 src6 = ipv6_all_zeros; 4243 src4 = INADDR_ANY; 4244 } else if (!ipif->ipif_addr_ready && !is_myaddr) { 4245 DTRACE_PROBE2(nce__resolve__ipif__not__ready, 4246 ncec_t *, ncec, ipif_t *, ipif); 4247 ipif_refrele(ipif); 4248 return (NULL); 4249 } 4250 } 4251 4252 if (IN6_IS_ADDR_UNSPECIFIED(&src6) && !is_myaddr) { 4253 /* 4254 * Pick a source address for this solicitation, but 4255 * restrict the selection to addresses assigned to the 4256 * output interface. We do this because the destination will 4257 * create a neighbor cache entry for the source address of 4258 * this packet, so the source address had better be a valid 4259 * neighbor. 4260 */ 4261 if (isv6) { 4262 ipif = ipif_select_source_v6(ill, &ncec->ncec_addr, 4263 B_TRUE, IPV6_PREFER_SRC_DEFAULT, ALL_ZONES, 4264 B_FALSE, NULL); 4265 } else { 4266 ipaddr_t nce_addr; 4267 4268 IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, nce_addr); 4269 ipif = ipif_select_source_v4(ill, nce_addr, ALL_ZONES, 4270 B_FALSE, NULL); 4271 } 4272 if (ipif == NULL && IS_IPMP(ill)) { 4273 ill_t *send_ill = ipmp_ill_get_xmit_ill(ill, B_TRUE); 4274 4275 if (send_ill != NULL) { 4276 if (isv6) { 4277 ipif = ipif_select_source_v6(send_ill, 4278 &ncec->ncec_addr, B_TRUE, 4279 IPV6_PREFER_SRC_DEFAULT, ALL_ZONES, 4280 B_FALSE, NULL); 4281 } else { 4282 IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, 4283 src4); 4284 ipif = ipif_select_source_v4(send_ill, 4285 src4, ALL_ZONES, B_TRUE, NULL); 4286 } 4287 ill_refrele(send_ill); 4288 } 4289 } 4290 4291 if (ipif == NULL) { 4292 char buf[INET6_ADDRSTRLEN]; 4293 4294 ip1dbg(("nce_resolve_src: No source ipif for dst %s\n", 4295 inet_ntop((isv6 ? AF_INET6 : AF_INET), 4296 (char *)&ncec->ncec_addr, buf, sizeof (buf)))); 4297 DTRACE_PROBE1(nce__resolve__no__ipif, ncec_t *, ncec); 4298 return (NULL); 4299 } 4300 src6 = ipif->ipif_v6lcl_addr; 4301 } 4302 *src = src6; 4303 if (ipif != NULL) { 4304 src_ill = ipif->ipif_ill; 4305 if (IS_IPMP(src_ill)) 4306 src_ill = ipmp_ipif_hold_bound_ill(ipif); 4307 else 4308 ill_refhold(src_ill); 4309 ipif_refrele(ipif); 4310 DTRACE_PROBE2(nce__resolve__src__ill, ncec_t *, ncec, 4311 ill_t *, src_ill); 4312 } 4313 return (src_ill); 4314 } 4315 4316 void 4317 ip_nce_lookup_and_update(ipaddr_t *addr, ipif_t *ipif, ip_stack_t *ipst, 4318 uchar_t *hwaddr, int hwaddr_len, int flags) 4319 { 4320 ill_t *ill; 4321 ncec_t *ncec; 4322 nce_t *nce; 4323 uint16_t new_state; 4324 4325 ill = (ipif ? ipif->ipif_ill : NULL); 4326 if (ill != NULL) { 4327 /* 4328 * only one ncec is possible 4329 */ 4330 nce = nce_lookup_v4(ill, addr); 4331 if (nce != NULL) { 4332 ncec = nce->nce_common; 4333 mutex_enter(&ncec->ncec_lock); 4334 if (NCE_ISREACHABLE(ncec)) 4335 new_state = ND_UNCHANGED; 4336 else 4337 new_state = ND_STALE; 4338 ncec->ncec_flags = flags; 4339 nce_update(ncec, new_state, hwaddr); 4340 mutex_exit(&ncec->ncec_lock); 4341 nce_refrele(nce); 4342 return; 4343 } 4344 } else { 4345 /* 4346 * ill is wildcard; clean up all ncec's and ire's 4347 * that match on addr. 4348 */ 4349 nce_hw_map_t hwm; 4350 4351 hwm.hwm_addr = *addr; 4352 hwm.hwm_hwlen = hwaddr_len; 4353 hwm.hwm_hwaddr = hwaddr; 4354 hwm.hwm_flags = flags; 4355 4356 ncec_walk_common(ipst->ips_ndp4, NULL, 4357 (pfi_t)nce_update_hw_changed, (uchar_t *)&hwm, B_TRUE); 4358 } 4359 } 4360 4361 /* 4362 * Common function to add ncec entries. 4363 * we always add the ncec with ncec_ill == ill, and always create 4364 * nce_t on ncec_ill. A dlpi fastpath message may be triggered if the 4365 * ncec is !reachable. 4366 * 4367 * When the caller passes in an nce_state of ND_UNCHANGED, 4368 * nce_add_common() will determine the state of the created nce based 4369 * on the ill_net_type and nce_flags used. Otherwise, the nce will 4370 * be created with state set to the passed in nce_state. 4371 */ 4372 static int 4373 nce_add_common(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len, 4374 const in6_addr_t *addr, uint16_t flags, uint16_t nce_state, nce_t **retnce) 4375 { 4376 static ncec_t nce_nil; 4377 uchar_t *template = NULL; 4378 int err; 4379 ncec_t *ncec; 4380 ncec_t **ncep; 4381 ip_stack_t *ipst = ill->ill_ipst; 4382 uint16_t state; 4383 boolean_t fastprobe = B_FALSE; 4384 struct ndp_g_s *ndp; 4385 nce_t *nce = NULL; 4386 mblk_t *dlur_mp = NULL; 4387 4388 if (ill->ill_isv6) 4389 ndp = ill->ill_ipst->ips_ndp6; 4390 else 4391 ndp = ill->ill_ipst->ips_ndp4; 4392 4393 *retnce = NULL; 4394 4395 ASSERT(MUTEX_HELD(&ndp->ndp_g_lock)); 4396 4397 if (IN6_IS_ADDR_UNSPECIFIED(addr)) { 4398 ip0dbg(("nce_add_common: no addr\n")); 4399 return (EINVAL); 4400 } 4401 if ((flags & ~NCE_EXTERNAL_FLAGS_MASK)) { 4402 ip0dbg(("nce_add_common: flags = %x\n", (int)flags)); 4403 return (EINVAL); 4404 } 4405 4406 if (ill->ill_isv6) { 4407 ncep = ((ncec_t **)NCE_HASH_PTR_V6(ipst, *addr)); 4408 } else { 4409 ipaddr_t v4addr; 4410 4411 IN6_V4MAPPED_TO_IPADDR(addr, v4addr); 4412 ncep = ((ncec_t **)NCE_HASH_PTR_V4(ipst, v4addr)); 4413 } 4414 4415 /* 4416 * The caller has ensured that there is no nce on ill, but there could 4417 * still be an nce_common_t for the address, so that we find exisiting 4418 * ncec_t strucutures first, and atomically add a new nce_t if 4419 * one is found. The ndp_g_lock ensures that we don't cross threads 4420 * with an ncec_delete(). Unlike ncec_lookup_illgrp() we do not 4421 * compare for matches across the illgrp because this function is 4422 * called via nce_lookup_then_add_v* -> nce_add_v* -> nce_add_common, 4423 * with the nce_lookup_then_add_v* passing in the ipmp_ill where 4424 * appropriate. 4425 */ 4426 ncec = *ncep; 4427 for (; ncec != NULL; ncec = ncec->ncec_next) { 4428 if (ncec->ncec_ill == ill) { 4429 if (IN6_ARE_ADDR_EQUAL(&ncec->ncec_addr, addr)) { 4430 /* 4431 * We should never find *retnce to be 4432 * MYADDR, since the caller may then 4433 * incorrectly restart a DAD timer that's 4434 * already running. However, if we are in 4435 * forwarding mode, and the interface is 4436 * moving in/out of groups, the data 4437 * path ire lookup (e.g., ire_revalidate_nce) 4438 * may have determined that some destination 4439 * is offlink while the control path is adding 4440 * that address as a local address. 4441 * Recover from this case by failing the 4442 * lookup 4443 */ 4444 if (NCE_MYADDR(ncec)) 4445 return (ENXIO); 4446 *retnce = nce_ill_lookup_then_add(ill, ncec); 4447 if (*retnce != NULL) 4448 break; 4449 } 4450 } 4451 } 4452 if (*retnce != NULL) /* caller must trigger fastpath on nce */ 4453 return (0); 4454 4455 ncec = kmem_cache_alloc(ncec_cache, KM_NOSLEEP); 4456 if (ncec == NULL) 4457 return (ENOMEM); 4458 *ncec = nce_nil; 4459 ncec->ncec_ill = ill; 4460 ncec->ncec_ipversion = (ill->ill_isv6 ? IPV6_VERSION : IPV4_VERSION); 4461 ncec->ncec_flags = flags; 4462 ncec->ncec_ipst = ipst; /* No netstack_hold */ 4463 4464 if (!ill->ill_isv6) { 4465 ipaddr_t addr4; 4466 4467 /* 4468 * DAD probe interval and probe count are set based on 4469 * fast/slow probe settings. If the underlying link doesn't 4470 * have reliably up/down notifications or if we're working 4471 * with IPv4 169.254.0.0/16 Link Local Address space, then 4472 * don't use the fast timers. Otherwise, use them. 4473 */ 4474 ASSERT(IN6_IS_ADDR_V4MAPPED(addr)); 4475 IN6_V4MAPPED_TO_IPADDR(addr, addr4); 4476 if (ill->ill_note_link && !IS_IPV4_LL_SPACE(&addr4)) { 4477 fastprobe = B_TRUE; 4478 } else if (IS_IPMP(ill) && NCE_PUBLISH(ncec) && 4479 !IS_IPV4_LL_SPACE(&addr4)) { 4480 ill_t *hwaddr_ill; 4481 4482 hwaddr_ill = ipmp_illgrp_find_ill(ill->ill_grp, hw_addr, 4483 hw_addr_len); 4484 if (hwaddr_ill != NULL && hwaddr_ill->ill_note_link) 4485 fastprobe = B_TRUE; 4486 } 4487 if (fastprobe) { 4488 ncec->ncec_xmit_interval = 4489 ipst->ips_arp_fastprobe_interval; 4490 ncec->ncec_pcnt = 4491 ipst->ips_arp_fastprobe_count; 4492 ncec->ncec_flags |= NCE_F_FAST; 4493 } else { 4494 ncec->ncec_xmit_interval = 4495 ipst->ips_arp_probe_interval; 4496 ncec->ncec_pcnt = 4497 ipst->ips_arp_probe_count; 4498 } 4499 if (NCE_PUBLISH(ncec)) { 4500 ncec->ncec_unsolicit_count = 4501 ipst->ips_ip_arp_publish_count; 4502 } 4503 } else { 4504 /* 4505 * probe interval is constant: ILL_PROBE_INTERVAL 4506 * probe count is constant: ND_MAX_UNICAST_SOLICIT 4507 */ 4508 ncec->ncec_pcnt = ND_MAX_UNICAST_SOLICIT; 4509 if (NCE_PUBLISH(ncec)) { 4510 ncec->ncec_unsolicit_count = 4511 ipst->ips_ip_ndp_unsolicit_count; 4512 } 4513 } 4514 ncec->ncec_rcnt = ill->ill_xmit_count; 4515 ncec->ncec_addr = *addr; 4516 ncec->ncec_qd_mp = NULL; 4517 ncec->ncec_refcnt = 1; /* for ncec getting created */ 4518 mutex_init(&ncec->ncec_lock, NULL, MUTEX_DEFAULT, NULL); 4519 ncec->ncec_trace_disable = B_FALSE; 4520 4521 /* 4522 * ncec_lladdr holds link layer address 4523 */ 4524 if (hw_addr_len > 0) { 4525 template = kmem_alloc(hw_addr_len, KM_NOSLEEP); 4526 if (template == NULL) { 4527 err = ENOMEM; 4528 goto err_ret; 4529 } 4530 ncec->ncec_lladdr = template; 4531 ncec->ncec_lladdr_length = hw_addr_len; 4532 bzero(ncec->ncec_lladdr, hw_addr_len); 4533 } 4534 if ((flags & NCE_F_BCAST) != 0) { 4535 state = ND_REACHABLE; 4536 ASSERT(hw_addr_len > 0); 4537 } else if (ill->ill_net_type == IRE_IF_RESOLVER) { 4538 state = ND_INITIAL; 4539 } else if (ill->ill_net_type == IRE_IF_NORESOLVER) { 4540 /* 4541 * NORESOLVER entries are always created in the REACHABLE 4542 * state. 4543 */ 4544 state = ND_REACHABLE; 4545 if (ill->ill_phys_addr_length == IP_ADDR_LEN && 4546 ill->ill_mactype != DL_IPV4 && 4547 ill->ill_mactype != DL_6TO4) { 4548 /* 4549 * We create a nce_res_mp with the IP nexthop address 4550 * as the destination address if the physical length 4551 * is exactly 4 bytes for point-to-multipoint links 4552 * that do their own resolution from IP to link-layer 4553 * address (e.g. IP over X.25). 4554 */ 4555 bcopy((uchar_t *)addr, 4556 ncec->ncec_lladdr, ill->ill_phys_addr_length); 4557 } 4558 if (ill->ill_phys_addr_length == IPV6_ADDR_LEN && 4559 ill->ill_mactype != DL_IPV6) { 4560 /* 4561 * We create a nce_res_mp with the IP nexthop address 4562 * as the destination address if the physical legnth 4563 * is exactly 16 bytes for point-to-multipoint links 4564 * that do their own resolution from IP to link-layer 4565 * address. 4566 */ 4567 bcopy((uchar_t *)addr, 4568 ncec->ncec_lladdr, ill->ill_phys_addr_length); 4569 } 4570 /* 4571 * Since NUD is not part of the base IPv4 protocol definition, 4572 * IPv4 neighbor entries on NORESOLVER interfaces will never 4573 * age, and are marked NCE_F_NONUD. 4574 */ 4575 if (!ill->ill_isv6) 4576 ncec->ncec_flags |= NCE_F_NONUD; 4577 } else if (ill->ill_net_type == IRE_LOOPBACK) { 4578 state = ND_REACHABLE; 4579 } 4580 4581 if (hw_addr != NULL || ill->ill_net_type == IRE_IF_NORESOLVER) { 4582 /* 4583 * We are adding an ncec with a deterministic hw_addr, 4584 * so the state can only be one of {REACHABLE, STALE, PROBE}. 4585 * 4586 * if we are adding a unicast ncec for the local address 4587 * it would be REACHABLE; we would be adding a ND_STALE entry 4588 * for the requestor of an ARP_REQUEST/ND_SOLICIT. Our own 4589 * addresses are added in PROBE to trigger DAD. 4590 */ 4591 if ((flags & (NCE_F_MCAST|NCE_F_BCAST)) || 4592 ill->ill_net_type == IRE_IF_NORESOLVER) 4593 state = ND_REACHABLE; 4594 else if (!NCE_PUBLISH(ncec)) 4595 state = ND_STALE; 4596 else 4597 state = ND_PROBE; 4598 if (hw_addr != NULL) 4599 nce_set_ll(ncec, hw_addr); 4600 } 4601 /* caller overrides internally computed state */ 4602 if (nce_state != ND_UNCHANGED) 4603 state = nce_state; 4604 4605 if (state == ND_PROBE) 4606 ncec->ncec_flags |= NCE_F_UNVERIFIED; 4607 4608 ncec->ncec_state = state; 4609 4610 if (state == ND_REACHABLE) { 4611 ncec->ncec_last = ncec->ncec_init_time = 4612 TICK_TO_MSEC(ddi_get_lbolt64()); 4613 } else { 4614 ncec->ncec_last = 0; 4615 if (state == ND_INITIAL) 4616 ncec->ncec_init_time = TICK_TO_MSEC(ddi_get_lbolt64()); 4617 } 4618 list_create(&ncec->ncec_cb, sizeof (ncec_cb_t), 4619 offsetof(ncec_cb_t, ncec_cb_node)); 4620 /* 4621 * have all the memory allocations out of the way before taking locks 4622 * and adding the nce. 4623 */ 4624 nce = kmem_cache_alloc(nce_cache, KM_NOSLEEP); 4625 if (nce == NULL) { 4626 err = ENOMEM; 4627 goto err_ret; 4628 } 4629 if (ncec->ncec_lladdr != NULL || 4630 ill->ill_net_type == IRE_IF_NORESOLVER) { 4631 dlur_mp = ill_dlur_gen(ncec->ncec_lladdr, 4632 ill->ill_phys_addr_length, ill->ill_sap, 4633 ill->ill_sap_length); 4634 if (dlur_mp == NULL) { 4635 err = ENOMEM; 4636 goto err_ret; 4637 } 4638 } 4639 4640 /* 4641 * Atomically ensure that the ill is not CONDEMNED, before 4642 * adding the NCE. 4643 */ 4644 mutex_enter(&ill->ill_lock); 4645 if (ill->ill_state_flags & ILL_CONDEMNED) { 4646 mutex_exit(&ill->ill_lock); 4647 err = EINVAL; 4648 goto err_ret; 4649 } 4650 if (!NCE_MYADDR(ncec) && 4651 (ill->ill_state_flags & ILL_DOWN_IN_PROGRESS)) { 4652 mutex_exit(&ill->ill_lock); 4653 DTRACE_PROBE1(nce__add__on__down__ill, ncec_t *, ncec); 4654 err = EINVAL; 4655 goto err_ret; 4656 } 4657 /* 4658 * Acquire the ncec_lock even before adding the ncec to the list 4659 * so that it cannot get deleted after the ncec is added, but 4660 * before we add the nce. 4661 */ 4662 mutex_enter(&ncec->ncec_lock); 4663 if ((ncec->ncec_next = *ncep) != NULL) 4664 ncec->ncec_next->ncec_ptpn = &ncec->ncec_next; 4665 *ncep = ncec; 4666 ncec->ncec_ptpn = ncep; 4667 4668 /* Bump up the number of ncec's referencing this ill */ 4669 DTRACE_PROBE3(ill__incr__cnt, (ill_t *), ill, 4670 (char *), "ncec", (void *), ncec); 4671 ill->ill_ncec_cnt++; 4672 /* 4673 * Since we hold the ncec_lock at this time, the ncec cannot be 4674 * condemned, and we can safely add the nce. 4675 */ 4676 *retnce = nce_add_impl(ill, ncec, nce, dlur_mp); 4677 mutex_exit(&ncec->ncec_lock); 4678 mutex_exit(&ill->ill_lock); 4679 4680 /* caller must trigger fastpath on *retnce */ 4681 return (0); 4682 4683 err_ret: 4684 if (ncec != NULL) 4685 kmem_cache_free(ncec_cache, ncec); 4686 if (nce != NULL) 4687 kmem_cache_free(nce_cache, nce); 4688 freemsg(dlur_mp); 4689 if (template != NULL) 4690 kmem_free(template, ill->ill_phys_addr_length); 4691 return (err); 4692 } 4693 4694 /* 4695 * take a ref on the nce 4696 */ 4697 void 4698 nce_refhold(nce_t *nce) 4699 { 4700 mutex_enter(&nce->nce_lock); 4701 nce->nce_refcnt++; 4702 ASSERT((nce)->nce_refcnt != 0); 4703 mutex_exit(&nce->nce_lock); 4704 } 4705 4706 /* 4707 * release a ref on the nce; In general, this 4708 * cannot be called with locks held because nce_inactive 4709 * may result in nce_inactive which will take the ill_lock, 4710 * do ipif_ill_refrele_tail etc. Thus the one exception 4711 * where this can be called with locks held is when the caller 4712 * is certain that the nce_refcnt is sufficient to prevent 4713 * the invocation of nce_inactive. 4714 */ 4715 void 4716 nce_refrele(nce_t *nce) 4717 { 4718 ASSERT((nce)->nce_refcnt != 0); 4719 mutex_enter(&nce->nce_lock); 4720 if (--nce->nce_refcnt == 0) 4721 nce_inactive(nce); /* destroys the mutex */ 4722 else 4723 mutex_exit(&nce->nce_lock); 4724 } 4725 4726 /* 4727 * free the nce after all refs have gone away. 4728 */ 4729 static void 4730 nce_inactive(nce_t *nce) 4731 { 4732 ill_t *ill = nce->nce_ill; 4733 4734 ASSERT(nce->nce_refcnt == 0); 4735 4736 ncec_refrele_notr(nce->nce_common); 4737 nce->nce_common = NULL; 4738 freemsg(nce->nce_fp_mp); 4739 freemsg(nce->nce_dlur_mp); 4740 4741 mutex_enter(&ill->ill_lock); 4742 DTRACE_PROBE3(ill__decr__cnt, (ill_t *), ill, 4743 (char *), "nce", (void *), nce); 4744 ill->ill_nce_cnt--; 4745 nce->nce_ill = NULL; 4746 /* 4747 * If the number of ncec's associated with this ill have dropped 4748 * to zero, check whether we need to restart any operation that 4749 * is waiting for this to happen. 4750 */ 4751 if (ILL_DOWN_OK(ill)) { 4752 /* ipif_ill_refrele_tail drops the ill_lock */ 4753 ipif_ill_refrele_tail(ill); 4754 } else { 4755 mutex_exit(&ill->ill_lock); 4756 } 4757 4758 mutex_destroy(&nce->nce_lock); 4759 kmem_cache_free(nce_cache, nce); 4760 } 4761 4762 /* 4763 * Add an nce to the ill_nce list. 4764 */ 4765 static nce_t * 4766 nce_add_impl(ill_t *ill, ncec_t *ncec, nce_t *nce, mblk_t *dlur_mp) 4767 { 4768 bzero(nce, sizeof (*nce)); 4769 mutex_init(&nce->nce_lock, NULL, MUTEX_DEFAULT, NULL); 4770 nce->nce_common = ncec; 4771 nce->nce_addr = ncec->ncec_addr; 4772 nce->nce_ill = ill; 4773 DTRACE_PROBE3(ill__incr__cnt, (ill_t *), ill, 4774 (char *), "nce", (void *), nce); 4775 ill->ill_nce_cnt++; 4776 4777 nce->nce_refcnt = 1; /* for the thread */ 4778 ncec->ncec_refcnt++; /* want ncec_refhold_locked_notr(ncec) */ 4779 nce->nce_dlur_mp = dlur_mp; 4780 4781 /* add nce to the ill's fastpath list. */ 4782 nce->nce_refcnt++; /* for the list */ 4783 list_insert_head(&ill->ill_nce, nce); 4784 return (nce); 4785 } 4786 4787 static nce_t * 4788 nce_add(ill_t *ill, ncec_t *ncec) 4789 { 4790 nce_t *nce; 4791 mblk_t *dlur_mp = NULL; 4792 4793 ASSERT(MUTEX_HELD(&ill->ill_lock)); 4794 ASSERT(MUTEX_HELD(&ncec->ncec_lock)); 4795 4796 nce = kmem_cache_alloc(nce_cache, KM_NOSLEEP); 4797 if (nce == NULL) 4798 return (NULL); 4799 if (ncec->ncec_lladdr != NULL || 4800 ill->ill_net_type == IRE_IF_NORESOLVER) { 4801 dlur_mp = ill_dlur_gen(ncec->ncec_lladdr, 4802 ill->ill_phys_addr_length, ill->ill_sap, 4803 ill->ill_sap_length); 4804 if (dlur_mp == NULL) { 4805 kmem_cache_free(nce_cache, nce); 4806 return (NULL); 4807 } 4808 } 4809 return (nce_add_impl(ill, ncec, nce, dlur_mp)); 4810 } 4811 4812 /* 4813 * remove the nce from the ill_faspath list 4814 */ 4815 void 4816 nce_delete(nce_t *nce) 4817 { 4818 ill_t *ill = nce->nce_ill; 4819 4820 ASSERT(MUTEX_HELD(&ill->ill_lock)); 4821 4822 mutex_enter(&nce->nce_lock); 4823 if (nce->nce_is_condemned) { 4824 /* 4825 * some other thread has removed this nce from the ill_nce list 4826 */ 4827 mutex_exit(&nce->nce_lock); 4828 return; 4829 } 4830 nce->nce_is_condemned = B_TRUE; 4831 mutex_exit(&nce->nce_lock); 4832 4833 list_remove(&ill->ill_nce, nce); 4834 /* 4835 * even though we are holding the ill_lock, it is ok to 4836 * call nce_refrele here because we know that we should have 4837 * at least 2 refs on the nce: one for the thread, and one 4838 * for the list. The refrele below will release the one for 4839 * the list. 4840 */ 4841 nce_refrele(nce); 4842 } 4843 4844 nce_t * 4845 nce_lookup(ill_t *ill, const in6_addr_t *addr) 4846 { 4847 nce_t *nce = NULL; 4848 4849 ASSERT(ill != NULL); 4850 ASSERT(MUTEX_HELD(&ill->ill_lock)); 4851 4852 for (nce = list_head(&ill->ill_nce); nce != NULL; 4853 nce = list_next(&ill->ill_nce, nce)) { 4854 if (IN6_ARE_ADDR_EQUAL(&nce->nce_addr, addr)) 4855 break; 4856 } 4857 4858 /* 4859 * if we found the nce on the ill_nce list while holding 4860 * the ill_lock, then it cannot be condemned yet. 4861 */ 4862 if (nce != NULL) { 4863 ASSERT(!nce->nce_is_condemned); 4864 nce_refhold(nce); 4865 } 4866 return (nce); 4867 } 4868 4869 /* 4870 * Walk the ill_nce list on ill. The callback function func() cannot perform 4871 * any destructive actions. 4872 */ 4873 static void 4874 nce_walk_common(ill_t *ill, pfi_t func, void *arg) 4875 { 4876 nce_t *nce = NULL, *nce_next; 4877 4878 ASSERT(MUTEX_HELD(&ill->ill_lock)); 4879 for (nce = list_head(&ill->ill_nce); nce != NULL; ) { 4880 nce_next = list_next(&ill->ill_nce, nce); 4881 if (func(ill, nce, arg) != 0) 4882 break; 4883 nce = nce_next; 4884 } 4885 } 4886 4887 void 4888 nce_walk(ill_t *ill, pfi_t func, void *arg) 4889 { 4890 mutex_enter(&ill->ill_lock); 4891 nce_walk_common(ill, func, arg); 4892 mutex_exit(&ill->ill_lock); 4893 } 4894 4895 void 4896 nce_flush(ill_t *ill, boolean_t flushall) 4897 { 4898 nce_t *nce, *nce_next; 4899 list_t dead; 4900 4901 list_create(&dead, sizeof (nce_t), offsetof(nce_t, nce_node)); 4902 mutex_enter(&ill->ill_lock); 4903 for (nce = list_head(&ill->ill_nce); nce != NULL; ) { 4904 nce_next = list_next(&ill->ill_nce, nce); 4905 if (!flushall && NCE_PUBLISH(nce->nce_common)) { 4906 nce = nce_next; 4907 continue; 4908 } 4909 /* 4910 * nce_delete requires that the caller should either not 4911 * be holding locks, or should hold a ref to ensure that 4912 * we wont hit ncec_inactive. So take a ref and clean up 4913 * after the list is flushed. 4914 */ 4915 nce_refhold(nce); 4916 nce_delete(nce); 4917 list_insert_tail(&dead, nce); 4918 nce = nce_next; 4919 } 4920 mutex_exit(&ill->ill_lock); 4921 while ((nce = list_head(&dead)) != NULL) { 4922 list_remove(&dead, nce); 4923 nce_refrele(nce); 4924 } 4925 ASSERT(list_is_empty(&dead)); 4926 list_destroy(&dead); 4927 } 4928 4929 /* Return an interval that is anywhere in the [1 .. intv] range */ 4930 static clock_t 4931 nce_fuzz_interval(clock_t intv, boolean_t initial_time) 4932 { 4933 clock_t rnd, frac; 4934 4935 (void) random_get_pseudo_bytes((uint8_t *)&rnd, sizeof (rnd)); 4936 /* Note that clock_t is signed; must chop off bits */ 4937 rnd &= (1ul << (NBBY * sizeof (rnd) - 1)) - 1; 4938 if (initial_time) { 4939 if (intv <= 0) 4940 intv = 1; 4941 else 4942 intv = (rnd % intv) + 1; 4943 } else { 4944 /* Compute 'frac' as 20% of the configured interval */ 4945 if ((frac = intv / 5) <= 1) 4946 frac = 2; 4947 /* Set intv randomly in the range [intv-frac .. intv+frac] */ 4948 if ((intv = intv - frac + rnd % (2 * frac + 1)) <= 0) 4949 intv = 1; 4950 } 4951 return (intv); 4952 } 4953 4954 void 4955 nce_resolv_ipmp_ok(ncec_t *ncec) 4956 { 4957 mblk_t *mp; 4958 uint_t pkt_len; 4959 iaflags_t ixaflags = IXAF_NO_TRACE; 4960 nce_t *under_nce; 4961 ill_t *ill = ncec->ncec_ill; 4962 boolean_t isv6 = (ncec->ncec_ipversion == IPV6_VERSION); 4963 ipif_t *src_ipif = NULL; 4964 ip_stack_t *ipst = ill->ill_ipst; 4965 ill_t *send_ill; 4966 uint_t nprobes; 4967 4968 ASSERT(IS_IPMP(ill)); 4969 4970 mutex_enter(&ncec->ncec_lock); 4971 nprobes = ncec->ncec_nprobes; 4972 mp = ncec->ncec_qd_mp; 4973 ncec->ncec_qd_mp = NULL; 4974 ncec->ncec_nprobes = 0; 4975 mutex_exit(&ncec->ncec_lock); 4976 4977 while (mp != NULL) { 4978 mblk_t *nxt_mp; 4979 4980 nxt_mp = mp->b_next; 4981 mp->b_next = NULL; 4982 if (isv6) { 4983 ip6_t *ip6h = (ip6_t *)mp->b_rptr; 4984 4985 pkt_len = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN; 4986 src_ipif = ipif_lookup_addr_nondup_v6(&ip6h->ip6_src, 4987 ill, ALL_ZONES, ipst); 4988 } else { 4989 ipha_t *ipha = (ipha_t *)mp->b_rptr; 4990 4991 ixaflags |= IXAF_IS_IPV4; 4992 pkt_len = ntohs(ipha->ipha_length); 4993 src_ipif = ipif_lookup_addr_nondup(ipha->ipha_src, 4994 ill, ALL_ZONES, ipst); 4995 } 4996 4997 /* 4998 * find a new nce based on an under_ill. The first IPMP probe 4999 * packet gets queued, so we could still find a src_ipif that 5000 * matches an IPMP test address. 5001 */ 5002 if (src_ipif == NULL || IS_IPMP(src_ipif->ipif_ill)) { 5003 /* 5004 * if src_ipif is null, this could be either a 5005 * forwarded packet or a probe whose src got deleted. 5006 * We identify the former case by looking for the 5007 * ncec_nprobes: the first ncec_nprobes packets are 5008 * probes; 5009 */ 5010 if (src_ipif == NULL && nprobes > 0) 5011 goto drop_pkt; 5012 5013 /* 5014 * For forwarded packets, we use the ipmp rotor 5015 * to find send_ill. 5016 */ 5017 send_ill = ipmp_ill_get_xmit_ill(ncec->ncec_ill, 5018 B_TRUE); 5019 } else { 5020 send_ill = src_ipif->ipif_ill; 5021 ill_refhold(send_ill); 5022 } 5023 5024 DTRACE_PROBE4(nce__resolve__ipmp, (mblk_t *), mp, 5025 (ncec_t *), ncec, (ipif_t *), 5026 src_ipif, (ill_t *), send_ill); 5027 5028 if (send_ill == NULL) { 5029 if (src_ipif != NULL) 5030 ipif_refrele(src_ipif); 5031 goto drop_pkt; 5032 } 5033 /* create an under_nce on send_ill */ 5034 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 5035 if (IS_IN_SAME_ILLGRP(send_ill, ncec->ncec_ill)) 5036 under_nce = nce_fastpath_create(send_ill, ncec); 5037 else 5038 under_nce = NULL; 5039 rw_exit(&ipst->ips_ill_g_lock); 5040 if (under_nce != NULL && NCE_ISREACHABLE(ncec)) 5041 nce_fastpath_trigger(under_nce); 5042 5043 ill_refrele(send_ill); 5044 if (src_ipif != NULL) 5045 ipif_refrele(src_ipif); 5046 5047 if (under_nce != NULL) { 5048 (void) ip_xmit(mp, under_nce, ixaflags, pkt_len, 0, 5049 ALL_ZONES, 0, NULL); 5050 nce_refrele(under_nce); 5051 if (nprobes > 0) 5052 nprobes--; 5053 mp = nxt_mp; 5054 continue; 5055 } 5056 drop_pkt: 5057 if (isv6) { 5058 BUMP_MIB(&ipst->ips_ip6_mib, ipIfStatsOutDiscards); 5059 } else { 5060 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); 5061 } 5062 ip_drop_output("ipIfStatsOutDiscards - no under_ill", mp, NULL); 5063 freemsg(mp); 5064 if (nprobes > 0) 5065 nprobes--; 5066 mp = nxt_mp; 5067 } 5068 ncec_cb_dispatch(ncec); /* complete callbacks */ 5069 } 5070