1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #include <sys/types.h> 27 #include <sys/stream.h> 28 #include <sys/stropts.h> 29 #include <sys/strsun.h> 30 #include <sys/sysmacros.h> 31 #include <sys/errno.h> 32 #include <sys/dlpi.h> 33 #include <sys/socket.h> 34 #include <sys/ddi.h> 35 #include <sys/sunddi.h> 36 #include <sys/cmn_err.h> 37 #include <sys/debug.h> 38 #include <sys/vtrace.h> 39 #include <sys/kmem.h> 40 #include <sys/zone.h> 41 #include <sys/ethernet.h> 42 #include <sys/sdt.h> 43 #include <sys/mac.h> 44 45 #include <net/if.h> 46 #include <net/if_types.h> 47 #include <net/if_dl.h> 48 #include <net/route.h> 49 #include <netinet/in.h> 50 #include <netinet/ip6.h> 51 #include <netinet/icmp6.h> 52 53 #include <inet/common.h> 54 #include <inet/mi.h> 55 #include <inet/mib2.h> 56 #include <inet/nd.h> 57 #include <inet/ip.h> 58 #include <inet/ip_impl.h> 59 #include <inet/ipclassifier.h> 60 #include <inet/ip_if.h> 61 #include <inet/ip_ire.h> 62 #include <inet/ip_rts.h> 63 #include <inet/ip6.h> 64 #include <inet/ip_ndp.h> 65 #include <inet/sctp_ip.h> 66 #include <inet/ip_arp.h> 67 #include <inet/ip2mac_impl.h> 68 69 #define ANNOUNCE_INTERVAL(isv6) \ 70 (isv6 ? ipst->ips_ip_ndp_unsolicit_interval : \ 71 ipst->ips_ip_arp_publish_interval) 72 73 #define DEFENSE_INTERVAL(isv6) \ 74 (isv6 ? ipst->ips_ndp_defend_interval : \ 75 ipst->ips_arp_defend_interval) 76 77 /* Non-tunable probe interval, based on link capabilities */ 78 #define ILL_PROBE_INTERVAL(ill) ((ill)->ill_note_link ? 150 : 1500) 79 80 /* 81 * The IPv4 Link Local address space is special; we do extra duplicate checking 82 * there, as the entire assignment mechanism rests on random numbers. 83 */ 84 #define IS_IPV4_LL_SPACE(ptr) (((uchar_t *)ptr)[0] == 169 && \ 85 ((uchar_t *)ptr)[1] == 254) 86 87 /* 88 * NCE_EXTERNAL_FLAGS_MASK defines the set of ncec_flags that may be passed 89 * in to the ncec*add* functions. 90 * 91 * NCE_F_AUTHORITY means that we ignore any incoming adverts for that 92 * mapping (though DAD is performed for the mapping). NCE_F_PUBLISH means 93 * that we will respond to requests for the protocol address. 94 */ 95 #define NCE_EXTERNAL_FLAGS_MASK \ 96 (NCE_F_MYADDR | NCE_F_ISROUTER | NCE_F_NONUD | \ 97 NCE_F_ANYCAST | NCE_F_UNSOL_ADV | NCE_F_BCAST | NCE_F_MCAST | \ 98 NCE_F_AUTHORITY | NCE_F_PUBLISH | NCE_F_STATIC) 99 100 /* 101 * Function names with nce_ prefix are static while function 102 * names with ndp_ prefix are used by rest of the IP. 103 * 104 * Lock ordering: 105 * 106 * ndp_g_lock -> ill_lock -> ncec_lock 107 * 108 * The ndp_g_lock protects the NCE hash (nce_hash_tbl, NCE_HASH_PTR) and 109 * ncec_next. ncec_lock protects the contents of the NCE (particularly 110 * ncec_refcnt). 111 */ 112 113 static void nce_cleanup_list(ncec_t *ncec); 114 static void nce_set_ll(ncec_t *ncec, uchar_t *ll_addr); 115 static ncec_t *ncec_lookup_illgrp(ill_t *, const in6_addr_t *, 116 ncec_t *); 117 static nce_t *nce_lookup_addr(ill_t *, const in6_addr_t *); 118 static int nce_set_multicast_v6(ill_t *ill, const in6_addr_t *addr, 119 uint16_t ncec_flags, nce_t **newnce); 120 static int nce_set_multicast_v4(ill_t *ill, const in_addr_t *dst, 121 uint16_t ncec_flags, nce_t **newnce); 122 static boolean_t ndp_xmit(ill_t *ill, uint32_t operation, 123 uint8_t *hwaddr, uint_t hwaddr_len, const in6_addr_t *sender, 124 const in6_addr_t *target, int flag); 125 static void ncec_refhold_locked(ncec_t *); 126 static boolean_t ill_defend_rate_limit(ill_t *, ncec_t *); 127 static void nce_queue_mp_common(ncec_t *, mblk_t *, boolean_t); 128 static int nce_add_common(ill_t *, uchar_t *, uint_t, const in6_addr_t *, 129 uint16_t, uint16_t, nce_t **); 130 static nce_t *nce_add_impl(ill_t *, ncec_t *, nce_t *, mblk_t *); 131 static nce_t *nce_add(ill_t *, ncec_t *); 132 static void nce_inactive(nce_t *); 133 extern nce_t *nce_lookup(ill_t *, const in6_addr_t *); 134 static nce_t *nce_ill_lookup_then_add(ill_t *, ncec_t *); 135 static int nce_add_v6(ill_t *, uchar_t *, uint_t, const in6_addr_t *, 136 uint16_t, uint16_t, nce_t **); 137 static int nce_add_v4(ill_t *, uchar_t *, uint_t, const in_addr_t *, 138 uint16_t, uint16_t, nce_t **); 139 static int nce_add_v6_postprocess(nce_t *); 140 static int nce_add_v4_postprocess(nce_t *); 141 static ill_t *nce_resolve_src(ncec_t *, in6_addr_t *); 142 static clock_t nce_fuzz_interval(clock_t, boolean_t); 143 static void nce_resolv_ipmp_ok(ncec_t *); 144 static void nce_walk_common(ill_t *, pfi_t, void *); 145 static void nce_start_timer(ncec_t *, uint_t); 146 static nce_t *nce_fastpath_create(ill_t *, ncec_t *); 147 static void nce_fastpath_trigger(nce_t *); 148 static nce_t *nce_fastpath(ncec_t *, boolean_t, nce_t *); 149 150 #ifdef DEBUG 151 static void ncec_trace_cleanup(const ncec_t *); 152 #endif 153 154 #define NCE_HASH_PTR_V4(ipst, addr) \ 155 (&((ipst)->ips_ndp4->nce_hash_tbl[IRE_ADDR_HASH(addr, NCE_TABLE_SIZE)])) 156 157 #define NCE_HASH_PTR_V6(ipst, addr) \ 158 (&((ipst)->ips_ndp6->nce_hash_tbl[NCE_ADDR_HASH_V6(addr, \ 159 NCE_TABLE_SIZE)])) 160 161 extern kmem_cache_t *ncec_cache; 162 extern kmem_cache_t *nce_cache; 163 164 /* 165 * Send out a IPv6 (unicast) or IPv4 (broadcast) DAD probe 166 * If src_ill is not null, the ncec_addr is bound to src_ill. The 167 * src_ill is ignored by nce_dad for IPv4 Neighbor Cache entries where 168 * the probe is sent on the ncec_ill (in the non-IPMP case) or the 169 * IPMP cast_ill (in the IPMP case). 170 * 171 * Note that the probe interval is based on ncec->ncec_ill which 172 * may be the ipmp_ill. 173 */ 174 static void 175 nce_dad(ncec_t *ncec, ill_t *src_ill, boolean_t send_probe) 176 { 177 boolean_t dropped; 178 uint32_t probe_interval; 179 180 ASSERT(!(ncec->ncec_flags & NCE_F_MCAST)); 181 ASSERT(!(ncec->ncec_flags & NCE_F_BCAST)); 182 if (ncec->ncec_ipversion == IPV6_VERSION) { 183 dropped = ndp_xmit(src_ill, ND_NEIGHBOR_SOLICIT, 184 ncec->ncec_lladdr, ncec->ncec_lladdr_length, 185 &ipv6_all_zeros, &ncec->ncec_addr, NDP_PROBE); 186 probe_interval = ILL_PROBE_INTERVAL(ncec->ncec_ill); 187 } else { 188 /* IPv4 DAD delay the initial probe. */ 189 if (send_probe) 190 dropped = arp_probe(ncec); 191 else 192 dropped = B_TRUE; 193 probe_interval = nce_fuzz_interval(ncec->ncec_xmit_interval, 194 !send_probe); 195 } 196 if (!dropped) { 197 mutex_enter(&ncec->ncec_lock); 198 ncec->ncec_pcnt--; 199 mutex_exit(&ncec->ncec_lock); 200 } 201 nce_restart_timer(ncec, probe_interval); 202 } 203 204 /* 205 * Compute default flags to use for an advertisement of this ncec's address. 206 */ 207 static int 208 nce_advert_flags(const ncec_t *ncec) 209 { 210 int flag = 0; 211 212 if (ncec->ncec_flags & NCE_F_ISROUTER) 213 flag |= NDP_ISROUTER; 214 if (!(ncec->ncec_flags & NCE_F_ANYCAST)) 215 flag |= NDP_ORIDE; 216 217 return (flag); 218 } 219 220 /* 221 * NDP Cache Entry creation routine. 222 * Mapped entries will never do NUD . 223 * This routine must always be called with ndp6->ndp_g_lock held. 224 */ 225 int 226 nce_add_v6(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len, 227 const in6_addr_t *addr, uint16_t flags, uint16_t state, nce_t **newnce) 228 { 229 int err; 230 nce_t *nce; 231 232 ASSERT(MUTEX_HELD(&ill->ill_ipst->ips_ndp6->ndp_g_lock)); 233 ASSERT(ill != NULL && ill->ill_isv6); 234 235 err = nce_add_common(ill, hw_addr, hw_addr_len, addr, flags, state, 236 &nce); 237 if (err != 0) 238 return (err); 239 ASSERT(newnce != NULL); 240 *newnce = nce; 241 return (err); 242 } 243 244 /* 245 * Post-processing routine to be executed after nce_add_v6(). This function 246 * triggers fastpath (if appropriate) and DAD on the newly added nce entry 247 * and must be called without any locks held. 248 */ 249 int 250 nce_add_v6_postprocess(nce_t *nce) 251 { 252 ncec_t *ncec = nce->nce_common; 253 boolean_t dropped = B_FALSE; 254 uchar_t *hw_addr = ncec->ncec_lladdr; 255 uint_t hw_addr_len = ncec->ncec_lladdr_length; 256 ill_t *ill = ncec->ncec_ill; 257 int err = 0; 258 uint16_t flags = ncec->ncec_flags; 259 ip_stack_t *ipst = ill->ill_ipst; 260 boolean_t trigger_fastpath = B_TRUE; 261 262 /* 263 * If the hw_addr is NULL, typically for ND_INCOMPLETE nces, then 264 * we call nce_fastpath as soon as the ncec is resolved in nce_process. 265 * We call nce_fastpath from nce_update if the link layer address of 266 * the peer changes from nce_update 267 */ 268 if (NCE_PUBLISH(ncec) || !NCE_ISREACHABLE(ncec) || 269 (hw_addr == NULL && ill->ill_net_type != IRE_IF_NORESOLVER)) 270 trigger_fastpath = B_FALSE; 271 272 if (trigger_fastpath) 273 nce_fastpath_trigger(nce); 274 if (NCE_PUBLISH(ncec) && ncec->ncec_state == ND_PROBE) { 275 ill_t *hwaddr_ill; 276 /* 277 * Unicast entry that needs DAD. 278 */ 279 if (IS_IPMP(ill)) { 280 hwaddr_ill = ipmp_illgrp_find_ill(ill->ill_grp, 281 hw_addr, hw_addr_len); 282 } else { 283 hwaddr_ill = ill; 284 } 285 nce_dad(ncec, hwaddr_ill, B_TRUE); 286 err = EINPROGRESS; 287 } else if (flags & NCE_F_UNSOL_ADV) { 288 /* 289 * We account for the transmit below by assigning one 290 * less than the ndd variable. Subsequent decrements 291 * are done in nce_timer. 292 */ 293 mutex_enter(&ncec->ncec_lock); 294 ncec->ncec_unsolicit_count = 295 ipst->ips_ip_ndp_unsolicit_count - 1; 296 mutex_exit(&ncec->ncec_lock); 297 dropped = ndp_xmit(ill, 298 ND_NEIGHBOR_ADVERT, 299 hw_addr, 300 hw_addr_len, 301 &ncec->ncec_addr, /* Source and target of the adv */ 302 &ipv6_all_hosts_mcast, /* Destination of the packet */ 303 nce_advert_flags(ncec)); 304 mutex_enter(&ncec->ncec_lock); 305 if (dropped) 306 ncec->ncec_unsolicit_count++; 307 else 308 ncec->ncec_last_time_defended = ddi_get_lbolt(); 309 if (ncec->ncec_unsolicit_count != 0) { 310 nce_start_timer(ncec, 311 ipst->ips_ip_ndp_unsolicit_interval); 312 } 313 mutex_exit(&ncec->ncec_lock); 314 } 315 return (err); 316 } 317 318 /* 319 * Atomically lookup and add (if needed) Neighbor Cache information for 320 * an address. 321 * 322 * IPMP notes: the ncec for non-local (i.e., !NCE_MYADDR(ncec) addresses 323 * are always added pointing at the ipmp_ill. Thus, when the ill passed 324 * to nce_add_v6 is an under_ill (i.e., IS_UNDER_IPMP(ill)) two nce_t 325 * entries will be created, both pointing at the same ncec_t. The nce_t 326 * entries will have their nce_ill set to the ipmp_ill and the under_ill 327 * respectively, with the ncec_t having its ncec_ill pointing at the ipmp_ill. 328 * Local addresses are always created on the ill passed to nce_add_v6. 329 */ 330 int 331 nce_lookup_then_add_v6(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len, 332 const in6_addr_t *addr, uint16_t flags, uint16_t state, nce_t **newnce) 333 { 334 int err = 0; 335 ip_stack_t *ipst = ill->ill_ipst; 336 nce_t *nce, *upper_nce = NULL; 337 ill_t *in_ill = ill; 338 boolean_t need_ill_refrele = B_FALSE; 339 340 if (flags & NCE_F_MCAST) { 341 /* 342 * hw_addr will be figured out in nce_set_multicast_v6; 343 * caller has to select the cast_ill 344 */ 345 ASSERT(hw_addr == NULL); 346 ASSERT(!IS_IPMP(ill)); 347 err = nce_set_multicast_v6(ill, addr, flags, newnce); 348 return (err); 349 } 350 ASSERT(ill->ill_isv6); 351 if (IS_UNDER_IPMP(ill) && !(flags & NCE_F_MYADDR)) { 352 ill = ipmp_ill_hold_ipmp_ill(ill); 353 if (ill == NULL) 354 return (ENXIO); 355 need_ill_refrele = B_TRUE; 356 } 357 358 mutex_enter(&ipst->ips_ndp6->ndp_g_lock); 359 nce = nce_lookup_addr(ill, addr); 360 if (nce == NULL) { 361 err = nce_add_v6(ill, hw_addr, hw_addr_len, addr, flags, state, 362 &nce); 363 } else { 364 err = EEXIST; 365 } 366 mutex_exit(&ipst->ips_ndp6->ndp_g_lock); 367 if (err == 0) 368 err = nce_add_v6_postprocess(nce); 369 if (in_ill != ill && nce != NULL) { 370 nce_t *under_nce; 371 372 /* 373 * in_ill was the under_ill. Try to create the under_nce. 374 * Hold the ill_g_lock to prevent changes to group membership 375 * until we are done. 376 */ 377 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 378 if (IS_IN_SAME_ILLGRP(in_ill, ill)) { 379 under_nce = nce_fastpath_create(in_ill, 380 nce->nce_common); 381 upper_nce = nce; 382 if ((nce = under_nce) == NULL) 383 err = EINVAL; 384 } 385 rw_exit(&ipst->ips_ill_g_lock); 386 if (under_nce != NULL && NCE_ISREACHABLE(nce->nce_common)) 387 nce_fastpath_trigger(under_nce); 388 } 389 if (nce != NULL) { 390 if (newnce != NULL) 391 *newnce = nce; 392 else 393 nce_refrele(nce); 394 } 395 /* nce_refrele is deferred until the lock is dropped */ 396 if (upper_nce != NULL) 397 nce_refrele(upper_nce); 398 if (need_ill_refrele) 399 ill_refrele(ill); 400 return (err); 401 } 402 403 /* 404 * Remove all the CONDEMNED nces from the appropriate hash table. 405 * We create a private list of NCEs, these may have ires pointing 406 * to them, so the list will be passed through to clean up dependent 407 * ires and only then we can do ncec_refrele() which can make NCE inactive. 408 */ 409 static void 410 nce_remove(ndp_g_t *ndp, ncec_t *ncec, ncec_t **free_nce_list) 411 { 412 ncec_t *ncec1; 413 ncec_t **ptpn; 414 415 ASSERT(MUTEX_HELD(&ndp->ndp_g_lock)); 416 ASSERT(ndp->ndp_g_walker == 0); 417 for (; ncec; ncec = ncec1) { 418 ncec1 = ncec->ncec_next; 419 mutex_enter(&ncec->ncec_lock); 420 if (NCE_ISCONDEMNED(ncec)) { 421 ptpn = ncec->ncec_ptpn; 422 ncec1 = ncec->ncec_next; 423 if (ncec1 != NULL) 424 ncec1->ncec_ptpn = ptpn; 425 *ptpn = ncec1; 426 ncec->ncec_ptpn = NULL; 427 ncec->ncec_next = NULL; 428 ncec->ncec_next = *free_nce_list; 429 *free_nce_list = ncec; 430 } 431 mutex_exit(&ncec->ncec_lock); 432 } 433 } 434 435 /* 436 * 1. Mark the entry CONDEMNED. This ensures that no new nce_lookup() 437 * will return this NCE. Also no new timeouts will 438 * be started (See nce_restart_timer). 439 * 2. Cancel any currently running timeouts. 440 * 3. If there is an ndp walker, return. The walker will do the cleanup. 441 * This ensures that walkers see a consistent list of NCEs while walking. 442 * 4. Otherwise remove the NCE from the list of NCEs 443 */ 444 void 445 ncec_delete(ncec_t *ncec) 446 { 447 ncec_t **ptpn; 448 ncec_t *ncec1; 449 int ipversion = ncec->ncec_ipversion; 450 ndp_g_t *ndp; 451 ip_stack_t *ipst = ncec->ncec_ipst; 452 453 if (ipversion == IPV4_VERSION) 454 ndp = ipst->ips_ndp4; 455 else 456 ndp = ipst->ips_ndp6; 457 458 /* Serialize deletes */ 459 mutex_enter(&ncec->ncec_lock); 460 if (NCE_ISCONDEMNED(ncec)) { 461 /* Some other thread is doing the delete */ 462 mutex_exit(&ncec->ncec_lock); 463 return; 464 } 465 /* 466 * Caller has a refhold. Also 1 ref for being in the list. Thus 467 * refcnt has to be >= 2 468 */ 469 ASSERT(ncec->ncec_refcnt >= 2); 470 ncec->ncec_flags |= NCE_F_CONDEMNED; 471 mutex_exit(&ncec->ncec_lock); 472 473 /* Count how many condemned ires for kmem_cache callback */ 474 atomic_add_32(&ipst->ips_num_nce_condemned, 1); 475 nce_fastpath_list_delete(ncec->ncec_ill, ncec, NULL); 476 477 /* Complete any waiting callbacks */ 478 ncec_cb_dispatch(ncec); 479 480 /* 481 * Cancel any running timer. Timeout can't be restarted 482 * since CONDEMNED is set. Can't hold ncec_lock across untimeout. 483 * Passing invalid timeout id is fine. 484 */ 485 if (ncec->ncec_timeout_id != 0) { 486 (void) untimeout(ncec->ncec_timeout_id); 487 ncec->ncec_timeout_id = 0; 488 } 489 490 mutex_enter(&ndp->ndp_g_lock); 491 if (ncec->ncec_ptpn == NULL) { 492 /* 493 * The last ndp walker has already removed this ncec from 494 * the list after we marked the ncec CONDEMNED and before 495 * we grabbed the global lock. 496 */ 497 mutex_exit(&ndp->ndp_g_lock); 498 return; 499 } 500 if (ndp->ndp_g_walker > 0) { 501 /* 502 * Can't unlink. The walker will clean up 503 */ 504 ndp->ndp_g_walker_cleanup = B_TRUE; 505 mutex_exit(&ndp->ndp_g_lock); 506 return; 507 } 508 509 /* 510 * Now remove the ncec from the list. nce_restart_timer won't restart 511 * the timer since it is marked CONDEMNED. 512 */ 513 ptpn = ncec->ncec_ptpn; 514 ncec1 = ncec->ncec_next; 515 if (ncec1 != NULL) 516 ncec1->ncec_ptpn = ptpn; 517 *ptpn = ncec1; 518 ncec->ncec_ptpn = NULL; 519 ncec->ncec_next = NULL; 520 mutex_exit(&ndp->ndp_g_lock); 521 522 /* Removed from ncec_ptpn/ncec_next list */ 523 ncec_refrele_notr(ncec); 524 } 525 526 void 527 ncec_inactive(ncec_t *ncec) 528 { 529 mblk_t **mpp; 530 ill_t *ill = ncec->ncec_ill; 531 ip_stack_t *ipst = ncec->ncec_ipst; 532 533 ASSERT(ncec->ncec_refcnt == 0); 534 ASSERT(MUTEX_HELD(&ncec->ncec_lock)); 535 536 /* Count how many condemned nces for kmem_cache callback */ 537 if (NCE_ISCONDEMNED(ncec)) 538 atomic_add_32(&ipst->ips_num_nce_condemned, -1); 539 540 /* Free all allocated messages */ 541 mpp = &ncec->ncec_qd_mp; 542 while (*mpp != NULL) { 543 mblk_t *mp; 544 545 mp = *mpp; 546 *mpp = mp->b_next; 547 548 inet_freemsg(mp); 549 } 550 /* 551 * must have been cleaned up in ncec_delete 552 */ 553 ASSERT(list_is_empty(&ncec->ncec_cb)); 554 list_destroy(&ncec->ncec_cb); 555 /* 556 * free the ncec_lladdr if one was allocated in nce_add_common() 557 */ 558 if (ncec->ncec_lladdr_length > 0) 559 kmem_free(ncec->ncec_lladdr, ncec->ncec_lladdr_length); 560 561 #ifdef DEBUG 562 ncec_trace_cleanup(ncec); 563 #endif 564 565 mutex_enter(&ill->ill_lock); 566 DTRACE_PROBE3(ill__decr__cnt, (ill_t *), ill, 567 (char *), "ncec", (void *), ncec); 568 ill->ill_ncec_cnt--; 569 ncec->ncec_ill = NULL; 570 /* 571 * If the number of ncec's associated with this ill have dropped 572 * to zero, check whether we need to restart any operation that 573 * is waiting for this to happen. 574 */ 575 if (ILL_DOWN_OK(ill)) { 576 /* ipif_ill_refrele_tail drops the ill_lock */ 577 ipif_ill_refrele_tail(ill); 578 } else { 579 mutex_exit(&ill->ill_lock); 580 } 581 582 mutex_destroy(&ncec->ncec_lock); 583 kmem_cache_free(ncec_cache, ncec); 584 } 585 586 /* 587 * ncec_walk routine. Delete the ncec if it is associated with the ill 588 * that is going away. Always called as a writer. 589 */ 590 void 591 ncec_delete_per_ill(ncec_t *ncec, uchar_t *arg) 592 { 593 if ((ncec != NULL) && ncec->ncec_ill == (ill_t *)arg) { 594 ncec_delete(ncec); 595 } 596 } 597 598 /* 599 * Neighbor Cache cleanup logic for a list of ncec_t entries. 600 */ 601 static void 602 nce_cleanup_list(ncec_t *ncec) 603 { 604 ncec_t *ncec_next; 605 606 ASSERT(ncec != NULL); 607 while (ncec != NULL) { 608 ncec_next = ncec->ncec_next; 609 ncec->ncec_next = NULL; 610 611 /* 612 * It is possible for the last ndp walker (this thread) 613 * to come here after ncec_delete has marked the ncec CONDEMNED 614 * and before it has removed the ncec from the fastpath list 615 * or called untimeout. So we need to do it here. It is safe 616 * for both ncec_delete and this thread to do it twice or 617 * even simultaneously since each of the threads has a 618 * reference on the ncec. 619 */ 620 nce_fastpath_list_delete(ncec->ncec_ill, ncec, NULL); 621 /* 622 * Cancel any running timer. Timeout can't be restarted 623 * since CONDEMNED is set. The ncec_lock can't be 624 * held across untimeout though passing invalid timeout 625 * id is fine. 626 */ 627 if (ncec->ncec_timeout_id != 0) { 628 (void) untimeout(ncec->ncec_timeout_id); 629 ncec->ncec_timeout_id = 0; 630 } 631 /* Removed from ncec_ptpn/ncec_next list */ 632 ncec_refrele_notr(ncec); 633 ncec = ncec_next; 634 } 635 } 636 637 /* 638 * Restart DAD on given NCE. Returns B_TRUE if DAD has been restarted. 639 */ 640 boolean_t 641 nce_restart_dad(ncec_t *ncec) 642 { 643 boolean_t started; 644 ill_t *ill, *hwaddr_ill; 645 646 if (ncec == NULL) 647 return (B_FALSE); 648 ill = ncec->ncec_ill; 649 mutex_enter(&ncec->ncec_lock); 650 if (ncec->ncec_state == ND_PROBE) { 651 mutex_exit(&ncec->ncec_lock); 652 started = B_TRUE; 653 } else if (ncec->ncec_state == ND_REACHABLE) { 654 ASSERT(ncec->ncec_lladdr != NULL); 655 ncec->ncec_state = ND_PROBE; 656 ncec->ncec_pcnt = ND_MAX_UNICAST_SOLICIT; 657 /* 658 * Slight cheat here: we don't use the initial probe delay 659 * for IPv4 in this obscure case. 660 */ 661 mutex_exit(&ncec->ncec_lock); 662 if (IS_IPMP(ill)) { 663 hwaddr_ill = ipmp_illgrp_find_ill(ill->ill_grp, 664 ncec->ncec_lladdr, ncec->ncec_lladdr_length); 665 } else { 666 hwaddr_ill = ill; 667 } 668 nce_dad(ncec, hwaddr_ill, B_TRUE); 669 started = B_TRUE; 670 } else { 671 mutex_exit(&ncec->ncec_lock); 672 started = B_FALSE; 673 } 674 return (started); 675 } 676 677 /* 678 * IPv6 Cache entry lookup. Try to find an ncec matching the parameters passed. 679 * If one is found, the refcnt on the ncec will be incremented. 680 */ 681 ncec_t * 682 ncec_lookup_illgrp_v6(ill_t *ill, const in6_addr_t *addr) 683 { 684 ncec_t *ncec; 685 ip_stack_t *ipst = ill->ill_ipst; 686 687 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 688 mutex_enter(&ipst->ips_ndp6->ndp_g_lock); 689 690 /* Get head of v6 hash table */ 691 ncec = *((ncec_t **)NCE_HASH_PTR_V6(ipst, *addr)); 692 ncec = ncec_lookup_illgrp(ill, addr, ncec); 693 mutex_exit(&ipst->ips_ndp6->ndp_g_lock); 694 rw_exit(&ipst->ips_ill_g_lock); 695 return (ncec); 696 } 697 /* 698 * IPv4 Cache entry lookup. Try to find an ncec matching the parameters passed. 699 * If one is found, the refcnt on the ncec will be incremented. 700 */ 701 ncec_t * 702 ncec_lookup_illgrp_v4(ill_t *ill, const in_addr_t *addr) 703 { 704 ncec_t *ncec = NULL; 705 in6_addr_t addr6; 706 ip_stack_t *ipst = ill->ill_ipst; 707 708 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 709 mutex_enter(&ipst->ips_ndp4->ndp_g_lock); 710 711 /* Get head of v4 hash table */ 712 ncec = *((ncec_t **)NCE_HASH_PTR_V4(ipst, *addr)); 713 IN6_IPADDR_TO_V4MAPPED(*addr, &addr6); 714 ncec = ncec_lookup_illgrp(ill, &addr6, ncec); 715 mutex_exit(&ipst->ips_ndp4->ndp_g_lock); 716 rw_exit(&ipst->ips_ill_g_lock); 717 return (ncec); 718 } 719 720 /* 721 * Cache entry lookup. Try to find an ncec matching the parameters passed. 722 * If an ncec is found, increment the hold count on that ncec. 723 * The caller passes in the start of the appropriate hash table, and must 724 * be holding the appropriate global lock (ndp_g_lock). In addition, since 725 * this function matches ncec_t entries across the illgrp, the ips_ill_g_lock 726 * must be held as reader. 727 * 728 * This function always matches across the ipmp group. 729 */ 730 ncec_t * 731 ncec_lookup_illgrp(ill_t *ill, const in6_addr_t *addr, ncec_t *ncec) 732 { 733 ndp_g_t *ndp; 734 ip_stack_t *ipst = ill->ill_ipst; 735 736 if (ill->ill_isv6) 737 ndp = ipst->ips_ndp6; 738 else 739 ndp = ipst->ips_ndp4; 740 741 ASSERT(ill != NULL); 742 ASSERT(MUTEX_HELD(&ndp->ndp_g_lock)); 743 if (IN6_IS_ADDR_UNSPECIFIED(addr)) 744 return (NULL); 745 for (; ncec != NULL; ncec = ncec->ncec_next) { 746 if (ncec->ncec_ill == ill || 747 IS_IN_SAME_ILLGRP(ill, ncec->ncec_ill)) { 748 if (IN6_ARE_ADDR_EQUAL(&ncec->ncec_addr, addr)) { 749 mutex_enter(&ncec->ncec_lock); 750 if (!NCE_ISCONDEMNED(ncec)) { 751 ncec_refhold_locked(ncec); 752 mutex_exit(&ncec->ncec_lock); 753 break; 754 } 755 mutex_exit(&ncec->ncec_lock); 756 } 757 } 758 } 759 return (ncec); 760 } 761 762 /* 763 * Find an nce_t on ill with nce_addr == addr. Lookup the nce_t 764 * entries for ill only, i.e., when ill is part of an ipmp group, 765 * nce_lookup_v4 will never try to match across the group. 766 */ 767 nce_t * 768 nce_lookup_v4(ill_t *ill, const in_addr_t *addr) 769 { 770 nce_t *nce; 771 in6_addr_t addr6; 772 ip_stack_t *ipst = ill->ill_ipst; 773 774 mutex_enter(&ipst->ips_ndp4->ndp_g_lock); 775 IN6_IPADDR_TO_V4MAPPED(*addr, &addr6); 776 nce = nce_lookup_addr(ill, &addr6); 777 mutex_exit(&ipst->ips_ndp4->ndp_g_lock); 778 return (nce); 779 } 780 781 /* 782 * Find an nce_t on ill with nce_addr == addr. Lookup the nce_t 783 * entries for ill only, i.e., when ill is part of an ipmp group, 784 * nce_lookup_v6 will never try to match across the group. 785 */ 786 nce_t * 787 nce_lookup_v6(ill_t *ill, const in6_addr_t *addr6) 788 { 789 nce_t *nce; 790 ip_stack_t *ipst = ill->ill_ipst; 791 792 mutex_enter(&ipst->ips_ndp6->ndp_g_lock); 793 nce = nce_lookup_addr(ill, addr6); 794 mutex_exit(&ipst->ips_ndp6->ndp_g_lock); 795 return (nce); 796 } 797 798 static nce_t * 799 nce_lookup_addr(ill_t *ill, const in6_addr_t *addr) 800 { 801 nce_t *nce; 802 803 ASSERT(ill != NULL); 804 #ifdef DEBUG 805 if (ill->ill_isv6) 806 ASSERT(MUTEX_HELD(&ill->ill_ipst->ips_ndp6->ndp_g_lock)); 807 else 808 ASSERT(MUTEX_HELD(&ill->ill_ipst->ips_ndp4->ndp_g_lock)); 809 #endif 810 mutex_enter(&ill->ill_lock); 811 nce = nce_lookup(ill, addr); 812 mutex_exit(&ill->ill_lock); 813 return (nce); 814 } 815 816 817 /* 818 * Router turned to host. We need to make sure that cached copies of the ncec 819 * are not used for forwarding packets if they were derived from the default 820 * route, and that the default route itself is removed, as required by 821 * section 7.2.5 of RFC 2461. 822 * 823 * Note that the ncec itself probably has valid link-layer information for the 824 * nexthop, so that there is no reason to delete the ncec, as long as the 825 * ISROUTER flag is turned off. 826 */ 827 static void 828 ncec_router_to_host(ncec_t *ncec) 829 { 830 ire_t *ire; 831 ip_stack_t *ipst = ncec->ncec_ipst; 832 833 mutex_enter(&ncec->ncec_lock); 834 ncec->ncec_flags &= ~NCE_F_ISROUTER; 835 mutex_exit(&ncec->ncec_lock); 836 837 ire = ire_ftable_lookup_v6(&ipv6_all_zeros, &ipv6_all_zeros, 838 &ncec->ncec_addr, IRE_DEFAULT, ncec->ncec_ill, ALL_ZONES, NULL, 839 MATCH_IRE_ILL | MATCH_IRE_TYPE | MATCH_IRE_GW, 0, ipst, NULL); 840 if (ire != NULL) { 841 ip_rts_rtmsg(RTM_DELETE, ire, 0, ipst); 842 ire_delete(ire); 843 ire_refrele(ire); 844 } 845 } 846 847 /* 848 * Process passed in parameters either from an incoming packet or via 849 * user ioctl. 850 */ 851 void 852 nce_process(ncec_t *ncec, uchar_t *hw_addr, uint32_t flag, boolean_t is_adv) 853 { 854 ill_t *ill = ncec->ncec_ill; 855 uint32_t hw_addr_len = ill->ill_phys_addr_length; 856 boolean_t ll_updated = B_FALSE; 857 boolean_t ll_changed; 858 nce_t *nce; 859 860 ASSERT(ncec->ncec_ipversion == IPV6_VERSION); 861 /* 862 * No updates of link layer address or the neighbor state is 863 * allowed, when the cache is in NONUD state. This still 864 * allows for responding to reachability solicitation. 865 */ 866 mutex_enter(&ncec->ncec_lock); 867 if (ncec->ncec_state == ND_INCOMPLETE) { 868 if (hw_addr == NULL) { 869 mutex_exit(&ncec->ncec_lock); 870 return; 871 } 872 nce_set_ll(ncec, hw_addr); 873 /* 874 * Update ncec state and send the queued packets 875 * back to ip this time ire will be added. 876 */ 877 if (flag & ND_NA_FLAG_SOLICITED) { 878 nce_update(ncec, ND_REACHABLE, NULL); 879 } else { 880 nce_update(ncec, ND_STALE, NULL); 881 } 882 mutex_exit(&ncec->ncec_lock); 883 nce = nce_fastpath(ncec, B_TRUE, NULL); 884 nce_resolv_ok(ncec); 885 if (nce != NULL) 886 nce_refrele(nce); 887 return; 888 } 889 ll_changed = nce_cmp_ll_addr(ncec, hw_addr, hw_addr_len); 890 if (!is_adv) { 891 /* If this is a SOLICITATION request only */ 892 if (ll_changed) 893 nce_update(ncec, ND_STALE, hw_addr); 894 mutex_exit(&ncec->ncec_lock); 895 ncec_cb_dispatch(ncec); 896 return; 897 } 898 if (!(flag & ND_NA_FLAG_OVERRIDE) && ll_changed) { 899 /* If in any other state than REACHABLE, ignore */ 900 if (ncec->ncec_state == ND_REACHABLE) { 901 nce_update(ncec, ND_STALE, NULL); 902 } 903 mutex_exit(&ncec->ncec_lock); 904 ncec_cb_dispatch(ncec); 905 return; 906 } else { 907 if (ll_changed) { 908 nce_update(ncec, ND_UNCHANGED, hw_addr); 909 ll_updated = B_TRUE; 910 } 911 if (flag & ND_NA_FLAG_SOLICITED) { 912 nce_update(ncec, ND_REACHABLE, NULL); 913 } else { 914 if (ll_updated) { 915 nce_update(ncec, ND_STALE, NULL); 916 } 917 } 918 mutex_exit(&ncec->ncec_lock); 919 if (!(flag & ND_NA_FLAG_ROUTER) && (ncec->ncec_flags & 920 NCE_F_ISROUTER)) { 921 ncec_router_to_host(ncec); 922 } else { 923 ncec_cb_dispatch(ncec); 924 } 925 } 926 } 927 928 /* 929 * Pass arg1 to the pfi supplied, along with each ncec in existence. 930 * ncec_walk() places a REFHOLD on the ncec and drops the lock when 931 * walking the hash list. 932 */ 933 void 934 ncec_walk_common(ndp_g_t *ndp, ill_t *ill, pfi_t pfi, void *arg1, 935 boolean_t trace) 936 { 937 ncec_t *ncec; 938 ncec_t *ncec1; 939 ncec_t **ncep; 940 ncec_t *free_nce_list = NULL; 941 942 mutex_enter(&ndp->ndp_g_lock); 943 /* Prevent ncec_delete from unlink and free of NCE */ 944 ndp->ndp_g_walker++; 945 mutex_exit(&ndp->ndp_g_lock); 946 for (ncep = ndp->nce_hash_tbl; 947 ncep < A_END(ndp->nce_hash_tbl); ncep++) { 948 for (ncec = *ncep; ncec != NULL; ncec = ncec1) { 949 ncec1 = ncec->ncec_next; 950 if (ill == NULL || ncec->ncec_ill == ill) { 951 if (trace) { 952 ncec_refhold(ncec); 953 (*pfi)(ncec, arg1); 954 ncec_refrele(ncec); 955 } else { 956 ncec_refhold_notr(ncec); 957 (*pfi)(ncec, arg1); 958 ncec_refrele_notr(ncec); 959 } 960 } 961 } 962 } 963 mutex_enter(&ndp->ndp_g_lock); 964 ndp->ndp_g_walker--; 965 if (ndp->ndp_g_walker_cleanup && ndp->ndp_g_walker == 0) { 966 /* Time to delete condemned entries */ 967 for (ncep = ndp->nce_hash_tbl; 968 ncep < A_END(ndp->nce_hash_tbl); ncep++) { 969 ncec = *ncep; 970 if (ncec != NULL) { 971 nce_remove(ndp, ncec, &free_nce_list); 972 } 973 } 974 ndp->ndp_g_walker_cleanup = B_FALSE; 975 } 976 977 mutex_exit(&ndp->ndp_g_lock); 978 979 if (free_nce_list != NULL) { 980 nce_cleanup_list(free_nce_list); 981 } 982 } 983 984 /* 985 * Walk everything. 986 * Note that ill can be NULL hence can't derive the ipst from it. 987 */ 988 void 989 ncec_walk(ill_t *ill, pfi_t pfi, void *arg1, ip_stack_t *ipst) 990 { 991 ncec_walk_common(ipst->ips_ndp4, ill, pfi, arg1, B_TRUE); 992 ncec_walk_common(ipst->ips_ndp6, ill, pfi, arg1, B_TRUE); 993 } 994 995 /* 996 * For each interface an entry is added for the unspecified multicast group. 997 * Here that mapping is used to form the multicast cache entry for a particular 998 * multicast destination. 999 */ 1000 static int 1001 nce_set_multicast_v6(ill_t *ill, const in6_addr_t *dst, 1002 uint16_t flags, nce_t **newnce) 1003 { 1004 uchar_t *hw_addr; 1005 int err = 0; 1006 ip_stack_t *ipst = ill->ill_ipst; 1007 nce_t *nce; 1008 1009 ASSERT(ill != NULL); 1010 ASSERT(ill->ill_isv6); 1011 ASSERT(!(IN6_IS_ADDR_UNSPECIFIED(dst))); 1012 1013 mutex_enter(&ipst->ips_ndp6->ndp_g_lock); 1014 nce = nce_lookup_addr(ill, dst); 1015 if (nce != NULL) { 1016 mutex_exit(&ipst->ips_ndp6->ndp_g_lock); 1017 goto done; 1018 } 1019 if (ill->ill_net_type == IRE_IF_RESOLVER) { 1020 /* 1021 * For IRE_IF_RESOLVER a hardware mapping can be 1022 * generated. 1023 */ 1024 hw_addr = kmem_alloc(ill->ill_nd_lla_len, KM_NOSLEEP); 1025 if (hw_addr == NULL) { 1026 mutex_exit(&ipst->ips_ndp6->ndp_g_lock); 1027 return (ENOMEM); 1028 } 1029 ip_mcast_mapping(ill, (uchar_t *)dst, hw_addr); 1030 } else { 1031 /* 1032 * So no hw_addr is needed for IRE_IF_NORESOLVER. 1033 */ 1034 hw_addr = NULL; 1035 } 1036 ASSERT((flags & NCE_F_MCAST) != 0); 1037 ASSERT((flags & NCE_F_NONUD) != 0); 1038 /* nce_state will be computed by nce_add_common() */ 1039 err = nce_add_v6(ill, hw_addr, ill->ill_phys_addr_length, dst, flags, 1040 ND_UNCHANGED, &nce); 1041 mutex_exit(&ipst->ips_ndp6->ndp_g_lock); 1042 if (err == 0) 1043 err = nce_add_v6_postprocess(nce); 1044 if (hw_addr != NULL) 1045 kmem_free(hw_addr, ill->ill_nd_lla_len); 1046 if (err != 0) { 1047 ip1dbg(("nce_set_multicast_v6: create failed" "%d\n", err)); 1048 return (err); 1049 } 1050 done: 1051 ASSERT(nce->nce_common->ncec_state == ND_REACHABLE); 1052 if (newnce != NULL) 1053 *newnce = nce; 1054 else 1055 nce_refrele(nce); 1056 return (0); 1057 } 1058 1059 /* 1060 * Return the link layer address, and any flags of a ncec. 1061 */ 1062 int 1063 ndp_query(ill_t *ill, struct lif_nd_req *lnr) 1064 { 1065 ncec_t *ncec; 1066 in6_addr_t *addr; 1067 sin6_t *sin6; 1068 1069 ASSERT(ill != NULL && ill->ill_isv6); 1070 sin6 = (sin6_t *)&lnr->lnr_addr; 1071 addr = &sin6->sin6_addr; 1072 1073 /* 1074 * NOTE: if the ill is an IPMP interface, then match against the whole 1075 * illgrp. This e.g. allows in.ndpd to retrieve the link layer 1076 * addresses for the data addresses on an IPMP interface even though 1077 * ipif_ndp_up() created them with an ncec_ill of ipif_bound_ill. 1078 */ 1079 ncec = ncec_lookup_illgrp_v6(ill, addr); 1080 if (ncec == NULL) 1081 return (ESRCH); 1082 /* If no link layer address is available yet, return ESRCH */ 1083 if (!NCE_ISREACHABLE(ncec)) { 1084 ncec_refrele(ncec); 1085 return (ESRCH); 1086 } 1087 lnr->lnr_hdw_len = ill->ill_phys_addr_length; 1088 bcopy(ncec->ncec_lladdr, (uchar_t *)&lnr->lnr_hdw_addr, 1089 lnr->lnr_hdw_len); 1090 if (ncec->ncec_flags & NCE_F_ISROUTER) 1091 lnr->lnr_flags = NDF_ISROUTER_ON; 1092 if (ncec->ncec_flags & NCE_F_ANYCAST) 1093 lnr->lnr_flags |= NDF_ANYCAST_ON; 1094 ncec_refrele(ncec); 1095 return (0); 1096 } 1097 1098 /* 1099 * Finish setting up the Enable/Disable multicast for the driver. 1100 */ 1101 mblk_t * 1102 ndp_mcastreq(ill_t *ill, const in6_addr_t *v6group, uint32_t hw_addr_len, 1103 uint32_t hw_addr_offset, mblk_t *mp) 1104 { 1105 uchar_t *hw_addr; 1106 ipaddr_t v4group; 1107 uchar_t *addr; 1108 1109 ASSERT(ill->ill_net_type == IRE_IF_RESOLVER); 1110 if (IN6_IS_ADDR_V4MAPPED(v6group)) { 1111 IN6_V4MAPPED_TO_IPADDR(v6group, v4group); 1112 1113 ASSERT(CLASSD(v4group)); 1114 ASSERT(!(ill->ill_isv6)); 1115 1116 addr = (uchar_t *)&v4group; 1117 } else { 1118 ASSERT(IN6_IS_ADDR_MULTICAST(v6group)); 1119 ASSERT(ill->ill_isv6); 1120 1121 addr = (uchar_t *)v6group; 1122 } 1123 hw_addr = mi_offset_paramc(mp, hw_addr_offset, hw_addr_len); 1124 if (hw_addr == NULL) { 1125 ip0dbg(("ndp_mcastreq NULL hw_addr\n")); 1126 freemsg(mp); 1127 return (NULL); 1128 } 1129 1130 ip_mcast_mapping(ill, addr, hw_addr); 1131 return (mp); 1132 } 1133 1134 void 1135 ip_ndp_resolve(ncec_t *ncec) 1136 { 1137 in_addr_t sender4 = INADDR_ANY; 1138 in6_addr_t sender6 = ipv6_all_zeros; 1139 ill_t *src_ill; 1140 uint32_t ms; 1141 1142 src_ill = nce_resolve_src(ncec, &sender6); 1143 if (src_ill == NULL) { 1144 /* Make sure we try again later */ 1145 ms = ncec->ncec_ill->ill_reachable_retrans_time; 1146 nce_restart_timer(ncec, (clock_t)ms); 1147 return; 1148 } 1149 if (ncec->ncec_ipversion == IPV4_VERSION) 1150 IN6_V4MAPPED_TO_IPADDR(&sender6, sender4); 1151 mutex_enter(&ncec->ncec_lock); 1152 if (ncec->ncec_ipversion == IPV6_VERSION) 1153 ms = ndp_solicit(ncec, sender6, src_ill); 1154 else 1155 ms = arp_request(ncec, sender4, src_ill); 1156 mutex_exit(&ncec->ncec_lock); 1157 if (ms == 0) { 1158 if (ncec->ncec_state != ND_REACHABLE) { 1159 if (ncec->ncec_ipversion == IPV6_VERSION) 1160 ndp_resolv_failed(ncec); 1161 else 1162 arp_resolv_failed(ncec); 1163 ASSERT((ncec->ncec_flags & NCE_F_STATIC) == 0); 1164 nce_make_unreachable(ncec); 1165 ncec_delete(ncec); 1166 } 1167 } else { 1168 nce_restart_timer(ncec, (clock_t)ms); 1169 } 1170 done: 1171 ill_refrele(src_ill); 1172 } 1173 1174 /* 1175 * Send an IPv6 neighbor solicitation. 1176 * Returns number of milliseconds after which we should either rexmit or abort. 1177 * Return of zero means we should abort. 1178 * The caller holds the ncec_lock to protect ncec_qd_mp and ncec_rcnt. 1179 * The optional source address is used as a hint to ndp_solicit for 1180 * which source to use in the packet. 1181 * 1182 * NOTE: This routine drops ncec_lock (and later reacquires it) when sending 1183 * the packet. 1184 */ 1185 uint32_t 1186 ndp_solicit(ncec_t *ncec, in6_addr_t src, ill_t *ill) 1187 { 1188 in6_addr_t dst; 1189 boolean_t dropped = B_FALSE; 1190 1191 ASSERT(ncec->ncec_ipversion == IPV6_VERSION); 1192 ASSERT(MUTEX_HELD(&ncec->ncec_lock)); 1193 1194 if (ncec->ncec_rcnt == 0) 1195 return (0); 1196 1197 dst = ncec->ncec_addr; 1198 ncec->ncec_rcnt--; 1199 mutex_exit(&ncec->ncec_lock); 1200 dropped = ndp_xmit(ill, ND_NEIGHBOR_SOLICIT, ill->ill_phys_addr, 1201 ill->ill_phys_addr_length, &src, &dst, 0); 1202 mutex_enter(&ncec->ncec_lock); 1203 if (dropped) 1204 ncec->ncec_rcnt++; 1205 return (ncec->ncec_ill->ill_reachable_retrans_time); 1206 } 1207 1208 /* 1209 * Attempt to recover an address on an interface that's been marked as a 1210 * duplicate. Because NCEs are destroyed when the interface goes down, there's 1211 * no easy way to just probe the address and have the right thing happen if 1212 * it's no longer in use. Instead, we just bring it up normally and allow the 1213 * regular interface start-up logic to probe for a remaining duplicate and take 1214 * us back down if necessary. 1215 * Neither DHCP nor temporary addresses arrive here; they're excluded by 1216 * ip_ndp_excl. 1217 */ 1218 /* ARGSUSED */ 1219 void 1220 ip_addr_recover(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg) 1221 { 1222 ill_t *ill = rq->q_ptr; 1223 ipif_t *ipif; 1224 in6_addr_t *addr6 = (in6_addr_t *)mp->b_rptr; 1225 in_addr_t *addr4 = (in_addr_t *)mp->b_rptr; 1226 boolean_t addr_equal; 1227 1228 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 1229 /* 1230 * We do not support recovery of proxy ARP'd interfaces, 1231 * because the system lacks a complete proxy ARP mechanism. 1232 */ 1233 if (ill->ill_isv6) { 1234 addr_equal = IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6lcl_addr, 1235 addr6); 1236 } else { 1237 addr_equal = (ipif->ipif_lcl_addr == *addr4); 1238 } 1239 1240 if ((ipif->ipif_flags & IPIF_POINTOPOINT) || !addr_equal) 1241 continue; 1242 1243 /* 1244 * If we have already recovered or if the interface is going 1245 * away, then ignore. 1246 */ 1247 mutex_enter(&ill->ill_lock); 1248 if (!(ipif->ipif_flags & IPIF_DUPLICATE) || 1249 (ipif->ipif_state_flags & IPIF_CONDEMNED)) { 1250 mutex_exit(&ill->ill_lock); 1251 continue; 1252 } 1253 1254 ipif->ipif_flags &= ~IPIF_DUPLICATE; 1255 ill->ill_ipif_dup_count--; 1256 mutex_exit(&ill->ill_lock); 1257 ipif->ipif_was_dup = B_TRUE; 1258 1259 if (ill->ill_isv6) { 1260 VERIFY(ipif_ndp_up(ipif, B_TRUE) != EINPROGRESS); 1261 (void) ipif_up_done_v6(ipif); 1262 } else { 1263 VERIFY(ipif_arp_up(ipif, Res_act_initial, B_TRUE) != 1264 EINPROGRESS); 1265 (void) ipif_up_done(ipif); 1266 } 1267 } 1268 freeb(mp); 1269 } 1270 1271 /* 1272 * 1273 * Attempt to recover an IPv6 interface that's been shut down as a duplicate. 1274 * As long as someone else holds the address, the interface will stay down. 1275 * When that conflict goes away, the interface is brought back up. This is 1276 * done so that accidental shutdowns of addresses aren't made permanent. Your 1277 * server will recover from a failure. 1278 * 1279 * For DHCP and temporary addresses, recovery is not done in the kernel. 1280 * Instead, it's handled by user space processes (dhcpagent and in.ndpd). 1281 * 1282 * This function is entered on a timer expiry; the ID is in ipif_recovery_id. 1283 */ 1284 void 1285 ipif_dup_recovery(void *arg) 1286 { 1287 ipif_t *ipif = arg; 1288 1289 ipif->ipif_recovery_id = 0; 1290 if (!(ipif->ipif_flags & IPIF_DUPLICATE)) 1291 return; 1292 1293 /* 1294 * No lock, because this is just an optimization. 1295 */ 1296 if (ipif->ipif_state_flags & IPIF_CONDEMNED) 1297 return; 1298 1299 /* If the link is down, we'll retry this later */ 1300 if (!(ipif->ipif_ill->ill_phyint->phyint_flags & PHYI_RUNNING)) 1301 return; 1302 1303 ipif_do_recovery(ipif); 1304 } 1305 1306 /* 1307 * Perform interface recovery by forcing the duplicate interfaces up and 1308 * allowing the system to determine which ones should stay up. 1309 * 1310 * Called both by recovery timer expiry and link-up notification. 1311 */ 1312 void 1313 ipif_do_recovery(ipif_t *ipif) 1314 { 1315 ill_t *ill = ipif->ipif_ill; 1316 mblk_t *mp; 1317 ip_stack_t *ipst = ill->ill_ipst; 1318 size_t mp_size; 1319 1320 if (ipif->ipif_isv6) 1321 mp_size = sizeof (ipif->ipif_v6lcl_addr); 1322 else 1323 mp_size = sizeof (ipif->ipif_lcl_addr); 1324 mp = allocb(mp_size, BPRI_MED); 1325 if (mp == NULL) { 1326 mutex_enter(&ill->ill_lock); 1327 if (ipst->ips_ip_dup_recovery > 0 && 1328 ipif->ipif_recovery_id == 0 && 1329 !(ipif->ipif_state_flags & IPIF_CONDEMNED)) { 1330 ipif->ipif_recovery_id = timeout(ipif_dup_recovery, 1331 ipif, MSEC_TO_TICK(ipst->ips_ip_dup_recovery)); 1332 } 1333 mutex_exit(&ill->ill_lock); 1334 } else { 1335 /* 1336 * A recovery timer may still be running if we got here from 1337 * ill_restart_dad(); cancel that timer. 1338 */ 1339 if (ipif->ipif_recovery_id != 0) 1340 (void) untimeout(ipif->ipif_recovery_id); 1341 ipif->ipif_recovery_id = 0; 1342 1343 if (ipif->ipif_isv6) { 1344 bcopy(&ipif->ipif_v6lcl_addr, mp->b_rptr, 1345 sizeof (ipif->ipif_v6lcl_addr)); 1346 } else { 1347 bcopy(&ipif->ipif_lcl_addr, mp->b_rptr, 1348 sizeof (ipif->ipif_lcl_addr)); 1349 } 1350 ill_refhold(ill); 1351 qwriter_ip(ill, ill->ill_rq, mp, ip_addr_recover, NEW_OP, 1352 B_FALSE); 1353 } 1354 } 1355 1356 /* 1357 * Find the MAC and IP addresses in an NA/NS message. 1358 */ 1359 static void 1360 ip_ndp_find_addresses(mblk_t *mp, ip_recv_attr_t *ira, ill_t *ill, 1361 in6_addr_t *targp, uchar_t **haddr, uint_t *haddrlenp) 1362 { 1363 icmp6_t *icmp6 = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN); 1364 nd_neighbor_solicit_t *ns = (nd_neighbor_solicit_t *)icmp6; 1365 uchar_t *addr; 1366 int alen; 1367 1368 /* icmp_inbound_v6 ensures this */ 1369 ASSERT(ira->ira_flags & IRAF_L2SRC_SET); 1370 1371 addr = ira->ira_l2src; 1372 alen = ill->ill_phys_addr_length; 1373 if (alen > 0) { 1374 *haddr = addr; 1375 *haddrlenp = alen; 1376 } else { 1377 *haddr = NULL; 1378 *haddrlenp = 0; 1379 } 1380 1381 /* nd_ns_target and nd_na_target are at the same offset, so we cheat */ 1382 *targp = ns->nd_ns_target; 1383 } 1384 1385 /* 1386 * This is for exclusive changes due to NDP duplicate address detection 1387 * failure. 1388 */ 1389 /* ARGSUSED */ 1390 static void 1391 ip_ndp_excl(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg) 1392 { 1393 ill_t *ill = rq->q_ptr; 1394 ipif_t *ipif; 1395 uchar_t *haddr; 1396 uint_t haddrlen; 1397 ip_stack_t *ipst = ill->ill_ipst; 1398 in6_addr_t targ; 1399 ip_recv_attr_t iras; 1400 mblk_t *attrmp; 1401 1402 attrmp = mp; 1403 mp = mp->b_cont; 1404 attrmp->b_cont = NULL; 1405 if (!ip_recv_attr_from_mblk(attrmp, &iras)) { 1406 /* The ill or ip_stack_t disappeared on us */ 1407 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 1408 ip_drop_input("ip_recv_attr_from_mblk", mp, ill); 1409 freemsg(mp); 1410 ira_cleanup(&iras, B_TRUE); 1411 return; 1412 } 1413 1414 ASSERT(ill == iras.ira_rill); 1415 1416 ip_ndp_find_addresses(mp, &iras, ill, &targ, &haddr, &haddrlen); 1417 if (haddr != NULL && haddrlen == ill->ill_phys_addr_length) { 1418 /* 1419 * Ignore conflicts generated by misbehaving switches that 1420 * just reflect our own messages back to us. For IPMP, we may 1421 * see reflections across any ill in the illgrp. 1422 * 1423 * RFC2462 and revisions tried to detect both the case 1424 * when a statically configured IPv6 address is a duplicate, 1425 * and the case when the L2 address itself is a duplicate. The 1426 * later is important because, with stateles address autoconf, 1427 * if the L2 address is a duplicate, the resulting IPv6 1428 * address(es) would also be duplicates. We rely on DAD of the 1429 * IPv6 address itself to detect the latter case. 1430 */ 1431 /* For an under ill_grp can change under lock */ 1432 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 1433 if (bcmp(haddr, ill->ill_phys_addr, haddrlen) == 0 || 1434 IS_UNDER_IPMP(ill) && 1435 ipmp_illgrp_find_ill(ill->ill_grp, haddr, 1436 haddrlen) != NULL) { 1437 rw_exit(&ipst->ips_ill_g_lock); 1438 goto ignore_conflict; 1439 } 1440 rw_exit(&ipst->ips_ill_g_lock); 1441 } 1442 1443 /* 1444 * Look up the appropriate ipif. 1445 */ 1446 ipif = ipif_lookup_addr_v6(&targ, ill, ALL_ZONES, ipst); 1447 if (ipif == NULL) 1448 goto ignore_conflict; 1449 1450 /* Reload the ill to match the ipif */ 1451 ill = ipif->ipif_ill; 1452 1453 /* If it's already duplicate or ineligible, then don't do anything. */ 1454 if (ipif->ipif_flags & (IPIF_POINTOPOINT|IPIF_DUPLICATE)) { 1455 ipif_refrele(ipif); 1456 goto ignore_conflict; 1457 } 1458 1459 /* 1460 * If this is a failure during duplicate recovery, then don't 1461 * complain. It may take a long time to recover. 1462 */ 1463 if (!ipif->ipif_was_dup) { 1464 char ibuf[LIFNAMSIZ]; 1465 char hbuf[MAC_STR_LEN]; 1466 char sbuf[INET6_ADDRSTRLEN]; 1467 1468 ipif_get_name(ipif, ibuf, sizeof (ibuf)); 1469 cmn_err(CE_WARN, "%s has duplicate address %s (in use by %s);" 1470 " disabled", ibuf, 1471 inet_ntop(AF_INET6, &targ, sbuf, sizeof (sbuf)), 1472 mac_colon_addr(haddr, haddrlen, hbuf, sizeof (hbuf))); 1473 } 1474 mutex_enter(&ill->ill_lock); 1475 ASSERT(!(ipif->ipif_flags & IPIF_DUPLICATE)); 1476 ipif->ipif_flags |= IPIF_DUPLICATE; 1477 ill->ill_ipif_dup_count++; 1478 mutex_exit(&ill->ill_lock); 1479 (void) ipif_down(ipif, NULL, NULL); 1480 (void) ipif_down_tail(ipif); 1481 mutex_enter(&ill->ill_lock); 1482 if (!(ipif->ipif_flags & (IPIF_DHCPRUNNING|IPIF_TEMPORARY)) && 1483 ill->ill_net_type == IRE_IF_RESOLVER && 1484 !(ipif->ipif_state_flags & IPIF_CONDEMNED) && 1485 ipst->ips_ip_dup_recovery > 0) { 1486 ASSERT(ipif->ipif_recovery_id == 0); 1487 ipif->ipif_recovery_id = timeout(ipif_dup_recovery, 1488 ipif, MSEC_TO_TICK(ipst->ips_ip_dup_recovery)); 1489 } 1490 mutex_exit(&ill->ill_lock); 1491 ipif_refrele(ipif); 1492 1493 ignore_conflict: 1494 freemsg(mp); 1495 ira_cleanup(&iras, B_TRUE); 1496 } 1497 1498 /* 1499 * Handle failure by tearing down the ipifs with the specified address. Note 1500 * that tearing down the ipif also means deleting the ncec through ipif_down, so 1501 * it's not possible to do recovery by just restarting the ncec timer. Instead, 1502 * we start a timer on the ipif. 1503 * Caller has to free mp; 1504 */ 1505 static void 1506 ndp_failure(mblk_t *mp, ip_recv_attr_t *ira) 1507 { 1508 const uchar_t *haddr; 1509 ill_t *ill = ira->ira_rill; 1510 1511 /* 1512 * Ignore conflicts generated by misbehaving switches that just 1513 * reflect our own messages back to us. 1514 */ 1515 1516 /* icmp_inbound_v6 ensures this */ 1517 ASSERT(ira->ira_flags & IRAF_L2SRC_SET); 1518 haddr = ira->ira_l2src; 1519 if (haddr != NULL && 1520 bcmp(haddr, ill->ill_phys_addr, ill->ill_phys_addr_length) == 0) { 1521 return; 1522 } 1523 1524 if ((mp = copymsg(mp)) != NULL) { 1525 mblk_t *attrmp; 1526 1527 attrmp = ip_recv_attr_to_mblk(ira); 1528 if (attrmp == NULL) { 1529 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 1530 ip_drop_input("ipIfStatsInDiscards", mp, ill); 1531 freemsg(mp); 1532 } else { 1533 ASSERT(attrmp->b_cont == NULL); 1534 attrmp->b_cont = mp; 1535 mp = attrmp; 1536 ill_refhold(ill); 1537 qwriter_ip(ill, ill->ill_rq, mp, ip_ndp_excl, NEW_OP, 1538 B_FALSE); 1539 } 1540 } 1541 } 1542 1543 /* 1544 * Handle a discovered conflict: some other system is advertising that it owns 1545 * one of our IP addresses. We need to defend ourselves, or just shut down the 1546 * interface. 1547 * 1548 * Handles both IPv4 and IPv6 1549 */ 1550 boolean_t 1551 ip_nce_conflict(mblk_t *mp, ip_recv_attr_t *ira, ncec_t *ncec) 1552 { 1553 ipif_t *ipif; 1554 clock_t now; 1555 uint_t maxdefense; 1556 uint_t defs; 1557 ill_t *ill = ira->ira_ill; 1558 ip_stack_t *ipst = ill->ill_ipst; 1559 uint32_t elapsed; 1560 boolean_t isv6 = ill->ill_isv6; 1561 ipaddr_t ncec_addr; 1562 1563 if (isv6) { 1564 ipif = ipif_lookup_addr_v6(&ncec->ncec_addr, ill, ALL_ZONES, 1565 ipst); 1566 } else { 1567 if (arp_no_defense) { 1568 /* 1569 * Yes, there is a conflict, but no, we do not 1570 * defend ourself. 1571 */ 1572 return (B_TRUE); 1573 } 1574 IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, ncec_addr); 1575 ipif = ipif_lookup_addr(ncec_addr, ill, ALL_ZONES, 1576 ipst); 1577 } 1578 if (ipif == NULL) 1579 return (B_FALSE); 1580 1581 /* 1582 * First, figure out if this address is disposable. 1583 */ 1584 if (ipif->ipif_flags & (IPIF_DHCPRUNNING | IPIF_TEMPORARY)) 1585 maxdefense = ipst->ips_ip_max_temp_defend; 1586 else 1587 maxdefense = ipst->ips_ip_max_defend; 1588 1589 /* 1590 * Now figure out how many times we've defended ourselves. Ignore 1591 * defenses that happened long in the past. 1592 */ 1593 now = ddi_get_lbolt(); 1594 elapsed = (drv_hztousec(now - ncec->ncec_last_time_defended))/1000000; 1595 mutex_enter(&ncec->ncec_lock); 1596 if ((defs = ncec->ncec_defense_count) > 0 && 1597 elapsed > ipst->ips_ip_defend_interval) { 1598 /* 1599 * ip_defend_interval has elapsed. 1600 * reset the defense count. 1601 */ 1602 ncec->ncec_defense_count = defs = 0; 1603 } 1604 ncec->ncec_defense_count++; 1605 ncec->ncec_last_time_defended = now; 1606 mutex_exit(&ncec->ncec_lock); 1607 ipif_refrele(ipif); 1608 1609 /* 1610 * If we've defended ourselves too many times already, then give up and 1611 * tear down the interface(s) using this address. 1612 * Otherwise, caller has to defend by sending out an announce. 1613 */ 1614 if (defs >= maxdefense) { 1615 if (isv6) 1616 ndp_failure(mp, ira); 1617 else 1618 arp_failure(mp, ira); 1619 } else { 1620 return (B_TRUE); /* caller must defend this address */ 1621 } 1622 return (B_FALSE); 1623 } 1624 1625 /* 1626 * Handle reception of Neighbor Solicitation messages. 1627 */ 1628 static void 1629 ndp_input_solicit(mblk_t *mp, ip_recv_attr_t *ira) 1630 { 1631 ill_t *ill = ira->ira_ill, *under_ill; 1632 nd_neighbor_solicit_t *ns; 1633 uint32_t hlen = ill->ill_phys_addr_length; 1634 uchar_t *haddr = NULL; 1635 icmp6_t *icmp_nd; 1636 ip6_t *ip6h; 1637 ncec_t *our_ncec = NULL; 1638 in6_addr_t target; 1639 in6_addr_t src; 1640 int len; 1641 int flag = 0; 1642 nd_opt_hdr_t *opt = NULL; 1643 boolean_t bad_solicit = B_FALSE; 1644 mib2_ipv6IfIcmpEntry_t *mib = ill->ill_icmp6_mib; 1645 boolean_t need_ill_refrele = B_FALSE; 1646 1647 ip6h = (ip6_t *)mp->b_rptr; 1648 icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN); 1649 len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN; 1650 src = ip6h->ip6_src; 1651 ns = (nd_neighbor_solicit_t *)icmp_nd; 1652 target = ns->nd_ns_target; 1653 if (IN6_IS_ADDR_MULTICAST(&target)) { 1654 if (ip_debug > 2) { 1655 /* ip1dbg */ 1656 pr_addr_dbg("ndp_input_solicit: Target is" 1657 " multicast! %s\n", AF_INET6, &target); 1658 } 1659 bad_solicit = B_TRUE; 1660 goto done; 1661 } 1662 if (len > sizeof (nd_neighbor_solicit_t)) { 1663 /* Options present */ 1664 opt = (nd_opt_hdr_t *)&ns[1]; 1665 len -= sizeof (nd_neighbor_solicit_t); 1666 if (!ndp_verify_optlen(opt, len)) { 1667 ip1dbg(("ndp_input_solicit: Bad opt len\n")); 1668 bad_solicit = B_TRUE; 1669 goto done; 1670 } 1671 } 1672 if (IN6_IS_ADDR_UNSPECIFIED(&src)) { 1673 /* Check to see if this is a valid DAD solicitation */ 1674 if (!IN6_IS_ADDR_MC_SOLICITEDNODE(&ip6h->ip6_dst)) { 1675 if (ip_debug > 2) { 1676 /* ip1dbg */ 1677 pr_addr_dbg("ndp_input_solicit: IPv6 " 1678 "Destination is not solicited node " 1679 "multicast %s\n", AF_INET6, 1680 &ip6h->ip6_dst); 1681 } 1682 bad_solicit = B_TRUE; 1683 goto done; 1684 } 1685 } 1686 1687 /* 1688 * NOTE: with IPMP, it's possible the nominated multicast ill (which 1689 * received this packet if it's multicast) is not the ill tied to 1690 * e.g. the IPMP ill's data link-local. So we match across the illgrp 1691 * to ensure we find the associated NCE. 1692 */ 1693 our_ncec = ncec_lookup_illgrp_v6(ill, &target); 1694 /* 1695 * If this is a valid Solicitation for an address we are publishing, 1696 * then a PUBLISH entry should exist in the cache 1697 */ 1698 if (our_ncec == NULL || !NCE_PUBLISH(our_ncec)) { 1699 ip1dbg(("ndp_input_solicit: Wrong target in NS?!" 1700 "ifname=%s ", ill->ill_name)); 1701 if (ip_debug > 2) { 1702 /* ip1dbg */ 1703 pr_addr_dbg(" dst %s\n", AF_INET6, &target); 1704 } 1705 if (our_ncec == NULL) 1706 bad_solicit = B_TRUE; 1707 goto done; 1708 } 1709 1710 /* At this point we should have a verified NS per spec */ 1711 if (opt != NULL) { 1712 opt = ndp_get_option(opt, len, ND_OPT_SOURCE_LINKADDR); 1713 if (opt != NULL) { 1714 haddr = (uchar_t *)&opt[1]; 1715 if (hlen > opt->nd_opt_len * 8 - sizeof (*opt) || 1716 hlen == 0) { 1717 ip1dbg(("ndp_input_advert: bad SLLA\n")); 1718 bad_solicit = B_TRUE; 1719 goto done; 1720 } 1721 } 1722 } 1723 1724 /* If sending directly to peer, set the unicast flag */ 1725 if (!IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst)) 1726 flag |= NDP_UNICAST; 1727 1728 /* 1729 * Create/update the entry for the soliciting node on the ipmp_ill. 1730 * or respond to outstanding queries, don't if 1731 * the source is unspecified address. 1732 */ 1733 if (!IN6_IS_ADDR_UNSPECIFIED(&src)) { 1734 int err; 1735 nce_t *nnce; 1736 1737 ASSERT(ill->ill_isv6); 1738 /* 1739 * Regular solicitations *must* include the Source Link-Layer 1740 * Address option. Ignore messages that do not. 1741 */ 1742 if (haddr == NULL && IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst)) { 1743 ip1dbg(("ndp_input_solicit: source link-layer address " 1744 "option missing with a specified source.\n")); 1745 bad_solicit = B_TRUE; 1746 goto done; 1747 } 1748 1749 /* 1750 * This is a regular solicitation. If we're still in the 1751 * process of verifying the address, then don't respond at all 1752 * and don't keep track of the sender. 1753 */ 1754 if (our_ncec->ncec_state == ND_PROBE) 1755 goto done; 1756 1757 /* 1758 * If the solicitation doesn't have sender hardware address 1759 * (legal for unicast solicitation), then process without 1760 * installing the return NCE. Either we already know it, or 1761 * we'll be forced to look it up when (and if) we reply to the 1762 * packet. 1763 */ 1764 if (haddr == NULL) 1765 goto no_source; 1766 1767 under_ill = ill; 1768 if (IS_UNDER_IPMP(under_ill)) { 1769 ill = ipmp_ill_hold_ipmp_ill(under_ill); 1770 if (ill == NULL) 1771 ill = under_ill; 1772 else 1773 need_ill_refrele = B_TRUE; 1774 } 1775 err = nce_lookup_then_add_v6(ill, 1776 haddr, hlen, 1777 &src, /* Soliciting nodes address */ 1778 0, 1779 ND_STALE, 1780 &nnce); 1781 1782 if (need_ill_refrele) { 1783 ill_refrele(ill); 1784 ill = under_ill; 1785 need_ill_refrele = B_FALSE; 1786 } 1787 switch (err) { 1788 case 0: 1789 /* done with this entry */ 1790 nce_refrele(nnce); 1791 break; 1792 case EEXIST: 1793 /* 1794 * B_FALSE indicates this is not an an advertisement. 1795 */ 1796 nce_process(nnce->nce_common, haddr, 0, B_FALSE); 1797 nce_refrele(nnce); 1798 break; 1799 default: 1800 ip1dbg(("ndp_input_solicit: Can't create NCE %d\n", 1801 err)); 1802 goto done; 1803 } 1804 no_source: 1805 flag |= NDP_SOLICITED; 1806 } else { 1807 /* 1808 * No source link layer address option should be present in a 1809 * valid DAD request. 1810 */ 1811 if (haddr != NULL) { 1812 ip1dbg(("ndp_input_solicit: source link-layer address " 1813 "option present with an unspecified source.\n")); 1814 bad_solicit = B_TRUE; 1815 goto done; 1816 } 1817 if (our_ncec->ncec_state == ND_PROBE) { 1818 /* 1819 * Internally looped-back probes will have 1820 * IRAF_L2SRC_LOOPBACK set so we can ignore our own 1821 * transmissions. 1822 */ 1823 if (!(ira->ira_flags & IRAF_L2SRC_LOOPBACK)) { 1824 /* 1825 * If someone else is probing our address, then 1826 * we've crossed wires. Declare failure. 1827 */ 1828 ndp_failure(mp, ira); 1829 } 1830 goto done; 1831 } 1832 /* 1833 * This is a DAD probe. Multicast the advertisement to the 1834 * all-nodes address. 1835 */ 1836 src = ipv6_all_hosts_mcast; 1837 } 1838 flag |= nce_advert_flags(our_ncec); 1839 (void) ndp_xmit(ill, 1840 ND_NEIGHBOR_ADVERT, 1841 our_ncec->ncec_lladdr, 1842 our_ncec->ncec_lladdr_length, 1843 &target, /* Source and target of the advertisement pkt */ 1844 &src, /* IP Destination (source of original pkt) */ 1845 flag); 1846 done: 1847 if (bad_solicit) 1848 BUMP_MIB(mib, ipv6IfIcmpInBadNeighborSolicitations); 1849 if (our_ncec != NULL) 1850 ncec_refrele(our_ncec); 1851 } 1852 1853 /* 1854 * Handle reception of Neighbor Solicitation messages 1855 */ 1856 void 1857 ndp_input_advert(mblk_t *mp, ip_recv_attr_t *ira) 1858 { 1859 ill_t *ill = ira->ira_ill; 1860 nd_neighbor_advert_t *na; 1861 uint32_t hlen = ill->ill_phys_addr_length; 1862 uchar_t *haddr = NULL; 1863 icmp6_t *icmp_nd; 1864 ip6_t *ip6h; 1865 ncec_t *dst_ncec = NULL; 1866 in6_addr_t target; 1867 nd_opt_hdr_t *opt = NULL; 1868 int len; 1869 ip_stack_t *ipst = ill->ill_ipst; 1870 mib2_ipv6IfIcmpEntry_t *mib = ill->ill_icmp6_mib; 1871 1872 ip6h = (ip6_t *)mp->b_rptr; 1873 icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN); 1874 len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN; 1875 na = (nd_neighbor_advert_t *)icmp_nd; 1876 1877 if (IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst) && 1878 (na->nd_na_flags_reserved & ND_NA_FLAG_SOLICITED)) { 1879 ip1dbg(("ndp_input_advert: Target is multicast but the " 1880 "solicited flag is not zero\n")); 1881 BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements); 1882 return; 1883 } 1884 target = na->nd_na_target; 1885 if (IN6_IS_ADDR_MULTICAST(&target)) { 1886 ip1dbg(("ndp_input_advert: Target is multicast!\n")); 1887 BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements); 1888 return; 1889 } 1890 if (len > sizeof (nd_neighbor_advert_t)) { 1891 opt = (nd_opt_hdr_t *)&na[1]; 1892 if (!ndp_verify_optlen(opt, 1893 len - sizeof (nd_neighbor_advert_t))) { 1894 ip1dbg(("ndp_input_advert: cannot verify SLLA\n")); 1895 BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements); 1896 return; 1897 } 1898 /* At this point we have a verified NA per spec */ 1899 len -= sizeof (nd_neighbor_advert_t); 1900 opt = ndp_get_option(opt, len, ND_OPT_TARGET_LINKADDR); 1901 if (opt != NULL) { 1902 haddr = (uchar_t *)&opt[1]; 1903 if (hlen > opt->nd_opt_len * 8 - sizeof (*opt) || 1904 hlen == 0) { 1905 ip1dbg(("ndp_input_advert: bad SLLA\n")); 1906 BUMP_MIB(mib, 1907 ipv6IfIcmpInBadNeighborAdvertisements); 1908 return; 1909 } 1910 } 1911 } 1912 1913 /* 1914 * NOTE: we match across the illgrp since we need to do DAD for all of 1915 * our local addresses, and those are spread across all the active 1916 * ills in the group. 1917 */ 1918 if ((dst_ncec = ncec_lookup_illgrp_v6(ill, &target)) == NULL) 1919 return; 1920 1921 if (NCE_PUBLISH(dst_ncec)) { 1922 /* 1923 * Someone just advertised an addresses that we publish. First, 1924 * check it it was us -- if so, we can safely ignore it. 1925 * We don't get the haddr from the ira_l2src because, in the 1926 * case that the packet originated from us, on an IPMP group, 1927 * the ira_l2src may would be the link-layer address of the 1928 * cast_ill used to send the packet, which may not be the same 1929 * as the dst_ncec->ncec_lladdr of the address. 1930 */ 1931 if (haddr != NULL) { 1932 if (ira->ira_flags & IRAF_L2SRC_LOOPBACK) 1933 goto out; 1934 1935 if (!nce_cmp_ll_addr(dst_ncec, haddr, hlen)) 1936 goto out; /* from us -- no conflict */ 1937 1938 /* 1939 * If we're in an IPMP group, check if this is an echo 1940 * from another ill in the group. Use the double- 1941 * checked locking pattern to avoid grabbing 1942 * ill_g_lock in the non-IPMP case. 1943 */ 1944 if (IS_UNDER_IPMP(ill)) { 1945 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 1946 if (IS_UNDER_IPMP(ill) && ipmp_illgrp_find_ill( 1947 ill->ill_grp, haddr, hlen) != NULL) { 1948 rw_exit(&ipst->ips_ill_g_lock); 1949 goto out; 1950 } 1951 rw_exit(&ipst->ips_ill_g_lock); 1952 } 1953 } 1954 1955 /* 1956 * This appears to be a real conflict. If we're trying to 1957 * configure this NCE (ND_PROBE), then shut it down. 1958 * Otherwise, handle the discovered conflict. 1959 */ 1960 if (dst_ncec->ncec_state == ND_PROBE) { 1961 ndp_failure(mp, ira); 1962 } else { 1963 if (ip_nce_conflict(mp, ira, dst_ncec)) { 1964 char hbuf[MAC_STR_LEN]; 1965 char sbuf[INET6_ADDRSTRLEN]; 1966 1967 cmn_err(CE_WARN, 1968 "node '%s' is using %s on %s", 1969 inet_ntop(AF_INET6, &target, sbuf, 1970 sizeof (sbuf)), 1971 haddr == NULL ? "<none>" : 1972 mac_colon_addr(haddr, hlen, hbuf, 1973 sizeof (hbuf)), ill->ill_name); 1974 /* 1975 * RFC 4862, Section 5.4.4 does not mandate 1976 * any specific behavior when an NA matches 1977 * a non-tentative address assigned to the 1978 * receiver. We make the choice of defending 1979 * our address, based on the assumption that 1980 * the sender has not detected the Duplicate. 1981 * 1982 * ncec_last_time_defended has been adjusted 1983 * in ip_nce_conflict() 1984 */ 1985 (void) ndp_announce(dst_ncec); 1986 } 1987 } 1988 } else { 1989 if (na->nd_na_flags_reserved & ND_NA_FLAG_ROUTER) 1990 dst_ncec->ncec_flags |= NCE_F_ISROUTER; 1991 1992 /* B_TRUE indicates this an advertisement */ 1993 nce_process(dst_ncec, haddr, na->nd_na_flags_reserved, B_TRUE); 1994 } 1995 out: 1996 ncec_refrele(dst_ncec); 1997 } 1998 1999 /* 2000 * Process NDP neighbor solicitation/advertisement messages. 2001 * The checksum has already checked o.k before reaching here. 2002 * Information about the datalink header is contained in ira_l2src, but 2003 * that should be ignored for loopback packets. 2004 */ 2005 void 2006 ndp_input(mblk_t *mp, ip_recv_attr_t *ira) 2007 { 2008 ill_t *ill = ira->ira_rill; 2009 icmp6_t *icmp_nd; 2010 ip6_t *ip6h; 2011 int len; 2012 mib2_ipv6IfIcmpEntry_t *mib = ill->ill_icmp6_mib; 2013 ill_t *orig_ill = NULL; 2014 2015 /* 2016 * Since ira_ill is where the IRE_LOCAL was hosted we use ira_rill 2017 * and make it be the IPMP upper so avoid being confused by a packet 2018 * addressed to a unicast address on a different ill. 2019 */ 2020 if (IS_UNDER_IPMP(ill)) { 2021 orig_ill = ill; 2022 ill = ipmp_ill_hold_ipmp_ill(orig_ill); 2023 if (ill == NULL) { 2024 ill = orig_ill; 2025 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 2026 ip_drop_input("ipIfStatsInDiscards - IPMP ill", 2027 mp, ill); 2028 freemsg(mp); 2029 return; 2030 } 2031 ASSERT(ill != orig_ill); 2032 orig_ill = ira->ira_ill; 2033 ira->ira_ill = ill; 2034 mib = ill->ill_icmp6_mib; 2035 } 2036 if (!pullupmsg(mp, -1)) { 2037 ip1dbg(("ndp_input: pullupmsg failed\n")); 2038 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 2039 ip_drop_input("ipIfStatsInDiscards - pullupmsg", mp, ill); 2040 goto done; 2041 } 2042 ip6h = (ip6_t *)mp->b_rptr; 2043 if (ip6h->ip6_hops != IPV6_MAX_HOPS) { 2044 ip1dbg(("ndp_input: hoplimit != IPV6_MAX_HOPS\n")); 2045 ip_drop_input("ipv6IfIcmpBadHoplimit", mp, ill); 2046 BUMP_MIB(mib, ipv6IfIcmpBadHoplimit); 2047 goto done; 2048 } 2049 /* 2050 * NDP does not accept any extension headers between the 2051 * IP header and the ICMP header since e.g. a routing 2052 * header could be dangerous. 2053 * This assumes that any AH or ESP headers are removed 2054 * by ip prior to passing the packet to ndp_input. 2055 */ 2056 if (ip6h->ip6_nxt != IPPROTO_ICMPV6) { 2057 ip1dbg(("ndp_input: Wrong next header 0x%x\n", 2058 ip6h->ip6_nxt)); 2059 ip_drop_input("Wrong next header", mp, ill); 2060 BUMP_MIB(mib, ipv6IfIcmpInErrors); 2061 goto done; 2062 } 2063 icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN); 2064 ASSERT(icmp_nd->icmp6_type == ND_NEIGHBOR_SOLICIT || 2065 icmp_nd->icmp6_type == ND_NEIGHBOR_ADVERT); 2066 if (icmp_nd->icmp6_code != 0) { 2067 ip1dbg(("ndp_input: icmp6 code != 0 \n")); 2068 ip_drop_input("code non-zero", mp, ill); 2069 BUMP_MIB(mib, ipv6IfIcmpInErrors); 2070 goto done; 2071 } 2072 len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN; 2073 /* 2074 * Make sure packet length is large enough for either 2075 * a NS or a NA icmp packet. 2076 */ 2077 if (len < sizeof (struct icmp6_hdr) + sizeof (struct in6_addr)) { 2078 ip1dbg(("ndp_input: packet too short\n")); 2079 ip_drop_input("packet too short", mp, ill); 2080 BUMP_MIB(mib, ipv6IfIcmpInErrors); 2081 goto done; 2082 } 2083 if (icmp_nd->icmp6_type == ND_NEIGHBOR_SOLICIT) { 2084 ndp_input_solicit(mp, ira); 2085 } else { 2086 ndp_input_advert(mp, ira); 2087 } 2088 done: 2089 freemsg(mp); 2090 if (orig_ill != NULL) { 2091 ill_refrele(ill); 2092 ira->ira_ill = orig_ill; 2093 } 2094 } 2095 2096 /* 2097 * ndp_xmit is called to form and transmit a ND solicitation or 2098 * advertisement ICMP packet. 2099 * 2100 * If the source address is unspecified and this isn't a probe (used for 2101 * duplicate address detection), an appropriate source address and link layer 2102 * address will be chosen here. The link layer address option is included if 2103 * the source is specified (i.e., all non-probe packets), and omitted (per the 2104 * specification) otherwise. 2105 * 2106 * It returns B_FALSE only if it does a successful put() to the 2107 * corresponding ill's ill_wq otherwise returns B_TRUE. 2108 */ 2109 static boolean_t 2110 ndp_xmit(ill_t *ill, uint32_t operation, uint8_t *hw_addr, uint_t hw_addr_len, 2111 const in6_addr_t *sender, const in6_addr_t *target, int flag) 2112 { 2113 uint32_t len; 2114 icmp6_t *icmp6; 2115 mblk_t *mp; 2116 ip6_t *ip6h; 2117 nd_opt_hdr_t *opt; 2118 uint_t plen; 2119 zoneid_t zoneid = GLOBAL_ZONEID; 2120 ill_t *hwaddr_ill = ill; 2121 ip_xmit_attr_t ixas; 2122 ip_stack_t *ipst = ill->ill_ipst; 2123 boolean_t need_refrele = B_FALSE; 2124 boolean_t probe = B_FALSE; 2125 2126 if (IS_UNDER_IPMP(ill)) { 2127 probe = ipif_lookup_testaddr_v6(ill, sender, NULL); 2128 /* 2129 * We send non-probe packets on the upper IPMP interface. 2130 * ip_output_simple() will use cast_ill for sending any 2131 * multicast packets. Note that we can't follow the same 2132 * logic for probe packets because all interfaces in the ipmp 2133 * group may have failed, so that we really want to only try 2134 * to send the ND packet on the ill corresponding to the src 2135 * address. 2136 */ 2137 if (!probe) { 2138 ill = ipmp_ill_hold_ipmp_ill(ill); 2139 if (ill != NULL) 2140 need_refrele = B_TRUE; 2141 else 2142 ill = hwaddr_ill; 2143 } 2144 } 2145 2146 /* 2147 * If we have a unspecified source(sender) address, select a 2148 * proper source address for the solicitation here itself so 2149 * that we can initialize the h/w address correctly. 2150 * 2151 * If the sender is specified then we use this address in order 2152 * to lookup the zoneid before calling ip_output_v6(). This is to 2153 * enable unicast ND_NEIGHBOR_ADVERT packets to be routed correctly 2154 * by IP (we cannot guarantee that the global zone has an interface 2155 * route to the destination). 2156 * 2157 * Note that the NA never comes here with the unspecified source 2158 * address. 2159 */ 2160 2161 /* 2162 * Probes will have unspec src at this point. 2163 */ 2164 if (!(IN6_IS_ADDR_UNSPECIFIED(sender))) { 2165 zoneid = ipif_lookup_addr_zoneid_v6(sender, ill, ipst); 2166 /* 2167 * It's possible for ipif_lookup_addr_zoneid_v6() to return 2168 * ALL_ZONES if it cannot find a matching ipif for the address 2169 * we are trying to use. In this case we err on the side of 2170 * trying to send the packet by defaulting to the GLOBAL_ZONEID. 2171 */ 2172 if (zoneid == ALL_ZONES) 2173 zoneid = GLOBAL_ZONEID; 2174 } 2175 2176 plen = (sizeof (nd_opt_hdr_t) + hw_addr_len + 7) / 8; 2177 len = IPV6_HDR_LEN + sizeof (nd_neighbor_advert_t) + plen * 8; 2178 mp = allocb(len, BPRI_LO); 2179 if (mp == NULL) { 2180 if (need_refrele) 2181 ill_refrele(ill); 2182 return (B_TRUE); 2183 } 2184 2185 bzero((char *)mp->b_rptr, len); 2186 mp->b_wptr = mp->b_rptr + len; 2187 2188 bzero(&ixas, sizeof (ixas)); 2189 ixas.ixa_flags = IXAF_BASIC_SIMPLE_V6 | IXAF_NO_HW_CKSUM; 2190 2191 ixas.ixa_ifindex = ill->ill_phyint->phyint_ifindex; 2192 ixas.ixa_ipst = ipst; 2193 ixas.ixa_cred = kcred; 2194 ixas.ixa_cpid = NOPID; 2195 ixas.ixa_tsl = NULL; 2196 ixas.ixa_zoneid = zoneid; 2197 2198 ip6h = (ip6_t *)mp->b_rptr; 2199 ip6h->ip6_vcf = IPV6_DEFAULT_VERS_AND_FLOW; 2200 ip6h->ip6_plen = htons(len - IPV6_HDR_LEN); 2201 ip6h->ip6_nxt = IPPROTO_ICMPV6; 2202 ip6h->ip6_hops = IPV6_MAX_HOPS; 2203 ixas.ixa_multicast_ttl = ip6h->ip6_hops; 2204 ip6h->ip6_dst = *target; 2205 icmp6 = (icmp6_t *)&ip6h[1]; 2206 2207 if (hw_addr_len != 0) { 2208 opt = (nd_opt_hdr_t *)((uint8_t *)ip6h + IPV6_HDR_LEN + 2209 sizeof (nd_neighbor_advert_t)); 2210 } else { 2211 opt = NULL; 2212 } 2213 if (operation == ND_NEIGHBOR_SOLICIT) { 2214 nd_neighbor_solicit_t *ns = (nd_neighbor_solicit_t *)icmp6; 2215 2216 if (opt != NULL && !(flag & NDP_PROBE)) { 2217 /* 2218 * Note that we don't send out SLLA for ND probes 2219 * per RFC 4862, even though we do send out the src 2220 * haddr for IPv4 DAD probes, even though both IPv4 2221 * and IPv6 go out with the unspecified/INADDR_ANY 2222 * src IP addr. 2223 */ 2224 opt->nd_opt_type = ND_OPT_SOURCE_LINKADDR; 2225 } 2226 ip6h->ip6_src = *sender; 2227 ns->nd_ns_target = *target; 2228 if (!(flag & NDP_UNICAST)) { 2229 /* Form multicast address of the target */ 2230 ip6h->ip6_dst = ipv6_solicited_node_mcast; 2231 ip6h->ip6_dst.s6_addr32[3] |= 2232 ns->nd_ns_target.s6_addr32[3]; 2233 } 2234 } else { 2235 nd_neighbor_advert_t *na = (nd_neighbor_advert_t *)icmp6; 2236 2237 ASSERT(!(flag & NDP_PROBE)); 2238 if (opt != NULL) 2239 opt->nd_opt_type = ND_OPT_TARGET_LINKADDR; 2240 ip6h->ip6_src = *sender; 2241 na->nd_na_target = *sender; 2242 if (flag & NDP_ISROUTER) 2243 na->nd_na_flags_reserved |= ND_NA_FLAG_ROUTER; 2244 if (flag & NDP_SOLICITED) 2245 na->nd_na_flags_reserved |= ND_NA_FLAG_SOLICITED; 2246 if (flag & NDP_ORIDE) 2247 na->nd_na_flags_reserved |= ND_NA_FLAG_OVERRIDE; 2248 } 2249 2250 if (!(flag & NDP_PROBE)) { 2251 if (hw_addr != NULL && opt != NULL) { 2252 /* Fill in link layer address and option len */ 2253 opt->nd_opt_len = (uint8_t)plen; 2254 bcopy(hw_addr, &opt[1], hw_addr_len); 2255 } 2256 } 2257 if (opt != NULL && opt->nd_opt_type == 0) { 2258 /* If there's no link layer address option, then strip it. */ 2259 len -= plen * 8; 2260 mp->b_wptr = mp->b_rptr + len; 2261 ip6h->ip6_plen = htons(len - IPV6_HDR_LEN); 2262 } 2263 2264 icmp6->icmp6_type = (uint8_t)operation; 2265 icmp6->icmp6_code = 0; 2266 /* 2267 * Prepare for checksum by putting icmp length in the icmp 2268 * checksum field. The checksum is calculated in ip_output.c. 2269 */ 2270 icmp6->icmp6_cksum = ip6h->ip6_plen; 2271 2272 (void) ip_output_simple(mp, &ixas); 2273 ixa_cleanup(&ixas); 2274 if (need_refrele) 2275 ill_refrele(ill); 2276 return (B_FALSE); 2277 } 2278 2279 /* 2280 * Used to set ND_UNREACHBLE before ncec_delete sets it NCE_F_CONDEMNED. 2281 * The datapath uses this as an indication that there 2282 * is a problem (as opposed to a NCE that was just 2283 * reclaimed due to lack of memory. 2284 * Note that static ARP entries never become unreachable. 2285 */ 2286 void 2287 nce_make_unreachable(ncec_t *ncec) 2288 { 2289 mutex_enter(&ncec->ncec_lock); 2290 ncec->ncec_state = ND_UNREACHABLE; 2291 mutex_exit(&ncec->ncec_lock); 2292 } 2293 2294 /* 2295 * NCE retransmit timer. Common to IPv4 and IPv6. 2296 * This timer goes off when: 2297 * a. It is time to retransmit a resolution for resolver. 2298 * b. It is time to send reachability probes. 2299 */ 2300 void 2301 nce_timer(void *arg) 2302 { 2303 ncec_t *ncec = arg; 2304 ill_t *ill = ncec->ncec_ill, *src_ill; 2305 char addrbuf[INET6_ADDRSTRLEN]; 2306 boolean_t dropped = B_FALSE; 2307 ip_stack_t *ipst = ncec->ncec_ipst; 2308 boolean_t isv6 = (ncec->ncec_ipversion == IPV6_VERSION); 2309 in_addr_t sender4 = INADDR_ANY; 2310 in6_addr_t sender6 = ipv6_all_zeros; 2311 2312 /* 2313 * The timer has to be cancelled by ncec_delete before doing the final 2314 * refrele. So the NCE is guaranteed to exist when the timer runs 2315 * until it clears the timeout_id. Before clearing the timeout_id 2316 * bump up the refcnt so that we can continue to use the ncec 2317 */ 2318 ASSERT(ncec != NULL); 2319 mutex_enter(&ncec->ncec_lock); 2320 ncec_refhold_locked(ncec); 2321 ncec->ncec_timeout_id = 0; 2322 mutex_exit(&ncec->ncec_lock); 2323 2324 src_ill = nce_resolve_src(ncec, &sender6); 2325 /* if we could not find a sender address, return */ 2326 if (src_ill == NULL) { 2327 if (!isv6) { 2328 IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, sender4); 2329 ip1dbg(("no src ill for %s\n", inet_ntop(AF_INET, 2330 &sender4, addrbuf, sizeof (addrbuf)))); 2331 } else { 2332 ip1dbg(("no src ill for %s\n", inet_ntop(AF_INET6, 2333 &ncec->ncec_addr, addrbuf, sizeof (addrbuf)))); 2334 } 2335 nce_restart_timer(ncec, ill->ill_reachable_retrans_time); 2336 ncec_refrele(ncec); 2337 return; 2338 } 2339 if (!isv6) 2340 IN6_V4MAPPED_TO_IPADDR(&sender6, sender4); 2341 2342 mutex_enter(&ncec->ncec_lock); 2343 /* 2344 * Check the reachability state. 2345 */ 2346 switch (ncec->ncec_state) { 2347 case ND_DELAY: 2348 ASSERT(ncec->ncec_lladdr != NULL); 2349 ncec->ncec_state = ND_PROBE; 2350 ncec->ncec_pcnt = ND_MAX_UNICAST_SOLICIT; 2351 if (isv6) { 2352 mutex_exit(&ncec->ncec_lock); 2353 (void) ndp_xmit(src_ill, ND_NEIGHBOR_SOLICIT, 2354 src_ill->ill_phys_addr, 2355 src_ill->ill_phys_addr_length, 2356 &sender6, &ncec->ncec_addr, 2357 NDP_UNICAST); 2358 } else { 2359 (void) arp_request(ncec, sender4, src_ill); 2360 mutex_exit(&ncec->ncec_lock); 2361 } 2362 if (ip_debug > 3) { 2363 /* ip2dbg */ 2364 pr_addr_dbg("nce_timer: state for %s changed " 2365 "to PROBE\n", AF_INET6, &ncec->ncec_addr); 2366 } 2367 nce_restart_timer(ncec, ill->ill_reachable_retrans_time); 2368 break; 2369 case ND_PROBE: 2370 /* must be retransmit timer */ 2371 ASSERT(ncec->ncec_pcnt >= -1); 2372 if (ncec->ncec_pcnt > 0) { 2373 /* 2374 * As per RFC2461, the ncec gets deleted after 2375 * MAX_UNICAST_SOLICIT unsuccessful re-transmissions. 2376 * Note that the first unicast solicitation is sent 2377 * during the DELAY state. 2378 */ 2379 ip2dbg(("nce_timer: pcount=%x dst %s\n", 2380 ncec->ncec_pcnt, 2381 inet_ntop((isv6? AF_INET6 : AF_INET), 2382 &ncec->ncec_addr, addrbuf, sizeof (addrbuf)))); 2383 if (NCE_PUBLISH(ncec)) { 2384 mutex_exit(&ncec->ncec_lock); 2385 /* 2386 * send out a probe; note that src_ill 2387 * is ignored by nce_dad() for all 2388 * DAD message types other than IPv6 2389 * unicast probes 2390 */ 2391 nce_dad(ncec, src_ill, B_TRUE); 2392 } else { 2393 ASSERT(src_ill != NULL); 2394 ncec->ncec_pcnt--; 2395 if (isv6) { 2396 mutex_exit(&ncec->ncec_lock); 2397 (void) ndp_xmit(src_ill, 2398 ND_NEIGHBOR_SOLICIT, 2399 src_ill->ill_phys_addr, 2400 src_ill->ill_phys_addr_length, 2401 &sender6, &ncec->ncec_addr, 2402 NDP_UNICAST); 2403 } else { 2404 /* 2405 * since the nce is REACHABLE, 2406 * the ARP request will be sent out 2407 * as a link-layer unicast. 2408 */ 2409 (void) arp_request(ncec, sender4, 2410 src_ill); 2411 mutex_exit(&ncec->ncec_lock); 2412 } 2413 nce_restart_timer(ncec, 2414 ill->ill_reachable_retrans_time); 2415 } 2416 } else if (ncec->ncec_pcnt < 0) { 2417 /* No hope, delete the ncec */ 2418 /* Tell datapath it went bad */ 2419 ncec->ncec_state = ND_UNREACHABLE; 2420 mutex_exit(&ncec->ncec_lock); 2421 if (ip_debug > 2) { 2422 /* ip1dbg */ 2423 pr_addr_dbg("nce_timer: Delete NCE for" 2424 " dst %s\n", (isv6? AF_INET6: AF_INET), 2425 &ncec->ncec_addr); 2426 } 2427 /* if static ARP can't delete. */ 2428 if ((ncec->ncec_flags & NCE_F_STATIC) == 0) 2429 ncec_delete(ncec); 2430 2431 } else if (!NCE_PUBLISH(ncec)) { 2432 /* 2433 * Probe count is 0 for a dynamic entry (one that we 2434 * ourselves are not publishing). We should never get 2435 * here if NONUD was requested, hence the ASSERT below. 2436 */ 2437 ASSERT((ncec->ncec_flags & NCE_F_NONUD) == 0); 2438 ip2dbg(("nce_timer: pcount=%x dst %s\n", 2439 ncec->ncec_pcnt, inet_ntop(AF_INET6, 2440 &ncec->ncec_addr, addrbuf, sizeof (addrbuf)))); 2441 ncec->ncec_pcnt--; 2442 mutex_exit(&ncec->ncec_lock); 2443 /* Wait one interval before killing */ 2444 nce_restart_timer(ncec, 2445 ill->ill_reachable_retrans_time); 2446 } else if (ill->ill_phyint->phyint_flags & PHYI_RUNNING) { 2447 ipif_t *ipif; 2448 ipaddr_t ncec_addr; 2449 2450 /* 2451 * We're done probing, and we can now declare this 2452 * address to be usable. Let IP know that it's ok to 2453 * use. 2454 */ 2455 ncec->ncec_state = ND_REACHABLE; 2456 ncec->ncec_flags &= ~NCE_F_UNVERIFIED; 2457 mutex_exit(&ncec->ncec_lock); 2458 if (isv6) { 2459 ipif = ipif_lookup_addr_exact_v6( 2460 &ncec->ncec_addr, ill, ipst); 2461 } else { 2462 IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, 2463 ncec_addr); 2464 ipif = ipif_lookup_addr_exact(ncec_addr, ill, 2465 ipst); 2466 } 2467 if (ipif != NULL) { 2468 if (ipif->ipif_was_dup) { 2469 char ibuf[LIFNAMSIZ + 10]; 2470 char sbuf[INET6_ADDRSTRLEN]; 2471 2472 ipif->ipif_was_dup = B_FALSE; 2473 (void) inet_ntop(AF_INET6, 2474 &ipif->ipif_v6lcl_addr, 2475 sbuf, sizeof (sbuf)); 2476 ipif_get_name(ipif, ibuf, 2477 sizeof (ibuf)); 2478 cmn_err(CE_NOTE, "recovered address " 2479 "%s on %s", sbuf, ibuf); 2480 } 2481 if ((ipif->ipif_flags & IPIF_UP) && 2482 !ipif->ipif_addr_ready) 2483 ipif_up_notify(ipif); 2484 ipif->ipif_addr_ready = 1; 2485 ipif_refrele(ipif); 2486 } 2487 if (!isv6 && arp_no_defense) 2488 break; 2489 /* Begin defending our new address */ 2490 if (ncec->ncec_unsolicit_count > 0) { 2491 ncec->ncec_unsolicit_count--; 2492 if (isv6) { 2493 dropped = ndp_announce(ncec); 2494 } else { 2495 dropped = arp_announce(ncec); 2496 } 2497 2498 if (dropped) 2499 ncec->ncec_unsolicit_count++; 2500 else 2501 ncec->ncec_last_time_defended = 2502 ddi_get_lbolt(); 2503 } 2504 if (ncec->ncec_unsolicit_count > 0) { 2505 nce_restart_timer(ncec, 2506 ANNOUNCE_INTERVAL(isv6)); 2507 } else if (DEFENSE_INTERVAL(isv6) != 0) { 2508 nce_restart_timer(ncec, DEFENSE_INTERVAL(isv6)); 2509 } 2510 } else { 2511 /* 2512 * This is an address we're probing to be our own, but 2513 * the ill is down. Wait until it comes back before 2514 * doing anything, but switch to reachable state so 2515 * that the restart will work. 2516 */ 2517 ncec->ncec_state = ND_REACHABLE; 2518 mutex_exit(&ncec->ncec_lock); 2519 } 2520 break; 2521 case ND_INCOMPLETE: { 2522 mblk_t *mp, *nextmp; 2523 mblk_t **prevmpp; 2524 2525 /* 2526 * Per case (2) in the nce_queue_mp() comments, scan ncec_qd_mp 2527 * for any IPMP probe packets, and toss them. IPMP probe 2528 * packets will always be at the head of ncec_qd_mp, so that 2529 * we can stop at the first queued ND packet that is 2530 * not a probe packet. 2531 */ 2532 prevmpp = &ncec->ncec_qd_mp; 2533 for (mp = ncec->ncec_qd_mp; mp != NULL; mp = nextmp) { 2534 nextmp = mp->b_next; 2535 2536 if (IS_UNDER_IPMP(ill) && ncec->ncec_nprobes > 0) { 2537 inet_freemsg(mp); 2538 ncec->ncec_nprobes--; 2539 *prevmpp = nextmp; 2540 } else { 2541 prevmpp = &mp->b_next; 2542 } 2543 } 2544 2545 /* 2546 * Must be resolver's retransmit timer. 2547 */ 2548 mutex_exit(&ncec->ncec_lock); 2549 ip_ndp_resolve(ncec); 2550 break; 2551 } 2552 case ND_REACHABLE: 2553 if (((ncec->ncec_flags & NCE_F_UNSOL_ADV) && 2554 ncec->ncec_unsolicit_count != 0) || 2555 (NCE_PUBLISH(ncec) && DEFENSE_INTERVAL(isv6) != 0)) { 2556 if (ncec->ncec_unsolicit_count > 0) { 2557 ncec->ncec_unsolicit_count--; 2558 mutex_exit(&ncec->ncec_lock); 2559 /* 2560 * When we get to zero announcements left, 2561 * switch to address defense 2562 */ 2563 } else { 2564 boolean_t rate_limit; 2565 2566 mutex_exit(&ncec->ncec_lock); 2567 rate_limit = ill_defend_rate_limit(ill, ncec); 2568 if (rate_limit) { 2569 nce_restart_timer(ncec, 2570 DEFENSE_INTERVAL(isv6)); 2571 break; 2572 } 2573 } 2574 if (isv6) { 2575 dropped = ndp_announce(ncec); 2576 } else { 2577 dropped = arp_announce(ncec); 2578 } 2579 mutex_enter(&ncec->ncec_lock); 2580 if (dropped) { 2581 ncec->ncec_unsolicit_count++; 2582 } else { 2583 ncec->ncec_last_time_defended = 2584 ddi_get_lbolt(); 2585 } 2586 mutex_exit(&ncec->ncec_lock); 2587 if (ncec->ncec_unsolicit_count != 0) { 2588 nce_restart_timer(ncec, 2589 ANNOUNCE_INTERVAL(isv6)); 2590 } else { 2591 nce_restart_timer(ncec, DEFENSE_INTERVAL(isv6)); 2592 } 2593 } else { 2594 mutex_exit(&ncec->ncec_lock); 2595 } 2596 break; 2597 default: 2598 mutex_exit(&ncec->ncec_lock); 2599 break; 2600 } 2601 done: 2602 ncec_refrele(ncec); 2603 ill_refrele(src_ill); 2604 } 2605 2606 /* 2607 * Set a link layer address from the ll_addr passed in. 2608 * Copy SAP from ill. 2609 */ 2610 static void 2611 nce_set_ll(ncec_t *ncec, uchar_t *ll_addr) 2612 { 2613 ill_t *ill = ncec->ncec_ill; 2614 2615 ASSERT(ll_addr != NULL); 2616 if (ill->ill_phys_addr_length > 0) { 2617 /* 2618 * The bcopy() below used to be called for the physical address 2619 * length rather than the link layer address length. For 2620 * ethernet and many other media, the phys_addr and lla are 2621 * identical. 2622 * 2623 * The phys_addr and lla may not be the same for devices that 2624 * support DL_IPV6_LINK_LAYER_ADDR, though there are currently 2625 * no known instances of these. 2626 * 2627 * For PPP or other interfaces with a zero length 2628 * physical address, don't do anything here. 2629 * The bcopy() with a zero phys_addr length was previously 2630 * a no-op for interfaces with a zero-length physical address. 2631 * Using the lla for them would change the way they operate. 2632 * Doing nothing in such cases preserves expected behavior. 2633 */ 2634 bcopy(ll_addr, ncec->ncec_lladdr, ill->ill_nd_lla_len); 2635 } 2636 } 2637 2638 boolean_t 2639 nce_cmp_ll_addr(const ncec_t *ncec, const uchar_t *ll_addr, 2640 uint32_t ll_addr_len) 2641 { 2642 ASSERT(ncec->ncec_lladdr != NULL); 2643 if (ll_addr == NULL) 2644 return (B_FALSE); 2645 if (bcmp(ll_addr, ncec->ncec_lladdr, ll_addr_len) != 0) 2646 return (B_TRUE); 2647 return (B_FALSE); 2648 } 2649 2650 /* 2651 * Updates the link layer address or the reachability state of 2652 * a cache entry. Reset probe counter if needed. 2653 */ 2654 void 2655 nce_update(ncec_t *ncec, uint16_t new_state, uchar_t *new_ll_addr) 2656 { 2657 ill_t *ill = ncec->ncec_ill; 2658 boolean_t need_stop_timer = B_FALSE; 2659 boolean_t need_fastpath_update = B_FALSE; 2660 nce_t *nce = NULL; 2661 timeout_id_t tid; 2662 2663 ASSERT(MUTEX_HELD(&ncec->ncec_lock)); 2664 /* 2665 * If this interface does not do NUD, there is no point 2666 * in allowing an update to the cache entry. Although 2667 * we will respond to NS. 2668 * The only time we accept an update for a resolver when 2669 * NUD is turned off is when it has just been created. 2670 * Non-Resolvers will always be created as REACHABLE. 2671 */ 2672 if (new_state != ND_UNCHANGED) { 2673 if ((ncec->ncec_flags & NCE_F_NONUD) && 2674 (ncec->ncec_state != ND_INCOMPLETE)) 2675 return; 2676 ASSERT((int16_t)new_state >= ND_STATE_VALID_MIN); 2677 ASSERT((int16_t)new_state <= ND_STATE_VALID_MAX); 2678 need_stop_timer = B_TRUE; 2679 if (new_state == ND_REACHABLE) 2680 ncec->ncec_last = TICK_TO_MSEC(ddi_get_lbolt64()); 2681 else { 2682 /* We force NUD in this case */ 2683 ncec->ncec_last = 0; 2684 } 2685 ncec->ncec_state = new_state; 2686 ncec->ncec_pcnt = ND_MAX_UNICAST_SOLICIT; 2687 ASSERT(ncec->ncec_lladdr != NULL || new_state == ND_INITIAL || 2688 new_state == ND_INCOMPLETE); 2689 } 2690 if (need_stop_timer || (ncec->ncec_flags & NCE_F_STATIC)) { 2691 tid = ncec->ncec_timeout_id; 2692 ncec->ncec_timeout_id = 0; 2693 } 2694 /* 2695 * Re-trigger fastpath probe and 2696 * overwrite the DL_UNITDATA_REQ data, noting we'll lose 2697 * whatever packets that happens to be transmitting at the time. 2698 */ 2699 if (new_ll_addr != NULL) { 2700 bcopy(new_ll_addr, ncec->ncec_lladdr, 2701 ill->ill_phys_addr_length); 2702 need_fastpath_update = B_TRUE; 2703 } 2704 mutex_exit(&ncec->ncec_lock); 2705 if (need_stop_timer || (ncec->ncec_flags & NCE_F_STATIC)) { 2706 if (tid != 0) 2707 (void) untimeout(tid); 2708 } 2709 if (need_fastpath_update) { 2710 /* 2711 * Delete any existing existing dlur_mp and fp_mp information. 2712 * For IPMP interfaces, all underlying ill's must be checked 2713 * and purged. 2714 */ 2715 nce_fastpath_list_delete(ncec->ncec_ill, ncec, NULL); 2716 /* 2717 * add the new dlur_mp and fp_mp 2718 */ 2719 nce = nce_fastpath(ncec, B_TRUE, NULL); 2720 if (nce != NULL) 2721 nce_refrele(nce); 2722 } 2723 mutex_enter(&ncec->ncec_lock); 2724 } 2725 2726 static void 2727 nce_queue_mp_common(ncec_t *ncec, mblk_t *mp, boolean_t head_insert) 2728 { 2729 uint_t count = 0; 2730 mblk_t **mpp, *tmp; 2731 2732 ASSERT(MUTEX_HELD(&ncec->ncec_lock)); 2733 2734 for (mpp = &ncec->ncec_qd_mp; *mpp != NULL; mpp = &(*mpp)->b_next) { 2735 if (++count > ncec->ncec_ill->ill_max_buf) { 2736 tmp = ncec->ncec_qd_mp->b_next; 2737 ncec->ncec_qd_mp->b_next = NULL; 2738 /* 2739 * if we never create data addrs on the under_ill 2740 * does this matter? 2741 */ 2742 BUMP_MIB(ncec->ncec_ill->ill_ip_mib, 2743 ipIfStatsOutDiscards); 2744 ip_drop_output("ipIfStatsOutDiscards", ncec->ncec_qd_mp, 2745 ncec->ncec_ill); 2746 freemsg(ncec->ncec_qd_mp); 2747 ncec->ncec_qd_mp = tmp; 2748 } 2749 } 2750 2751 if (head_insert) { 2752 ncec->ncec_nprobes++; 2753 mp->b_next = ncec->ncec_qd_mp; 2754 ncec->ncec_qd_mp = mp; 2755 } else { 2756 *mpp = mp; 2757 } 2758 } 2759 2760 /* 2761 * nce_queue_mp will queue the packet into the ncec_qd_mp. The packet will be 2762 * queued at the head or tail of the queue based on the input argument 2763 * 'head_insert'. The caller should specify this argument as B_TRUE if this 2764 * packet is an IPMP probe packet, in which case the following happens: 2765 * 2766 * 1. Insert it at the head of the ncec_qd_mp list. Consider the normal 2767 * (non-ipmp_probe) load-speading case where the source address of the ND 2768 * packet is not tied to ncec_ill. If the ill bound to the source address 2769 * cannot receive, the response to the ND packet will not be received. 2770 * However, if ND packets for ncec_ill's probes are queued behind that ND 2771 * packet, those probes will also fail to be sent, and thus in.mpathd will 2772 * erroneously conclude that ncec_ill has also failed. 2773 * 2774 * 2. Drop the ipmp_probe packet in ndp_timer() if the ND did not succeed on 2775 * the first attempt. This ensures that ND problems do not manifest as 2776 * probe RTT spikes. 2777 * 2778 * We achieve this by inserting ipmp_probe() packets at the head of the 2779 * nce_queue. 2780 * 2781 * The ncec for the probe target is created with ncec_ill set to the ipmp_ill, 2782 * but the caller needs to set head_insert to B_TRUE if this is a probe packet. 2783 */ 2784 void 2785 nce_queue_mp(ncec_t *ncec, mblk_t *mp, boolean_t head_insert) 2786 { 2787 ASSERT(MUTEX_HELD(&ncec->ncec_lock)); 2788 nce_queue_mp_common(ncec, mp, head_insert); 2789 } 2790 2791 /* 2792 * Called when address resolution failed due to a timeout. 2793 * Send an ICMP unreachable in response to all queued packets. 2794 */ 2795 void 2796 ndp_resolv_failed(ncec_t *ncec) 2797 { 2798 mblk_t *mp, *nxt_mp; 2799 char buf[INET6_ADDRSTRLEN]; 2800 ill_t *ill = ncec->ncec_ill; 2801 ip_recv_attr_t iras; 2802 2803 bzero(&iras, sizeof (iras)); 2804 iras.ira_flags = 0; 2805 /* 2806 * we are setting the ira_rill to the ipmp_ill (instead of 2807 * the actual ill on which the packet was received), but this 2808 * is ok because we don't actually need the real ira_rill. 2809 * to send the icmp unreachable to the sender. 2810 */ 2811 iras.ira_ill = iras.ira_rill = ill; 2812 iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex; 2813 iras.ira_rifindex = iras.ira_ruifindex; 2814 2815 ip1dbg(("ndp_resolv_failed: dst %s\n", 2816 inet_ntop(AF_INET6, (char *)&ncec->ncec_addr, buf, sizeof (buf)))); 2817 mutex_enter(&ncec->ncec_lock); 2818 mp = ncec->ncec_qd_mp; 2819 ncec->ncec_qd_mp = NULL; 2820 ncec->ncec_nprobes = 0; 2821 mutex_exit(&ncec->ncec_lock); 2822 while (mp != NULL) { 2823 nxt_mp = mp->b_next; 2824 mp->b_next = NULL; 2825 2826 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); 2827 ip_drop_output("ipIfStatsOutDiscards - address unreachable", 2828 mp, ill); 2829 icmp_unreachable_v6(mp, 2830 ICMP6_DST_UNREACH_ADDR, B_FALSE, &iras); 2831 ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE)); 2832 mp = nxt_mp; 2833 } 2834 ncec_cb_dispatch(ncec); /* finish off waiting callbacks */ 2835 } 2836 2837 /* 2838 * Handle the completion of NDP and ARP resolution. 2839 */ 2840 void 2841 nce_resolv_ok(ncec_t *ncec) 2842 { 2843 mblk_t *mp; 2844 uint_t pkt_len; 2845 iaflags_t ixaflags = IXAF_NO_TRACE; 2846 nce_t *nce; 2847 ill_t *ill = ncec->ncec_ill; 2848 boolean_t isv6 = (ncec->ncec_ipversion == IPV6_VERSION); 2849 ip_stack_t *ipst = ill->ill_ipst; 2850 2851 if (IS_IPMP(ncec->ncec_ill)) { 2852 nce_resolv_ipmp_ok(ncec); 2853 return; 2854 } 2855 /* non IPMP case */ 2856 2857 mutex_enter(&ncec->ncec_lock); 2858 ASSERT(ncec->ncec_nprobes == 0); 2859 mp = ncec->ncec_qd_mp; 2860 ncec->ncec_qd_mp = NULL; 2861 mutex_exit(&ncec->ncec_lock); 2862 2863 while (mp != NULL) { 2864 mblk_t *nxt_mp; 2865 2866 if (ill->ill_isv6) { 2867 ip6_t *ip6h = (ip6_t *)mp->b_rptr; 2868 2869 pkt_len = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN; 2870 } else { 2871 ipha_t *ipha = (ipha_t *)mp->b_rptr; 2872 2873 ixaflags |= IXAF_IS_IPV4; 2874 pkt_len = ntohs(ipha->ipha_length); 2875 } 2876 nxt_mp = mp->b_next; 2877 mp->b_next = NULL; 2878 /* 2879 * IXAF_NO_DEV_FLOW_CTL information for TCP packets is no 2880 * longer available, but it's ok to drop this flag because TCP 2881 * has its own flow-control in effect, so TCP packets 2882 * are not likely to get here when flow-control is in effect. 2883 */ 2884 mutex_enter(&ill->ill_lock); 2885 nce = nce_lookup(ill, &ncec->ncec_addr); 2886 mutex_exit(&ill->ill_lock); 2887 2888 if (nce == NULL) { 2889 if (isv6) { 2890 BUMP_MIB(&ipst->ips_ip6_mib, 2891 ipIfStatsOutDiscards); 2892 } else { 2893 BUMP_MIB(&ipst->ips_ip_mib, 2894 ipIfStatsOutDiscards); 2895 } 2896 ip_drop_output("ipIfStatsOutDiscards - no nce", 2897 mp, NULL); 2898 freemsg(mp); 2899 } else { 2900 /* 2901 * We don't know the zoneid, but 2902 * ip_xmit does not care since IXAF_NO_TRACE 2903 * is set. (We traced the packet the first 2904 * time through ip_xmit.) 2905 */ 2906 (void) ip_xmit(mp, nce, ixaflags, pkt_len, 0, 2907 ALL_ZONES, 0, NULL); 2908 nce_refrele(nce); 2909 } 2910 mp = nxt_mp; 2911 } 2912 2913 ncec_cb_dispatch(ncec); /* complete callbacks */ 2914 } 2915 2916 /* 2917 * Called by SIOCSNDP* ioctl to add/change an ncec entry 2918 * and the corresponding attributes. 2919 * Disallow states other than ND_REACHABLE or ND_STALE. 2920 */ 2921 int 2922 ndp_sioc_update(ill_t *ill, lif_nd_req_t *lnr) 2923 { 2924 sin6_t *sin6; 2925 in6_addr_t *addr; 2926 ncec_t *ncec; 2927 nce_t *nce; 2928 int err = 0; 2929 uint16_t new_flags = 0; 2930 uint16_t old_flags = 0; 2931 int inflags = lnr->lnr_flags; 2932 ip_stack_t *ipst = ill->ill_ipst; 2933 boolean_t do_postprocess = B_FALSE; 2934 2935 ASSERT(ill->ill_isv6); 2936 if ((lnr->lnr_state_create != ND_REACHABLE) && 2937 (lnr->lnr_state_create != ND_STALE)) 2938 return (EINVAL); 2939 2940 sin6 = (sin6_t *)&lnr->lnr_addr; 2941 addr = &sin6->sin6_addr; 2942 2943 mutex_enter(&ipst->ips_ndp6->ndp_g_lock); 2944 ASSERT(!IS_UNDER_IPMP(ill)); 2945 nce = nce_lookup_addr(ill, addr); 2946 if (nce != NULL) 2947 new_flags = nce->nce_common->ncec_flags; 2948 2949 switch (inflags & (NDF_ISROUTER_ON|NDF_ISROUTER_OFF)) { 2950 case NDF_ISROUTER_ON: 2951 new_flags |= NCE_F_ISROUTER; 2952 break; 2953 case NDF_ISROUTER_OFF: 2954 new_flags &= ~NCE_F_ISROUTER; 2955 break; 2956 case (NDF_ISROUTER_OFF|NDF_ISROUTER_ON): 2957 mutex_exit(&ipst->ips_ndp6->ndp_g_lock); 2958 if (nce != NULL) 2959 nce_refrele(nce); 2960 return (EINVAL); 2961 } 2962 2963 switch (inflags & (NDF_ANYCAST_ON|NDF_ANYCAST_OFF)) { 2964 case NDF_ANYCAST_ON: 2965 new_flags |= NCE_F_ANYCAST; 2966 break; 2967 case NDF_ANYCAST_OFF: 2968 new_flags &= ~NCE_F_ANYCAST; 2969 break; 2970 case (NDF_ANYCAST_OFF|NDF_ANYCAST_ON): 2971 mutex_exit(&ipst->ips_ndp6->ndp_g_lock); 2972 if (nce != NULL) 2973 nce_refrele(nce); 2974 return (EINVAL); 2975 } 2976 2977 if (nce == NULL) { 2978 err = nce_add_v6(ill, 2979 (uchar_t *)lnr->lnr_hdw_addr, 2980 ill->ill_phys_addr_length, 2981 addr, 2982 new_flags, 2983 lnr->lnr_state_create, 2984 &nce); 2985 if (err != 0) { 2986 mutex_exit(&ipst->ips_ndp6->ndp_g_lock); 2987 ip1dbg(("ndp_sioc_update: Can't create NCE %d\n", err)); 2988 return (err); 2989 } else { 2990 do_postprocess = B_TRUE; 2991 } 2992 } 2993 ncec = nce->nce_common; 2994 old_flags = ncec->ncec_flags; 2995 if (old_flags & NCE_F_ISROUTER && !(new_flags & NCE_F_ISROUTER)) { 2996 ncec_router_to_host(ncec); 2997 mutex_exit(&ipst->ips_ndp6->ndp_g_lock); 2998 if (do_postprocess) 2999 err = nce_add_v6_postprocess(nce); 3000 nce_refrele(nce); 3001 return (0); 3002 } 3003 mutex_exit(&ipst->ips_ndp6->ndp_g_lock); 3004 3005 if (do_postprocess) 3006 err = nce_add_v6_postprocess(nce); 3007 /* 3008 * err cannot be anything other than 0 because we don't support 3009 * proxy arp of static addresses. 3010 */ 3011 ASSERT(err == 0); 3012 3013 mutex_enter(&ncec->ncec_lock); 3014 ncec->ncec_flags = new_flags; 3015 mutex_exit(&ncec->ncec_lock); 3016 /* 3017 * Note that we ignore the state at this point, which 3018 * should be either STALE or REACHABLE. Instead we let 3019 * the link layer address passed in to determine the state 3020 * much like incoming packets. 3021 */ 3022 nce_process(ncec, (uchar_t *)lnr->lnr_hdw_addr, 0, B_FALSE); 3023 nce_refrele(nce); 3024 return (0); 3025 } 3026 3027 /* 3028 * Create an nce_t structure for ill using the ncec->ncec_lladdr to set up 3029 * the nce_dlur_mp. If ill != ncec->ncec_ill, then the ips_ill_g_lock must 3030 * be held to ensure that they are in the same group. 3031 */ 3032 static nce_t * 3033 nce_fastpath_create(ill_t *ill, ncec_t *ncec) 3034 { 3035 3036 nce_t *nce; 3037 3038 nce = nce_ill_lookup_then_add(ill, ncec); 3039 3040 if (nce == NULL || IS_LOOPBACK(nce->nce_ill) || IS_VNI(nce->nce_ill)) 3041 return (nce); 3042 3043 /* 3044 * hold the ncec_lock to synchronize with nce_update() so that, 3045 * at the end of this function, the contents of nce_dlur_mp are 3046 * consistent with ncec->ncec_lladdr, even though some intermediate 3047 * packet may have been sent out with a mangled address, which would 3048 * only be a transient condition. 3049 */ 3050 mutex_enter(&ncec->ncec_lock); 3051 if (ncec->ncec_lladdr != NULL) { 3052 bcopy(ncec->ncec_lladdr, nce->nce_dlur_mp->b_rptr + 3053 NCE_LL_ADDR_OFFSET(ill), ill->ill_phys_addr_length); 3054 } else { 3055 nce->nce_dlur_mp = ill_dlur_gen(NULL, 0, ill->ill_sap, 3056 ill->ill_sap_length); 3057 } 3058 mutex_exit(&ncec->ncec_lock); 3059 return (nce); 3060 } 3061 3062 /* 3063 * we make nce_fp_mp to have an M_DATA prepend. 3064 * The caller ensures there is hold on ncec for this function. 3065 * Note that since ill_fastpath_probe() copies the mblk there is 3066 * no need to hold the nce or ncec beyond this function. 3067 * 3068 * If the caller has passed in a non-null ncec_nce to nce_faspath() that 3069 * ncec_nce must correspond to the nce for ncec with nce_ill == ncec->ncec_ill 3070 * and will be returned back by this function, so that no extra nce_refrele 3071 * is required for the caller. The calls from nce_add_common() use this 3072 * method. All other callers (that pass in NULL ncec_nce) will have to do a 3073 * nce_refrele of the returned nce (when it is non-null). 3074 */ 3075 nce_t * 3076 nce_fastpath(ncec_t *ncec, boolean_t trigger_fp_req, nce_t *ncec_nce) 3077 { 3078 nce_t *nce; 3079 ill_t *ill = ncec->ncec_ill; 3080 3081 ASSERT(ill != NULL); 3082 3083 if (IS_IPMP(ill) && trigger_fp_req) { 3084 trigger_fp_req = B_FALSE; 3085 ipmp_ncec_fastpath(ncec, ill); 3086 3087 } 3088 /* 3089 * If the caller already has the nce corresponding to the ill, use 3090 * that one. Otherwise we have to lookup/add the nce. Calls from 3091 * nce_add_common() fall in the former category, and have just done 3092 * the nce lookup/add that can be reused. 3093 */ 3094 if (ncec_nce == NULL) 3095 nce = nce_fastpath_create(ill, ncec); 3096 else 3097 nce = ncec_nce; 3098 3099 if (nce == NULL || IS_LOOPBACK(nce->nce_ill) || IS_VNI(nce->nce_ill)) 3100 return (nce); 3101 3102 if (trigger_fp_req) 3103 nce_fastpath_trigger(nce); 3104 return (nce); 3105 } 3106 3107 /* 3108 * Trigger fastpath on nce. No locks may be held. 3109 */ 3110 static void 3111 nce_fastpath_trigger(nce_t *nce) 3112 { 3113 int res; 3114 ill_t *ill = nce->nce_ill; 3115 ncec_t *ncec = nce->nce_common; 3116 3117 res = ill_fastpath_probe(ill, nce->nce_dlur_mp); 3118 /* 3119 * EAGAIN is an indication of a transient error 3120 * i.e. allocation failure etc. leave the ncec in the list it 3121 * will be updated when another probe happens for another ire 3122 * if not it will be taken out of the list when the ire is 3123 * deleted. 3124 */ 3125 if (res != 0 && res != EAGAIN && res != ENOTSUP) 3126 nce_fastpath_list_delete(ill, ncec, NULL); 3127 } 3128 3129 /* 3130 * Add ncec to the nce fastpath list on ill. 3131 */ 3132 static nce_t * 3133 nce_ill_lookup_then_add_locked(ill_t *ill, ncec_t *ncec) 3134 { 3135 nce_t *nce = NULL; 3136 3137 ASSERT(MUTEX_HELD(&ill->ill_lock)); 3138 /* 3139 * Atomically ensure that the ill is not CONDEMNED and is not going 3140 * down, before adding the NCE. 3141 */ 3142 if (ill->ill_state_flags & ILL_CONDEMNED) 3143 return (NULL); 3144 mutex_enter(&ncec->ncec_lock); 3145 /* 3146 * if ncec has not been deleted and 3147 * is not already in the list add it. 3148 */ 3149 if (!NCE_ISCONDEMNED(ncec)) { 3150 nce = nce_lookup(ill, &ncec->ncec_addr); 3151 if (nce != NULL) 3152 goto done; 3153 nce = nce_add(ill, ncec); 3154 } 3155 done: 3156 mutex_exit(&ncec->ncec_lock); 3157 return (nce); 3158 } 3159 3160 nce_t * 3161 nce_ill_lookup_then_add(ill_t *ill, ncec_t *ncec) 3162 { 3163 nce_t *nce; 3164 3165 mutex_enter(&ill->ill_lock); 3166 nce = nce_ill_lookup_then_add_locked(ill, ncec); 3167 mutex_exit(&ill->ill_lock); 3168 return (nce); 3169 } 3170 3171 3172 /* 3173 * remove ncec from the ill_nce list. If 'dead' is non-null, the deleted 3174 * nce is added to the 'dead' list, and the caller must nce_refrele() the 3175 * entry after all locks have been dropped. 3176 */ 3177 void 3178 nce_fastpath_list_delete(ill_t *ill, ncec_t *ncec, list_t *dead) 3179 { 3180 nce_t *nce; 3181 3182 ASSERT(ill != NULL); 3183 3184 /* first clean out any nce pointers in the under_ills */ 3185 if (IS_IPMP(ill)) 3186 ipmp_ncec_flush_nce(ncec); 3187 3188 /* now the ill itself */ 3189 mutex_enter(&ill->ill_lock); 3190 for (nce = list_head(&ill->ill_nce); nce != NULL; 3191 nce = list_next(&ill->ill_nce, nce)) { 3192 if (nce->nce_common == ncec) { 3193 nce_refhold(nce); 3194 nce_delete(nce); 3195 break; 3196 } 3197 } 3198 mutex_exit(&ill->ill_lock); 3199 if (nce != NULL) { 3200 if (dead == NULL) 3201 nce_refrele(nce); 3202 else 3203 list_insert_tail(dead, nce); 3204 } 3205 } 3206 3207 /* 3208 * when the fastpath response does not fit in the datab 3209 * associated with the existing nce_fp_mp, we delete and 3210 * add the nce to retrigger fastpath based on the information 3211 * in the ncec_t. 3212 */ 3213 static nce_t * 3214 nce_delete_then_add(nce_t *nce) 3215 { 3216 ill_t *ill = nce->nce_ill; 3217 nce_t *newnce = NULL; 3218 3219 ip0dbg(("nce_delete_then_add nce %p ill %s\n", 3220 (void *)nce, ill->ill_name)); 3221 mutex_enter(&ill->ill_lock); 3222 mutex_enter(&nce->nce_common->ncec_lock); 3223 nce_delete(nce); 3224 /* 3225 * Make sure that ncec is not condemned before adding. We hold the 3226 * ill_lock and ncec_lock to synchronize with ncec_delete() and 3227 * ipmp_ncec_flush_nce() 3228 */ 3229 if (!NCE_ISCONDEMNED(nce->nce_common)) 3230 newnce = nce_add(ill, nce->nce_common); 3231 mutex_exit(&nce->nce_common->ncec_lock); 3232 mutex_exit(&ill->ill_lock); 3233 nce_refrele(nce); 3234 return (newnce); /* could be null if nomem */ 3235 } 3236 3237 typedef struct nce_fp_match_s { 3238 nce_t *nce_fp_match_res; 3239 mblk_t *nce_fp_match_ack_mp; 3240 } nce_fp_match_t; 3241 3242 /* ARGSUSED */ 3243 static int 3244 nce_fastpath_match_dlur(ill_t *ill, nce_t *nce, void *arg) 3245 { 3246 nce_fp_match_t *nce_fp_marg = arg; 3247 ncec_t *ncec = nce->nce_common; 3248 mblk_t *mp = nce_fp_marg->nce_fp_match_ack_mp; 3249 uchar_t *mp_rptr, *ud_mp_rptr; 3250 mblk_t *ud_mp = nce->nce_dlur_mp; 3251 ptrdiff_t cmplen; 3252 3253 /* 3254 * mp is the mp associated with the fastpath ack. 3255 * ud_mp is the outstanding DL_UNITDATA_REQ on the nce_t 3256 * under consideration. If the contents match, then the 3257 * fastpath ack is used to update the nce. 3258 */ 3259 if (ud_mp == NULL) 3260 return (0); /* MH_WALK_CONTINUE */ 3261 mp_rptr = mp->b_rptr; 3262 cmplen = mp->b_wptr - mp_rptr; 3263 ASSERT(cmplen >= 0); 3264 3265 ud_mp_rptr = ud_mp->b_rptr; 3266 /* 3267 * The ncec is locked here to prevent any other threads from accessing 3268 * and changing nce_dlur_mp when the address becomes resolved to an 3269 * lla while we're in the middle of looking at and comparing the 3270 * hardware address (lla). It is also locked to prevent multiple 3271 * threads in nce_fastpath() from examining nce_dlur_mp at the same 3272 * time. 3273 */ 3274 mutex_enter(&ncec->ncec_lock); 3275 if (ud_mp->b_wptr - ud_mp_rptr != cmplen || 3276 bcmp((char *)mp_rptr, (char *)ud_mp_rptr, cmplen) == 0) { 3277 nce_fp_marg->nce_fp_match_res = nce; 3278 mutex_exit(&ncec->ncec_lock); 3279 nce_refhold(nce); 3280 return (1); /* MH_WALK_TERMINATE */ 3281 } 3282 mutex_exit(&ncec->ncec_lock); 3283 return (0); /* MH_WALK_CONTINUE */ 3284 } 3285 3286 /* 3287 * Update all NCE's that are not in fastpath mode and 3288 * have an nce_fp_mp that matches mp. mp->b_cont contains 3289 * the fastpath header. 3290 * 3291 * Returns TRUE if entry should be dequeued, or FALSE otherwise. 3292 */ 3293 void 3294 nce_fastpath_update(ill_t *ill, mblk_t *mp) 3295 { 3296 nce_fp_match_t nce_fp_marg; 3297 nce_t *nce; 3298 mblk_t *nce_fp_mp, *fp_mp; 3299 3300 nce_fp_marg.nce_fp_match_res = NULL; 3301 nce_fp_marg.nce_fp_match_ack_mp = mp; 3302 3303 nce_walk(ill, nce_fastpath_match_dlur, &nce_fp_marg); 3304 3305 if ((nce = nce_fp_marg.nce_fp_match_res) == NULL) 3306 return; 3307 3308 mutex_enter(&nce->nce_lock); 3309 nce_fp_mp = nce->nce_fp_mp; 3310 3311 if (nce_fp_mp != NULL) { 3312 fp_mp = mp->b_cont; 3313 if (nce_fp_mp->b_rptr + MBLKL(fp_mp) > 3314 nce_fp_mp->b_datap->db_lim) { 3315 mutex_exit(&nce->nce_lock); 3316 nce = nce_delete_then_add(nce); 3317 if (nce == NULL) { 3318 return; 3319 } 3320 mutex_enter(&nce->nce_lock); 3321 nce_fp_mp = nce->nce_fp_mp; 3322 } 3323 } 3324 3325 /* Matched - install mp as the fastpath mp */ 3326 if (nce_fp_mp == NULL) { 3327 fp_mp = dupb(mp->b_cont); 3328 nce->nce_fp_mp = fp_mp; 3329 } else { 3330 fp_mp = mp->b_cont; 3331 bcopy(fp_mp->b_rptr, nce_fp_mp->b_rptr, MBLKL(fp_mp)); 3332 nce->nce_fp_mp->b_wptr = nce->nce_fp_mp->b_rptr 3333 + MBLKL(fp_mp); 3334 } 3335 mutex_exit(&nce->nce_lock); 3336 nce_refrele(nce); 3337 } 3338 3339 /* 3340 * Return a pointer to a given option in the packet. 3341 * Assumes that option part of the packet have already been validated. 3342 */ 3343 nd_opt_hdr_t * 3344 ndp_get_option(nd_opt_hdr_t *opt, int optlen, int opt_type) 3345 { 3346 while (optlen > 0) { 3347 if (opt->nd_opt_type == opt_type) 3348 return (opt); 3349 optlen -= 8 * opt->nd_opt_len; 3350 opt = (struct nd_opt_hdr *)((char *)opt + 8 * opt->nd_opt_len); 3351 } 3352 return (NULL); 3353 } 3354 3355 /* 3356 * Verify all option lengths present are > 0, also check to see 3357 * if the option lengths and packet length are consistent. 3358 */ 3359 boolean_t 3360 ndp_verify_optlen(nd_opt_hdr_t *opt, int optlen) 3361 { 3362 ASSERT(opt != NULL); 3363 while (optlen > 0) { 3364 if (opt->nd_opt_len == 0) 3365 return (B_FALSE); 3366 optlen -= 8 * opt->nd_opt_len; 3367 if (optlen < 0) 3368 return (B_FALSE); 3369 opt = (struct nd_opt_hdr *)((char *)opt + 8 * opt->nd_opt_len); 3370 } 3371 return (B_TRUE); 3372 } 3373 3374 /* 3375 * ncec_walk function. 3376 * Free a fraction of the NCE cache entries. 3377 * 3378 * A possible optimization here would be to use ncec_last where possible, and 3379 * delete the least-frequently used entry, which would require more complex 3380 * computation as we walk through the ncec's (e.g., track ncec entries by 3381 * order of ncec_last and/or maintain state) 3382 */ 3383 static void 3384 ncec_cache_reclaim(ncec_t *ncec, char *arg) 3385 { 3386 ip_stack_t *ipst = ncec->ncec_ipst; 3387 uint_t fraction = *(uint_t *)arg; 3388 uint_t rand; 3389 3390 if ((ncec->ncec_flags & 3391 (NCE_F_MYADDR | NCE_F_STATIC | NCE_F_BCAST)) != 0) { 3392 return; 3393 } 3394 3395 rand = (uint_t)ddi_get_lbolt() + 3396 NCE_ADDR_HASH_V6(ncec->ncec_addr, NCE_TABLE_SIZE); 3397 if ((rand/fraction)*fraction == rand) { 3398 IP_STAT(ipst, ip_nce_reclaim_deleted); 3399 ncec_delete(ncec); 3400 } 3401 } 3402 3403 /* 3404 * kmem_cache callback to free up memory. 3405 * 3406 * For now we just delete a fixed fraction. 3407 */ 3408 static void 3409 ip_nce_reclaim_stack(ip_stack_t *ipst) 3410 { 3411 uint_t fraction = ipst->ips_ip_nce_reclaim_fraction; 3412 3413 IP_STAT(ipst, ip_nce_reclaim_calls); 3414 3415 ncec_walk(NULL, (pfi_t)ncec_cache_reclaim, (uchar_t *)&fraction, ipst); 3416 3417 /* 3418 * Walk all CONNs that can have a reference on an ire, ncec or dce. 3419 * Get them to update any stale references to drop any refholds they 3420 * have. 3421 */ 3422 ipcl_walk(conn_ixa_cleanup, (void *)B_FALSE, ipst); 3423 } 3424 3425 /* 3426 * Called by the memory allocator subsystem directly, when the system 3427 * is running low on memory. 3428 */ 3429 /* ARGSUSED */ 3430 void 3431 ip_nce_reclaim(void *args) 3432 { 3433 netstack_handle_t nh; 3434 netstack_t *ns; 3435 3436 netstack_next_init(&nh); 3437 while ((ns = netstack_next(&nh)) != NULL) { 3438 ip_nce_reclaim_stack(ns->netstack_ip); 3439 netstack_rele(ns); 3440 } 3441 netstack_next_fini(&nh); 3442 } 3443 3444 #ifdef DEBUG 3445 void 3446 ncec_trace_ref(ncec_t *ncec) 3447 { 3448 ASSERT(MUTEX_HELD(&ncec->ncec_lock)); 3449 3450 if (ncec->ncec_trace_disable) 3451 return; 3452 3453 if (!th_trace_ref(ncec, ncec->ncec_ipst)) { 3454 ncec->ncec_trace_disable = B_TRUE; 3455 ncec_trace_cleanup(ncec); 3456 } 3457 } 3458 3459 void 3460 ncec_untrace_ref(ncec_t *ncec) 3461 { 3462 ASSERT(MUTEX_HELD(&ncec->ncec_lock)); 3463 3464 if (!ncec->ncec_trace_disable) 3465 th_trace_unref(ncec); 3466 } 3467 3468 static void 3469 ncec_trace_cleanup(const ncec_t *ncec) 3470 { 3471 th_trace_cleanup(ncec, ncec->ncec_trace_disable); 3472 } 3473 #endif 3474 3475 /* 3476 * Called when address resolution fails due to a timeout. 3477 * Send an ICMP unreachable in response to all queued packets. 3478 */ 3479 void 3480 arp_resolv_failed(ncec_t *ncec) 3481 { 3482 mblk_t *mp, *nxt_mp; 3483 char buf[INET6_ADDRSTRLEN]; 3484 struct in_addr ipv4addr; 3485 ill_t *ill = ncec->ncec_ill; 3486 ip_stack_t *ipst = ncec->ncec_ipst; 3487 ip_recv_attr_t iras; 3488 3489 bzero(&iras, sizeof (iras)); 3490 iras.ira_flags = IRAF_IS_IPV4; 3491 /* 3492 * we are setting the ira_rill to the ipmp_ill (instead of 3493 * the actual ill on which the packet was received), but this 3494 * is ok because we don't actually need the real ira_rill. 3495 * to send the icmp unreachable to the sender. 3496 */ 3497 iras.ira_ill = iras.ira_rill = ill; 3498 iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex; 3499 iras.ira_rifindex = iras.ira_ruifindex; 3500 3501 IN6_V4MAPPED_TO_INADDR(&ncec->ncec_addr, &ipv4addr); 3502 ip3dbg(("arp_resolv_failed: dst %s\n", 3503 inet_ntop(AF_INET, &ipv4addr, buf, sizeof (buf)))); 3504 mutex_enter(&ncec->ncec_lock); 3505 mp = ncec->ncec_qd_mp; 3506 ncec->ncec_qd_mp = NULL; 3507 ncec->ncec_nprobes = 0; 3508 mutex_exit(&ncec->ncec_lock); 3509 while (mp != NULL) { 3510 nxt_mp = mp->b_next; 3511 mp->b_next = NULL; 3512 3513 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); 3514 ip_drop_output("ipIfStatsOutDiscards - address unreachable", 3515 mp, ill); 3516 if (ipst->ips_ip_arp_icmp_error) { 3517 ip3dbg(("arp_resolv_failed: " 3518 "Calling icmp_unreachable\n")); 3519 icmp_unreachable(mp, ICMP_HOST_UNREACHABLE, &iras); 3520 } else { 3521 freemsg(mp); 3522 } 3523 ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE)); 3524 mp = nxt_mp; 3525 } 3526 ncec_cb_dispatch(ncec); /* finish off waiting callbacks */ 3527 } 3528 3529 /* 3530 * if ill is an under_ill, translate it to the ipmp_ill and add the 3531 * nce on the ipmp_ill. Two nce_t entries (one on the ipmp_ill, and 3532 * one on the underlying in_ill) will be created for the 3533 * ncec_t in this case. The ncec_t itself will be created on the ipmp_ill. 3534 */ 3535 int 3536 nce_lookup_then_add_v4(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len, 3537 const in_addr_t *addr, uint16_t flags, uint16_t state, nce_t **newnce) 3538 { 3539 int err; 3540 in6_addr_t addr6; 3541 ip_stack_t *ipst = ill->ill_ipst; 3542 nce_t *nce, *upper_nce = NULL; 3543 ill_t *in_ill = ill, *under = NULL; 3544 boolean_t need_ill_refrele = B_FALSE; 3545 3546 if (flags & NCE_F_MCAST) { 3547 /* 3548 * hw_addr will be figured out in nce_set_multicast_v4; 3549 * caller needs to pass in the cast_ill for ipmp 3550 */ 3551 ASSERT(hw_addr == NULL); 3552 ASSERT(!IS_IPMP(ill)); 3553 err = nce_set_multicast_v4(ill, addr, flags, newnce); 3554 return (err); 3555 } 3556 3557 if (IS_UNDER_IPMP(ill) && !(flags & NCE_F_MYADDR)) { 3558 ill = ipmp_ill_hold_ipmp_ill(ill); 3559 if (ill == NULL) 3560 return (ENXIO); 3561 need_ill_refrele = B_TRUE; 3562 } 3563 if ((flags & NCE_F_BCAST) != 0) { 3564 /* 3565 * IPv4 broadcast ncec: compute the hwaddr. 3566 */ 3567 if (IS_IPMP(ill)) { 3568 under = ipmp_ill_get_xmit_ill(ill, B_FALSE); 3569 if (under == NULL) { 3570 if (need_ill_refrele) 3571 ill_refrele(ill); 3572 return (ENETDOWN); 3573 } 3574 hw_addr = under->ill_bcast_mp->b_rptr + 3575 NCE_LL_ADDR_OFFSET(under); 3576 hw_addr_len = under->ill_phys_addr_length; 3577 } else { 3578 hw_addr = ill->ill_bcast_mp->b_rptr + 3579 NCE_LL_ADDR_OFFSET(ill), 3580 hw_addr_len = ill->ill_phys_addr_length; 3581 } 3582 } 3583 3584 mutex_enter(&ipst->ips_ndp4->ndp_g_lock); 3585 IN6_IPADDR_TO_V4MAPPED(*addr, &addr6); 3586 nce = nce_lookup_addr(ill, &addr6); 3587 if (nce == NULL) { 3588 err = nce_add_v4(ill, hw_addr, hw_addr_len, addr, flags, 3589 state, &nce); 3590 } else { 3591 err = EEXIST; 3592 } 3593 mutex_exit(&ipst->ips_ndp4->ndp_g_lock); 3594 if (err == 0) 3595 err = nce_add_v4_postprocess(nce); 3596 3597 if (in_ill != ill && nce != NULL) { 3598 nce_t *under_nce; 3599 3600 /* 3601 * in_ill was the under_ill. Try to create the under_nce. 3602 * Hold the ill_g_lock to prevent changes to group membership 3603 * until we are done. 3604 */ 3605 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 3606 if (IS_IN_SAME_ILLGRP(in_ill, ill)) { 3607 under_nce = nce_fastpath_create(in_ill, 3608 nce->nce_common); 3609 upper_nce = nce; 3610 if ((nce = under_nce) == NULL) 3611 err = EINVAL; 3612 } 3613 rw_exit(&ipst->ips_ill_g_lock); 3614 if (under_nce != NULL && NCE_ISREACHABLE(nce->nce_common)) 3615 nce_fastpath_trigger(under_nce); 3616 } 3617 if (nce != NULL) { 3618 if (newnce != NULL) 3619 *newnce = nce; 3620 else 3621 nce_refrele(nce); 3622 } 3623 3624 if (under != NULL) 3625 ill_refrele(under); 3626 3627 if (upper_nce != NULL) 3628 nce_refrele(upper_nce); 3629 3630 if (need_ill_refrele) 3631 ill_refrele(ill); 3632 3633 return (err); 3634 } 3635 3636 /* 3637 * NDP Cache Entry creation routine for IPv4. 3638 * Mapped entries are handled in arp. 3639 * This routine must always be called with ndp4->ndp_g_lock held. 3640 * Prior to return, ncec_refcnt is incremented. 3641 * 3642 * IPMP notes: the ncec for non-local (i.e., !NCE_MYADDR(ncec) addresses 3643 * are always added pointing at the ipmp_ill. Thus, when the ill passed 3644 * to nce_add_v4 is an under_ill (i.e., IS_UNDER_IPMP(ill)) two nce_t 3645 * entries will be created, both pointing at the same ncec_t. The nce_t 3646 * entries will have their nce_ill set to the ipmp_ill and the under_ill 3647 * respectively, with the ncec_t having its ncec_ill pointing at the ipmp_ill. 3648 * Local addresses are always created on the ill passed to nce_add_v4. 3649 */ 3650 int 3651 nce_add_v4(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len, 3652 const in_addr_t *addr, uint16_t flags, uint16_t state, nce_t **newnce) 3653 { 3654 int err; 3655 boolean_t is_multicast = (flags & NCE_F_MCAST); 3656 struct in6_addr addr6; 3657 nce_t *nce; 3658 3659 ASSERT(MUTEX_HELD(&ill->ill_ipst->ips_ndp4->ndp_g_lock)); 3660 ASSERT(!ill->ill_isv6); 3661 ASSERT(!IN_MULTICAST(htonl(*addr)) || is_multicast); 3662 3663 IN6_IPADDR_TO_V4MAPPED(*addr, &addr6); 3664 err = nce_add_common(ill, hw_addr, hw_addr_len, &addr6, flags, state, 3665 &nce); 3666 ASSERT(newnce != NULL); 3667 *newnce = nce; 3668 return (err); 3669 } 3670 3671 /* 3672 * Post-processing routine to be executed after nce_add_v4(). This function 3673 * triggers fastpath (if appropriate) and DAD on the newly added nce entry 3674 * and must be called without any locks held. 3675 * 3676 * Always returns 0, but we return an int to keep this symmetric with the 3677 * IPv6 counter-part. 3678 */ 3679 int 3680 nce_add_v4_postprocess(nce_t *nce) 3681 { 3682 ncec_t *ncec = nce->nce_common; 3683 uint16_t flags = ncec->ncec_flags; 3684 boolean_t ndp_need_dad = B_FALSE; 3685 boolean_t dropped; 3686 clock_t delay; 3687 ip_stack_t *ipst = ncec->ncec_ill->ill_ipst; 3688 uchar_t *hw_addr = ncec->ncec_lladdr; 3689 boolean_t trigger_fastpath = B_TRUE; 3690 3691 /* 3692 * If the hw_addr is NULL, typically for ND_INCOMPLETE nces, then 3693 * we call nce_fastpath as soon as the ncec is resolved in nce_process. 3694 * We call nce_fastpath from nce_update if the link layer address of 3695 * the peer changes from nce_update 3696 */ 3697 if (NCE_PUBLISH(ncec) || !NCE_ISREACHABLE(ncec) || (hw_addr == NULL && 3698 ncec->ncec_ill->ill_net_type != IRE_IF_NORESOLVER)) 3699 trigger_fastpath = B_FALSE; 3700 3701 if (trigger_fastpath) 3702 nce_fastpath_trigger(nce); 3703 3704 if (NCE_PUBLISH(ncec) && ncec->ncec_state == ND_PROBE) { 3705 /* 3706 * Either the caller (by passing in ND_PROBE) 3707 * or nce_add_common() (by the internally computed state 3708 * based on ncec_addr and ill_net_type) has determined 3709 * that this unicast entry needs DAD. Trigger DAD. 3710 */ 3711 ndp_need_dad = B_TRUE; 3712 } else if (flags & NCE_F_UNSOL_ADV) { 3713 /* 3714 * We account for the transmit below by assigning one 3715 * less than the ndd variable. Subsequent decrements 3716 * are done in nce_timer. 3717 */ 3718 mutex_enter(&ncec->ncec_lock); 3719 ncec->ncec_unsolicit_count = 3720 ipst->ips_ip_arp_publish_count - 1; 3721 mutex_exit(&ncec->ncec_lock); 3722 dropped = arp_announce(ncec); 3723 mutex_enter(&ncec->ncec_lock); 3724 if (dropped) 3725 ncec->ncec_unsolicit_count++; 3726 else 3727 ncec->ncec_last_time_defended = ddi_get_lbolt(); 3728 if (ncec->ncec_unsolicit_count != 0) { 3729 nce_start_timer(ncec, 3730 ipst->ips_ip_arp_publish_interval); 3731 } 3732 mutex_exit(&ncec->ncec_lock); 3733 } 3734 3735 /* 3736 * If ncec_xmit_interval is 0, user has configured us to send the first 3737 * probe right away. Do so, and set up for the subsequent probes. 3738 */ 3739 if (ndp_need_dad) { 3740 mutex_enter(&ncec->ncec_lock); 3741 if (ncec->ncec_pcnt == 0) { 3742 /* 3743 * DAD probes and announce can be 3744 * administratively disabled by setting the 3745 * probe_count to zero. Restart the timer in 3746 * this case to mark the ipif as ready. 3747 */ 3748 ncec->ncec_unsolicit_count = 0; 3749 mutex_exit(&ncec->ncec_lock); 3750 nce_restart_timer(ncec, 0); 3751 } else { 3752 mutex_exit(&ncec->ncec_lock); 3753 delay = ((ncec->ncec_flags & NCE_F_FAST) ? 3754 ipst->ips_arp_probe_delay : 3755 ipst->ips_arp_fastprobe_delay); 3756 nce_dad(ncec, NULL, (delay == 0 ? B_TRUE : B_FALSE)); 3757 } 3758 } 3759 return (0); 3760 } 3761 3762 /* 3763 * ncec_walk routine to update all entries that have a given destination or 3764 * gateway address and cached link layer (MAC) address. This is used when ARP 3765 * informs us that a network-to-link-layer mapping may have changed. 3766 */ 3767 void 3768 nce_update_hw_changed(ncec_t *ncec, void *arg) 3769 { 3770 nce_hw_map_t *hwm = arg; 3771 ipaddr_t ncec_addr; 3772 3773 if (ncec->ncec_state != ND_REACHABLE) 3774 return; 3775 3776 IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, ncec_addr); 3777 if (ncec_addr != hwm->hwm_addr) 3778 return; 3779 3780 mutex_enter(&ncec->ncec_lock); 3781 if (hwm->hwm_flags != 0) 3782 ncec->ncec_flags = hwm->hwm_flags; 3783 nce_update(ncec, ND_STALE, hwm->hwm_hwaddr); 3784 mutex_exit(&ncec->ncec_lock); 3785 } 3786 3787 void 3788 ncec_refhold(ncec_t *ncec) 3789 { 3790 mutex_enter(&(ncec)->ncec_lock); 3791 (ncec)->ncec_refcnt++; 3792 ASSERT((ncec)->ncec_refcnt != 0); 3793 #ifdef DEBUG 3794 ncec_trace_ref(ncec); 3795 #endif 3796 mutex_exit(&(ncec)->ncec_lock); 3797 } 3798 3799 void 3800 ncec_refhold_notr(ncec_t *ncec) 3801 { 3802 mutex_enter(&(ncec)->ncec_lock); 3803 (ncec)->ncec_refcnt++; 3804 ASSERT((ncec)->ncec_refcnt != 0); 3805 mutex_exit(&(ncec)->ncec_lock); 3806 } 3807 3808 static void 3809 ncec_refhold_locked(ncec_t *ncec) 3810 { 3811 ASSERT(MUTEX_HELD(&(ncec)->ncec_lock)); 3812 (ncec)->ncec_refcnt++; 3813 #ifdef DEBUG 3814 ncec_trace_ref(ncec); 3815 #endif 3816 } 3817 3818 /* ncec_inactive destroys the mutex thus no mutex_exit is needed */ 3819 void 3820 ncec_refrele(ncec_t *ncec) 3821 { 3822 mutex_enter(&(ncec)->ncec_lock); 3823 #ifdef DEBUG 3824 ncec_untrace_ref(ncec); 3825 #endif 3826 ASSERT((ncec)->ncec_refcnt != 0); 3827 if (--(ncec)->ncec_refcnt == 0) { 3828 ncec_inactive(ncec); 3829 } else { 3830 mutex_exit(&(ncec)->ncec_lock); 3831 } 3832 } 3833 3834 void 3835 ncec_refrele_notr(ncec_t *ncec) 3836 { 3837 mutex_enter(&(ncec)->ncec_lock); 3838 ASSERT((ncec)->ncec_refcnt != 0); 3839 if (--(ncec)->ncec_refcnt == 0) { 3840 ncec_inactive(ncec); 3841 } else { 3842 mutex_exit(&(ncec)->ncec_lock); 3843 } 3844 } 3845 3846 /* 3847 * Common to IPv4 and IPv6. 3848 */ 3849 void 3850 nce_restart_timer(ncec_t *ncec, uint_t ms) 3851 { 3852 timeout_id_t tid; 3853 3854 ASSERT(!MUTEX_HELD(&(ncec)->ncec_lock)); 3855 3856 /* First cancel any running timer */ 3857 mutex_enter(&ncec->ncec_lock); 3858 tid = ncec->ncec_timeout_id; 3859 ncec->ncec_timeout_id = 0; 3860 if (tid != 0) { 3861 mutex_exit(&ncec->ncec_lock); 3862 (void) untimeout(tid); 3863 mutex_enter(&ncec->ncec_lock); 3864 } 3865 3866 /* Restart timer */ 3867 nce_start_timer(ncec, ms); 3868 mutex_exit(&ncec->ncec_lock); 3869 } 3870 3871 static void 3872 nce_start_timer(ncec_t *ncec, uint_t ms) 3873 { 3874 ASSERT(MUTEX_HELD(&ncec->ncec_lock)); 3875 /* 3876 * Don't start the timer if the ncec has been deleted, or if the timer 3877 * is already running 3878 */ 3879 if (!NCE_ISCONDEMNED(ncec) && ncec->ncec_timeout_id == 0) { 3880 ncec->ncec_timeout_id = timeout(nce_timer, ncec, 3881 MSEC_TO_TICK(ms) == 0 ? 1 : MSEC_TO_TICK(ms)); 3882 } 3883 } 3884 3885 int 3886 nce_set_multicast_v4(ill_t *ill, const in_addr_t *dst, 3887 uint16_t flags, nce_t **newnce) 3888 { 3889 uchar_t *hw_addr; 3890 int err = 0; 3891 ip_stack_t *ipst = ill->ill_ipst; 3892 in6_addr_t dst6; 3893 nce_t *nce; 3894 3895 ASSERT(!ill->ill_isv6); 3896 3897 IN6_IPADDR_TO_V4MAPPED(*dst, &dst6); 3898 mutex_enter(&ipst->ips_ndp4->ndp_g_lock); 3899 if ((nce = nce_lookup_addr(ill, &dst6)) != NULL) { 3900 mutex_exit(&ipst->ips_ndp4->ndp_g_lock); 3901 goto done; 3902 } 3903 if (ill->ill_net_type == IRE_IF_RESOLVER) { 3904 /* 3905 * For IRE_IF_RESOLVER a hardware mapping can be 3906 * generated, for IRE_IF_NORESOLVER, resolution cookie 3907 * in the ill is copied in nce_add_v4(). 3908 */ 3909 hw_addr = kmem_alloc(ill->ill_phys_addr_length, KM_NOSLEEP); 3910 if (hw_addr == NULL) { 3911 mutex_exit(&ipst->ips_ndp4->ndp_g_lock); 3912 return (ENOMEM); 3913 } 3914 ip_mcast_mapping(ill, (uchar_t *)dst, hw_addr); 3915 } else { 3916 /* 3917 * IRE_IF_NORESOLVER type simply copies the resolution 3918 * cookie passed in. So no hw_addr is needed. 3919 */ 3920 hw_addr = NULL; 3921 } 3922 ASSERT(flags & NCE_F_MCAST); 3923 ASSERT(flags & NCE_F_NONUD); 3924 /* nce_state will be computed by nce_add_common() */ 3925 err = nce_add_v4(ill, hw_addr, ill->ill_phys_addr_length, dst, flags, 3926 ND_UNCHANGED, &nce); 3927 mutex_exit(&ipst->ips_ndp4->ndp_g_lock); 3928 if (err == 0) 3929 err = nce_add_v4_postprocess(nce); 3930 if (hw_addr != NULL) 3931 kmem_free(hw_addr, ill->ill_phys_addr_length); 3932 if (err != 0) { 3933 ip1dbg(("nce_set_multicast_v4: create failed" "%d\n", err)); 3934 return (err); 3935 } 3936 done: 3937 if (newnce != NULL) 3938 *newnce = nce; 3939 else 3940 nce_refrele(nce); 3941 return (0); 3942 } 3943 3944 /* 3945 * This is used when scanning for "old" (least recently broadcast) NCEs. We 3946 * don't want to have to walk the list for every single one, so we gather up 3947 * batches at a time. 3948 */ 3949 #define NCE_RESCHED_LIST_LEN 8 3950 3951 typedef struct { 3952 ill_t *ncert_ill; 3953 uint_t ncert_num; 3954 ncec_t *ncert_nces[NCE_RESCHED_LIST_LEN]; 3955 } nce_resched_t; 3956 3957 /* 3958 * Pick the longest waiting NCEs for defense. 3959 */ 3960 /* ARGSUSED */ 3961 static int 3962 ncec_reschedule(ill_t *ill, nce_t *nce, void *arg) 3963 { 3964 nce_resched_t *ncert = arg; 3965 ncec_t **ncecs; 3966 ncec_t **ncec_max; 3967 ncec_t *ncec_temp; 3968 ncec_t *ncec = nce->nce_common; 3969 3970 ASSERT(ncec->ncec_ill == ncert->ncert_ill); 3971 /* 3972 * Only reachable entries that are ready for announcement are eligible. 3973 */ 3974 if (!NCE_MYADDR(ncec) || ncec->ncec_state != ND_REACHABLE) 3975 return (0); 3976 if (ncert->ncert_num < NCE_RESCHED_LIST_LEN) { 3977 ncec_refhold(ncec); 3978 ncert->ncert_nces[ncert->ncert_num++] = ncec; 3979 } else { 3980 ncecs = ncert->ncert_nces; 3981 ncec_max = ncecs + NCE_RESCHED_LIST_LEN; 3982 ncec_refhold(ncec); 3983 for (; ncecs < ncec_max; ncecs++) { 3984 ASSERT(ncec != NULL); 3985 if ((*ncecs)->ncec_last_time_defended > 3986 ncec->ncec_last_time_defended) { 3987 ncec_temp = *ncecs; 3988 *ncecs = ncec; 3989 ncec = ncec_temp; 3990 } 3991 } 3992 ncec_refrele(ncec); 3993 } 3994 return (0); 3995 } 3996 3997 /* 3998 * Reschedule the ARP defense of any long-waiting NCEs. It's assumed that this 3999 * doesn't happen very often (if at all), and thus it needn't be highly 4000 * optimized. (Note, though, that it's actually O(N) complexity, because the 4001 * outer loop is bounded by a constant rather than by the length of the list.) 4002 */ 4003 static void 4004 nce_ill_reschedule(ill_t *ill, nce_resched_t *ncert) 4005 { 4006 ncec_t *ncec; 4007 ip_stack_t *ipst = ill->ill_ipst; 4008 uint_t i, defend_rate; 4009 4010 i = ill->ill_defend_count; 4011 ill->ill_defend_count = 0; 4012 if (ill->ill_isv6) 4013 defend_rate = ipst->ips_ndp_defend_rate; 4014 else 4015 defend_rate = ipst->ips_arp_defend_rate; 4016 /* If none could be sitting around, then don't reschedule */ 4017 if (i < defend_rate) { 4018 DTRACE_PROBE1(reschedule_none, ill_t *, ill); 4019 return; 4020 } 4021 ncert->ncert_ill = ill; 4022 while (ill->ill_defend_count < defend_rate) { 4023 nce_walk_common(ill, ncec_reschedule, ncert); 4024 for (i = 0; i < ncert->ncert_num; i++) { 4025 4026 ncec = ncert->ncert_nces[i]; 4027 mutex_enter(&ncec->ncec_lock); 4028 ncec->ncec_flags |= NCE_F_DELAYED; 4029 mutex_exit(&ncec->ncec_lock); 4030 /* 4031 * we plan to schedule this ncec, so incr the 4032 * defend_count in anticipation. 4033 */ 4034 if (++ill->ill_defend_count >= defend_rate) 4035 break; 4036 } 4037 if (ncert->ncert_num < NCE_RESCHED_LIST_LEN) 4038 break; 4039 } 4040 } 4041 4042 /* 4043 * Check if the current rate-limiting parameters permit the sending 4044 * of another address defense announcement for both IPv4 and IPv6. 4045 * Returns B_TRUE if rate-limiting is in effect (i.e., send is not 4046 * permitted), and B_FALSE otherwise. The `defend_rate' parameter 4047 * determines how many address defense announcements are permitted 4048 * in any `defense_perio' interval. 4049 */ 4050 static boolean_t 4051 ill_defend_rate_limit(ill_t *ill, ncec_t *ncec) 4052 { 4053 clock_t now = ddi_get_lbolt(); 4054 ip_stack_t *ipst = ill->ill_ipst; 4055 clock_t start = ill->ill_defend_start; 4056 uint32_t elapsed, defend_period, defend_rate; 4057 nce_resched_t ncert; 4058 boolean_t ret; 4059 int i; 4060 4061 if (ill->ill_isv6) { 4062 defend_period = ipst->ips_ndp_defend_period; 4063 defend_rate = ipst->ips_ndp_defend_rate; 4064 } else { 4065 defend_period = ipst->ips_arp_defend_period; 4066 defend_rate = ipst->ips_arp_defend_rate; 4067 } 4068 if (defend_rate == 0) 4069 return (B_TRUE); 4070 bzero(&ncert, sizeof (ncert)); 4071 mutex_enter(&ill->ill_lock); 4072 if (start > 0) { 4073 elapsed = now - start; 4074 if (elapsed > SEC_TO_TICK(defend_period)) { 4075 ill->ill_defend_start = now; 4076 /* 4077 * nce_ill_reschedule will attempt to 4078 * prevent starvation by reschduling the 4079 * oldest entries, which are marked with 4080 * the NCE_F_DELAYED flag. 4081 */ 4082 nce_ill_reschedule(ill, &ncert); 4083 } 4084 } else { 4085 ill->ill_defend_start = now; 4086 } 4087 ASSERT(ill->ill_defend_count <= defend_rate); 4088 mutex_enter(&ncec->ncec_lock); 4089 if (ncec->ncec_flags & NCE_F_DELAYED) { 4090 /* 4091 * This ncec was rescheduled as one of the really old 4092 * entries needing on-going defense. The 4093 * ill_defend_count was already incremented in 4094 * nce_ill_reschedule. Go ahead and send the announce. 4095 */ 4096 ncec->ncec_flags &= ~NCE_F_DELAYED; 4097 mutex_exit(&ncec->ncec_lock); 4098 ret = B_FALSE; 4099 goto done; 4100 } 4101 mutex_exit(&ncec->ncec_lock); 4102 if (ill->ill_defend_count < defend_rate) 4103 ill->ill_defend_count++; 4104 if (ill->ill_defend_count == defend_rate) { 4105 /* 4106 * we are no longer allowed to send unbidden defense 4107 * messages. Wait for rescheduling. 4108 */ 4109 ret = B_TRUE; 4110 } else { 4111 ret = B_FALSE; 4112 } 4113 done: 4114 mutex_exit(&ill->ill_lock); 4115 /* 4116 * After all the locks have been dropped we can restart nce timer, 4117 * and refrele the delayed ncecs 4118 */ 4119 for (i = 0; i < ncert.ncert_num; i++) { 4120 clock_t xmit_interval; 4121 ncec_t *tmp; 4122 4123 tmp = ncert.ncert_nces[i]; 4124 xmit_interval = nce_fuzz_interval(tmp->ncec_xmit_interval, 4125 B_FALSE); 4126 nce_restart_timer(tmp, xmit_interval); 4127 ncec_refrele(tmp); 4128 } 4129 return (ret); 4130 } 4131 4132 boolean_t 4133 ndp_announce(ncec_t *ncec) 4134 { 4135 return (ndp_xmit(ncec->ncec_ill, ND_NEIGHBOR_ADVERT, ncec->ncec_lladdr, 4136 ncec->ncec_lladdr_length, &ncec->ncec_addr, &ipv6_all_hosts_mcast, 4137 nce_advert_flags(ncec))); 4138 } 4139 4140 ill_t * 4141 nce_resolve_src(ncec_t *ncec, in6_addr_t *src) 4142 { 4143 mblk_t *mp; 4144 in6_addr_t src6; 4145 ipaddr_t src4; 4146 ill_t *ill = ncec->ncec_ill; 4147 ill_t *src_ill = NULL; 4148 ipif_t *ipif = NULL; 4149 boolean_t is_myaddr = NCE_MYADDR(ncec); 4150 boolean_t isv6 = (ncec->ncec_ipversion == IPV6_VERSION); 4151 4152 ASSERT(src != NULL); 4153 ASSERT(IN6_IS_ADDR_UNSPECIFIED(src)); 4154 src6 = *src; 4155 if (is_myaddr) { 4156 src6 = ncec->ncec_addr; 4157 if (!isv6) 4158 IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, src4); 4159 } else { 4160 /* 4161 * try to find one from the outgoing packet. 4162 */ 4163 mutex_enter(&ncec->ncec_lock); 4164 mp = ncec->ncec_qd_mp; 4165 if (mp != NULL) { 4166 if (isv6) { 4167 ip6_t *ip6h = (ip6_t *)mp->b_rptr; 4168 4169 src6 = ip6h->ip6_src; 4170 } else { 4171 ipha_t *ipha = (ipha_t *)mp->b_rptr; 4172 4173 src4 = ipha->ipha_src; 4174 IN6_IPADDR_TO_V4MAPPED(src4, &src6); 4175 } 4176 } 4177 mutex_exit(&ncec->ncec_lock); 4178 } 4179 4180 /* 4181 * For outgoing packets, if the src of outgoing packet is one 4182 * of the assigned interface addresses use it, otherwise we 4183 * will pick the source address below. 4184 * For local addresses (is_myaddr) doing DAD, NDP announce 4185 * messages are mcast. So we use the (IPMP) cast_ill or the 4186 * (non-IPMP) ncec_ill for these message types. The only case 4187 * of unicast DAD messages are for IPv6 ND probes, for which 4188 * we find the ipif_bound_ill corresponding to the ncec_addr. 4189 */ 4190 if (!IN6_IS_ADDR_UNSPECIFIED(&src6) || is_myaddr) { 4191 if (isv6) { 4192 ipif = ipif_lookup_addr_nondup_v6(&src6, ill, ALL_ZONES, 4193 ill->ill_ipst); 4194 } else { 4195 ipif = ipif_lookup_addr_nondup(src4, ill, ALL_ZONES, 4196 ill->ill_ipst); 4197 } 4198 4199 /* 4200 * If no relevant ipif can be found, then it's not one of our 4201 * addresses. Reset to :: and try to find a src for the NS or 4202 * ARP request using ipif_select_source_v[4,6] below. 4203 * If an ipif can be found, but it's not yet done with 4204 * DAD verification, and we are not being invoked for 4205 * DAD (i.e., !is_myaddr), then just postpone this 4206 * transmission until later. 4207 */ 4208 if (ipif == NULL) { 4209 src6 = ipv6_all_zeros; 4210 src4 = INADDR_ANY; 4211 } else if (!ipif->ipif_addr_ready && !is_myaddr) { 4212 DTRACE_PROBE2(nce__resolve__ipif__not__ready, 4213 ncec_t *, ncec, ipif_t *, ipif); 4214 ipif_refrele(ipif); 4215 return (NULL); 4216 } 4217 } 4218 4219 if (IN6_IS_ADDR_UNSPECIFIED(&src6) && !is_myaddr) { 4220 /* 4221 * Pick a source address for this solicitation, but 4222 * restrict the selection to addresses assigned to the 4223 * output interface. We do this because the destination will 4224 * create a neighbor cache entry for the source address of 4225 * this packet, so the source address had better be a valid 4226 * neighbor. 4227 */ 4228 if (isv6) { 4229 ipif = ipif_select_source_v6(ill, &ncec->ncec_addr, 4230 B_TRUE, IPV6_PREFER_SRC_DEFAULT, ALL_ZONES, 4231 B_FALSE, NULL); 4232 } else { 4233 ipaddr_t nce_addr; 4234 4235 IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, nce_addr); 4236 ipif = ipif_select_source_v4(ill, nce_addr, ALL_ZONES, 4237 B_FALSE, NULL); 4238 } 4239 if (ipif == NULL && IS_IPMP(ill)) { 4240 ill_t *send_ill = ipmp_ill_get_xmit_ill(ill, B_TRUE); 4241 4242 if (send_ill != NULL) { 4243 if (isv6) { 4244 ipif = ipif_select_source_v6(send_ill, 4245 &ncec->ncec_addr, B_TRUE, 4246 IPV6_PREFER_SRC_DEFAULT, ALL_ZONES, 4247 B_FALSE, NULL); 4248 } else { 4249 IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, 4250 src4); 4251 ipif = ipif_select_source_v4(send_ill, 4252 src4, ALL_ZONES, B_TRUE, NULL); 4253 } 4254 ill_refrele(send_ill); 4255 } 4256 } 4257 4258 if (ipif == NULL) { 4259 char buf[INET6_ADDRSTRLEN]; 4260 4261 ip1dbg(("nce_resolve_src: No source ipif for dst %s\n", 4262 inet_ntop((isv6 ? AF_INET6 : AF_INET), 4263 (char *)&ncec->ncec_addr, buf, sizeof (buf)))); 4264 DTRACE_PROBE1(nce__resolve__no__ipif, ncec_t *, ncec); 4265 return (NULL); 4266 } 4267 src6 = ipif->ipif_v6lcl_addr; 4268 } 4269 *src = src6; 4270 if (ipif != NULL) { 4271 src_ill = ipif->ipif_ill; 4272 if (IS_IPMP(src_ill)) 4273 src_ill = ipmp_ipif_hold_bound_ill(ipif); 4274 else 4275 ill_refhold(src_ill); 4276 ipif_refrele(ipif); 4277 DTRACE_PROBE2(nce__resolve__src__ill, ncec_t *, ncec, 4278 ill_t *, src_ill); 4279 } 4280 return (src_ill); 4281 } 4282 4283 void 4284 ip_nce_lookup_and_update(ipaddr_t *addr, ipif_t *ipif, ip_stack_t *ipst, 4285 uchar_t *hwaddr, int hwaddr_len, int flags) 4286 { 4287 ill_t *ill; 4288 ncec_t *ncec; 4289 nce_t *nce; 4290 uint16_t new_state; 4291 4292 ill = (ipif ? ipif->ipif_ill : NULL); 4293 if (ill != NULL) { 4294 /* 4295 * only one ncec is possible 4296 */ 4297 nce = nce_lookup_v4(ill, addr); 4298 if (nce != NULL) { 4299 ncec = nce->nce_common; 4300 mutex_enter(&ncec->ncec_lock); 4301 if (NCE_ISREACHABLE(ncec)) 4302 new_state = ND_UNCHANGED; 4303 else 4304 new_state = ND_STALE; 4305 ncec->ncec_flags = flags; 4306 nce_update(ncec, new_state, hwaddr); 4307 mutex_exit(&ncec->ncec_lock); 4308 nce_refrele(nce); 4309 return; 4310 } 4311 } else { 4312 /* 4313 * ill is wildcard; clean up all ncec's and ire's 4314 * that match on addr. 4315 */ 4316 nce_hw_map_t hwm; 4317 4318 hwm.hwm_addr = *addr; 4319 hwm.hwm_hwlen = hwaddr_len; 4320 hwm.hwm_hwaddr = hwaddr; 4321 hwm.hwm_flags = flags; 4322 4323 ncec_walk_common(ipst->ips_ndp4, NULL, 4324 (pfi_t)nce_update_hw_changed, (uchar_t *)&hwm, B_TRUE); 4325 } 4326 } 4327 4328 /* 4329 * Common function to add ncec entries. 4330 * we always add the ncec with ncec_ill == ill, and always create 4331 * nce_t on ncec_ill. A dlpi fastpath message may be triggered if the 4332 * ncec is !reachable. 4333 * 4334 * When the caller passes in an nce_state of ND_UNCHANGED, 4335 * nce_add_common() will determine the state of the created nce based 4336 * on the ill_net_type and nce_flags used. Otherwise, the nce will 4337 * be created with state set to the passed in nce_state. 4338 */ 4339 static int 4340 nce_add_common(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len, 4341 const in6_addr_t *addr, uint16_t flags, uint16_t nce_state, nce_t **retnce) 4342 { 4343 static ncec_t nce_nil; 4344 uchar_t *template = NULL; 4345 int err; 4346 ncec_t *ncec; 4347 ncec_t **ncep; 4348 ip_stack_t *ipst = ill->ill_ipst; 4349 uint16_t state; 4350 boolean_t fastprobe = B_FALSE; 4351 struct ndp_g_s *ndp; 4352 nce_t *nce = NULL; 4353 mblk_t *dlur_mp = NULL; 4354 4355 if (ill->ill_isv6) 4356 ndp = ill->ill_ipst->ips_ndp6; 4357 else 4358 ndp = ill->ill_ipst->ips_ndp4; 4359 4360 *retnce = NULL; 4361 4362 ASSERT(MUTEX_HELD(&ndp->ndp_g_lock)); 4363 4364 if (IN6_IS_ADDR_UNSPECIFIED(addr)) { 4365 ip0dbg(("nce_add_common: no addr\n")); 4366 return (EINVAL); 4367 } 4368 if ((flags & ~NCE_EXTERNAL_FLAGS_MASK)) { 4369 ip0dbg(("nce_add_common: flags = %x\n", (int)flags)); 4370 return (EINVAL); 4371 } 4372 4373 if (ill->ill_isv6) { 4374 ncep = ((ncec_t **)NCE_HASH_PTR_V6(ipst, *addr)); 4375 } else { 4376 ipaddr_t v4addr; 4377 4378 IN6_V4MAPPED_TO_IPADDR(addr, v4addr); 4379 ncep = ((ncec_t **)NCE_HASH_PTR_V4(ipst, v4addr)); 4380 } 4381 4382 /* 4383 * The caller has ensured that there is no nce on ill, but there could 4384 * still be an nce_common_t for the address, so that we find exisiting 4385 * ncec_t strucutures first, and atomically add a new nce_t if 4386 * one is found. The ndp_g_lock ensures that we don't cross threads 4387 * with an ncec_delete(). Unlike ncec_lookup_illgrp() we do not 4388 * compare for matches across the illgrp because this function is 4389 * called via nce_lookup_then_add_v* -> nce_add_v* -> nce_add_common, 4390 * with the nce_lookup_then_add_v* passing in the ipmp_ill where 4391 * appropriate. 4392 */ 4393 ncec = *ncep; 4394 for (; ncec != NULL; ncec = ncec->ncec_next) { 4395 if (ncec->ncec_ill == ill) { 4396 if (IN6_ARE_ADDR_EQUAL(&ncec->ncec_addr, addr)) { 4397 *retnce = nce_ill_lookup_then_add(ill, ncec); 4398 if (*retnce != NULL) 4399 break; 4400 } 4401 } 4402 } 4403 if (*retnce != NULL) { 4404 /* 4405 * We should never find *retnce to be MYADDR, since the caller 4406 * may then incorrectly restart a DAD timer that's already 4407 * running. 4408 */ 4409 ASSERT(!NCE_MYADDR(ncec)); 4410 /* caller must trigger fastpath on nce */ 4411 return (0); 4412 } 4413 ncec = kmem_cache_alloc(ncec_cache, KM_NOSLEEP); 4414 if (ncec == NULL) 4415 return (ENOMEM); 4416 *ncec = nce_nil; 4417 ncec->ncec_ill = ill; 4418 ncec->ncec_ipversion = (ill->ill_isv6 ? IPV6_VERSION : IPV4_VERSION); 4419 ncec->ncec_flags = flags; 4420 ncec->ncec_ipst = ipst; /* No netstack_hold */ 4421 4422 if (!ill->ill_isv6) { 4423 ipaddr_t addr4; 4424 4425 /* 4426 * DAD probe interval and probe count are set based on 4427 * fast/slow probe settings. If the underlying link doesn't 4428 * have reliably up/down notifications or if we're working 4429 * with IPv4 169.254.0.0/16 Link Local Address space, then 4430 * don't use the fast timers. Otherwise, use them. 4431 */ 4432 ASSERT(IN6_IS_ADDR_V4MAPPED(addr)); 4433 IN6_V4MAPPED_TO_IPADDR(addr, addr4); 4434 if (ill->ill_note_link && !IS_IPV4_LL_SPACE(&addr4)) 4435 fastprobe = B_TRUE; 4436 if (fastprobe) { 4437 ncec->ncec_xmit_interval = 4438 ipst->ips_arp_fastprobe_interval; 4439 ncec->ncec_pcnt = 4440 ipst->ips_arp_fastprobe_count; 4441 ncec->ncec_flags |= NCE_F_FAST; 4442 } else { 4443 ncec->ncec_xmit_interval = 4444 ipst->ips_arp_probe_interval; 4445 ncec->ncec_pcnt = 4446 ipst->ips_arp_probe_count; 4447 } 4448 if (NCE_PUBLISH(ncec)) { 4449 ncec->ncec_unsolicit_count = 4450 ipst->ips_ip_arp_publish_count; 4451 } 4452 } else { 4453 /* 4454 * probe interval is constant: ILL_PROBE_INTERVAL 4455 * probe count is constant: ND_MAX_UNICAST_SOLICIT 4456 */ 4457 ncec->ncec_pcnt = ND_MAX_UNICAST_SOLICIT; 4458 if (NCE_PUBLISH(ncec)) { 4459 ncec->ncec_unsolicit_count = 4460 ipst->ips_ip_ndp_unsolicit_count; 4461 } 4462 } 4463 ncec->ncec_rcnt = ill->ill_xmit_count; 4464 ncec->ncec_addr = *addr; 4465 ncec->ncec_qd_mp = NULL; 4466 ncec->ncec_refcnt = 1; /* for ncec getting created */ 4467 mutex_init(&ncec->ncec_lock, NULL, MUTEX_DEFAULT, NULL); 4468 ncec->ncec_trace_disable = B_FALSE; 4469 4470 /* 4471 * ncec_lladdr holds link layer address 4472 */ 4473 if (hw_addr_len > 0) { 4474 template = kmem_alloc(hw_addr_len, KM_NOSLEEP); 4475 if (template == NULL) { 4476 err = ENOMEM; 4477 goto err_ret; 4478 } 4479 ncec->ncec_lladdr = template; 4480 ncec->ncec_lladdr_length = hw_addr_len; 4481 bzero(ncec->ncec_lladdr, hw_addr_len); 4482 } 4483 if ((flags & NCE_F_BCAST) != 0) { 4484 state = ND_REACHABLE; 4485 ASSERT(hw_addr_len > 0); 4486 } else if (ill->ill_net_type == IRE_IF_RESOLVER) { 4487 state = ND_INITIAL; 4488 } else if (ill->ill_net_type == IRE_IF_NORESOLVER) { 4489 /* 4490 * NORESOLVER entries are always created in the REACHABLE 4491 * state. 4492 */ 4493 state = ND_REACHABLE; 4494 if (ill->ill_phys_addr_length == IP_ADDR_LEN && 4495 ill->ill_mactype != DL_IPV4 && 4496 ill->ill_mactype != DL_6TO4) { 4497 /* 4498 * We create a nce_res_mp with the IP nexthop address 4499 * as the destination address if the physical length 4500 * is exactly 4 bytes for point-to-multipoint links 4501 * that do their own resolution from IP to link-layer 4502 * address (e.g. IP over X.25). 4503 */ 4504 bcopy((uchar_t *)addr, 4505 ncec->ncec_lladdr, ill->ill_phys_addr_length); 4506 } 4507 if (ill->ill_phys_addr_length == IPV6_ADDR_LEN && 4508 ill->ill_mactype != DL_IPV6) { 4509 /* 4510 * We create a nce_res_mp with the IP nexthop address 4511 * as the destination address if the physical legnth 4512 * is exactly 16 bytes for point-to-multipoint links 4513 * that do their own resolution from IP to link-layer 4514 * address. 4515 */ 4516 bcopy((uchar_t *)addr, 4517 ncec->ncec_lladdr, ill->ill_phys_addr_length); 4518 } 4519 /* 4520 * Since NUD is not part of the base IPv4 protocol definition, 4521 * IPv4 neighbor entries on NORESOLVER interfaces will never 4522 * age, and are marked NCE_F_NONUD. 4523 */ 4524 if (!ill->ill_isv6) 4525 ncec->ncec_flags |= NCE_F_NONUD; 4526 } else if (ill->ill_net_type == IRE_LOOPBACK) { 4527 state = ND_REACHABLE; 4528 } 4529 4530 if (hw_addr != NULL || ill->ill_net_type == IRE_IF_NORESOLVER) { 4531 /* 4532 * We are adding an ncec with a deterministic hw_addr, 4533 * so the state can only be one of {REACHABLE, STALE, PROBE}. 4534 * 4535 * if we are adding a unicast ncec for the local address 4536 * it would be REACHABLE; we would be adding a ND_STALE entry 4537 * for the requestor of an ARP_REQUEST/ND_SOLICIT. Our own 4538 * addresses are added in PROBE to trigger DAD. 4539 */ 4540 if ((flags & (NCE_F_MCAST|NCE_F_BCAST)) || 4541 ill->ill_net_type == IRE_IF_NORESOLVER) 4542 state = ND_REACHABLE; 4543 else if (!NCE_PUBLISH(ncec)) 4544 state = ND_STALE; 4545 else 4546 state = ND_PROBE; 4547 if (hw_addr != NULL) 4548 nce_set_ll(ncec, hw_addr); 4549 } 4550 /* caller overrides internally computed state */ 4551 if (nce_state != ND_UNCHANGED) 4552 state = nce_state; 4553 4554 if (state == ND_PROBE) 4555 ncec->ncec_flags |= NCE_F_UNVERIFIED; 4556 4557 ncec->ncec_state = state; 4558 4559 if (state == ND_REACHABLE) { 4560 ncec->ncec_last = ncec->ncec_init_time = 4561 TICK_TO_MSEC(ddi_get_lbolt64()); 4562 } else { 4563 ncec->ncec_last = 0; 4564 if (state == ND_INITIAL) 4565 ncec->ncec_init_time = TICK_TO_MSEC(ddi_get_lbolt64()); 4566 } 4567 list_create(&ncec->ncec_cb, sizeof (ncec_cb_t), 4568 offsetof(ncec_cb_t, ncec_cb_node)); 4569 /* 4570 * have all the memory allocations out of the way before taking locks 4571 * and adding the nce. 4572 */ 4573 nce = kmem_cache_alloc(nce_cache, KM_NOSLEEP); 4574 if (nce == NULL) { 4575 err = ENOMEM; 4576 goto err_ret; 4577 } 4578 if (ncec->ncec_lladdr != NULL || 4579 ill->ill_net_type == IRE_IF_NORESOLVER) { 4580 dlur_mp = ill_dlur_gen(ncec->ncec_lladdr, 4581 ill->ill_phys_addr_length, ill->ill_sap, 4582 ill->ill_sap_length); 4583 if (dlur_mp == NULL) { 4584 err = ENOMEM; 4585 goto err_ret; 4586 } 4587 } 4588 4589 /* 4590 * Atomically ensure that the ill is not CONDEMNED, before 4591 * adding the NCE. 4592 */ 4593 mutex_enter(&ill->ill_lock); 4594 if (ill->ill_state_flags & ILL_CONDEMNED) { 4595 mutex_exit(&ill->ill_lock); 4596 err = EINVAL; 4597 goto err_ret; 4598 } 4599 if (!NCE_MYADDR(ncec) && 4600 (ill->ill_state_flags & ILL_DOWN_IN_PROGRESS)) { 4601 mutex_exit(&ill->ill_lock); 4602 DTRACE_PROBE1(nce__add__on__down__ill, ncec_t *, ncec); 4603 err = EINVAL; 4604 goto err_ret; 4605 } 4606 /* 4607 * Acquire the ncec_lock even before adding the ncec to the list 4608 * so that it cannot get deleted after the ncec is added, but 4609 * before we add the nce. 4610 */ 4611 mutex_enter(&ncec->ncec_lock); 4612 if ((ncec->ncec_next = *ncep) != NULL) 4613 ncec->ncec_next->ncec_ptpn = &ncec->ncec_next; 4614 *ncep = ncec; 4615 ncec->ncec_ptpn = ncep; 4616 4617 /* Bump up the number of ncec's referencing this ill */ 4618 DTRACE_PROBE3(ill__incr__cnt, (ill_t *), ill, 4619 (char *), "ncec", (void *), ncec); 4620 ill->ill_ncec_cnt++; 4621 /* 4622 * Since we hold the ncec_lock at this time, the ncec cannot be 4623 * condemned, and we can safely add the nce. 4624 */ 4625 *retnce = nce_add_impl(ill, ncec, nce, dlur_mp); 4626 mutex_exit(&ncec->ncec_lock); 4627 mutex_exit(&ill->ill_lock); 4628 4629 /* caller must trigger fastpath on *retnce */ 4630 return (0); 4631 4632 err_ret: 4633 if (ncec != NULL) 4634 kmem_cache_free(ncec_cache, ncec); 4635 if (nce != NULL) 4636 kmem_cache_free(nce_cache, nce); 4637 freemsg(dlur_mp); 4638 if (template != NULL) 4639 kmem_free(template, ill->ill_phys_addr_length); 4640 return (err); 4641 } 4642 4643 /* 4644 * take a ref on the nce 4645 */ 4646 void 4647 nce_refhold(nce_t *nce) 4648 { 4649 mutex_enter(&nce->nce_lock); 4650 nce->nce_refcnt++; 4651 ASSERT((nce)->nce_refcnt != 0); 4652 mutex_exit(&nce->nce_lock); 4653 } 4654 4655 /* 4656 * release a ref on the nce; In general, this 4657 * cannot be called with locks held because nce_inactive 4658 * may result in nce_inactive which will take the ill_lock, 4659 * do ipif_ill_refrele_tail etc. Thus the one exception 4660 * where this can be called with locks held is when the caller 4661 * is certain that the nce_refcnt is sufficient to prevent 4662 * the invocation of nce_inactive. 4663 */ 4664 void 4665 nce_refrele(nce_t *nce) 4666 { 4667 ASSERT((nce)->nce_refcnt != 0); 4668 mutex_enter(&nce->nce_lock); 4669 if (--nce->nce_refcnt == 0) 4670 nce_inactive(nce); /* destroys the mutex */ 4671 else 4672 mutex_exit(&nce->nce_lock); 4673 } 4674 4675 /* 4676 * free the nce after all refs have gone away. 4677 */ 4678 static void 4679 nce_inactive(nce_t *nce) 4680 { 4681 ill_t *ill = nce->nce_ill; 4682 4683 ASSERT(nce->nce_refcnt == 0); 4684 4685 ncec_refrele_notr(nce->nce_common); 4686 nce->nce_common = NULL; 4687 freemsg(nce->nce_fp_mp); 4688 freemsg(nce->nce_dlur_mp); 4689 4690 mutex_enter(&ill->ill_lock); 4691 DTRACE_PROBE3(ill__decr__cnt, (ill_t *), ill, 4692 (char *), "nce", (void *), nce); 4693 ill->ill_nce_cnt--; 4694 nce->nce_ill = NULL; 4695 /* 4696 * If the number of ncec's associated with this ill have dropped 4697 * to zero, check whether we need to restart any operation that 4698 * is waiting for this to happen. 4699 */ 4700 if (ILL_DOWN_OK(ill)) { 4701 /* ipif_ill_refrele_tail drops the ill_lock */ 4702 ipif_ill_refrele_tail(ill); 4703 } else { 4704 mutex_exit(&ill->ill_lock); 4705 } 4706 4707 mutex_destroy(&nce->nce_lock); 4708 kmem_cache_free(nce_cache, nce); 4709 } 4710 4711 /* 4712 * Add an nce to the ill_nce list. 4713 */ 4714 static nce_t * 4715 nce_add_impl(ill_t *ill, ncec_t *ncec, nce_t *nce, mblk_t *dlur_mp) 4716 { 4717 bzero(nce, sizeof (*nce)); 4718 mutex_init(&nce->nce_lock, NULL, MUTEX_DEFAULT, NULL); 4719 nce->nce_common = ncec; 4720 nce->nce_addr = ncec->ncec_addr; 4721 nce->nce_ill = ill; 4722 DTRACE_PROBE3(ill__incr__cnt, (ill_t *), ill, 4723 (char *), "nce", (void *), nce); 4724 ill->ill_nce_cnt++; 4725 4726 nce->nce_refcnt = 1; /* for the thread */ 4727 ncec->ncec_refcnt++; /* want ncec_refhold_locked_notr(ncec) */ 4728 nce->nce_dlur_mp = dlur_mp; 4729 4730 /* add nce to the ill's fastpath list. */ 4731 nce->nce_refcnt++; /* for the list */ 4732 list_insert_head(&ill->ill_nce, nce); 4733 return (nce); 4734 } 4735 4736 static nce_t * 4737 nce_add(ill_t *ill, ncec_t *ncec) 4738 { 4739 nce_t *nce; 4740 mblk_t *dlur_mp = NULL; 4741 4742 ASSERT(MUTEX_HELD(&ill->ill_lock)); 4743 ASSERT(MUTEX_HELD(&ncec->ncec_lock)); 4744 4745 nce = kmem_cache_alloc(nce_cache, KM_NOSLEEP); 4746 if (nce == NULL) 4747 return (NULL); 4748 if (ncec->ncec_lladdr != NULL || 4749 ill->ill_net_type == IRE_IF_NORESOLVER) { 4750 dlur_mp = ill_dlur_gen(ncec->ncec_lladdr, 4751 ill->ill_phys_addr_length, ill->ill_sap, 4752 ill->ill_sap_length); 4753 if (dlur_mp == NULL) { 4754 kmem_cache_free(nce_cache, nce); 4755 return (NULL); 4756 } 4757 } 4758 return (nce_add_impl(ill, ncec, nce, dlur_mp)); 4759 } 4760 4761 /* 4762 * remove the nce from the ill_faspath list 4763 */ 4764 void 4765 nce_delete(nce_t *nce) 4766 { 4767 ill_t *ill = nce->nce_ill; 4768 4769 ASSERT(MUTEX_HELD(&ill->ill_lock)); 4770 4771 mutex_enter(&nce->nce_lock); 4772 if (nce->nce_is_condemned) { 4773 /* 4774 * some other thread has removed this nce from the ill_nce list 4775 */ 4776 mutex_exit(&nce->nce_lock); 4777 return; 4778 } 4779 nce->nce_is_condemned = B_TRUE; 4780 mutex_exit(&nce->nce_lock); 4781 4782 list_remove(&ill->ill_nce, nce); 4783 /* 4784 * even though we are holding the ill_lock, it is ok to 4785 * call nce_refrele here because we know that we should have 4786 * at least 2 refs on the nce: one for the thread, and one 4787 * for the list. The refrele below will release the one for 4788 * the list. 4789 */ 4790 nce_refrele(nce); 4791 } 4792 4793 nce_t * 4794 nce_lookup(ill_t *ill, const in6_addr_t *addr) 4795 { 4796 nce_t *nce = NULL; 4797 4798 ASSERT(ill != NULL); 4799 ASSERT(MUTEX_HELD(&ill->ill_lock)); 4800 4801 for (nce = list_head(&ill->ill_nce); nce != NULL; 4802 nce = list_next(&ill->ill_nce, nce)) { 4803 if (IN6_ARE_ADDR_EQUAL(&nce->nce_addr, addr)) 4804 break; 4805 } 4806 4807 /* 4808 * if we found the nce on the ill_nce list while holding 4809 * the ill_lock, then it cannot be condemned yet. 4810 */ 4811 if (nce != NULL) { 4812 ASSERT(!nce->nce_is_condemned); 4813 nce_refhold(nce); 4814 } 4815 return (nce); 4816 } 4817 4818 /* 4819 * Walk the ill_nce list on ill. The callback function func() cannot perform 4820 * any destructive actions. 4821 */ 4822 static void 4823 nce_walk_common(ill_t *ill, pfi_t func, void *arg) 4824 { 4825 nce_t *nce = NULL, *nce_next; 4826 4827 ASSERT(MUTEX_HELD(&ill->ill_lock)); 4828 for (nce = list_head(&ill->ill_nce); nce != NULL; ) { 4829 nce_next = list_next(&ill->ill_nce, nce); 4830 if (func(ill, nce, arg) != 0) 4831 break; 4832 nce = nce_next; 4833 } 4834 } 4835 4836 void 4837 nce_walk(ill_t *ill, pfi_t func, void *arg) 4838 { 4839 mutex_enter(&ill->ill_lock); 4840 nce_walk_common(ill, func, arg); 4841 mutex_exit(&ill->ill_lock); 4842 } 4843 4844 void 4845 nce_flush(ill_t *ill, boolean_t flushall) 4846 { 4847 nce_t *nce, *nce_next; 4848 list_t dead; 4849 4850 list_create(&dead, sizeof (nce_t), offsetof(nce_t, nce_node)); 4851 mutex_enter(&ill->ill_lock); 4852 for (nce = list_head(&ill->ill_nce); nce != NULL; ) { 4853 nce_next = list_next(&ill->ill_nce, nce); 4854 if (!flushall && NCE_PUBLISH(nce->nce_common)) { 4855 nce = nce_next; 4856 continue; 4857 } 4858 /* 4859 * nce_delete requires that the caller should either not 4860 * be holding locks, or should hold a ref to ensure that 4861 * we wont hit ncec_inactive. So take a ref and clean up 4862 * after the list is flushed. 4863 */ 4864 nce_refhold(nce); 4865 nce_delete(nce); 4866 list_insert_tail(&dead, nce); 4867 nce = nce_next; 4868 } 4869 mutex_exit(&ill->ill_lock); 4870 while ((nce = list_head(&dead)) != NULL) { 4871 list_remove(&dead, nce); 4872 nce_refrele(nce); 4873 } 4874 ASSERT(list_is_empty(&dead)); 4875 list_destroy(&dead); 4876 } 4877 4878 /* Return an interval that is anywhere in the [1 .. intv] range */ 4879 static clock_t 4880 nce_fuzz_interval(clock_t intv, boolean_t initial_time) 4881 { 4882 clock_t rnd, frac; 4883 4884 (void) random_get_pseudo_bytes((uint8_t *)&rnd, sizeof (rnd)); 4885 /* Note that clock_t is signed; must chop off bits */ 4886 rnd &= (1ul << (NBBY * sizeof (rnd) - 1)) - 1; 4887 if (initial_time) { 4888 if (intv <= 0) 4889 intv = 1; 4890 else 4891 intv = (rnd % intv) + 1; 4892 } else { 4893 /* Compute 'frac' as 20% of the configured interval */ 4894 if ((frac = intv / 5) <= 1) 4895 frac = 2; 4896 /* Set intv randomly in the range [intv-frac .. intv+frac] */ 4897 if ((intv = intv - frac + rnd % (2 * frac + 1)) <= 0) 4898 intv = 1; 4899 } 4900 return (intv); 4901 } 4902 4903 void 4904 nce_resolv_ipmp_ok(ncec_t *ncec) 4905 { 4906 mblk_t *mp; 4907 uint_t pkt_len; 4908 iaflags_t ixaflags = IXAF_NO_TRACE; 4909 nce_t *under_nce; 4910 ill_t *ill = ncec->ncec_ill; 4911 boolean_t isv6 = (ncec->ncec_ipversion == IPV6_VERSION); 4912 ipif_t *src_ipif = NULL; 4913 ip_stack_t *ipst = ill->ill_ipst; 4914 ill_t *send_ill; 4915 uint_t nprobes; 4916 4917 ASSERT(IS_IPMP(ill)); 4918 4919 mutex_enter(&ncec->ncec_lock); 4920 nprobes = ncec->ncec_nprobes; 4921 mp = ncec->ncec_qd_mp; 4922 ncec->ncec_qd_mp = NULL; 4923 ncec->ncec_nprobes = 0; 4924 mutex_exit(&ncec->ncec_lock); 4925 4926 while (mp != NULL) { 4927 mblk_t *nxt_mp; 4928 4929 nxt_mp = mp->b_next; 4930 mp->b_next = NULL; 4931 if (isv6) { 4932 ip6_t *ip6h = (ip6_t *)mp->b_rptr; 4933 4934 pkt_len = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN; 4935 src_ipif = ipif_lookup_addr_nondup_v6(&ip6h->ip6_src, 4936 ill, ALL_ZONES, ipst); 4937 } else { 4938 ipha_t *ipha = (ipha_t *)mp->b_rptr; 4939 4940 ixaflags |= IXAF_IS_IPV4; 4941 pkt_len = ntohs(ipha->ipha_length); 4942 src_ipif = ipif_lookup_addr_nondup(ipha->ipha_src, 4943 ill, ALL_ZONES, ipst); 4944 } 4945 4946 /* 4947 * find a new nce based on an under_ill. The first IPMP probe 4948 * packet gets queued, so we could still find a src_ipif that 4949 * matches an IPMP test address. 4950 */ 4951 if (src_ipif == NULL || IS_IPMP(src_ipif->ipif_ill)) { 4952 /* 4953 * if src_ipif is null, this could be either a 4954 * forwarded packet or a probe whose src got deleted. 4955 * We identify the former case by looking for the 4956 * ncec_nprobes: the first ncec_nprobes packets are 4957 * probes; 4958 */ 4959 if (src_ipif == NULL && nprobes > 0) 4960 goto drop_pkt; 4961 4962 /* 4963 * For forwarded packets, we use the ipmp rotor 4964 * to find send_ill. 4965 */ 4966 send_ill = ipmp_ill_get_xmit_ill(ncec->ncec_ill, 4967 B_TRUE); 4968 } else { 4969 send_ill = src_ipif->ipif_ill; 4970 ill_refhold(send_ill); 4971 } 4972 4973 DTRACE_PROBE4(nce__resolve__ipmp, (mblk_t *), mp, 4974 (ncec_t *), ncec, (ipif_t *), 4975 src_ipif, (ill_t *), send_ill); 4976 4977 if (send_ill == NULL) { 4978 if (src_ipif != NULL) 4979 ipif_refrele(src_ipif); 4980 goto drop_pkt; 4981 } 4982 /* create an under_nce on send_ill */ 4983 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 4984 if (IS_IN_SAME_ILLGRP(send_ill, ncec->ncec_ill)) 4985 under_nce = nce_fastpath_create(send_ill, ncec); 4986 else 4987 under_nce = NULL; 4988 rw_exit(&ipst->ips_ill_g_lock); 4989 if (under_nce != NULL && NCE_ISREACHABLE(ncec)) 4990 nce_fastpath_trigger(under_nce); 4991 4992 ill_refrele(send_ill); 4993 if (src_ipif != NULL) 4994 ipif_refrele(src_ipif); 4995 4996 if (under_nce != NULL) { 4997 (void) ip_xmit(mp, under_nce, ixaflags, pkt_len, 0, 4998 ALL_ZONES, 0, NULL); 4999 nce_refrele(under_nce); 5000 if (nprobes > 0) 5001 nprobes--; 5002 mp = nxt_mp; 5003 continue; 5004 } 5005 drop_pkt: 5006 if (isv6) { 5007 BUMP_MIB(&ipst->ips_ip6_mib, ipIfStatsOutDiscards); 5008 } else { 5009 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); 5010 } 5011 ip_drop_output("ipIfStatsOutDiscards - no under_ill", mp, NULL); 5012 freemsg(mp); 5013 if (nprobes > 0) 5014 nprobes--; 5015 mp = nxt_mp; 5016 } 5017 ncec_cb_dispatch(ncec); /* complete callbacks */ 5018 } 5019