1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2010 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #include <sys/types.h> 27 #include <sys/stream.h> 28 #include <sys/stropts.h> 29 #include <sys/strsun.h> 30 #include <sys/sysmacros.h> 31 #include <sys/errno.h> 32 #include <sys/dlpi.h> 33 #include <sys/socket.h> 34 #include <sys/ddi.h> 35 #include <sys/sunddi.h> 36 #include <sys/cmn_err.h> 37 #include <sys/debug.h> 38 #include <sys/vtrace.h> 39 #include <sys/kmem.h> 40 #include <sys/zone.h> 41 #include <sys/ethernet.h> 42 #include <sys/sdt.h> 43 #include <sys/mac.h> 44 45 #include <net/if.h> 46 #include <net/if_types.h> 47 #include <net/if_dl.h> 48 #include <net/route.h> 49 #include <netinet/in.h> 50 #include <netinet/ip6.h> 51 #include <netinet/icmp6.h> 52 53 #include <inet/common.h> 54 #include <inet/mi.h> 55 #include <inet/mib2.h> 56 #include <inet/nd.h> 57 #include <inet/ip.h> 58 #include <inet/ip_impl.h> 59 #include <inet/ipclassifier.h> 60 #include <inet/ip_if.h> 61 #include <inet/ip_ire.h> 62 #include <inet/ip_rts.h> 63 #include <inet/ip6.h> 64 #include <inet/ip_ndp.h> 65 #include <inet/sctp_ip.h> 66 #include <inet/ip_arp.h> 67 #include <inet/ip2mac_impl.h> 68 69 #define ANNOUNCE_INTERVAL(isv6) \ 70 (isv6 ? ipst->ips_ip_ndp_unsolicit_interval : \ 71 ipst->ips_ip_arp_publish_interval) 72 73 #define DEFENSE_INTERVAL(isv6) \ 74 (isv6 ? ipst->ips_ndp_defend_interval : \ 75 ipst->ips_arp_defend_interval) 76 77 /* Non-tunable probe interval, based on link capabilities */ 78 #define ILL_PROBE_INTERVAL(ill) ((ill)->ill_note_link ? 150 : 1500) 79 80 /* 81 * The IPv4 Link Local address space is special; we do extra duplicate checking 82 * there, as the entire assignment mechanism rests on random numbers. 83 */ 84 #define IS_IPV4_LL_SPACE(ptr) (((uchar_t *)ptr)[0] == 169 && \ 85 ((uchar_t *)ptr)[1] == 254) 86 87 /* 88 * NCE_EXTERNAL_FLAGS_MASK defines the set of ncec_flags that may be passed 89 * in to the ncec*add* functions. 90 * 91 * NCE_F_AUTHORITY means that we ignore any incoming adverts for that 92 * mapping (though DAD is performed for the mapping). NCE_F_PUBLISH means 93 * that we will respond to requests for the protocol address. 94 */ 95 #define NCE_EXTERNAL_FLAGS_MASK \ 96 (NCE_F_MYADDR | NCE_F_ISROUTER | NCE_F_NONUD | \ 97 NCE_F_ANYCAST | NCE_F_UNSOL_ADV | NCE_F_BCAST | NCE_F_MCAST | \ 98 NCE_F_AUTHORITY | NCE_F_PUBLISH | NCE_F_STATIC) 99 100 /* 101 * Lock ordering: 102 * 103 * ndp_g_lock -> ill_lock -> ncec_lock 104 * 105 * The ndp_g_lock protects the NCE hash (nce_hash_tbl, NCE_HASH_PTR) and 106 * ncec_next. ncec_lock protects the contents of the NCE (particularly 107 * ncec_refcnt). 108 */ 109 110 static void nce_cleanup_list(ncec_t *ncec); 111 static void nce_set_ll(ncec_t *ncec, uchar_t *ll_addr); 112 static ncec_t *ncec_lookup_illgrp(ill_t *, const in6_addr_t *, 113 ncec_t *); 114 static nce_t *nce_lookup_addr(ill_t *, const in6_addr_t *); 115 static int nce_set_multicast_v6(ill_t *ill, const in6_addr_t *addr, 116 uint16_t ncec_flags, nce_t **newnce); 117 static int nce_set_multicast_v4(ill_t *ill, const in_addr_t *dst, 118 uint16_t ncec_flags, nce_t **newnce); 119 static boolean_t ndp_xmit(ill_t *ill, uint32_t operation, 120 uint8_t *hwaddr, uint_t hwaddr_len, const in6_addr_t *sender, 121 const in6_addr_t *target, int flag); 122 static void ncec_refhold_locked(ncec_t *); 123 static boolean_t ill_defend_rate_limit(ill_t *, ncec_t *); 124 static void nce_queue_mp_common(ncec_t *, mblk_t *, boolean_t); 125 static int nce_add_common(ill_t *, uchar_t *, uint_t, const in6_addr_t *, 126 uint16_t, uint16_t, nce_t **); 127 static nce_t *nce_add_impl(ill_t *, ncec_t *, nce_t *, mblk_t *); 128 static nce_t *nce_add(ill_t *, ncec_t *); 129 static void nce_inactive(nce_t *); 130 extern nce_t *nce_lookup(ill_t *, const in6_addr_t *); 131 static nce_t *nce_ill_lookup_then_add(ill_t *, ncec_t *); 132 static int nce_add_v6(ill_t *, uchar_t *, uint_t, const in6_addr_t *, 133 uint16_t, uint16_t, nce_t **); 134 static int nce_add_v4(ill_t *, uchar_t *, uint_t, const in_addr_t *, 135 uint16_t, uint16_t, nce_t **); 136 static int nce_add_v6_postprocess(nce_t *); 137 static int nce_add_v4_postprocess(nce_t *); 138 static ill_t *nce_resolve_src(ncec_t *, in6_addr_t *); 139 static clock_t nce_fuzz_interval(clock_t, boolean_t); 140 static void nce_resolv_ipmp_ok(ncec_t *); 141 static void nce_walk_common(ill_t *, pfi_t, void *); 142 static void nce_start_timer(ncec_t *, uint_t); 143 static nce_t *nce_fastpath_create(ill_t *, ncec_t *); 144 static void nce_fastpath_trigger(nce_t *); 145 static nce_t *nce_fastpath(ncec_t *, boolean_t, nce_t *); 146 147 #ifdef DEBUG 148 static void ncec_trace_cleanup(const ncec_t *); 149 #endif 150 151 #define NCE_HASH_PTR_V4(ipst, addr) \ 152 (&((ipst)->ips_ndp4->nce_hash_tbl[IRE_ADDR_HASH(addr, NCE_TABLE_SIZE)])) 153 154 #define NCE_HASH_PTR_V6(ipst, addr) \ 155 (&((ipst)->ips_ndp6->nce_hash_tbl[NCE_ADDR_HASH_V6(addr, \ 156 NCE_TABLE_SIZE)])) 157 158 extern kmem_cache_t *ncec_cache; 159 extern kmem_cache_t *nce_cache; 160 161 /* 162 * Send out a IPv6 (unicast) or IPv4 (broadcast) DAD probe 163 * If src_ill is not null, the ncec_addr is bound to src_ill. The 164 * src_ill is ignored by nce_dad for IPv4 Neighbor Cache entries where 165 * the probe is sent on the ncec_ill (in the non-IPMP case) or the 166 * IPMP cast_ill (in the IPMP case). 167 * 168 * Note that the probe interval is based on ncec->ncec_ill which 169 * may be the ipmp_ill. 170 */ 171 static void 172 nce_dad(ncec_t *ncec, ill_t *src_ill, boolean_t send_probe) 173 { 174 boolean_t dropped; 175 uint32_t probe_interval; 176 177 ASSERT(!(ncec->ncec_flags & NCE_F_MCAST)); 178 ASSERT(!(ncec->ncec_flags & NCE_F_BCAST)); 179 if (ncec->ncec_ipversion == IPV6_VERSION) { 180 dropped = ndp_xmit(src_ill, ND_NEIGHBOR_SOLICIT, 181 ncec->ncec_lladdr, ncec->ncec_lladdr_length, 182 &ipv6_all_zeros, &ncec->ncec_addr, NDP_PROBE); 183 probe_interval = ILL_PROBE_INTERVAL(ncec->ncec_ill); 184 } else { 185 /* IPv4 DAD delay the initial probe. */ 186 if (send_probe) 187 dropped = arp_probe(ncec); 188 else 189 dropped = B_TRUE; 190 probe_interval = nce_fuzz_interval(ncec->ncec_xmit_interval, 191 !send_probe); 192 } 193 if (!dropped) { 194 mutex_enter(&ncec->ncec_lock); 195 ncec->ncec_pcnt--; 196 mutex_exit(&ncec->ncec_lock); 197 } 198 nce_restart_timer(ncec, probe_interval); 199 } 200 201 /* 202 * Compute default flags to use for an advertisement of this ncec's address. 203 */ 204 static int 205 nce_advert_flags(const ncec_t *ncec) 206 { 207 int flag = 0; 208 209 if (ncec->ncec_flags & NCE_F_ISROUTER) 210 flag |= NDP_ISROUTER; 211 if (!(ncec->ncec_flags & NCE_F_ANYCAST)) 212 flag |= NDP_ORIDE; 213 214 return (flag); 215 } 216 217 /* 218 * NDP Cache Entry creation routine. 219 * This routine must always be called with ndp6->ndp_g_lock held. 220 */ 221 int 222 nce_add_v6(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len, 223 const in6_addr_t *addr, uint16_t flags, uint16_t state, nce_t **newnce) 224 { 225 int err; 226 nce_t *nce; 227 228 ASSERT(MUTEX_HELD(&ill->ill_ipst->ips_ndp6->ndp_g_lock)); 229 ASSERT(ill != NULL && ill->ill_isv6); 230 231 err = nce_add_common(ill, hw_addr, hw_addr_len, addr, flags, state, 232 &nce); 233 if (err != 0) 234 return (err); 235 ASSERT(newnce != NULL); 236 *newnce = nce; 237 return (err); 238 } 239 240 /* 241 * Post-processing routine to be executed after nce_add_v6(). This function 242 * triggers fastpath (if appropriate) and DAD on the newly added nce entry 243 * and must be called without any locks held. 244 */ 245 int 246 nce_add_v6_postprocess(nce_t *nce) 247 { 248 ncec_t *ncec = nce->nce_common; 249 boolean_t dropped = B_FALSE; 250 uchar_t *hw_addr = ncec->ncec_lladdr; 251 uint_t hw_addr_len = ncec->ncec_lladdr_length; 252 ill_t *ill = ncec->ncec_ill; 253 int err = 0; 254 uint16_t flags = ncec->ncec_flags; 255 ip_stack_t *ipst = ill->ill_ipst; 256 boolean_t trigger_fastpath = B_TRUE; 257 258 /* 259 * If the hw_addr is NULL, typically for ND_INCOMPLETE nces, then 260 * we call nce_fastpath as soon as the ncec is resolved in nce_process. 261 * We call nce_fastpath from nce_update if the link layer address of 262 * the peer changes from nce_update 263 */ 264 if (NCE_PUBLISH(ncec) || !NCE_ISREACHABLE(ncec) || 265 (hw_addr == NULL && ill->ill_net_type != IRE_IF_NORESOLVER)) 266 trigger_fastpath = B_FALSE; 267 268 if (trigger_fastpath) 269 nce_fastpath_trigger(nce); 270 if (NCE_PUBLISH(ncec) && ncec->ncec_state == ND_PROBE) { 271 ill_t *hwaddr_ill; 272 /* 273 * Unicast entry that needs DAD. 274 */ 275 if (IS_IPMP(ill)) { 276 hwaddr_ill = ipmp_illgrp_find_ill(ill->ill_grp, 277 hw_addr, hw_addr_len); 278 } else { 279 hwaddr_ill = ill; 280 } 281 nce_dad(ncec, hwaddr_ill, B_TRUE); 282 err = EINPROGRESS; 283 } else if (flags & NCE_F_UNSOL_ADV) { 284 /* 285 * We account for the transmit below by assigning one 286 * less than the ndd variable. Subsequent decrements 287 * are done in nce_timer. 288 */ 289 mutex_enter(&ncec->ncec_lock); 290 ncec->ncec_unsolicit_count = 291 ipst->ips_ip_ndp_unsolicit_count - 1; 292 mutex_exit(&ncec->ncec_lock); 293 dropped = ndp_xmit(ill, 294 ND_NEIGHBOR_ADVERT, 295 hw_addr, 296 hw_addr_len, 297 &ncec->ncec_addr, /* Source and target of the adv */ 298 &ipv6_all_hosts_mcast, /* Destination of the packet */ 299 nce_advert_flags(ncec)); 300 mutex_enter(&ncec->ncec_lock); 301 if (dropped) 302 ncec->ncec_unsolicit_count++; 303 else 304 ncec->ncec_last_time_defended = ddi_get_lbolt(); 305 if (ncec->ncec_unsolicit_count != 0) { 306 nce_start_timer(ncec, 307 ipst->ips_ip_ndp_unsolicit_interval); 308 } 309 mutex_exit(&ncec->ncec_lock); 310 } 311 return (err); 312 } 313 314 /* 315 * Atomically lookup and add (if needed) Neighbor Cache information for 316 * an address. 317 * 318 * IPMP notes: the ncec for non-local (i.e., !NCE_MYADDR(ncec) addresses 319 * are always added pointing at the ipmp_ill. Thus, when the ill passed 320 * to nce_add_v6 is an under_ill (i.e., IS_UNDER_IPMP(ill)) two nce_t 321 * entries will be created, both pointing at the same ncec_t. The nce_t 322 * entries will have their nce_ill set to the ipmp_ill and the under_ill 323 * respectively, with the ncec_t having its ncec_ill pointing at the ipmp_ill. 324 * Local addresses are always created on the ill passed to nce_add_v6. 325 */ 326 int 327 nce_lookup_then_add_v6(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len, 328 const in6_addr_t *addr, uint16_t flags, uint16_t state, nce_t **newnce) 329 { 330 int err = 0; 331 ip_stack_t *ipst = ill->ill_ipst; 332 nce_t *nce, *upper_nce = NULL; 333 ill_t *in_ill = ill; 334 boolean_t need_ill_refrele = B_FALSE; 335 336 if (flags & NCE_F_MCAST) { 337 /* 338 * hw_addr will be figured out in nce_set_multicast_v6; 339 * caller has to select the cast_ill 340 */ 341 ASSERT(hw_addr == NULL); 342 ASSERT(!IS_IPMP(ill)); 343 err = nce_set_multicast_v6(ill, addr, flags, newnce); 344 return (err); 345 } 346 ASSERT(ill->ill_isv6); 347 if (IS_UNDER_IPMP(ill) && !(flags & NCE_F_MYADDR)) { 348 ill = ipmp_ill_hold_ipmp_ill(ill); 349 if (ill == NULL) 350 return (ENXIO); 351 need_ill_refrele = B_TRUE; 352 } 353 354 mutex_enter(&ipst->ips_ndp6->ndp_g_lock); 355 nce = nce_lookup_addr(ill, addr); 356 if (nce == NULL) { 357 err = nce_add_v6(ill, hw_addr, hw_addr_len, addr, flags, state, 358 &nce); 359 } else { 360 err = EEXIST; 361 } 362 mutex_exit(&ipst->ips_ndp6->ndp_g_lock); 363 if (err == 0) 364 err = nce_add_v6_postprocess(nce); 365 if (in_ill != ill && nce != NULL) { 366 nce_t *under_nce = NULL; 367 368 /* 369 * in_ill was the under_ill. Try to create the under_nce. 370 * Hold the ill_g_lock to prevent changes to group membership 371 * until we are done. 372 */ 373 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 374 if (!IS_IN_SAME_ILLGRP(in_ill, ill)) { 375 DTRACE_PROBE2(ill__not__in__group, nce_t *, nce, 376 ill_t *, ill); 377 rw_exit(&ipst->ips_ill_g_lock); 378 err = ENXIO; 379 nce_refrele(nce); 380 nce = NULL; 381 goto bail; 382 } 383 under_nce = nce_fastpath_create(in_ill, nce->nce_common); 384 if (under_nce == NULL) { 385 rw_exit(&ipst->ips_ill_g_lock); 386 err = EINVAL; 387 nce_refrele(nce); 388 nce = NULL; 389 goto bail; 390 } 391 rw_exit(&ipst->ips_ill_g_lock); 392 upper_nce = nce; 393 nce = under_nce; /* will be returned to caller */ 394 if (NCE_ISREACHABLE(nce->nce_common)) 395 nce_fastpath_trigger(under_nce); 396 } 397 /* nce_refrele is deferred until the lock is dropped */ 398 if (nce != NULL) { 399 if (newnce != NULL) 400 *newnce = nce; 401 else 402 nce_refrele(nce); 403 } 404 bail: 405 if (upper_nce != NULL) 406 nce_refrele(upper_nce); 407 if (need_ill_refrele) 408 ill_refrele(ill); 409 return (err); 410 } 411 412 /* 413 * Remove all the CONDEMNED nces from the appropriate hash table. 414 * We create a private list of NCEs, these may have ires pointing 415 * to them, so the list will be passed through to clean up dependent 416 * ires and only then we can do ncec_refrele() which can make NCE inactive. 417 */ 418 static void 419 nce_remove(ndp_g_t *ndp, ncec_t *ncec, ncec_t **free_nce_list) 420 { 421 ncec_t *ncec1; 422 ncec_t **ptpn; 423 424 ASSERT(MUTEX_HELD(&ndp->ndp_g_lock)); 425 ASSERT(ndp->ndp_g_walker == 0); 426 for (; ncec; ncec = ncec1) { 427 ncec1 = ncec->ncec_next; 428 mutex_enter(&ncec->ncec_lock); 429 if (NCE_ISCONDEMNED(ncec)) { 430 ptpn = ncec->ncec_ptpn; 431 ncec1 = ncec->ncec_next; 432 if (ncec1 != NULL) 433 ncec1->ncec_ptpn = ptpn; 434 *ptpn = ncec1; 435 ncec->ncec_ptpn = NULL; 436 ncec->ncec_next = NULL; 437 ncec->ncec_next = *free_nce_list; 438 *free_nce_list = ncec; 439 } 440 mutex_exit(&ncec->ncec_lock); 441 } 442 } 443 444 /* 445 * 1. Mark the entry CONDEMNED. This ensures that no new nce_lookup() 446 * will return this NCE. Also no new timeouts will 447 * be started (See nce_restart_timer). 448 * 2. Cancel any currently running timeouts. 449 * 3. If there is an ndp walker, return. The walker will do the cleanup. 450 * This ensures that walkers see a consistent list of NCEs while walking. 451 * 4. Otherwise remove the NCE from the list of NCEs 452 */ 453 void 454 ncec_delete(ncec_t *ncec) 455 { 456 ncec_t **ptpn; 457 ncec_t *ncec1; 458 int ipversion = ncec->ncec_ipversion; 459 ndp_g_t *ndp; 460 ip_stack_t *ipst = ncec->ncec_ipst; 461 462 if (ipversion == IPV4_VERSION) 463 ndp = ipst->ips_ndp4; 464 else 465 ndp = ipst->ips_ndp6; 466 467 /* Serialize deletes */ 468 mutex_enter(&ncec->ncec_lock); 469 if (NCE_ISCONDEMNED(ncec)) { 470 /* Some other thread is doing the delete */ 471 mutex_exit(&ncec->ncec_lock); 472 return; 473 } 474 /* 475 * Caller has a refhold. Also 1 ref for being in the list. Thus 476 * refcnt has to be >= 2 477 */ 478 ASSERT(ncec->ncec_refcnt >= 2); 479 ncec->ncec_flags |= NCE_F_CONDEMNED; 480 mutex_exit(&ncec->ncec_lock); 481 482 /* Count how many condemned ires for kmem_cache callback */ 483 atomic_add_32(&ipst->ips_num_nce_condemned, 1); 484 nce_fastpath_list_delete(ncec->ncec_ill, ncec, NULL); 485 486 /* Complete any waiting callbacks */ 487 ncec_cb_dispatch(ncec); 488 489 /* 490 * Cancel any running timer. Timeout can't be restarted 491 * since CONDEMNED is set. Can't hold ncec_lock across untimeout. 492 * Passing invalid timeout id is fine. 493 */ 494 if (ncec->ncec_timeout_id != 0) { 495 (void) untimeout(ncec->ncec_timeout_id); 496 ncec->ncec_timeout_id = 0; 497 } 498 499 mutex_enter(&ndp->ndp_g_lock); 500 if (ncec->ncec_ptpn == NULL) { 501 /* 502 * The last ndp walker has already removed this ncec from 503 * the list after we marked the ncec CONDEMNED and before 504 * we grabbed the global lock. 505 */ 506 mutex_exit(&ndp->ndp_g_lock); 507 return; 508 } 509 if (ndp->ndp_g_walker > 0) { 510 /* 511 * Can't unlink. The walker will clean up 512 */ 513 ndp->ndp_g_walker_cleanup = B_TRUE; 514 mutex_exit(&ndp->ndp_g_lock); 515 return; 516 } 517 518 /* 519 * Now remove the ncec from the list. nce_restart_timer won't restart 520 * the timer since it is marked CONDEMNED. 521 */ 522 ptpn = ncec->ncec_ptpn; 523 ncec1 = ncec->ncec_next; 524 if (ncec1 != NULL) 525 ncec1->ncec_ptpn = ptpn; 526 *ptpn = ncec1; 527 ncec->ncec_ptpn = NULL; 528 ncec->ncec_next = NULL; 529 mutex_exit(&ndp->ndp_g_lock); 530 531 /* Removed from ncec_ptpn/ncec_next list */ 532 ncec_refrele_notr(ncec); 533 } 534 535 void 536 ncec_inactive(ncec_t *ncec) 537 { 538 mblk_t **mpp; 539 ill_t *ill = ncec->ncec_ill; 540 ip_stack_t *ipst = ncec->ncec_ipst; 541 542 ASSERT(ncec->ncec_refcnt == 0); 543 ASSERT(MUTEX_HELD(&ncec->ncec_lock)); 544 545 /* Count how many condemned nces for kmem_cache callback */ 546 if (NCE_ISCONDEMNED(ncec)) 547 atomic_add_32(&ipst->ips_num_nce_condemned, -1); 548 549 /* Free all allocated messages */ 550 mpp = &ncec->ncec_qd_mp; 551 while (*mpp != NULL) { 552 mblk_t *mp; 553 554 mp = *mpp; 555 *mpp = mp->b_next; 556 557 inet_freemsg(mp); 558 } 559 /* 560 * must have been cleaned up in ncec_delete 561 */ 562 ASSERT(list_is_empty(&ncec->ncec_cb)); 563 list_destroy(&ncec->ncec_cb); 564 /* 565 * free the ncec_lladdr if one was allocated in nce_add_common() 566 */ 567 if (ncec->ncec_lladdr_length > 0) 568 kmem_free(ncec->ncec_lladdr, ncec->ncec_lladdr_length); 569 570 #ifdef DEBUG 571 ncec_trace_cleanup(ncec); 572 #endif 573 574 mutex_enter(&ill->ill_lock); 575 DTRACE_PROBE3(ill__decr__cnt, (ill_t *), ill, 576 (char *), "ncec", (void *), ncec); 577 ill->ill_ncec_cnt--; 578 ncec->ncec_ill = NULL; 579 /* 580 * If the number of ncec's associated with this ill have dropped 581 * to zero, check whether we need to restart any operation that 582 * is waiting for this to happen. 583 */ 584 if (ILL_DOWN_OK(ill)) { 585 /* ipif_ill_refrele_tail drops the ill_lock */ 586 ipif_ill_refrele_tail(ill); 587 } else { 588 mutex_exit(&ill->ill_lock); 589 } 590 591 mutex_destroy(&ncec->ncec_lock); 592 kmem_cache_free(ncec_cache, ncec); 593 } 594 595 /* 596 * ncec_walk routine. Delete the ncec if it is associated with the ill 597 * that is going away. Always called as a writer. 598 */ 599 void 600 ncec_delete_per_ill(ncec_t *ncec, uchar_t *arg) 601 { 602 if ((ncec != NULL) && ncec->ncec_ill == (ill_t *)arg) { 603 ncec_delete(ncec); 604 } 605 } 606 607 /* 608 * Neighbor Cache cleanup logic for a list of ncec_t entries. 609 */ 610 static void 611 nce_cleanup_list(ncec_t *ncec) 612 { 613 ncec_t *ncec_next; 614 615 ASSERT(ncec != NULL); 616 while (ncec != NULL) { 617 ncec_next = ncec->ncec_next; 618 ncec->ncec_next = NULL; 619 620 /* 621 * It is possible for the last ndp walker (this thread) 622 * to come here after ncec_delete has marked the ncec CONDEMNED 623 * and before it has removed the ncec from the fastpath list 624 * or called untimeout. So we need to do it here. It is safe 625 * for both ncec_delete and this thread to do it twice or 626 * even simultaneously since each of the threads has a 627 * reference on the ncec. 628 */ 629 nce_fastpath_list_delete(ncec->ncec_ill, ncec, NULL); 630 /* 631 * Cancel any running timer. Timeout can't be restarted 632 * since CONDEMNED is set. The ncec_lock can't be 633 * held across untimeout though passing invalid timeout 634 * id is fine. 635 */ 636 if (ncec->ncec_timeout_id != 0) { 637 (void) untimeout(ncec->ncec_timeout_id); 638 ncec->ncec_timeout_id = 0; 639 } 640 /* Removed from ncec_ptpn/ncec_next list */ 641 ncec_refrele_notr(ncec); 642 ncec = ncec_next; 643 } 644 } 645 646 /* 647 * Restart DAD on given NCE. Returns B_TRUE if DAD has been restarted. 648 */ 649 boolean_t 650 nce_restart_dad(ncec_t *ncec) 651 { 652 boolean_t started; 653 ill_t *ill, *hwaddr_ill; 654 655 if (ncec == NULL) 656 return (B_FALSE); 657 ill = ncec->ncec_ill; 658 mutex_enter(&ncec->ncec_lock); 659 if (ncec->ncec_state == ND_PROBE) { 660 mutex_exit(&ncec->ncec_lock); 661 started = B_TRUE; 662 } else if (ncec->ncec_state == ND_REACHABLE) { 663 ASSERT(ncec->ncec_lladdr != NULL); 664 ncec->ncec_state = ND_PROBE; 665 ncec->ncec_pcnt = ND_MAX_UNICAST_SOLICIT; 666 /* 667 * Slight cheat here: we don't use the initial probe delay 668 * for IPv4 in this obscure case. 669 */ 670 mutex_exit(&ncec->ncec_lock); 671 if (IS_IPMP(ill)) { 672 hwaddr_ill = ipmp_illgrp_find_ill(ill->ill_grp, 673 ncec->ncec_lladdr, ncec->ncec_lladdr_length); 674 } else { 675 hwaddr_ill = ill; 676 } 677 nce_dad(ncec, hwaddr_ill, B_TRUE); 678 started = B_TRUE; 679 } else { 680 mutex_exit(&ncec->ncec_lock); 681 started = B_FALSE; 682 } 683 return (started); 684 } 685 686 /* 687 * IPv6 Cache entry lookup. Try to find an ncec matching the parameters passed. 688 * If one is found, the refcnt on the ncec will be incremented. 689 */ 690 ncec_t * 691 ncec_lookup_illgrp_v6(ill_t *ill, const in6_addr_t *addr) 692 { 693 ncec_t *ncec; 694 ip_stack_t *ipst = ill->ill_ipst; 695 696 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 697 mutex_enter(&ipst->ips_ndp6->ndp_g_lock); 698 699 /* Get head of v6 hash table */ 700 ncec = *((ncec_t **)NCE_HASH_PTR_V6(ipst, *addr)); 701 ncec = ncec_lookup_illgrp(ill, addr, ncec); 702 mutex_exit(&ipst->ips_ndp6->ndp_g_lock); 703 rw_exit(&ipst->ips_ill_g_lock); 704 return (ncec); 705 } 706 /* 707 * IPv4 Cache entry lookup. Try to find an ncec matching the parameters passed. 708 * If one is found, the refcnt on the ncec will be incremented. 709 */ 710 ncec_t * 711 ncec_lookup_illgrp_v4(ill_t *ill, const in_addr_t *addr) 712 { 713 ncec_t *ncec = NULL; 714 in6_addr_t addr6; 715 ip_stack_t *ipst = ill->ill_ipst; 716 717 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 718 mutex_enter(&ipst->ips_ndp4->ndp_g_lock); 719 720 /* Get head of v4 hash table */ 721 ncec = *((ncec_t **)NCE_HASH_PTR_V4(ipst, *addr)); 722 IN6_IPADDR_TO_V4MAPPED(*addr, &addr6); 723 ncec = ncec_lookup_illgrp(ill, &addr6, ncec); 724 mutex_exit(&ipst->ips_ndp4->ndp_g_lock); 725 rw_exit(&ipst->ips_ill_g_lock); 726 return (ncec); 727 } 728 729 /* 730 * Cache entry lookup. Try to find an ncec matching the parameters passed. 731 * If an ncec is found, increment the hold count on that ncec. 732 * The caller passes in the start of the appropriate hash table, and must 733 * be holding the appropriate global lock (ndp_g_lock). In addition, since 734 * this function matches ncec_t entries across the illgrp, the ips_ill_g_lock 735 * must be held as reader. 736 * 737 * This function always matches across the ipmp group. 738 */ 739 ncec_t * 740 ncec_lookup_illgrp(ill_t *ill, const in6_addr_t *addr, ncec_t *ncec) 741 { 742 ndp_g_t *ndp; 743 ip_stack_t *ipst = ill->ill_ipst; 744 745 if (ill->ill_isv6) 746 ndp = ipst->ips_ndp6; 747 else 748 ndp = ipst->ips_ndp4; 749 750 ASSERT(ill != NULL); 751 ASSERT(MUTEX_HELD(&ndp->ndp_g_lock)); 752 if (IN6_IS_ADDR_UNSPECIFIED(addr)) 753 return (NULL); 754 for (; ncec != NULL; ncec = ncec->ncec_next) { 755 if (ncec->ncec_ill == ill || 756 IS_IN_SAME_ILLGRP(ill, ncec->ncec_ill)) { 757 if (IN6_ARE_ADDR_EQUAL(&ncec->ncec_addr, addr)) { 758 mutex_enter(&ncec->ncec_lock); 759 if (!NCE_ISCONDEMNED(ncec)) { 760 ncec_refhold_locked(ncec); 761 mutex_exit(&ncec->ncec_lock); 762 break; 763 } 764 mutex_exit(&ncec->ncec_lock); 765 } 766 } 767 } 768 return (ncec); 769 } 770 771 /* 772 * Find an nce_t on ill with nce_addr == addr. Lookup the nce_t 773 * entries for ill only, i.e., when ill is part of an ipmp group, 774 * nce_lookup_v4 will never try to match across the group. 775 */ 776 nce_t * 777 nce_lookup_v4(ill_t *ill, const in_addr_t *addr) 778 { 779 nce_t *nce; 780 in6_addr_t addr6; 781 ip_stack_t *ipst = ill->ill_ipst; 782 783 mutex_enter(&ipst->ips_ndp4->ndp_g_lock); 784 IN6_IPADDR_TO_V4MAPPED(*addr, &addr6); 785 nce = nce_lookup_addr(ill, &addr6); 786 mutex_exit(&ipst->ips_ndp4->ndp_g_lock); 787 return (nce); 788 } 789 790 /* 791 * Find an nce_t on ill with nce_addr == addr. Lookup the nce_t 792 * entries for ill only, i.e., when ill is part of an ipmp group, 793 * nce_lookup_v6 will never try to match across the group. 794 */ 795 nce_t * 796 nce_lookup_v6(ill_t *ill, const in6_addr_t *addr6) 797 { 798 nce_t *nce; 799 ip_stack_t *ipst = ill->ill_ipst; 800 801 mutex_enter(&ipst->ips_ndp6->ndp_g_lock); 802 nce = nce_lookup_addr(ill, addr6); 803 mutex_exit(&ipst->ips_ndp6->ndp_g_lock); 804 return (nce); 805 } 806 807 static nce_t * 808 nce_lookup_addr(ill_t *ill, const in6_addr_t *addr) 809 { 810 nce_t *nce; 811 812 ASSERT(ill != NULL); 813 #ifdef DEBUG 814 if (ill->ill_isv6) 815 ASSERT(MUTEX_HELD(&ill->ill_ipst->ips_ndp6->ndp_g_lock)); 816 else 817 ASSERT(MUTEX_HELD(&ill->ill_ipst->ips_ndp4->ndp_g_lock)); 818 #endif 819 mutex_enter(&ill->ill_lock); 820 nce = nce_lookup(ill, addr); 821 mutex_exit(&ill->ill_lock); 822 return (nce); 823 } 824 825 826 /* 827 * Router turned to host. We need to make sure that cached copies of the ncec 828 * are not used for forwarding packets if they were derived from the default 829 * route, and that the default route itself is removed, as required by 830 * section 7.2.5 of RFC 2461. 831 * 832 * Note that the ncec itself probably has valid link-layer information for the 833 * nexthop, so that there is no reason to delete the ncec, as long as the 834 * ISROUTER flag is turned off. 835 */ 836 static void 837 ncec_router_to_host(ncec_t *ncec) 838 { 839 ire_t *ire; 840 ip_stack_t *ipst = ncec->ncec_ipst; 841 842 mutex_enter(&ncec->ncec_lock); 843 ncec->ncec_flags &= ~NCE_F_ISROUTER; 844 mutex_exit(&ncec->ncec_lock); 845 846 ire = ire_ftable_lookup_v6(&ipv6_all_zeros, &ipv6_all_zeros, 847 &ncec->ncec_addr, IRE_DEFAULT, ncec->ncec_ill, ALL_ZONES, NULL, 848 MATCH_IRE_ILL | MATCH_IRE_TYPE | MATCH_IRE_GW, 0, ipst, NULL); 849 if (ire != NULL) { 850 ip_rts_rtmsg(RTM_DELETE, ire, 0, ipst); 851 ire_delete(ire); 852 ire_refrele(ire); 853 } 854 } 855 856 /* 857 * Process passed in parameters either from an incoming packet or via 858 * user ioctl. 859 */ 860 void 861 nce_process(ncec_t *ncec, uchar_t *hw_addr, uint32_t flag, boolean_t is_adv) 862 { 863 ill_t *ill = ncec->ncec_ill; 864 uint32_t hw_addr_len = ill->ill_phys_addr_length; 865 boolean_t ll_updated = B_FALSE; 866 boolean_t ll_changed; 867 nce_t *nce; 868 869 ASSERT(ncec->ncec_ipversion == IPV6_VERSION); 870 /* 871 * No updates of link layer address or the neighbor state is 872 * allowed, when the cache is in NONUD state. This still 873 * allows for responding to reachability solicitation. 874 */ 875 mutex_enter(&ncec->ncec_lock); 876 if (ncec->ncec_state == ND_INCOMPLETE) { 877 if (hw_addr == NULL) { 878 mutex_exit(&ncec->ncec_lock); 879 return; 880 } 881 nce_set_ll(ncec, hw_addr); 882 /* 883 * Update ncec state and send the queued packets 884 * back to ip this time ire will be added. 885 */ 886 if (flag & ND_NA_FLAG_SOLICITED) { 887 nce_update(ncec, ND_REACHABLE, NULL); 888 } else { 889 nce_update(ncec, ND_STALE, NULL); 890 } 891 mutex_exit(&ncec->ncec_lock); 892 nce = nce_fastpath(ncec, B_TRUE, NULL); 893 nce_resolv_ok(ncec); 894 if (nce != NULL) 895 nce_refrele(nce); 896 return; 897 } 898 ll_changed = nce_cmp_ll_addr(ncec, hw_addr, hw_addr_len); 899 if (!is_adv) { 900 /* If this is a SOLICITATION request only */ 901 if (ll_changed) 902 nce_update(ncec, ND_STALE, hw_addr); 903 mutex_exit(&ncec->ncec_lock); 904 ncec_cb_dispatch(ncec); 905 return; 906 } 907 if (!(flag & ND_NA_FLAG_OVERRIDE) && ll_changed) { 908 /* If in any other state than REACHABLE, ignore */ 909 if (ncec->ncec_state == ND_REACHABLE) { 910 nce_update(ncec, ND_STALE, NULL); 911 } 912 mutex_exit(&ncec->ncec_lock); 913 ncec_cb_dispatch(ncec); 914 return; 915 } else { 916 if (ll_changed) { 917 nce_update(ncec, ND_UNCHANGED, hw_addr); 918 ll_updated = B_TRUE; 919 } 920 if (flag & ND_NA_FLAG_SOLICITED) { 921 nce_update(ncec, ND_REACHABLE, NULL); 922 } else { 923 if (ll_updated) { 924 nce_update(ncec, ND_STALE, NULL); 925 } 926 } 927 mutex_exit(&ncec->ncec_lock); 928 if (!(flag & ND_NA_FLAG_ROUTER) && (ncec->ncec_flags & 929 NCE_F_ISROUTER)) { 930 ncec_router_to_host(ncec); 931 } else { 932 ncec_cb_dispatch(ncec); 933 } 934 } 935 } 936 937 /* 938 * Pass arg1 to the pfi supplied, along with each ncec in existence. 939 * ncec_walk() places a REFHOLD on the ncec and drops the lock when 940 * walking the hash list. 941 */ 942 void 943 ncec_walk_common(ndp_g_t *ndp, ill_t *ill, pfi_t pfi, void *arg1, 944 boolean_t trace) 945 { 946 ncec_t *ncec; 947 ncec_t *ncec1; 948 ncec_t **ncep; 949 ncec_t *free_nce_list = NULL; 950 951 mutex_enter(&ndp->ndp_g_lock); 952 /* Prevent ncec_delete from unlink and free of NCE */ 953 ndp->ndp_g_walker++; 954 mutex_exit(&ndp->ndp_g_lock); 955 for (ncep = ndp->nce_hash_tbl; 956 ncep < A_END(ndp->nce_hash_tbl); ncep++) { 957 for (ncec = *ncep; ncec != NULL; ncec = ncec1) { 958 ncec1 = ncec->ncec_next; 959 if (ill == NULL || ncec->ncec_ill == ill) { 960 if (trace) { 961 ncec_refhold(ncec); 962 (*pfi)(ncec, arg1); 963 ncec_refrele(ncec); 964 } else { 965 ncec_refhold_notr(ncec); 966 (*pfi)(ncec, arg1); 967 ncec_refrele_notr(ncec); 968 } 969 } 970 } 971 } 972 mutex_enter(&ndp->ndp_g_lock); 973 ndp->ndp_g_walker--; 974 if (ndp->ndp_g_walker_cleanup && ndp->ndp_g_walker == 0) { 975 /* Time to delete condemned entries */ 976 for (ncep = ndp->nce_hash_tbl; 977 ncep < A_END(ndp->nce_hash_tbl); ncep++) { 978 ncec = *ncep; 979 if (ncec != NULL) { 980 nce_remove(ndp, ncec, &free_nce_list); 981 } 982 } 983 ndp->ndp_g_walker_cleanup = B_FALSE; 984 } 985 986 mutex_exit(&ndp->ndp_g_lock); 987 988 if (free_nce_list != NULL) { 989 nce_cleanup_list(free_nce_list); 990 } 991 } 992 993 /* 994 * Walk everything. 995 * Note that ill can be NULL hence can't derive the ipst from it. 996 */ 997 void 998 ncec_walk(ill_t *ill, pfi_t pfi, void *arg1, ip_stack_t *ipst) 999 { 1000 ncec_walk_common(ipst->ips_ndp4, ill, pfi, arg1, B_TRUE); 1001 ncec_walk_common(ipst->ips_ndp6, ill, pfi, arg1, B_TRUE); 1002 } 1003 1004 /* 1005 * For each interface an entry is added for the unspecified multicast group. 1006 * Here that mapping is used to form the multicast cache entry for a particular 1007 * multicast destination. 1008 */ 1009 static int 1010 nce_set_multicast_v6(ill_t *ill, const in6_addr_t *dst, 1011 uint16_t flags, nce_t **newnce) 1012 { 1013 uchar_t *hw_addr; 1014 int err = 0; 1015 ip_stack_t *ipst = ill->ill_ipst; 1016 nce_t *nce; 1017 1018 ASSERT(ill != NULL); 1019 ASSERT(ill->ill_isv6); 1020 ASSERT(!(IN6_IS_ADDR_UNSPECIFIED(dst))); 1021 1022 mutex_enter(&ipst->ips_ndp6->ndp_g_lock); 1023 nce = nce_lookup_addr(ill, dst); 1024 if (nce != NULL) { 1025 mutex_exit(&ipst->ips_ndp6->ndp_g_lock); 1026 goto done; 1027 } 1028 if (ill->ill_net_type == IRE_IF_RESOLVER) { 1029 /* 1030 * For IRE_IF_RESOLVER a hardware mapping can be 1031 * generated. 1032 */ 1033 hw_addr = kmem_alloc(ill->ill_nd_lla_len, KM_NOSLEEP); 1034 if (hw_addr == NULL) { 1035 mutex_exit(&ipst->ips_ndp6->ndp_g_lock); 1036 return (ENOMEM); 1037 } 1038 ip_mcast_mapping(ill, (uchar_t *)dst, hw_addr); 1039 } else { 1040 /* No hw_addr is needed for IRE_IF_NORESOLVER. */ 1041 hw_addr = NULL; 1042 } 1043 ASSERT((flags & NCE_F_MCAST) != 0); 1044 ASSERT((flags & NCE_F_NONUD) != 0); 1045 /* nce_state will be computed by nce_add_common() */ 1046 err = nce_add_v6(ill, hw_addr, ill->ill_phys_addr_length, dst, flags, 1047 ND_UNCHANGED, &nce); 1048 mutex_exit(&ipst->ips_ndp6->ndp_g_lock); 1049 if (err == 0) 1050 err = nce_add_v6_postprocess(nce); 1051 if (hw_addr != NULL) 1052 kmem_free(hw_addr, ill->ill_nd_lla_len); 1053 if (err != 0) { 1054 ip1dbg(("nce_set_multicast_v6: create failed" "%d\n", err)); 1055 return (err); 1056 } 1057 done: 1058 ASSERT(nce->nce_common->ncec_state == ND_REACHABLE); 1059 if (newnce != NULL) 1060 *newnce = nce; 1061 else 1062 nce_refrele(nce); 1063 return (0); 1064 } 1065 1066 /* 1067 * Return the link layer address, and any flags of a ncec. 1068 */ 1069 int 1070 ndp_query(ill_t *ill, struct lif_nd_req *lnr) 1071 { 1072 ncec_t *ncec; 1073 in6_addr_t *addr; 1074 sin6_t *sin6; 1075 1076 ASSERT(ill != NULL && ill->ill_isv6); 1077 sin6 = (sin6_t *)&lnr->lnr_addr; 1078 addr = &sin6->sin6_addr; 1079 1080 /* 1081 * NOTE: if the ill is an IPMP interface, then match against the whole 1082 * illgrp. This e.g. allows in.ndpd to retrieve the link layer 1083 * addresses for the data addresses on an IPMP interface even though 1084 * ipif_ndp_up() created them with an ncec_ill of ipif_bound_ill. 1085 */ 1086 ncec = ncec_lookup_illgrp_v6(ill, addr); 1087 if (ncec == NULL) 1088 return (ESRCH); 1089 /* If no link layer address is available yet, return ESRCH */ 1090 if (!NCE_ISREACHABLE(ncec)) { 1091 ncec_refrele(ncec); 1092 return (ESRCH); 1093 } 1094 lnr->lnr_hdw_len = ill->ill_phys_addr_length; 1095 bcopy(ncec->ncec_lladdr, (uchar_t *)&lnr->lnr_hdw_addr, 1096 lnr->lnr_hdw_len); 1097 if (ncec->ncec_flags & NCE_F_ISROUTER) 1098 lnr->lnr_flags = NDF_ISROUTER_ON; 1099 if (ncec->ncec_flags & NCE_F_ANYCAST) 1100 lnr->lnr_flags |= NDF_ANYCAST_ON; 1101 ncec_refrele(ncec); 1102 return (0); 1103 } 1104 1105 /* 1106 * Finish setting up the Enable/Disable multicast for the driver. 1107 */ 1108 mblk_t * 1109 ndp_mcastreq(ill_t *ill, const in6_addr_t *v6group, uint32_t hw_addr_len, 1110 uint32_t hw_addr_offset, mblk_t *mp) 1111 { 1112 uchar_t *hw_addr; 1113 ipaddr_t v4group; 1114 uchar_t *addr; 1115 1116 ASSERT(ill->ill_net_type == IRE_IF_RESOLVER); 1117 if (IN6_IS_ADDR_V4MAPPED(v6group)) { 1118 IN6_V4MAPPED_TO_IPADDR(v6group, v4group); 1119 1120 ASSERT(CLASSD(v4group)); 1121 ASSERT(!(ill->ill_isv6)); 1122 1123 addr = (uchar_t *)&v4group; 1124 } else { 1125 ASSERT(IN6_IS_ADDR_MULTICAST(v6group)); 1126 ASSERT(ill->ill_isv6); 1127 1128 addr = (uchar_t *)v6group; 1129 } 1130 hw_addr = mi_offset_paramc(mp, hw_addr_offset, hw_addr_len); 1131 if (hw_addr == NULL) { 1132 ip0dbg(("ndp_mcastreq NULL hw_addr\n")); 1133 freemsg(mp); 1134 return (NULL); 1135 } 1136 1137 ip_mcast_mapping(ill, addr, hw_addr); 1138 return (mp); 1139 } 1140 1141 void 1142 ip_ndp_resolve(ncec_t *ncec) 1143 { 1144 in_addr_t sender4 = INADDR_ANY; 1145 in6_addr_t sender6 = ipv6_all_zeros; 1146 ill_t *src_ill; 1147 uint32_t ms; 1148 1149 src_ill = nce_resolve_src(ncec, &sender6); 1150 if (src_ill == NULL) { 1151 /* Make sure we try again later */ 1152 ms = ncec->ncec_ill->ill_reachable_retrans_time; 1153 nce_restart_timer(ncec, (clock_t)ms); 1154 return; 1155 } 1156 if (ncec->ncec_ipversion == IPV4_VERSION) 1157 IN6_V4MAPPED_TO_IPADDR(&sender6, sender4); 1158 mutex_enter(&ncec->ncec_lock); 1159 if (ncec->ncec_ipversion == IPV6_VERSION) 1160 ms = ndp_solicit(ncec, sender6, src_ill); 1161 else 1162 ms = arp_request(ncec, sender4, src_ill); 1163 mutex_exit(&ncec->ncec_lock); 1164 if (ms == 0) { 1165 if (ncec->ncec_state != ND_REACHABLE) { 1166 if (ncec->ncec_ipversion == IPV6_VERSION) 1167 ndp_resolv_failed(ncec); 1168 else 1169 arp_resolv_failed(ncec); 1170 ASSERT((ncec->ncec_flags & NCE_F_STATIC) == 0); 1171 nce_make_unreachable(ncec); 1172 ncec_delete(ncec); 1173 } 1174 } else { 1175 nce_restart_timer(ncec, (clock_t)ms); 1176 } 1177 done: 1178 ill_refrele(src_ill); 1179 } 1180 1181 /* 1182 * Send an IPv6 neighbor solicitation. 1183 * Returns number of milliseconds after which we should either rexmit or abort. 1184 * Return of zero means we should abort. 1185 * The caller holds the ncec_lock to protect ncec_qd_mp and ncec_rcnt. 1186 * The optional source address is used as a hint to ndp_solicit for 1187 * which source to use in the packet. 1188 * 1189 * NOTE: This routine drops ncec_lock (and later reacquires it) when sending 1190 * the packet. 1191 */ 1192 uint32_t 1193 ndp_solicit(ncec_t *ncec, in6_addr_t src, ill_t *ill) 1194 { 1195 in6_addr_t dst; 1196 boolean_t dropped = B_FALSE; 1197 1198 ASSERT(ncec->ncec_ipversion == IPV6_VERSION); 1199 ASSERT(MUTEX_HELD(&ncec->ncec_lock)); 1200 1201 if (ncec->ncec_rcnt == 0) 1202 return (0); 1203 1204 dst = ncec->ncec_addr; 1205 ncec->ncec_rcnt--; 1206 mutex_exit(&ncec->ncec_lock); 1207 dropped = ndp_xmit(ill, ND_NEIGHBOR_SOLICIT, ill->ill_phys_addr, 1208 ill->ill_phys_addr_length, &src, &dst, 0); 1209 mutex_enter(&ncec->ncec_lock); 1210 if (dropped) 1211 ncec->ncec_rcnt++; 1212 return (ncec->ncec_ill->ill_reachable_retrans_time); 1213 } 1214 1215 /* 1216 * Attempt to recover an address on an interface that's been marked as a 1217 * duplicate. Because NCEs are destroyed when the interface goes down, there's 1218 * no easy way to just probe the address and have the right thing happen if 1219 * it's no longer in use. Instead, we just bring it up normally and allow the 1220 * regular interface start-up logic to probe for a remaining duplicate and take 1221 * us back down if necessary. 1222 * Neither DHCP nor temporary addresses arrive here; they're excluded by 1223 * ip_ndp_excl. 1224 */ 1225 /* ARGSUSED */ 1226 void 1227 ip_addr_recover(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg) 1228 { 1229 ill_t *ill = rq->q_ptr; 1230 ipif_t *ipif; 1231 in6_addr_t *addr6 = (in6_addr_t *)mp->b_rptr; 1232 in_addr_t *addr4 = (in_addr_t *)mp->b_rptr; 1233 boolean_t addr_equal; 1234 1235 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 1236 /* 1237 * We do not support recovery of proxy ARP'd interfaces, 1238 * because the system lacks a complete proxy ARP mechanism. 1239 */ 1240 if (ill->ill_isv6) { 1241 addr_equal = IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6lcl_addr, 1242 addr6); 1243 } else { 1244 addr_equal = (ipif->ipif_lcl_addr == *addr4); 1245 } 1246 1247 if ((ipif->ipif_flags & IPIF_POINTOPOINT) || !addr_equal) 1248 continue; 1249 1250 /* 1251 * If we have already recovered or if the interface is going 1252 * away, then ignore. 1253 */ 1254 mutex_enter(&ill->ill_lock); 1255 if (!(ipif->ipif_flags & IPIF_DUPLICATE) || 1256 (ipif->ipif_state_flags & IPIF_CONDEMNED)) { 1257 mutex_exit(&ill->ill_lock); 1258 continue; 1259 } 1260 1261 ipif->ipif_flags &= ~IPIF_DUPLICATE; 1262 ill->ill_ipif_dup_count--; 1263 mutex_exit(&ill->ill_lock); 1264 ipif->ipif_was_dup = B_TRUE; 1265 1266 if (ill->ill_isv6) { 1267 VERIFY(ipif_ndp_up(ipif, B_TRUE) != EINPROGRESS); 1268 (void) ipif_up_done_v6(ipif); 1269 } else { 1270 VERIFY(ipif_arp_up(ipif, Res_act_initial, B_TRUE) != 1271 EINPROGRESS); 1272 (void) ipif_up_done(ipif); 1273 } 1274 } 1275 freeb(mp); 1276 } 1277 1278 /* 1279 * Attempt to recover an IPv6 interface that's been shut down as a duplicate. 1280 * As long as someone else holds the address, the interface will stay down. 1281 * When that conflict goes away, the interface is brought back up. This is 1282 * done so that accidental shutdowns of addresses aren't made permanent. Your 1283 * server will recover from a failure. 1284 * 1285 * For DHCP and temporary addresses, recovery is not done in the kernel. 1286 * Instead, it's handled by user space processes (dhcpagent and in.ndpd). 1287 * 1288 * This function is entered on a timer expiry; the ID is in ipif_recovery_id. 1289 */ 1290 void 1291 ipif_dup_recovery(void *arg) 1292 { 1293 ipif_t *ipif = arg; 1294 1295 ipif->ipif_recovery_id = 0; 1296 if (!(ipif->ipif_flags & IPIF_DUPLICATE)) 1297 return; 1298 1299 /* 1300 * No lock, because this is just an optimization. 1301 */ 1302 if (ipif->ipif_state_flags & IPIF_CONDEMNED) 1303 return; 1304 1305 /* If the link is down, we'll retry this later */ 1306 if (!(ipif->ipif_ill->ill_phyint->phyint_flags & PHYI_RUNNING)) 1307 return; 1308 1309 ipif_do_recovery(ipif); 1310 } 1311 1312 /* 1313 * Perform interface recovery by forcing the duplicate interfaces up and 1314 * allowing the system to determine which ones should stay up. 1315 * 1316 * Called both by recovery timer expiry and link-up notification. 1317 */ 1318 void 1319 ipif_do_recovery(ipif_t *ipif) 1320 { 1321 ill_t *ill = ipif->ipif_ill; 1322 mblk_t *mp; 1323 ip_stack_t *ipst = ill->ill_ipst; 1324 size_t mp_size; 1325 1326 if (ipif->ipif_isv6) 1327 mp_size = sizeof (ipif->ipif_v6lcl_addr); 1328 else 1329 mp_size = sizeof (ipif->ipif_lcl_addr); 1330 mp = allocb(mp_size, BPRI_MED); 1331 if (mp == NULL) { 1332 mutex_enter(&ill->ill_lock); 1333 if (ipst->ips_ip_dup_recovery > 0 && 1334 ipif->ipif_recovery_id == 0 && 1335 !(ipif->ipif_state_flags & IPIF_CONDEMNED)) { 1336 ipif->ipif_recovery_id = timeout(ipif_dup_recovery, 1337 ipif, MSEC_TO_TICK(ipst->ips_ip_dup_recovery)); 1338 } 1339 mutex_exit(&ill->ill_lock); 1340 } else { 1341 /* 1342 * A recovery timer may still be running if we got here from 1343 * ill_restart_dad(); cancel that timer. 1344 */ 1345 if (ipif->ipif_recovery_id != 0) 1346 (void) untimeout(ipif->ipif_recovery_id); 1347 ipif->ipif_recovery_id = 0; 1348 1349 if (ipif->ipif_isv6) { 1350 bcopy(&ipif->ipif_v6lcl_addr, mp->b_rptr, 1351 sizeof (ipif->ipif_v6lcl_addr)); 1352 } else { 1353 bcopy(&ipif->ipif_lcl_addr, mp->b_rptr, 1354 sizeof (ipif->ipif_lcl_addr)); 1355 } 1356 ill_refhold(ill); 1357 qwriter_ip(ill, ill->ill_rq, mp, ip_addr_recover, NEW_OP, 1358 B_FALSE); 1359 } 1360 } 1361 1362 /* 1363 * Find the MAC and IP addresses in an NA/NS message. 1364 */ 1365 static void 1366 ip_ndp_find_addresses(mblk_t *mp, ip_recv_attr_t *ira, ill_t *ill, 1367 in6_addr_t *targp, uchar_t **haddr, uint_t *haddrlenp) 1368 { 1369 icmp6_t *icmp6 = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN); 1370 nd_neighbor_solicit_t *ns = (nd_neighbor_solicit_t *)icmp6; 1371 uchar_t *addr; 1372 int alen; 1373 1374 /* icmp_inbound_v6 ensures this */ 1375 ASSERT(ira->ira_flags & IRAF_L2SRC_SET); 1376 1377 addr = ira->ira_l2src; 1378 alen = ill->ill_phys_addr_length; 1379 if (alen > 0) { 1380 *haddr = addr; 1381 *haddrlenp = alen; 1382 } else { 1383 *haddr = NULL; 1384 *haddrlenp = 0; 1385 } 1386 1387 /* nd_ns_target and nd_na_target are at the same offset, so we cheat */ 1388 *targp = ns->nd_ns_target; 1389 } 1390 1391 /* 1392 * This is for exclusive changes due to NDP duplicate address detection 1393 * failure. 1394 */ 1395 /* ARGSUSED */ 1396 static void 1397 ip_ndp_excl(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg) 1398 { 1399 ill_t *ill = rq->q_ptr; 1400 ipif_t *ipif; 1401 uchar_t *haddr; 1402 uint_t haddrlen; 1403 ip_stack_t *ipst = ill->ill_ipst; 1404 in6_addr_t targ; 1405 ip_recv_attr_t iras; 1406 mblk_t *attrmp; 1407 1408 attrmp = mp; 1409 mp = mp->b_cont; 1410 attrmp->b_cont = NULL; 1411 if (!ip_recv_attr_from_mblk(attrmp, &iras)) { 1412 /* The ill or ip_stack_t disappeared on us */ 1413 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 1414 ip_drop_input("ip_recv_attr_from_mblk", mp, ill); 1415 freemsg(mp); 1416 ira_cleanup(&iras, B_TRUE); 1417 return; 1418 } 1419 1420 ASSERT(ill == iras.ira_rill); 1421 1422 ip_ndp_find_addresses(mp, &iras, ill, &targ, &haddr, &haddrlen); 1423 if (haddr != NULL && haddrlen == ill->ill_phys_addr_length) { 1424 /* 1425 * Ignore conflicts generated by misbehaving switches that 1426 * just reflect our own messages back to us. For IPMP, we may 1427 * see reflections across any ill in the illgrp. 1428 * 1429 * RFC2462 and revisions tried to detect both the case 1430 * when a statically configured IPv6 address is a duplicate, 1431 * and the case when the L2 address itself is a duplicate. The 1432 * later is important because, with stateles address autoconf, 1433 * if the L2 address is a duplicate, the resulting IPv6 1434 * address(es) would also be duplicates. We rely on DAD of the 1435 * IPv6 address itself to detect the latter case. 1436 */ 1437 /* For an under ill_grp can change under lock */ 1438 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 1439 if (bcmp(haddr, ill->ill_phys_addr, haddrlen) == 0 || 1440 IS_UNDER_IPMP(ill) && 1441 ipmp_illgrp_find_ill(ill->ill_grp, haddr, 1442 haddrlen) != NULL) { 1443 rw_exit(&ipst->ips_ill_g_lock); 1444 goto ignore_conflict; 1445 } 1446 rw_exit(&ipst->ips_ill_g_lock); 1447 } 1448 1449 /* 1450 * Look up the appropriate ipif. 1451 */ 1452 ipif = ipif_lookup_addr_v6(&targ, ill, ALL_ZONES, ipst); 1453 if (ipif == NULL) 1454 goto ignore_conflict; 1455 1456 /* Reload the ill to match the ipif */ 1457 ill = ipif->ipif_ill; 1458 1459 /* If it's already duplicate or ineligible, then don't do anything. */ 1460 if (ipif->ipif_flags & (IPIF_POINTOPOINT|IPIF_DUPLICATE)) { 1461 ipif_refrele(ipif); 1462 goto ignore_conflict; 1463 } 1464 1465 /* 1466 * If this is a failure during duplicate recovery, then don't 1467 * complain. It may take a long time to recover. 1468 */ 1469 if (!ipif->ipif_was_dup) { 1470 char ibuf[LIFNAMSIZ]; 1471 char hbuf[MAC_STR_LEN]; 1472 char sbuf[INET6_ADDRSTRLEN]; 1473 1474 ipif_get_name(ipif, ibuf, sizeof (ibuf)); 1475 cmn_err(CE_WARN, "%s has duplicate address %s (in use by %s);" 1476 " disabled", ibuf, 1477 inet_ntop(AF_INET6, &targ, sbuf, sizeof (sbuf)), 1478 mac_colon_addr(haddr, haddrlen, hbuf, sizeof (hbuf))); 1479 } 1480 mutex_enter(&ill->ill_lock); 1481 ASSERT(!(ipif->ipif_flags & IPIF_DUPLICATE)); 1482 ipif->ipif_flags |= IPIF_DUPLICATE; 1483 ill->ill_ipif_dup_count++; 1484 mutex_exit(&ill->ill_lock); 1485 (void) ipif_down(ipif, NULL, NULL); 1486 (void) ipif_down_tail(ipif); 1487 mutex_enter(&ill->ill_lock); 1488 if (!(ipif->ipif_flags & (IPIF_DHCPRUNNING|IPIF_TEMPORARY)) && 1489 ill->ill_net_type == IRE_IF_RESOLVER && 1490 !(ipif->ipif_state_flags & IPIF_CONDEMNED) && 1491 ipst->ips_ip_dup_recovery > 0) { 1492 ASSERT(ipif->ipif_recovery_id == 0); 1493 ipif->ipif_recovery_id = timeout(ipif_dup_recovery, 1494 ipif, MSEC_TO_TICK(ipst->ips_ip_dup_recovery)); 1495 } 1496 mutex_exit(&ill->ill_lock); 1497 ipif_refrele(ipif); 1498 1499 ignore_conflict: 1500 freemsg(mp); 1501 ira_cleanup(&iras, B_TRUE); 1502 } 1503 1504 /* 1505 * Handle failure by tearing down the ipifs with the specified address. Note 1506 * that tearing down the ipif also means deleting the ncec through ipif_down, so 1507 * it's not possible to do recovery by just restarting the ncec timer. Instead, 1508 * we start a timer on the ipif. 1509 * Caller has to free mp; 1510 */ 1511 static void 1512 ndp_failure(mblk_t *mp, ip_recv_attr_t *ira) 1513 { 1514 const uchar_t *haddr; 1515 ill_t *ill = ira->ira_rill; 1516 1517 /* 1518 * Ignore conflicts generated by misbehaving switches that just 1519 * reflect our own messages back to us. 1520 */ 1521 1522 /* icmp_inbound_v6 ensures this */ 1523 ASSERT(ira->ira_flags & IRAF_L2SRC_SET); 1524 haddr = ira->ira_l2src; 1525 if (haddr != NULL && 1526 bcmp(haddr, ill->ill_phys_addr, ill->ill_phys_addr_length) == 0) { 1527 return; 1528 } 1529 1530 if ((mp = copymsg(mp)) != NULL) { 1531 mblk_t *attrmp; 1532 1533 attrmp = ip_recv_attr_to_mblk(ira); 1534 if (attrmp == NULL) { 1535 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 1536 ip_drop_input("ipIfStatsInDiscards", mp, ill); 1537 freemsg(mp); 1538 } else { 1539 ASSERT(attrmp->b_cont == NULL); 1540 attrmp->b_cont = mp; 1541 mp = attrmp; 1542 ill_refhold(ill); 1543 qwriter_ip(ill, ill->ill_rq, mp, ip_ndp_excl, NEW_OP, 1544 B_FALSE); 1545 } 1546 } 1547 } 1548 1549 /* 1550 * Handle a discovered conflict: some other system is advertising that it owns 1551 * one of our IP addresses. We need to defend ourselves, or just shut down the 1552 * interface. 1553 * 1554 * Handles both IPv4 and IPv6 1555 */ 1556 boolean_t 1557 ip_nce_conflict(mblk_t *mp, ip_recv_attr_t *ira, ncec_t *ncec) 1558 { 1559 ipif_t *ipif; 1560 clock_t now; 1561 uint_t maxdefense; 1562 uint_t defs; 1563 ill_t *ill = ira->ira_ill; 1564 ip_stack_t *ipst = ill->ill_ipst; 1565 uint32_t elapsed; 1566 boolean_t isv6 = ill->ill_isv6; 1567 ipaddr_t ncec_addr; 1568 1569 if (isv6) { 1570 ipif = ipif_lookup_addr_v6(&ncec->ncec_addr, ill, ALL_ZONES, 1571 ipst); 1572 } else { 1573 if (arp_no_defense) { 1574 /* 1575 * Yes, there is a conflict, but no, we do not 1576 * defend ourself. 1577 */ 1578 return (B_TRUE); 1579 } 1580 IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, ncec_addr); 1581 ipif = ipif_lookup_addr(ncec_addr, ill, ALL_ZONES, 1582 ipst); 1583 } 1584 if (ipif == NULL) 1585 return (B_FALSE); 1586 1587 /* 1588 * First, figure out if this address is disposable. 1589 */ 1590 if (ipif->ipif_flags & (IPIF_DHCPRUNNING | IPIF_TEMPORARY)) 1591 maxdefense = ipst->ips_ip_max_temp_defend; 1592 else 1593 maxdefense = ipst->ips_ip_max_defend; 1594 1595 /* 1596 * Now figure out how many times we've defended ourselves. Ignore 1597 * defenses that happened long in the past. 1598 */ 1599 now = ddi_get_lbolt(); 1600 elapsed = (drv_hztousec(now - ncec->ncec_last_time_defended))/1000000; 1601 mutex_enter(&ncec->ncec_lock); 1602 if ((defs = ncec->ncec_defense_count) > 0 && 1603 elapsed > ipst->ips_ip_defend_interval) { 1604 /* 1605 * ip_defend_interval has elapsed. 1606 * reset the defense count. 1607 */ 1608 ncec->ncec_defense_count = defs = 0; 1609 } 1610 ncec->ncec_defense_count++; 1611 ncec->ncec_last_time_defended = now; 1612 mutex_exit(&ncec->ncec_lock); 1613 ipif_refrele(ipif); 1614 1615 /* 1616 * If we've defended ourselves too many times already, then give up and 1617 * tear down the interface(s) using this address. 1618 * Otherwise, caller has to defend by sending out an announce. 1619 */ 1620 if (defs >= maxdefense) { 1621 if (isv6) 1622 ndp_failure(mp, ira); 1623 else 1624 arp_failure(mp, ira); 1625 } else { 1626 return (B_TRUE); /* caller must defend this address */ 1627 } 1628 return (B_FALSE); 1629 } 1630 1631 /* 1632 * Handle reception of Neighbor Solicitation messages. 1633 */ 1634 static void 1635 ndp_input_solicit(mblk_t *mp, ip_recv_attr_t *ira) 1636 { 1637 ill_t *ill = ira->ira_ill, *under_ill; 1638 nd_neighbor_solicit_t *ns; 1639 uint32_t hlen = ill->ill_phys_addr_length; 1640 uchar_t *haddr = NULL; 1641 icmp6_t *icmp_nd; 1642 ip6_t *ip6h; 1643 ncec_t *our_ncec = NULL; 1644 in6_addr_t target; 1645 in6_addr_t src; 1646 int len; 1647 int flag = 0; 1648 nd_opt_hdr_t *opt = NULL; 1649 boolean_t bad_solicit = B_FALSE; 1650 mib2_ipv6IfIcmpEntry_t *mib = ill->ill_icmp6_mib; 1651 boolean_t need_ill_refrele = B_FALSE; 1652 1653 ip6h = (ip6_t *)mp->b_rptr; 1654 icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN); 1655 len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN; 1656 src = ip6h->ip6_src; 1657 ns = (nd_neighbor_solicit_t *)icmp_nd; 1658 target = ns->nd_ns_target; 1659 if (IN6_IS_ADDR_MULTICAST(&target)) { 1660 if (ip_debug > 2) { 1661 /* ip1dbg */ 1662 pr_addr_dbg("ndp_input_solicit: Target is" 1663 " multicast! %s\n", AF_INET6, &target); 1664 } 1665 bad_solicit = B_TRUE; 1666 goto done; 1667 } 1668 if (len > sizeof (nd_neighbor_solicit_t)) { 1669 /* Options present */ 1670 opt = (nd_opt_hdr_t *)&ns[1]; 1671 len -= sizeof (nd_neighbor_solicit_t); 1672 if (!ndp_verify_optlen(opt, len)) { 1673 ip1dbg(("ndp_input_solicit: Bad opt len\n")); 1674 bad_solicit = B_TRUE; 1675 goto done; 1676 } 1677 } 1678 if (IN6_IS_ADDR_UNSPECIFIED(&src)) { 1679 /* Check to see if this is a valid DAD solicitation */ 1680 if (!IN6_IS_ADDR_MC_SOLICITEDNODE(&ip6h->ip6_dst)) { 1681 if (ip_debug > 2) { 1682 /* ip1dbg */ 1683 pr_addr_dbg("ndp_input_solicit: IPv6 " 1684 "Destination is not solicited node " 1685 "multicast %s\n", AF_INET6, 1686 &ip6h->ip6_dst); 1687 } 1688 bad_solicit = B_TRUE; 1689 goto done; 1690 } 1691 } 1692 1693 /* 1694 * NOTE: with IPMP, it's possible the nominated multicast ill (which 1695 * received this packet if it's multicast) is not the ill tied to 1696 * e.g. the IPMP ill's data link-local. So we match across the illgrp 1697 * to ensure we find the associated NCE. 1698 */ 1699 our_ncec = ncec_lookup_illgrp_v6(ill, &target); 1700 /* 1701 * If this is a valid Solicitation for an address we are publishing, 1702 * then a PUBLISH entry should exist in the cache 1703 */ 1704 if (our_ncec == NULL || !NCE_PUBLISH(our_ncec)) { 1705 ip1dbg(("ndp_input_solicit: Wrong target in NS?!" 1706 "ifname=%s ", ill->ill_name)); 1707 if (ip_debug > 2) { 1708 /* ip1dbg */ 1709 pr_addr_dbg(" dst %s\n", AF_INET6, &target); 1710 } 1711 if (our_ncec == NULL) 1712 bad_solicit = B_TRUE; 1713 goto done; 1714 } 1715 1716 /* At this point we should have a verified NS per spec */ 1717 if (opt != NULL) { 1718 opt = ndp_get_option(opt, len, ND_OPT_SOURCE_LINKADDR); 1719 if (opt != NULL) { 1720 haddr = (uchar_t *)&opt[1]; 1721 if (hlen > opt->nd_opt_len * 8 - sizeof (*opt) || 1722 hlen == 0) { 1723 ip1dbg(("ndp_input_advert: bad SLLA\n")); 1724 bad_solicit = B_TRUE; 1725 goto done; 1726 } 1727 } 1728 } 1729 1730 /* If sending directly to peer, set the unicast flag */ 1731 if (!IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst)) 1732 flag |= NDP_UNICAST; 1733 1734 /* 1735 * Create/update the entry for the soliciting node on the ipmp_ill. 1736 * or respond to outstanding queries, don't if 1737 * the source is unspecified address. 1738 */ 1739 if (!IN6_IS_ADDR_UNSPECIFIED(&src)) { 1740 int err; 1741 nce_t *nnce; 1742 1743 ASSERT(ill->ill_isv6); 1744 /* 1745 * Regular solicitations *must* include the Source Link-Layer 1746 * Address option. Ignore messages that do not. 1747 */ 1748 if (haddr == NULL && IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst)) { 1749 ip1dbg(("ndp_input_solicit: source link-layer address " 1750 "option missing with a specified source.\n")); 1751 bad_solicit = B_TRUE; 1752 goto done; 1753 } 1754 1755 /* 1756 * This is a regular solicitation. If we're still in the 1757 * process of verifying the address, then don't respond at all 1758 * and don't keep track of the sender. 1759 */ 1760 if (our_ncec->ncec_state == ND_PROBE) 1761 goto done; 1762 1763 /* 1764 * If the solicitation doesn't have sender hardware address 1765 * (legal for unicast solicitation), then process without 1766 * installing the return NCE. Either we already know it, or 1767 * we'll be forced to look it up when (and if) we reply to the 1768 * packet. 1769 */ 1770 if (haddr == NULL) 1771 goto no_source; 1772 1773 under_ill = ill; 1774 if (IS_UNDER_IPMP(under_ill)) { 1775 ill = ipmp_ill_hold_ipmp_ill(under_ill); 1776 if (ill == NULL) 1777 ill = under_ill; 1778 else 1779 need_ill_refrele = B_TRUE; 1780 } 1781 err = nce_lookup_then_add_v6(ill, 1782 haddr, hlen, 1783 &src, /* Soliciting nodes address */ 1784 0, 1785 ND_STALE, 1786 &nnce); 1787 1788 if (need_ill_refrele) { 1789 ill_refrele(ill); 1790 ill = under_ill; 1791 need_ill_refrele = B_FALSE; 1792 } 1793 switch (err) { 1794 case 0: 1795 /* done with this entry */ 1796 nce_refrele(nnce); 1797 break; 1798 case EEXIST: 1799 /* 1800 * B_FALSE indicates this is not an an advertisement. 1801 */ 1802 nce_process(nnce->nce_common, haddr, 0, B_FALSE); 1803 nce_refrele(nnce); 1804 break; 1805 default: 1806 ip1dbg(("ndp_input_solicit: Can't create NCE %d\n", 1807 err)); 1808 goto done; 1809 } 1810 no_source: 1811 flag |= NDP_SOLICITED; 1812 } else { 1813 /* 1814 * No source link layer address option should be present in a 1815 * valid DAD request. 1816 */ 1817 if (haddr != NULL) { 1818 ip1dbg(("ndp_input_solicit: source link-layer address " 1819 "option present with an unspecified source.\n")); 1820 bad_solicit = B_TRUE; 1821 goto done; 1822 } 1823 if (our_ncec->ncec_state == ND_PROBE) { 1824 /* 1825 * Internally looped-back probes will have 1826 * IRAF_L2SRC_LOOPBACK set so we can ignore our own 1827 * transmissions. 1828 */ 1829 if (!(ira->ira_flags & IRAF_L2SRC_LOOPBACK)) { 1830 /* 1831 * If someone else is probing our address, then 1832 * we've crossed wires. Declare failure. 1833 */ 1834 ndp_failure(mp, ira); 1835 } 1836 goto done; 1837 } 1838 /* 1839 * This is a DAD probe. Multicast the advertisement to the 1840 * all-nodes address. 1841 */ 1842 src = ipv6_all_hosts_mcast; 1843 } 1844 flag |= nce_advert_flags(our_ncec); 1845 (void) ndp_xmit(ill, 1846 ND_NEIGHBOR_ADVERT, 1847 our_ncec->ncec_lladdr, 1848 our_ncec->ncec_lladdr_length, 1849 &target, /* Source and target of the advertisement pkt */ 1850 &src, /* IP Destination (source of original pkt) */ 1851 flag); 1852 done: 1853 if (bad_solicit) 1854 BUMP_MIB(mib, ipv6IfIcmpInBadNeighborSolicitations); 1855 if (our_ncec != NULL) 1856 ncec_refrele(our_ncec); 1857 } 1858 1859 /* 1860 * Handle reception of Neighbor Solicitation messages 1861 */ 1862 void 1863 ndp_input_advert(mblk_t *mp, ip_recv_attr_t *ira) 1864 { 1865 ill_t *ill = ira->ira_ill; 1866 nd_neighbor_advert_t *na; 1867 uint32_t hlen = ill->ill_phys_addr_length; 1868 uchar_t *haddr = NULL; 1869 icmp6_t *icmp_nd; 1870 ip6_t *ip6h; 1871 ncec_t *dst_ncec = NULL; 1872 in6_addr_t target; 1873 nd_opt_hdr_t *opt = NULL; 1874 int len; 1875 ip_stack_t *ipst = ill->ill_ipst; 1876 mib2_ipv6IfIcmpEntry_t *mib = ill->ill_icmp6_mib; 1877 1878 ip6h = (ip6_t *)mp->b_rptr; 1879 icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN); 1880 len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN; 1881 na = (nd_neighbor_advert_t *)icmp_nd; 1882 1883 if (IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst) && 1884 (na->nd_na_flags_reserved & ND_NA_FLAG_SOLICITED)) { 1885 ip1dbg(("ndp_input_advert: Target is multicast but the " 1886 "solicited flag is not zero\n")); 1887 BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements); 1888 return; 1889 } 1890 target = na->nd_na_target; 1891 if (IN6_IS_ADDR_MULTICAST(&target)) { 1892 ip1dbg(("ndp_input_advert: Target is multicast!\n")); 1893 BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements); 1894 return; 1895 } 1896 if (len > sizeof (nd_neighbor_advert_t)) { 1897 opt = (nd_opt_hdr_t *)&na[1]; 1898 if (!ndp_verify_optlen(opt, 1899 len - sizeof (nd_neighbor_advert_t))) { 1900 ip1dbg(("ndp_input_advert: cannot verify SLLA\n")); 1901 BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements); 1902 return; 1903 } 1904 /* At this point we have a verified NA per spec */ 1905 len -= sizeof (nd_neighbor_advert_t); 1906 opt = ndp_get_option(opt, len, ND_OPT_TARGET_LINKADDR); 1907 if (opt != NULL) { 1908 haddr = (uchar_t *)&opt[1]; 1909 if (hlen > opt->nd_opt_len * 8 - sizeof (*opt) || 1910 hlen == 0) { 1911 ip1dbg(("ndp_input_advert: bad SLLA\n")); 1912 BUMP_MIB(mib, 1913 ipv6IfIcmpInBadNeighborAdvertisements); 1914 return; 1915 } 1916 } 1917 } 1918 1919 /* 1920 * NOTE: we match across the illgrp since we need to do DAD for all of 1921 * our local addresses, and those are spread across all the active 1922 * ills in the group. 1923 */ 1924 if ((dst_ncec = ncec_lookup_illgrp_v6(ill, &target)) == NULL) 1925 return; 1926 1927 if (NCE_PUBLISH(dst_ncec)) { 1928 /* 1929 * Someone just advertised an addresses that we publish. First, 1930 * check it it was us -- if so, we can safely ignore it. 1931 * We don't get the haddr from the ira_l2src because, in the 1932 * case that the packet originated from us, on an IPMP group, 1933 * the ira_l2src may would be the link-layer address of the 1934 * cast_ill used to send the packet, which may not be the same 1935 * as the dst_ncec->ncec_lladdr of the address. 1936 */ 1937 if (haddr != NULL) { 1938 if (ira->ira_flags & IRAF_L2SRC_LOOPBACK) 1939 goto out; 1940 1941 if (!nce_cmp_ll_addr(dst_ncec, haddr, hlen)) 1942 goto out; /* from us -- no conflict */ 1943 1944 /* 1945 * If we're in an IPMP group, check if this is an echo 1946 * from another ill in the group. Use the double- 1947 * checked locking pattern to avoid grabbing 1948 * ill_g_lock in the non-IPMP case. 1949 */ 1950 if (IS_UNDER_IPMP(ill)) { 1951 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 1952 if (IS_UNDER_IPMP(ill) && ipmp_illgrp_find_ill( 1953 ill->ill_grp, haddr, hlen) != NULL) { 1954 rw_exit(&ipst->ips_ill_g_lock); 1955 goto out; 1956 } 1957 rw_exit(&ipst->ips_ill_g_lock); 1958 } 1959 } 1960 1961 /* 1962 * This appears to be a real conflict. If we're trying to 1963 * configure this NCE (ND_PROBE), then shut it down. 1964 * Otherwise, handle the discovered conflict. 1965 */ 1966 if (dst_ncec->ncec_state == ND_PROBE) { 1967 ndp_failure(mp, ira); 1968 } else { 1969 if (ip_nce_conflict(mp, ira, dst_ncec)) { 1970 char hbuf[MAC_STR_LEN]; 1971 char sbuf[INET6_ADDRSTRLEN]; 1972 1973 cmn_err(CE_WARN, 1974 "node '%s' is using %s on %s", 1975 inet_ntop(AF_INET6, &target, sbuf, 1976 sizeof (sbuf)), 1977 haddr == NULL ? "<none>" : 1978 mac_colon_addr(haddr, hlen, hbuf, 1979 sizeof (hbuf)), ill->ill_name); 1980 /* 1981 * RFC 4862, Section 5.4.4 does not mandate 1982 * any specific behavior when an NA matches 1983 * a non-tentative address assigned to the 1984 * receiver. We make the choice of defending 1985 * our address, based on the assumption that 1986 * the sender has not detected the Duplicate. 1987 * 1988 * ncec_last_time_defended has been adjusted 1989 * in ip_nce_conflict() 1990 */ 1991 (void) ndp_announce(dst_ncec); 1992 } 1993 } 1994 } else { 1995 if (na->nd_na_flags_reserved & ND_NA_FLAG_ROUTER) 1996 dst_ncec->ncec_flags |= NCE_F_ISROUTER; 1997 1998 /* B_TRUE indicates this an advertisement */ 1999 nce_process(dst_ncec, haddr, na->nd_na_flags_reserved, B_TRUE); 2000 } 2001 out: 2002 ncec_refrele(dst_ncec); 2003 } 2004 2005 /* 2006 * Process NDP neighbor solicitation/advertisement messages. 2007 * The checksum has already checked o.k before reaching here. 2008 * Information about the datalink header is contained in ira_l2src, but 2009 * that should be ignored for loopback packets. 2010 */ 2011 void 2012 ndp_input(mblk_t *mp, ip_recv_attr_t *ira) 2013 { 2014 ill_t *ill = ira->ira_rill; 2015 icmp6_t *icmp_nd; 2016 ip6_t *ip6h; 2017 int len; 2018 mib2_ipv6IfIcmpEntry_t *mib = ill->ill_icmp6_mib; 2019 ill_t *orig_ill = NULL; 2020 2021 /* 2022 * Since ira_ill is where the IRE_LOCAL was hosted we use ira_rill 2023 * and make it be the IPMP upper so avoid being confused by a packet 2024 * addressed to a unicast address on a different ill. 2025 */ 2026 if (IS_UNDER_IPMP(ill)) { 2027 orig_ill = ill; 2028 ill = ipmp_ill_hold_ipmp_ill(orig_ill); 2029 if (ill == NULL) { 2030 ill = orig_ill; 2031 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 2032 ip_drop_input("ipIfStatsInDiscards - IPMP ill", 2033 mp, ill); 2034 freemsg(mp); 2035 return; 2036 } 2037 ASSERT(ill != orig_ill); 2038 orig_ill = ira->ira_ill; 2039 ira->ira_ill = ill; 2040 mib = ill->ill_icmp6_mib; 2041 } 2042 if (!pullupmsg(mp, -1)) { 2043 ip1dbg(("ndp_input: pullupmsg failed\n")); 2044 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); 2045 ip_drop_input("ipIfStatsInDiscards - pullupmsg", mp, ill); 2046 goto done; 2047 } 2048 ip6h = (ip6_t *)mp->b_rptr; 2049 if (ip6h->ip6_hops != IPV6_MAX_HOPS) { 2050 ip1dbg(("ndp_input: hoplimit != IPV6_MAX_HOPS\n")); 2051 ip_drop_input("ipv6IfIcmpBadHoplimit", mp, ill); 2052 BUMP_MIB(mib, ipv6IfIcmpBadHoplimit); 2053 goto done; 2054 } 2055 /* 2056 * NDP does not accept any extension headers between the 2057 * IP header and the ICMP header since e.g. a routing 2058 * header could be dangerous. 2059 * This assumes that any AH or ESP headers are removed 2060 * by ip prior to passing the packet to ndp_input. 2061 */ 2062 if (ip6h->ip6_nxt != IPPROTO_ICMPV6) { 2063 ip1dbg(("ndp_input: Wrong next header 0x%x\n", 2064 ip6h->ip6_nxt)); 2065 ip_drop_input("Wrong next header", mp, ill); 2066 BUMP_MIB(mib, ipv6IfIcmpInErrors); 2067 goto done; 2068 } 2069 icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN); 2070 ASSERT(icmp_nd->icmp6_type == ND_NEIGHBOR_SOLICIT || 2071 icmp_nd->icmp6_type == ND_NEIGHBOR_ADVERT); 2072 if (icmp_nd->icmp6_code != 0) { 2073 ip1dbg(("ndp_input: icmp6 code != 0 \n")); 2074 ip_drop_input("code non-zero", mp, ill); 2075 BUMP_MIB(mib, ipv6IfIcmpInErrors); 2076 goto done; 2077 } 2078 len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN; 2079 /* 2080 * Make sure packet length is large enough for either 2081 * a NS or a NA icmp packet. 2082 */ 2083 if (len < sizeof (struct icmp6_hdr) + sizeof (struct in6_addr)) { 2084 ip1dbg(("ndp_input: packet too short\n")); 2085 ip_drop_input("packet too short", mp, ill); 2086 BUMP_MIB(mib, ipv6IfIcmpInErrors); 2087 goto done; 2088 } 2089 if (icmp_nd->icmp6_type == ND_NEIGHBOR_SOLICIT) { 2090 ndp_input_solicit(mp, ira); 2091 } else { 2092 ndp_input_advert(mp, ira); 2093 } 2094 done: 2095 freemsg(mp); 2096 if (orig_ill != NULL) { 2097 ill_refrele(ill); 2098 ira->ira_ill = orig_ill; 2099 } 2100 } 2101 2102 /* 2103 * ndp_xmit is called to form and transmit a ND solicitation or 2104 * advertisement ICMP packet. 2105 * 2106 * If the source address is unspecified and this isn't a probe (used for 2107 * duplicate address detection), an appropriate source address and link layer 2108 * address will be chosen here. The link layer address option is included if 2109 * the source is specified (i.e., all non-probe packets), and omitted (per the 2110 * specification) otherwise. 2111 * 2112 * It returns B_FALSE only if it does a successful put() to the 2113 * corresponding ill's ill_wq otherwise returns B_TRUE. 2114 */ 2115 static boolean_t 2116 ndp_xmit(ill_t *ill, uint32_t operation, uint8_t *hw_addr, uint_t hw_addr_len, 2117 const in6_addr_t *sender, const in6_addr_t *target, int flag) 2118 { 2119 uint32_t len; 2120 icmp6_t *icmp6; 2121 mblk_t *mp; 2122 ip6_t *ip6h; 2123 nd_opt_hdr_t *opt; 2124 uint_t plen; 2125 zoneid_t zoneid = GLOBAL_ZONEID; 2126 ill_t *hwaddr_ill = ill; 2127 ip_xmit_attr_t ixas; 2128 ip_stack_t *ipst = ill->ill_ipst; 2129 boolean_t need_refrele = B_FALSE; 2130 boolean_t probe = B_FALSE; 2131 2132 if (IS_UNDER_IPMP(ill)) { 2133 probe = ipif_lookup_testaddr_v6(ill, sender, NULL); 2134 /* 2135 * We send non-probe packets on the upper IPMP interface. 2136 * ip_output_simple() will use cast_ill for sending any 2137 * multicast packets. Note that we can't follow the same 2138 * logic for probe packets because all interfaces in the ipmp 2139 * group may have failed, so that we really want to only try 2140 * to send the ND packet on the ill corresponding to the src 2141 * address. 2142 */ 2143 if (!probe) { 2144 ill = ipmp_ill_hold_ipmp_ill(ill); 2145 if (ill != NULL) 2146 need_refrele = B_TRUE; 2147 else 2148 ill = hwaddr_ill; 2149 } 2150 } 2151 2152 /* 2153 * If we have a unspecified source(sender) address, select a 2154 * proper source address for the solicitation here itself so 2155 * that we can initialize the h/w address correctly. 2156 * 2157 * If the sender is specified then we use this address in order 2158 * to lookup the zoneid before calling ip_output_v6(). This is to 2159 * enable unicast ND_NEIGHBOR_ADVERT packets to be routed correctly 2160 * by IP (we cannot guarantee that the global zone has an interface 2161 * route to the destination). 2162 * 2163 * Note that the NA never comes here with the unspecified source 2164 * address. 2165 */ 2166 2167 /* 2168 * Probes will have unspec src at this point. 2169 */ 2170 if (!(IN6_IS_ADDR_UNSPECIFIED(sender))) { 2171 zoneid = ipif_lookup_addr_zoneid_v6(sender, ill, ipst); 2172 /* 2173 * It's possible for ipif_lookup_addr_zoneid_v6() to return 2174 * ALL_ZONES if it cannot find a matching ipif for the address 2175 * we are trying to use. In this case we err on the side of 2176 * trying to send the packet by defaulting to the GLOBAL_ZONEID. 2177 */ 2178 if (zoneid == ALL_ZONES) 2179 zoneid = GLOBAL_ZONEID; 2180 } 2181 2182 plen = (sizeof (nd_opt_hdr_t) + hw_addr_len + 7) / 8; 2183 len = IPV6_HDR_LEN + sizeof (nd_neighbor_advert_t) + plen * 8; 2184 mp = allocb(len, BPRI_LO); 2185 if (mp == NULL) { 2186 if (need_refrele) 2187 ill_refrele(ill); 2188 return (B_TRUE); 2189 } 2190 2191 bzero((char *)mp->b_rptr, len); 2192 mp->b_wptr = mp->b_rptr + len; 2193 2194 bzero(&ixas, sizeof (ixas)); 2195 ixas.ixa_flags = IXAF_BASIC_SIMPLE_V6 | IXAF_NO_HW_CKSUM; 2196 2197 ixas.ixa_ifindex = ill->ill_phyint->phyint_ifindex; 2198 ixas.ixa_ipst = ipst; 2199 ixas.ixa_cred = kcred; 2200 ixas.ixa_cpid = NOPID; 2201 ixas.ixa_tsl = NULL; 2202 ixas.ixa_zoneid = zoneid; 2203 2204 ip6h = (ip6_t *)mp->b_rptr; 2205 ip6h->ip6_vcf = IPV6_DEFAULT_VERS_AND_FLOW; 2206 ip6h->ip6_plen = htons(len - IPV6_HDR_LEN); 2207 ip6h->ip6_nxt = IPPROTO_ICMPV6; 2208 ip6h->ip6_hops = IPV6_MAX_HOPS; 2209 ixas.ixa_multicast_ttl = ip6h->ip6_hops; 2210 ip6h->ip6_dst = *target; 2211 icmp6 = (icmp6_t *)&ip6h[1]; 2212 2213 if (hw_addr_len != 0) { 2214 opt = (nd_opt_hdr_t *)((uint8_t *)ip6h + IPV6_HDR_LEN + 2215 sizeof (nd_neighbor_advert_t)); 2216 } else { 2217 opt = NULL; 2218 } 2219 if (operation == ND_NEIGHBOR_SOLICIT) { 2220 nd_neighbor_solicit_t *ns = (nd_neighbor_solicit_t *)icmp6; 2221 2222 if (opt != NULL && !(flag & NDP_PROBE)) { 2223 /* 2224 * Note that we don't send out SLLA for ND probes 2225 * per RFC 4862, even though we do send out the src 2226 * haddr for IPv4 DAD probes, even though both IPv4 2227 * and IPv6 go out with the unspecified/INADDR_ANY 2228 * src IP addr. 2229 */ 2230 opt->nd_opt_type = ND_OPT_SOURCE_LINKADDR; 2231 } 2232 ip6h->ip6_src = *sender; 2233 ns->nd_ns_target = *target; 2234 if (!(flag & NDP_UNICAST)) { 2235 /* Form multicast address of the target */ 2236 ip6h->ip6_dst = ipv6_solicited_node_mcast; 2237 ip6h->ip6_dst.s6_addr32[3] |= 2238 ns->nd_ns_target.s6_addr32[3]; 2239 } 2240 } else { 2241 nd_neighbor_advert_t *na = (nd_neighbor_advert_t *)icmp6; 2242 2243 ASSERT(!(flag & NDP_PROBE)); 2244 if (opt != NULL) 2245 opt->nd_opt_type = ND_OPT_TARGET_LINKADDR; 2246 ip6h->ip6_src = *sender; 2247 na->nd_na_target = *sender; 2248 if (flag & NDP_ISROUTER) 2249 na->nd_na_flags_reserved |= ND_NA_FLAG_ROUTER; 2250 if (flag & NDP_SOLICITED) 2251 na->nd_na_flags_reserved |= ND_NA_FLAG_SOLICITED; 2252 if (flag & NDP_ORIDE) 2253 na->nd_na_flags_reserved |= ND_NA_FLAG_OVERRIDE; 2254 } 2255 2256 if (!(flag & NDP_PROBE)) { 2257 if (hw_addr != NULL && opt != NULL) { 2258 /* Fill in link layer address and option len */ 2259 opt->nd_opt_len = (uint8_t)plen; 2260 bcopy(hw_addr, &opt[1], hw_addr_len); 2261 } 2262 } 2263 if (opt != NULL && opt->nd_opt_type == 0) { 2264 /* If there's no link layer address option, then strip it. */ 2265 len -= plen * 8; 2266 mp->b_wptr = mp->b_rptr + len; 2267 ip6h->ip6_plen = htons(len - IPV6_HDR_LEN); 2268 } 2269 2270 icmp6->icmp6_type = (uint8_t)operation; 2271 icmp6->icmp6_code = 0; 2272 /* 2273 * Prepare for checksum by putting icmp length in the icmp 2274 * checksum field. The checksum is calculated in ip_output.c. 2275 */ 2276 icmp6->icmp6_cksum = ip6h->ip6_plen; 2277 2278 (void) ip_output_simple(mp, &ixas); 2279 ixa_cleanup(&ixas); 2280 if (need_refrele) 2281 ill_refrele(ill); 2282 return (B_FALSE); 2283 } 2284 2285 /* 2286 * Used to set ND_UNREACHBLE before ncec_delete sets it NCE_F_CONDEMNED. 2287 * The datapath uses this as an indication that there 2288 * is a problem (as opposed to a NCE that was just 2289 * reclaimed due to lack of memory. 2290 * Note that static ARP entries never become unreachable. 2291 */ 2292 void 2293 nce_make_unreachable(ncec_t *ncec) 2294 { 2295 mutex_enter(&ncec->ncec_lock); 2296 ncec->ncec_state = ND_UNREACHABLE; 2297 mutex_exit(&ncec->ncec_lock); 2298 } 2299 2300 /* 2301 * NCE retransmit timer. Common to IPv4 and IPv6. 2302 * This timer goes off when: 2303 * a. It is time to retransmit a resolution for resolver. 2304 * b. It is time to send reachability probes. 2305 */ 2306 void 2307 nce_timer(void *arg) 2308 { 2309 ncec_t *ncec = arg; 2310 ill_t *ill = ncec->ncec_ill, *src_ill; 2311 char addrbuf[INET6_ADDRSTRLEN]; 2312 boolean_t dropped = B_FALSE; 2313 ip_stack_t *ipst = ncec->ncec_ipst; 2314 boolean_t isv6 = (ncec->ncec_ipversion == IPV6_VERSION); 2315 in_addr_t sender4 = INADDR_ANY; 2316 in6_addr_t sender6 = ipv6_all_zeros; 2317 2318 /* 2319 * The timer has to be cancelled by ncec_delete before doing the final 2320 * refrele. So the NCE is guaranteed to exist when the timer runs 2321 * until it clears the timeout_id. Before clearing the timeout_id 2322 * bump up the refcnt so that we can continue to use the ncec 2323 */ 2324 ASSERT(ncec != NULL); 2325 mutex_enter(&ncec->ncec_lock); 2326 ncec_refhold_locked(ncec); 2327 ncec->ncec_timeout_id = 0; 2328 mutex_exit(&ncec->ncec_lock); 2329 2330 src_ill = nce_resolve_src(ncec, &sender6); 2331 /* if we could not find a sender address, return */ 2332 if (src_ill == NULL) { 2333 if (!isv6) { 2334 IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, sender4); 2335 ip1dbg(("no src ill for %s\n", inet_ntop(AF_INET, 2336 &sender4, addrbuf, sizeof (addrbuf)))); 2337 } else { 2338 ip1dbg(("no src ill for %s\n", inet_ntop(AF_INET6, 2339 &ncec->ncec_addr, addrbuf, sizeof (addrbuf)))); 2340 } 2341 nce_restart_timer(ncec, ill->ill_reachable_retrans_time); 2342 ncec_refrele(ncec); 2343 return; 2344 } 2345 if (!isv6) 2346 IN6_V4MAPPED_TO_IPADDR(&sender6, sender4); 2347 2348 mutex_enter(&ncec->ncec_lock); 2349 /* 2350 * Check the reachability state. 2351 */ 2352 switch (ncec->ncec_state) { 2353 case ND_DELAY: 2354 ASSERT(ncec->ncec_lladdr != NULL); 2355 ncec->ncec_state = ND_PROBE; 2356 ncec->ncec_pcnt = ND_MAX_UNICAST_SOLICIT; 2357 if (isv6) { 2358 mutex_exit(&ncec->ncec_lock); 2359 dropped = ndp_xmit(src_ill, ND_NEIGHBOR_SOLICIT, 2360 src_ill->ill_phys_addr, 2361 src_ill->ill_phys_addr_length, 2362 &sender6, &ncec->ncec_addr, 2363 NDP_UNICAST); 2364 } else { 2365 dropped = arp_request(ncec, sender4, src_ill); 2366 mutex_exit(&ncec->ncec_lock); 2367 } 2368 if (!dropped) { 2369 mutex_enter(&ncec->ncec_lock); 2370 ncec->ncec_pcnt--; 2371 mutex_exit(&ncec->ncec_lock); 2372 } 2373 if (ip_debug > 3) { 2374 /* ip2dbg */ 2375 pr_addr_dbg("nce_timer: state for %s changed " 2376 "to PROBE\n", AF_INET6, &ncec->ncec_addr); 2377 } 2378 nce_restart_timer(ncec, ill->ill_reachable_retrans_time); 2379 break; 2380 case ND_PROBE: 2381 /* must be retransmit timer */ 2382 ASSERT(ncec->ncec_pcnt >= -1); 2383 if (ncec->ncec_pcnt > 0) { 2384 /* 2385 * As per RFC2461, the ncec gets deleted after 2386 * MAX_UNICAST_SOLICIT unsuccessful re-transmissions. 2387 * Note that the first unicast solicitation is sent 2388 * during the DELAY state. 2389 */ 2390 ip2dbg(("nce_timer: pcount=%x dst %s\n", 2391 ncec->ncec_pcnt, 2392 inet_ntop((isv6? AF_INET6 : AF_INET), 2393 &ncec->ncec_addr, addrbuf, sizeof (addrbuf)))); 2394 if (NCE_PUBLISH(ncec)) { 2395 mutex_exit(&ncec->ncec_lock); 2396 /* 2397 * send out a probe; note that src_ill 2398 * is ignored by nce_dad() for all 2399 * DAD message types other than IPv6 2400 * unicast probes 2401 */ 2402 nce_dad(ncec, src_ill, B_TRUE); 2403 } else { 2404 ASSERT(src_ill != NULL); 2405 if (isv6) { 2406 mutex_exit(&ncec->ncec_lock); 2407 dropped = ndp_xmit(src_ill, 2408 ND_NEIGHBOR_SOLICIT, 2409 src_ill->ill_phys_addr, 2410 src_ill->ill_phys_addr_length, 2411 &sender6, &ncec->ncec_addr, 2412 NDP_UNICAST); 2413 } else { 2414 /* 2415 * since the nce is REACHABLE, 2416 * the ARP request will be sent out 2417 * as a link-layer unicast. 2418 */ 2419 dropped = arp_request(ncec, sender4, 2420 src_ill); 2421 mutex_exit(&ncec->ncec_lock); 2422 } 2423 if (!dropped) { 2424 mutex_enter(&ncec->ncec_lock); 2425 ncec->ncec_pcnt--; 2426 mutex_exit(&ncec->ncec_lock); 2427 } 2428 nce_restart_timer(ncec, 2429 ill->ill_reachable_retrans_time); 2430 } 2431 } else if (ncec->ncec_pcnt < 0) { 2432 /* No hope, delete the ncec */ 2433 /* Tell datapath it went bad */ 2434 ncec->ncec_state = ND_UNREACHABLE; 2435 mutex_exit(&ncec->ncec_lock); 2436 if (ip_debug > 2) { 2437 /* ip1dbg */ 2438 pr_addr_dbg("nce_timer: Delete NCE for" 2439 " dst %s\n", (isv6? AF_INET6: AF_INET), 2440 &ncec->ncec_addr); 2441 } 2442 /* if static ARP can't delete. */ 2443 if ((ncec->ncec_flags & NCE_F_STATIC) == 0) 2444 ncec_delete(ncec); 2445 2446 } else if (!NCE_PUBLISH(ncec)) { 2447 /* 2448 * Probe count is 0 for a dynamic entry (one that we 2449 * ourselves are not publishing). We should never get 2450 * here if NONUD was requested, hence the ASSERT below. 2451 */ 2452 ASSERT((ncec->ncec_flags & NCE_F_NONUD) == 0); 2453 ip2dbg(("nce_timer: pcount=%x dst %s\n", 2454 ncec->ncec_pcnt, inet_ntop(AF_INET6, 2455 &ncec->ncec_addr, addrbuf, sizeof (addrbuf)))); 2456 ncec->ncec_pcnt--; 2457 mutex_exit(&ncec->ncec_lock); 2458 /* Wait one interval before killing */ 2459 nce_restart_timer(ncec, 2460 ill->ill_reachable_retrans_time); 2461 } else if (ill->ill_phyint->phyint_flags & PHYI_RUNNING) { 2462 ipif_t *ipif; 2463 ipaddr_t ncec_addr; 2464 2465 /* 2466 * We're done probing, and we can now declare this 2467 * address to be usable. Let IP know that it's ok to 2468 * use. 2469 */ 2470 ncec->ncec_state = ND_REACHABLE; 2471 ncec->ncec_flags &= ~NCE_F_UNVERIFIED; 2472 mutex_exit(&ncec->ncec_lock); 2473 if (isv6) { 2474 ipif = ipif_lookup_addr_exact_v6( 2475 &ncec->ncec_addr, ill, ipst); 2476 } else { 2477 IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, 2478 ncec_addr); 2479 ipif = ipif_lookup_addr_exact(ncec_addr, ill, 2480 ipst); 2481 } 2482 if (ipif != NULL) { 2483 if (ipif->ipif_was_dup) { 2484 char ibuf[LIFNAMSIZ]; 2485 char sbuf[INET6_ADDRSTRLEN]; 2486 2487 ipif->ipif_was_dup = B_FALSE; 2488 (void) inet_ntop(AF_INET6, 2489 &ipif->ipif_v6lcl_addr, 2490 sbuf, sizeof (sbuf)); 2491 ipif_get_name(ipif, ibuf, 2492 sizeof (ibuf)); 2493 cmn_err(CE_NOTE, "recovered address " 2494 "%s on %s", sbuf, ibuf); 2495 } 2496 if ((ipif->ipif_flags & IPIF_UP) && 2497 !ipif->ipif_addr_ready) 2498 ipif_up_notify(ipif); 2499 ipif->ipif_addr_ready = 1; 2500 ipif_refrele(ipif); 2501 } 2502 if (!isv6 && arp_no_defense) 2503 break; 2504 /* Begin defending our new address */ 2505 if (ncec->ncec_unsolicit_count > 0) { 2506 ncec->ncec_unsolicit_count--; 2507 if (isv6) { 2508 dropped = ndp_announce(ncec); 2509 } else { 2510 dropped = arp_announce(ncec); 2511 } 2512 2513 if (dropped) 2514 ncec->ncec_unsolicit_count++; 2515 else 2516 ncec->ncec_last_time_defended = 2517 ddi_get_lbolt(); 2518 } 2519 if (ncec->ncec_unsolicit_count > 0) { 2520 nce_restart_timer(ncec, 2521 ANNOUNCE_INTERVAL(isv6)); 2522 } else if (DEFENSE_INTERVAL(isv6) != 0) { 2523 nce_restart_timer(ncec, DEFENSE_INTERVAL(isv6)); 2524 } 2525 } else { 2526 /* 2527 * This is an address we're probing to be our own, but 2528 * the ill is down. Wait until it comes back before 2529 * doing anything, but switch to reachable state so 2530 * that the restart will work. 2531 */ 2532 ncec->ncec_state = ND_REACHABLE; 2533 mutex_exit(&ncec->ncec_lock); 2534 } 2535 break; 2536 case ND_INCOMPLETE: { 2537 mblk_t *mp, *nextmp; 2538 mblk_t **prevmpp; 2539 2540 /* 2541 * Per case (2) in the nce_queue_mp() comments, scan ncec_qd_mp 2542 * for any IPMP probe packets, and toss them. IPMP probe 2543 * packets will always be at the head of ncec_qd_mp, so that 2544 * we can stop at the first queued ND packet that is 2545 * not a probe packet. 2546 */ 2547 prevmpp = &ncec->ncec_qd_mp; 2548 for (mp = ncec->ncec_qd_mp; mp != NULL; mp = nextmp) { 2549 nextmp = mp->b_next; 2550 2551 if (IS_UNDER_IPMP(ill) && ncec->ncec_nprobes > 0) { 2552 inet_freemsg(mp); 2553 ncec->ncec_nprobes--; 2554 *prevmpp = nextmp; 2555 } else { 2556 prevmpp = &mp->b_next; 2557 } 2558 } 2559 2560 /* 2561 * Must be resolver's retransmit timer. 2562 */ 2563 mutex_exit(&ncec->ncec_lock); 2564 ip_ndp_resolve(ncec); 2565 break; 2566 } 2567 case ND_REACHABLE: 2568 if (((ncec->ncec_flags & NCE_F_UNSOL_ADV) && 2569 ncec->ncec_unsolicit_count != 0) || 2570 (NCE_PUBLISH(ncec) && DEFENSE_INTERVAL(isv6) != 0)) { 2571 if (ncec->ncec_unsolicit_count > 0) { 2572 ncec->ncec_unsolicit_count--; 2573 mutex_exit(&ncec->ncec_lock); 2574 /* 2575 * When we get to zero announcements left, 2576 * switch to address defense 2577 */ 2578 } else { 2579 boolean_t rate_limit; 2580 2581 mutex_exit(&ncec->ncec_lock); 2582 rate_limit = ill_defend_rate_limit(ill, ncec); 2583 if (rate_limit) { 2584 nce_restart_timer(ncec, 2585 DEFENSE_INTERVAL(isv6)); 2586 break; 2587 } 2588 } 2589 if (isv6) { 2590 dropped = ndp_announce(ncec); 2591 } else { 2592 dropped = arp_announce(ncec); 2593 } 2594 mutex_enter(&ncec->ncec_lock); 2595 if (dropped) { 2596 ncec->ncec_unsolicit_count++; 2597 } else { 2598 ncec->ncec_last_time_defended = 2599 ddi_get_lbolt(); 2600 } 2601 mutex_exit(&ncec->ncec_lock); 2602 if (ncec->ncec_unsolicit_count != 0) { 2603 nce_restart_timer(ncec, 2604 ANNOUNCE_INTERVAL(isv6)); 2605 } else { 2606 nce_restart_timer(ncec, DEFENSE_INTERVAL(isv6)); 2607 } 2608 } else { 2609 mutex_exit(&ncec->ncec_lock); 2610 } 2611 break; 2612 default: 2613 mutex_exit(&ncec->ncec_lock); 2614 break; 2615 } 2616 done: 2617 ncec_refrele(ncec); 2618 ill_refrele(src_ill); 2619 } 2620 2621 /* 2622 * Set a link layer address from the ll_addr passed in. 2623 * Copy SAP from ill. 2624 */ 2625 static void 2626 nce_set_ll(ncec_t *ncec, uchar_t *ll_addr) 2627 { 2628 ill_t *ill = ncec->ncec_ill; 2629 2630 ASSERT(ll_addr != NULL); 2631 if (ill->ill_phys_addr_length > 0) { 2632 /* 2633 * The bcopy() below used to be called for the physical address 2634 * length rather than the link layer address length. For 2635 * ethernet and many other media, the phys_addr and lla are 2636 * identical. 2637 * 2638 * The phys_addr and lla may not be the same for devices that 2639 * support DL_IPV6_LINK_LAYER_ADDR, though there are currently 2640 * no known instances of these. 2641 * 2642 * For PPP or other interfaces with a zero length 2643 * physical address, don't do anything here. 2644 * The bcopy() with a zero phys_addr length was previously 2645 * a no-op for interfaces with a zero-length physical address. 2646 * Using the lla for them would change the way they operate. 2647 * Doing nothing in such cases preserves expected behavior. 2648 */ 2649 bcopy(ll_addr, ncec->ncec_lladdr, ill->ill_nd_lla_len); 2650 } 2651 } 2652 2653 boolean_t 2654 nce_cmp_ll_addr(const ncec_t *ncec, const uchar_t *ll_addr, 2655 uint32_t ll_addr_len) 2656 { 2657 ASSERT(ncec->ncec_lladdr != NULL); 2658 if (ll_addr == NULL) 2659 return (B_FALSE); 2660 if (bcmp(ll_addr, ncec->ncec_lladdr, ll_addr_len) != 0) 2661 return (B_TRUE); 2662 return (B_FALSE); 2663 } 2664 2665 /* 2666 * Updates the link layer address or the reachability state of 2667 * a cache entry. Reset probe counter if needed. 2668 */ 2669 void 2670 nce_update(ncec_t *ncec, uint16_t new_state, uchar_t *new_ll_addr) 2671 { 2672 ill_t *ill = ncec->ncec_ill; 2673 boolean_t need_stop_timer = B_FALSE; 2674 boolean_t need_fastpath_update = B_FALSE; 2675 nce_t *nce = NULL; 2676 timeout_id_t tid; 2677 2678 ASSERT(MUTEX_HELD(&ncec->ncec_lock)); 2679 /* 2680 * If this interface does not do NUD, there is no point 2681 * in allowing an update to the cache entry. Although 2682 * we will respond to NS. 2683 * The only time we accept an update for a resolver when 2684 * NUD is turned off is when it has just been created. 2685 * Non-Resolvers will always be created as REACHABLE. 2686 */ 2687 if (new_state != ND_UNCHANGED) { 2688 if ((ncec->ncec_flags & NCE_F_NONUD) && 2689 (ncec->ncec_state != ND_INCOMPLETE)) 2690 return; 2691 ASSERT((int16_t)new_state >= ND_STATE_VALID_MIN); 2692 ASSERT((int16_t)new_state <= ND_STATE_VALID_MAX); 2693 need_stop_timer = B_TRUE; 2694 if (new_state == ND_REACHABLE) 2695 ncec->ncec_last = TICK_TO_MSEC(ddi_get_lbolt64()); 2696 else { 2697 /* We force NUD in this case */ 2698 ncec->ncec_last = 0; 2699 } 2700 ncec->ncec_state = new_state; 2701 ncec->ncec_pcnt = ND_MAX_UNICAST_SOLICIT; 2702 ASSERT(ncec->ncec_lladdr != NULL || new_state == ND_INITIAL || 2703 new_state == ND_INCOMPLETE); 2704 } 2705 if (need_stop_timer || (ncec->ncec_flags & NCE_F_STATIC)) { 2706 tid = ncec->ncec_timeout_id; 2707 ncec->ncec_timeout_id = 0; 2708 } 2709 /* 2710 * Re-trigger fastpath probe and 2711 * overwrite the DL_UNITDATA_REQ data, noting we'll lose 2712 * whatever packets that happens to be transmitting at the time. 2713 */ 2714 if (new_ll_addr != NULL) { 2715 bcopy(new_ll_addr, ncec->ncec_lladdr, 2716 ill->ill_phys_addr_length); 2717 need_fastpath_update = B_TRUE; 2718 } 2719 mutex_exit(&ncec->ncec_lock); 2720 if (need_stop_timer || (ncec->ncec_flags & NCE_F_STATIC)) { 2721 if (tid != 0) 2722 (void) untimeout(tid); 2723 } 2724 if (need_fastpath_update) { 2725 /* 2726 * Delete any existing existing dlur_mp and fp_mp information. 2727 * For IPMP interfaces, all underlying ill's must be checked 2728 * and purged. 2729 */ 2730 nce_fastpath_list_delete(ncec->ncec_ill, ncec, NULL); 2731 /* 2732 * add the new dlur_mp and fp_mp 2733 */ 2734 nce = nce_fastpath(ncec, B_TRUE, NULL); 2735 if (nce != NULL) 2736 nce_refrele(nce); 2737 } 2738 mutex_enter(&ncec->ncec_lock); 2739 } 2740 2741 static void 2742 nce_queue_mp_common(ncec_t *ncec, mblk_t *mp, boolean_t head_insert) 2743 { 2744 uint_t count = 0; 2745 mblk_t **mpp, *tmp; 2746 2747 ASSERT(MUTEX_HELD(&ncec->ncec_lock)); 2748 2749 for (mpp = &ncec->ncec_qd_mp; *mpp != NULL; mpp = &(*mpp)->b_next) { 2750 if (++count > ncec->ncec_ill->ill_max_buf) { 2751 tmp = ncec->ncec_qd_mp->b_next; 2752 ncec->ncec_qd_mp->b_next = NULL; 2753 /* 2754 * if we never create data addrs on the under_ill 2755 * does this matter? 2756 */ 2757 BUMP_MIB(ncec->ncec_ill->ill_ip_mib, 2758 ipIfStatsOutDiscards); 2759 ip_drop_output("ipIfStatsOutDiscards", ncec->ncec_qd_mp, 2760 ncec->ncec_ill); 2761 freemsg(ncec->ncec_qd_mp); 2762 ncec->ncec_qd_mp = tmp; 2763 } 2764 } 2765 2766 if (head_insert) { 2767 ncec->ncec_nprobes++; 2768 mp->b_next = ncec->ncec_qd_mp; 2769 ncec->ncec_qd_mp = mp; 2770 } else { 2771 *mpp = mp; 2772 } 2773 } 2774 2775 /* 2776 * nce_queue_mp will queue the packet into the ncec_qd_mp. The packet will be 2777 * queued at the head or tail of the queue based on the input argument 2778 * 'head_insert'. The caller should specify this argument as B_TRUE if this 2779 * packet is an IPMP probe packet, in which case the following happens: 2780 * 2781 * 1. Insert it at the head of the ncec_qd_mp list. Consider the normal 2782 * (non-ipmp_probe) load-speading case where the source address of the ND 2783 * packet is not tied to ncec_ill. If the ill bound to the source address 2784 * cannot receive, the response to the ND packet will not be received. 2785 * However, if ND packets for ncec_ill's probes are queued behind that ND 2786 * packet, those probes will also fail to be sent, and thus in.mpathd will 2787 * erroneously conclude that ncec_ill has also failed. 2788 * 2789 * 2. Drop the ipmp_probe packet in ndp_timer() if the ND did not succeed on 2790 * the first attempt. This ensures that ND problems do not manifest as 2791 * probe RTT spikes. 2792 * 2793 * We achieve this by inserting ipmp_probe() packets at the head of the 2794 * nce_queue. 2795 * 2796 * The ncec for the probe target is created with ncec_ill set to the ipmp_ill, 2797 * but the caller needs to set head_insert to B_TRUE if this is a probe packet. 2798 */ 2799 void 2800 nce_queue_mp(ncec_t *ncec, mblk_t *mp, boolean_t head_insert) 2801 { 2802 ASSERT(MUTEX_HELD(&ncec->ncec_lock)); 2803 nce_queue_mp_common(ncec, mp, head_insert); 2804 } 2805 2806 /* 2807 * Called when address resolution failed due to a timeout. 2808 * Send an ICMP unreachable in response to all queued packets. 2809 */ 2810 void 2811 ndp_resolv_failed(ncec_t *ncec) 2812 { 2813 mblk_t *mp, *nxt_mp; 2814 char buf[INET6_ADDRSTRLEN]; 2815 ill_t *ill = ncec->ncec_ill; 2816 ip_recv_attr_t iras; 2817 2818 bzero(&iras, sizeof (iras)); 2819 iras.ira_flags = 0; 2820 /* 2821 * we are setting the ira_rill to the ipmp_ill (instead of 2822 * the actual ill on which the packet was received), but this 2823 * is ok because we don't actually need the real ira_rill. 2824 * to send the icmp unreachable to the sender. 2825 */ 2826 iras.ira_ill = iras.ira_rill = ill; 2827 iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex; 2828 iras.ira_rifindex = iras.ira_ruifindex; 2829 2830 ip1dbg(("ndp_resolv_failed: dst %s\n", 2831 inet_ntop(AF_INET6, (char *)&ncec->ncec_addr, buf, sizeof (buf)))); 2832 mutex_enter(&ncec->ncec_lock); 2833 mp = ncec->ncec_qd_mp; 2834 ncec->ncec_qd_mp = NULL; 2835 ncec->ncec_nprobes = 0; 2836 mutex_exit(&ncec->ncec_lock); 2837 while (mp != NULL) { 2838 nxt_mp = mp->b_next; 2839 mp->b_next = NULL; 2840 2841 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); 2842 ip_drop_output("ipIfStatsOutDiscards - address unreachable", 2843 mp, ill); 2844 icmp_unreachable_v6(mp, 2845 ICMP6_DST_UNREACH_ADDR, B_FALSE, &iras); 2846 ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE)); 2847 mp = nxt_mp; 2848 } 2849 ncec_cb_dispatch(ncec); /* finish off waiting callbacks */ 2850 } 2851 2852 /* 2853 * Handle the completion of NDP and ARP resolution. 2854 */ 2855 void 2856 nce_resolv_ok(ncec_t *ncec) 2857 { 2858 mblk_t *mp; 2859 uint_t pkt_len; 2860 iaflags_t ixaflags = IXAF_NO_TRACE; 2861 nce_t *nce; 2862 ill_t *ill = ncec->ncec_ill; 2863 boolean_t isv6 = (ncec->ncec_ipversion == IPV6_VERSION); 2864 ip_stack_t *ipst = ill->ill_ipst; 2865 2866 if (IS_IPMP(ncec->ncec_ill)) { 2867 nce_resolv_ipmp_ok(ncec); 2868 return; 2869 } 2870 /* non IPMP case */ 2871 2872 mutex_enter(&ncec->ncec_lock); 2873 ASSERT(ncec->ncec_nprobes == 0); 2874 mp = ncec->ncec_qd_mp; 2875 ncec->ncec_qd_mp = NULL; 2876 mutex_exit(&ncec->ncec_lock); 2877 2878 while (mp != NULL) { 2879 mblk_t *nxt_mp; 2880 2881 if (ill->ill_isv6) { 2882 ip6_t *ip6h = (ip6_t *)mp->b_rptr; 2883 2884 pkt_len = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN; 2885 } else { 2886 ipha_t *ipha = (ipha_t *)mp->b_rptr; 2887 2888 ixaflags |= IXAF_IS_IPV4; 2889 pkt_len = ntohs(ipha->ipha_length); 2890 } 2891 nxt_mp = mp->b_next; 2892 mp->b_next = NULL; 2893 /* 2894 * IXAF_NO_DEV_FLOW_CTL information for TCP packets is no 2895 * longer available, but it's ok to drop this flag because TCP 2896 * has its own flow-control in effect, so TCP packets 2897 * are not likely to get here when flow-control is in effect. 2898 */ 2899 mutex_enter(&ill->ill_lock); 2900 nce = nce_lookup(ill, &ncec->ncec_addr); 2901 mutex_exit(&ill->ill_lock); 2902 2903 if (nce == NULL) { 2904 if (isv6) { 2905 BUMP_MIB(&ipst->ips_ip6_mib, 2906 ipIfStatsOutDiscards); 2907 } else { 2908 BUMP_MIB(&ipst->ips_ip_mib, 2909 ipIfStatsOutDiscards); 2910 } 2911 ip_drop_output("ipIfStatsOutDiscards - no nce", 2912 mp, NULL); 2913 freemsg(mp); 2914 } else { 2915 /* 2916 * We don't know the zoneid, but 2917 * ip_xmit does not care since IXAF_NO_TRACE 2918 * is set. (We traced the packet the first 2919 * time through ip_xmit.) 2920 */ 2921 (void) ip_xmit(mp, nce, ixaflags, pkt_len, 0, 2922 ALL_ZONES, 0, NULL); 2923 nce_refrele(nce); 2924 } 2925 mp = nxt_mp; 2926 } 2927 2928 ncec_cb_dispatch(ncec); /* complete callbacks */ 2929 } 2930 2931 /* 2932 * Called by SIOCSNDP* ioctl to add/change an ncec entry 2933 * and the corresponding attributes. 2934 * Disallow states other than ND_REACHABLE or ND_STALE. 2935 */ 2936 int 2937 ndp_sioc_update(ill_t *ill, lif_nd_req_t *lnr) 2938 { 2939 sin6_t *sin6; 2940 in6_addr_t *addr; 2941 ncec_t *ncec; 2942 nce_t *nce; 2943 int err = 0; 2944 uint16_t new_flags = 0; 2945 uint16_t old_flags = 0; 2946 int inflags = lnr->lnr_flags; 2947 ip_stack_t *ipst = ill->ill_ipst; 2948 boolean_t do_postprocess = B_FALSE; 2949 2950 ASSERT(ill->ill_isv6); 2951 if ((lnr->lnr_state_create != ND_REACHABLE) && 2952 (lnr->lnr_state_create != ND_STALE)) 2953 return (EINVAL); 2954 2955 sin6 = (sin6_t *)&lnr->lnr_addr; 2956 addr = &sin6->sin6_addr; 2957 2958 mutex_enter(&ipst->ips_ndp6->ndp_g_lock); 2959 ASSERT(!IS_UNDER_IPMP(ill)); 2960 nce = nce_lookup_addr(ill, addr); 2961 if (nce != NULL) 2962 new_flags = nce->nce_common->ncec_flags; 2963 2964 switch (inflags & (NDF_ISROUTER_ON|NDF_ISROUTER_OFF)) { 2965 case NDF_ISROUTER_ON: 2966 new_flags |= NCE_F_ISROUTER; 2967 break; 2968 case NDF_ISROUTER_OFF: 2969 new_flags &= ~NCE_F_ISROUTER; 2970 break; 2971 case (NDF_ISROUTER_OFF|NDF_ISROUTER_ON): 2972 mutex_exit(&ipst->ips_ndp6->ndp_g_lock); 2973 if (nce != NULL) 2974 nce_refrele(nce); 2975 return (EINVAL); 2976 } 2977 2978 switch (inflags & (NDF_ANYCAST_ON|NDF_ANYCAST_OFF)) { 2979 case NDF_ANYCAST_ON: 2980 new_flags |= NCE_F_ANYCAST; 2981 break; 2982 case NDF_ANYCAST_OFF: 2983 new_flags &= ~NCE_F_ANYCAST; 2984 break; 2985 case (NDF_ANYCAST_OFF|NDF_ANYCAST_ON): 2986 mutex_exit(&ipst->ips_ndp6->ndp_g_lock); 2987 if (nce != NULL) 2988 nce_refrele(nce); 2989 return (EINVAL); 2990 } 2991 2992 if (nce == NULL) { 2993 err = nce_add_v6(ill, 2994 (uchar_t *)lnr->lnr_hdw_addr, 2995 ill->ill_phys_addr_length, 2996 addr, 2997 new_flags, 2998 lnr->lnr_state_create, 2999 &nce); 3000 if (err != 0) { 3001 mutex_exit(&ipst->ips_ndp6->ndp_g_lock); 3002 ip1dbg(("ndp_sioc_update: Can't create NCE %d\n", err)); 3003 return (err); 3004 } else { 3005 do_postprocess = B_TRUE; 3006 } 3007 } 3008 ncec = nce->nce_common; 3009 old_flags = ncec->ncec_flags; 3010 if (old_flags & NCE_F_ISROUTER && !(new_flags & NCE_F_ISROUTER)) { 3011 ncec_router_to_host(ncec); 3012 mutex_exit(&ipst->ips_ndp6->ndp_g_lock); 3013 if (do_postprocess) 3014 err = nce_add_v6_postprocess(nce); 3015 nce_refrele(nce); 3016 return (0); 3017 } 3018 mutex_exit(&ipst->ips_ndp6->ndp_g_lock); 3019 3020 if (do_postprocess) 3021 err = nce_add_v6_postprocess(nce); 3022 /* 3023 * err cannot be anything other than 0 because we don't support 3024 * proxy arp of static addresses. 3025 */ 3026 ASSERT(err == 0); 3027 3028 mutex_enter(&ncec->ncec_lock); 3029 ncec->ncec_flags = new_flags; 3030 mutex_exit(&ncec->ncec_lock); 3031 /* 3032 * Note that we ignore the state at this point, which 3033 * should be either STALE or REACHABLE. Instead we let 3034 * the link layer address passed in to determine the state 3035 * much like incoming packets. 3036 */ 3037 nce_process(ncec, (uchar_t *)lnr->lnr_hdw_addr, 0, B_FALSE); 3038 nce_refrele(nce); 3039 return (0); 3040 } 3041 3042 /* 3043 * Create an nce_t structure for ill using the ncec->ncec_lladdr to set up 3044 * the nce_dlur_mp. If ill != ncec->ncec_ill, then the ips_ill_g_lock must 3045 * be held to ensure that they are in the same group. 3046 */ 3047 static nce_t * 3048 nce_fastpath_create(ill_t *ill, ncec_t *ncec) 3049 { 3050 3051 nce_t *nce; 3052 3053 nce = nce_ill_lookup_then_add(ill, ncec); 3054 3055 if (nce == NULL || IS_LOOPBACK(nce->nce_ill) || IS_VNI(nce->nce_ill)) 3056 return (nce); 3057 3058 /* 3059 * hold the ncec_lock to synchronize with nce_update() so that, 3060 * at the end of this function, the contents of nce_dlur_mp are 3061 * consistent with ncec->ncec_lladdr, even though some intermediate 3062 * packet may have been sent out with a mangled address, which would 3063 * only be a transient condition. 3064 */ 3065 mutex_enter(&ncec->ncec_lock); 3066 if (ncec->ncec_lladdr != NULL) { 3067 bcopy(ncec->ncec_lladdr, nce->nce_dlur_mp->b_rptr + 3068 NCE_LL_ADDR_OFFSET(ill), ill->ill_phys_addr_length); 3069 } else { 3070 nce->nce_dlur_mp = ill_dlur_gen(NULL, 0, ill->ill_sap, 3071 ill->ill_sap_length); 3072 } 3073 mutex_exit(&ncec->ncec_lock); 3074 return (nce); 3075 } 3076 3077 /* 3078 * we make nce_fp_mp to have an M_DATA prepend. 3079 * The caller ensures there is hold on ncec for this function. 3080 * Note that since ill_fastpath_probe() copies the mblk there is 3081 * no need to hold the nce or ncec beyond this function. 3082 * 3083 * If the caller has passed in a non-null ncec_nce to nce_faspath() that 3084 * ncec_nce must correspond to the nce for ncec with nce_ill == ncec->ncec_ill 3085 * and will be returned back by this function, so that no extra nce_refrele 3086 * is required for the caller. The calls from nce_add_common() use this 3087 * method. All other callers (that pass in NULL ncec_nce) will have to do a 3088 * nce_refrele of the returned nce (when it is non-null). 3089 */ 3090 nce_t * 3091 nce_fastpath(ncec_t *ncec, boolean_t trigger_fp_req, nce_t *ncec_nce) 3092 { 3093 nce_t *nce; 3094 ill_t *ill = ncec->ncec_ill; 3095 3096 ASSERT(ill != NULL); 3097 3098 if (IS_IPMP(ill) && trigger_fp_req) { 3099 trigger_fp_req = B_FALSE; 3100 ipmp_ncec_fastpath(ncec, ill); 3101 3102 } 3103 /* 3104 * If the caller already has the nce corresponding to the ill, use 3105 * that one. Otherwise we have to lookup/add the nce. Calls from 3106 * nce_add_common() fall in the former category, and have just done 3107 * the nce lookup/add that can be reused. 3108 */ 3109 if (ncec_nce == NULL) 3110 nce = nce_fastpath_create(ill, ncec); 3111 else 3112 nce = ncec_nce; 3113 3114 if (nce == NULL || IS_LOOPBACK(nce->nce_ill) || IS_VNI(nce->nce_ill)) 3115 return (nce); 3116 3117 if (trigger_fp_req) 3118 nce_fastpath_trigger(nce); 3119 return (nce); 3120 } 3121 3122 /* 3123 * Trigger fastpath on nce. No locks may be held. 3124 */ 3125 static void 3126 nce_fastpath_trigger(nce_t *nce) 3127 { 3128 int res; 3129 ill_t *ill = nce->nce_ill; 3130 ncec_t *ncec = nce->nce_common; 3131 3132 res = ill_fastpath_probe(ill, nce->nce_dlur_mp); 3133 /* 3134 * EAGAIN is an indication of a transient error 3135 * i.e. allocation failure etc. leave the ncec in the list it 3136 * will be updated when another probe happens for another ire 3137 * if not it will be taken out of the list when the ire is 3138 * deleted. 3139 */ 3140 if (res != 0 && res != EAGAIN && res != ENOTSUP) 3141 nce_fastpath_list_delete(ill, ncec, NULL); 3142 } 3143 3144 /* 3145 * Add ncec to the nce fastpath list on ill. 3146 */ 3147 static nce_t * 3148 nce_ill_lookup_then_add_locked(ill_t *ill, ncec_t *ncec) 3149 { 3150 nce_t *nce = NULL; 3151 3152 ASSERT(MUTEX_HELD(&ill->ill_lock)); 3153 /* 3154 * Atomically ensure that the ill is not CONDEMNED and is not going 3155 * down, before adding the NCE. 3156 */ 3157 if (ill->ill_state_flags & ILL_CONDEMNED) 3158 return (NULL); 3159 mutex_enter(&ncec->ncec_lock); 3160 /* 3161 * if ncec has not been deleted and 3162 * is not already in the list add it. 3163 */ 3164 if (!NCE_ISCONDEMNED(ncec)) { 3165 nce = nce_lookup(ill, &ncec->ncec_addr); 3166 if (nce != NULL) 3167 goto done; 3168 nce = nce_add(ill, ncec); 3169 } 3170 done: 3171 mutex_exit(&ncec->ncec_lock); 3172 return (nce); 3173 } 3174 3175 nce_t * 3176 nce_ill_lookup_then_add(ill_t *ill, ncec_t *ncec) 3177 { 3178 nce_t *nce; 3179 3180 mutex_enter(&ill->ill_lock); 3181 nce = nce_ill_lookup_then_add_locked(ill, ncec); 3182 mutex_exit(&ill->ill_lock); 3183 return (nce); 3184 } 3185 3186 3187 /* 3188 * remove ncec from the ill_nce list. If 'dead' is non-null, the deleted 3189 * nce is added to the 'dead' list, and the caller must nce_refrele() the 3190 * entry after all locks have been dropped. 3191 */ 3192 void 3193 nce_fastpath_list_delete(ill_t *ill, ncec_t *ncec, list_t *dead) 3194 { 3195 nce_t *nce; 3196 3197 ASSERT(ill != NULL); 3198 3199 /* first clean out any nce pointers in the under_ills */ 3200 if (IS_IPMP(ill)) 3201 ipmp_ncec_flush_nce(ncec); 3202 3203 /* now the ill itself */ 3204 mutex_enter(&ill->ill_lock); 3205 for (nce = list_head(&ill->ill_nce); nce != NULL; 3206 nce = list_next(&ill->ill_nce, nce)) { 3207 if (nce->nce_common == ncec) { 3208 nce_refhold(nce); 3209 nce_delete(nce); 3210 break; 3211 } 3212 } 3213 mutex_exit(&ill->ill_lock); 3214 if (nce != NULL) { 3215 if (dead == NULL) 3216 nce_refrele(nce); 3217 else 3218 list_insert_tail(dead, nce); 3219 } 3220 } 3221 3222 /* 3223 * when the fastpath response does not fit in the datab 3224 * associated with the existing nce_fp_mp, we delete and 3225 * add the nce to retrigger fastpath based on the information 3226 * in the ncec_t. 3227 */ 3228 static nce_t * 3229 nce_delete_then_add(nce_t *nce) 3230 { 3231 ill_t *ill = nce->nce_ill; 3232 nce_t *newnce = NULL; 3233 3234 ip0dbg(("nce_delete_then_add nce %p ill %s\n", 3235 (void *)nce, ill->ill_name)); 3236 mutex_enter(&ill->ill_lock); 3237 mutex_enter(&nce->nce_common->ncec_lock); 3238 nce_delete(nce); 3239 /* 3240 * Make sure that ncec is not condemned before adding. We hold the 3241 * ill_lock and ncec_lock to synchronize with ncec_delete() and 3242 * ipmp_ncec_flush_nce() 3243 */ 3244 if (!NCE_ISCONDEMNED(nce->nce_common)) 3245 newnce = nce_add(ill, nce->nce_common); 3246 mutex_exit(&nce->nce_common->ncec_lock); 3247 mutex_exit(&ill->ill_lock); 3248 nce_refrele(nce); 3249 return (newnce); /* could be null if nomem */ 3250 } 3251 3252 typedef struct nce_fp_match_s { 3253 nce_t *nce_fp_match_res; 3254 mblk_t *nce_fp_match_ack_mp; 3255 } nce_fp_match_t; 3256 3257 /* ARGSUSED */ 3258 static int 3259 nce_fastpath_match_dlur(ill_t *ill, nce_t *nce, void *arg) 3260 { 3261 nce_fp_match_t *nce_fp_marg = arg; 3262 ncec_t *ncec = nce->nce_common; 3263 mblk_t *mp = nce_fp_marg->nce_fp_match_ack_mp; 3264 uchar_t *mp_rptr, *ud_mp_rptr; 3265 mblk_t *ud_mp = nce->nce_dlur_mp; 3266 ptrdiff_t cmplen; 3267 3268 /* 3269 * mp is the mp associated with the fastpath ack. 3270 * ud_mp is the outstanding DL_UNITDATA_REQ on the nce_t 3271 * under consideration. If the contents match, then the 3272 * fastpath ack is used to update the nce. 3273 */ 3274 if (ud_mp == NULL) 3275 return (0); 3276 mp_rptr = mp->b_rptr; 3277 cmplen = mp->b_wptr - mp_rptr; 3278 ASSERT(cmplen >= 0); 3279 3280 ud_mp_rptr = ud_mp->b_rptr; 3281 /* 3282 * The ncec is locked here to prevent any other threads from accessing 3283 * and changing nce_dlur_mp when the address becomes resolved to an 3284 * lla while we're in the middle of looking at and comparing the 3285 * hardware address (lla). It is also locked to prevent multiple 3286 * threads in nce_fastpath() from examining nce_dlur_mp at the same 3287 * time. 3288 */ 3289 mutex_enter(&ncec->ncec_lock); 3290 if (ud_mp->b_wptr - ud_mp_rptr != cmplen || 3291 bcmp((char *)mp_rptr, (char *)ud_mp_rptr, cmplen) == 0) { 3292 nce_fp_marg->nce_fp_match_res = nce; 3293 mutex_exit(&ncec->ncec_lock); 3294 nce_refhold(nce); 3295 return (1); 3296 } 3297 mutex_exit(&ncec->ncec_lock); 3298 return (0); 3299 } 3300 3301 /* 3302 * Update all NCE's that are not in fastpath mode and 3303 * have an nce_fp_mp that matches mp. mp->b_cont contains 3304 * the fastpath header. 3305 * 3306 * Returns TRUE if entry should be dequeued, or FALSE otherwise. 3307 */ 3308 void 3309 nce_fastpath_update(ill_t *ill, mblk_t *mp) 3310 { 3311 nce_fp_match_t nce_fp_marg; 3312 nce_t *nce; 3313 mblk_t *nce_fp_mp, *fp_mp; 3314 3315 nce_fp_marg.nce_fp_match_res = NULL; 3316 nce_fp_marg.nce_fp_match_ack_mp = mp; 3317 3318 nce_walk(ill, nce_fastpath_match_dlur, &nce_fp_marg); 3319 3320 if ((nce = nce_fp_marg.nce_fp_match_res) == NULL) 3321 return; 3322 3323 mutex_enter(&nce->nce_lock); 3324 nce_fp_mp = nce->nce_fp_mp; 3325 3326 if (nce_fp_mp != NULL) { 3327 fp_mp = mp->b_cont; 3328 if (nce_fp_mp->b_rptr + MBLKL(fp_mp) > 3329 nce_fp_mp->b_datap->db_lim) { 3330 mutex_exit(&nce->nce_lock); 3331 nce = nce_delete_then_add(nce); 3332 if (nce == NULL) { 3333 return; 3334 } 3335 mutex_enter(&nce->nce_lock); 3336 nce_fp_mp = nce->nce_fp_mp; 3337 } 3338 } 3339 3340 /* Matched - install mp as the fastpath mp */ 3341 if (nce_fp_mp == NULL) { 3342 fp_mp = dupb(mp->b_cont); 3343 nce->nce_fp_mp = fp_mp; 3344 } else { 3345 fp_mp = mp->b_cont; 3346 bcopy(fp_mp->b_rptr, nce_fp_mp->b_rptr, MBLKL(fp_mp)); 3347 nce->nce_fp_mp->b_wptr = nce->nce_fp_mp->b_rptr 3348 + MBLKL(fp_mp); 3349 } 3350 mutex_exit(&nce->nce_lock); 3351 nce_refrele(nce); 3352 } 3353 3354 /* 3355 * Return a pointer to a given option in the packet. 3356 * Assumes that option part of the packet have already been validated. 3357 */ 3358 nd_opt_hdr_t * 3359 ndp_get_option(nd_opt_hdr_t *opt, int optlen, int opt_type) 3360 { 3361 while (optlen > 0) { 3362 if (opt->nd_opt_type == opt_type) 3363 return (opt); 3364 optlen -= 8 * opt->nd_opt_len; 3365 opt = (struct nd_opt_hdr *)((char *)opt + 8 * opt->nd_opt_len); 3366 } 3367 return (NULL); 3368 } 3369 3370 /* 3371 * Verify all option lengths present are > 0, also check to see 3372 * if the option lengths and packet length are consistent. 3373 */ 3374 boolean_t 3375 ndp_verify_optlen(nd_opt_hdr_t *opt, int optlen) 3376 { 3377 ASSERT(opt != NULL); 3378 while (optlen > 0) { 3379 if (opt->nd_opt_len == 0) 3380 return (B_FALSE); 3381 optlen -= 8 * opt->nd_opt_len; 3382 if (optlen < 0) 3383 return (B_FALSE); 3384 opt = (struct nd_opt_hdr *)((char *)opt + 8 * opt->nd_opt_len); 3385 } 3386 return (B_TRUE); 3387 } 3388 3389 /* 3390 * ncec_walk function. 3391 * Free a fraction of the NCE cache entries. 3392 * 3393 * A possible optimization here would be to use ncec_last where possible, and 3394 * delete the least-frequently used entry, which would require more complex 3395 * computation as we walk through the ncec's (e.g., track ncec entries by 3396 * order of ncec_last and/or maintain state) 3397 */ 3398 static void 3399 ncec_cache_reclaim(ncec_t *ncec, char *arg) 3400 { 3401 ip_stack_t *ipst = ncec->ncec_ipst; 3402 uint_t fraction = *(uint_t *)arg; 3403 uint_t rand; 3404 3405 if ((ncec->ncec_flags & 3406 (NCE_F_MYADDR | NCE_F_STATIC | NCE_F_BCAST)) != 0) { 3407 return; 3408 } 3409 3410 rand = (uint_t)ddi_get_lbolt() + 3411 NCE_ADDR_HASH_V6(ncec->ncec_addr, NCE_TABLE_SIZE); 3412 if ((rand/fraction)*fraction == rand) { 3413 IP_STAT(ipst, ip_nce_reclaim_deleted); 3414 ncec_delete(ncec); 3415 } 3416 } 3417 3418 /* 3419 * kmem_cache callback to free up memory. 3420 * 3421 * For now we just delete a fixed fraction. 3422 */ 3423 static void 3424 ip_nce_reclaim_stack(ip_stack_t *ipst) 3425 { 3426 uint_t fraction = ipst->ips_ip_nce_reclaim_fraction; 3427 3428 IP_STAT(ipst, ip_nce_reclaim_calls); 3429 3430 ncec_walk(NULL, (pfi_t)ncec_cache_reclaim, (uchar_t *)&fraction, ipst); 3431 3432 /* 3433 * Walk all CONNs that can have a reference on an ire, ncec or dce. 3434 * Get them to update any stale references to drop any refholds they 3435 * have. 3436 */ 3437 ipcl_walk(conn_ixa_cleanup, (void *)B_FALSE, ipst); 3438 } 3439 3440 /* 3441 * Called by the memory allocator subsystem directly, when the system 3442 * is running low on memory. 3443 */ 3444 /* ARGSUSED */ 3445 void 3446 ip_nce_reclaim(void *args) 3447 { 3448 netstack_handle_t nh; 3449 netstack_t *ns; 3450 3451 netstack_next_init(&nh); 3452 while ((ns = netstack_next(&nh)) != NULL) { 3453 ip_nce_reclaim_stack(ns->netstack_ip); 3454 netstack_rele(ns); 3455 } 3456 netstack_next_fini(&nh); 3457 } 3458 3459 #ifdef DEBUG 3460 void 3461 ncec_trace_ref(ncec_t *ncec) 3462 { 3463 ASSERT(MUTEX_HELD(&ncec->ncec_lock)); 3464 3465 if (ncec->ncec_trace_disable) 3466 return; 3467 3468 if (!th_trace_ref(ncec, ncec->ncec_ipst)) { 3469 ncec->ncec_trace_disable = B_TRUE; 3470 ncec_trace_cleanup(ncec); 3471 } 3472 } 3473 3474 void 3475 ncec_untrace_ref(ncec_t *ncec) 3476 { 3477 ASSERT(MUTEX_HELD(&ncec->ncec_lock)); 3478 3479 if (!ncec->ncec_trace_disable) 3480 th_trace_unref(ncec); 3481 } 3482 3483 static void 3484 ncec_trace_cleanup(const ncec_t *ncec) 3485 { 3486 th_trace_cleanup(ncec, ncec->ncec_trace_disable); 3487 } 3488 #endif 3489 3490 /* 3491 * Called when address resolution fails due to a timeout. 3492 * Send an ICMP unreachable in response to all queued packets. 3493 */ 3494 void 3495 arp_resolv_failed(ncec_t *ncec) 3496 { 3497 mblk_t *mp, *nxt_mp; 3498 char buf[INET6_ADDRSTRLEN]; 3499 struct in_addr ipv4addr; 3500 ill_t *ill = ncec->ncec_ill; 3501 ip_stack_t *ipst = ncec->ncec_ipst; 3502 ip_recv_attr_t iras; 3503 3504 bzero(&iras, sizeof (iras)); 3505 iras.ira_flags = IRAF_IS_IPV4; 3506 /* 3507 * we are setting the ira_rill to the ipmp_ill (instead of 3508 * the actual ill on which the packet was received), but this 3509 * is ok because we don't actually need the real ira_rill. 3510 * to send the icmp unreachable to the sender. 3511 */ 3512 iras.ira_ill = iras.ira_rill = ill; 3513 iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex; 3514 iras.ira_rifindex = iras.ira_ruifindex; 3515 3516 IN6_V4MAPPED_TO_INADDR(&ncec->ncec_addr, &ipv4addr); 3517 ip3dbg(("arp_resolv_failed: dst %s\n", 3518 inet_ntop(AF_INET, &ipv4addr, buf, sizeof (buf)))); 3519 mutex_enter(&ncec->ncec_lock); 3520 mp = ncec->ncec_qd_mp; 3521 ncec->ncec_qd_mp = NULL; 3522 ncec->ncec_nprobes = 0; 3523 mutex_exit(&ncec->ncec_lock); 3524 while (mp != NULL) { 3525 nxt_mp = mp->b_next; 3526 mp->b_next = NULL; 3527 3528 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); 3529 ip_drop_output("ipIfStatsOutDiscards - address unreachable", 3530 mp, ill); 3531 if (ipst->ips_ip_arp_icmp_error) { 3532 ip3dbg(("arp_resolv_failed: " 3533 "Calling icmp_unreachable\n")); 3534 icmp_unreachable(mp, ICMP_HOST_UNREACHABLE, &iras); 3535 } else { 3536 freemsg(mp); 3537 } 3538 ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE)); 3539 mp = nxt_mp; 3540 } 3541 ncec_cb_dispatch(ncec); /* finish off waiting callbacks */ 3542 } 3543 3544 /* 3545 * if ill is an under_ill, translate it to the ipmp_ill and add the 3546 * nce on the ipmp_ill. Two nce_t entries (one on the ipmp_ill, and 3547 * one on the underlying in_ill) will be created for the 3548 * ncec_t in this case. The ncec_t itself will be created on the ipmp_ill. 3549 */ 3550 int 3551 nce_lookup_then_add_v4(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len, 3552 const in_addr_t *addr, uint16_t flags, uint16_t state, nce_t **newnce) 3553 { 3554 int err; 3555 in6_addr_t addr6; 3556 ip_stack_t *ipst = ill->ill_ipst; 3557 nce_t *nce, *upper_nce = NULL; 3558 ill_t *in_ill = ill, *under = NULL; 3559 boolean_t need_ill_refrele = B_FALSE; 3560 3561 if (flags & NCE_F_MCAST) { 3562 /* 3563 * hw_addr will be figured out in nce_set_multicast_v4; 3564 * caller needs to pass in the cast_ill for ipmp 3565 */ 3566 ASSERT(hw_addr == NULL); 3567 ASSERT(!IS_IPMP(ill)); 3568 err = nce_set_multicast_v4(ill, addr, flags, newnce); 3569 return (err); 3570 } 3571 3572 if (IS_UNDER_IPMP(ill) && !(flags & NCE_F_MYADDR)) { 3573 ill = ipmp_ill_hold_ipmp_ill(ill); 3574 if (ill == NULL) 3575 return (ENXIO); 3576 need_ill_refrele = B_TRUE; 3577 } 3578 if ((flags & NCE_F_BCAST) != 0) { 3579 /* 3580 * IPv4 broadcast ncec: compute the hwaddr. 3581 */ 3582 if (IS_IPMP(ill)) { 3583 under = ipmp_ill_get_xmit_ill(ill, B_FALSE); 3584 if (under == NULL) { 3585 if (need_ill_refrele) 3586 ill_refrele(ill); 3587 return (ENETDOWN); 3588 } 3589 hw_addr = under->ill_bcast_mp->b_rptr + 3590 NCE_LL_ADDR_OFFSET(under); 3591 hw_addr_len = under->ill_phys_addr_length; 3592 } else { 3593 hw_addr = ill->ill_bcast_mp->b_rptr + 3594 NCE_LL_ADDR_OFFSET(ill), 3595 hw_addr_len = ill->ill_phys_addr_length; 3596 } 3597 } 3598 3599 mutex_enter(&ipst->ips_ndp4->ndp_g_lock); 3600 IN6_IPADDR_TO_V4MAPPED(*addr, &addr6); 3601 nce = nce_lookup_addr(ill, &addr6); 3602 if (nce == NULL) { 3603 err = nce_add_v4(ill, hw_addr, hw_addr_len, addr, flags, 3604 state, &nce); 3605 } else { 3606 err = EEXIST; 3607 } 3608 mutex_exit(&ipst->ips_ndp4->ndp_g_lock); 3609 if (err == 0) 3610 err = nce_add_v4_postprocess(nce); 3611 3612 if (in_ill != ill && nce != NULL) { 3613 nce_t *under_nce = NULL; 3614 3615 /* 3616 * in_ill was the under_ill. Try to create the under_nce. 3617 * Hold the ill_g_lock to prevent changes to group membership 3618 * until we are done. 3619 */ 3620 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 3621 if (!IS_IN_SAME_ILLGRP(in_ill, ill)) { 3622 DTRACE_PROBE2(ill__not__in__group, nce_t *, nce, 3623 ill_t *, ill); 3624 rw_exit(&ipst->ips_ill_g_lock); 3625 err = ENXIO; 3626 nce_refrele(nce); 3627 nce = NULL; 3628 goto bail; 3629 } 3630 under_nce = nce_fastpath_create(in_ill, nce->nce_common); 3631 if (under_nce == NULL) { 3632 rw_exit(&ipst->ips_ill_g_lock); 3633 err = EINVAL; 3634 nce_refrele(nce); 3635 nce = NULL; 3636 goto bail; 3637 } 3638 rw_exit(&ipst->ips_ill_g_lock); 3639 upper_nce = nce; 3640 nce = under_nce; /* will be returned to caller */ 3641 if (NCE_ISREACHABLE(nce->nce_common)) 3642 nce_fastpath_trigger(under_nce); 3643 } 3644 if (nce != NULL) { 3645 if (newnce != NULL) 3646 *newnce = nce; 3647 else 3648 nce_refrele(nce); 3649 } 3650 bail: 3651 if (under != NULL) 3652 ill_refrele(under); 3653 if (upper_nce != NULL) 3654 nce_refrele(upper_nce); 3655 if (need_ill_refrele) 3656 ill_refrele(ill); 3657 3658 return (err); 3659 } 3660 3661 /* 3662 * NDP Cache Entry creation routine for IPv4. 3663 * This routine must always be called with ndp4->ndp_g_lock held. 3664 * Prior to return, ncec_refcnt is incremented. 3665 * 3666 * IPMP notes: the ncec for non-local (i.e., !NCE_MYADDR(ncec) addresses 3667 * are always added pointing at the ipmp_ill. Thus, when the ill passed 3668 * to nce_add_v4 is an under_ill (i.e., IS_UNDER_IPMP(ill)) two nce_t 3669 * entries will be created, both pointing at the same ncec_t. The nce_t 3670 * entries will have their nce_ill set to the ipmp_ill and the under_ill 3671 * respectively, with the ncec_t having its ncec_ill pointing at the ipmp_ill. 3672 * Local addresses are always created on the ill passed to nce_add_v4. 3673 */ 3674 int 3675 nce_add_v4(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len, 3676 const in_addr_t *addr, uint16_t flags, uint16_t state, nce_t **newnce) 3677 { 3678 int err; 3679 boolean_t is_multicast = (flags & NCE_F_MCAST); 3680 struct in6_addr addr6; 3681 nce_t *nce; 3682 3683 ASSERT(MUTEX_HELD(&ill->ill_ipst->ips_ndp4->ndp_g_lock)); 3684 ASSERT(!ill->ill_isv6); 3685 ASSERT(!IN_MULTICAST(htonl(*addr)) || is_multicast); 3686 3687 IN6_IPADDR_TO_V4MAPPED(*addr, &addr6); 3688 err = nce_add_common(ill, hw_addr, hw_addr_len, &addr6, flags, state, 3689 &nce); 3690 ASSERT(newnce != NULL); 3691 *newnce = nce; 3692 return (err); 3693 } 3694 3695 /* 3696 * Post-processing routine to be executed after nce_add_v4(). This function 3697 * triggers fastpath (if appropriate) and DAD on the newly added nce entry 3698 * and must be called without any locks held. 3699 * 3700 * Always returns 0, but we return an int to keep this symmetric with the 3701 * IPv6 counter-part. 3702 */ 3703 int 3704 nce_add_v4_postprocess(nce_t *nce) 3705 { 3706 ncec_t *ncec = nce->nce_common; 3707 uint16_t flags = ncec->ncec_flags; 3708 boolean_t ndp_need_dad = B_FALSE; 3709 boolean_t dropped; 3710 clock_t delay; 3711 ip_stack_t *ipst = ncec->ncec_ill->ill_ipst; 3712 uchar_t *hw_addr = ncec->ncec_lladdr; 3713 boolean_t trigger_fastpath = B_TRUE; 3714 3715 /* 3716 * If the hw_addr is NULL, typically for ND_INCOMPLETE nces, then 3717 * we call nce_fastpath as soon as the ncec is resolved in nce_process. 3718 * We call nce_fastpath from nce_update if the link layer address of 3719 * the peer changes from nce_update 3720 */ 3721 if (NCE_PUBLISH(ncec) || !NCE_ISREACHABLE(ncec) || (hw_addr == NULL && 3722 ncec->ncec_ill->ill_net_type != IRE_IF_NORESOLVER)) 3723 trigger_fastpath = B_FALSE; 3724 3725 if (trigger_fastpath) 3726 nce_fastpath_trigger(nce); 3727 3728 if (NCE_PUBLISH(ncec) && ncec->ncec_state == ND_PROBE) { 3729 /* 3730 * Either the caller (by passing in ND_PROBE) 3731 * or nce_add_common() (by the internally computed state 3732 * based on ncec_addr and ill_net_type) has determined 3733 * that this unicast entry needs DAD. Trigger DAD. 3734 */ 3735 ndp_need_dad = B_TRUE; 3736 } else if (flags & NCE_F_UNSOL_ADV) { 3737 /* 3738 * We account for the transmit below by assigning one 3739 * less than the ndd variable. Subsequent decrements 3740 * are done in nce_timer. 3741 */ 3742 mutex_enter(&ncec->ncec_lock); 3743 ncec->ncec_unsolicit_count = 3744 ipst->ips_ip_arp_publish_count - 1; 3745 mutex_exit(&ncec->ncec_lock); 3746 dropped = arp_announce(ncec); 3747 mutex_enter(&ncec->ncec_lock); 3748 if (dropped) 3749 ncec->ncec_unsolicit_count++; 3750 else 3751 ncec->ncec_last_time_defended = ddi_get_lbolt(); 3752 if (ncec->ncec_unsolicit_count != 0) { 3753 nce_start_timer(ncec, 3754 ipst->ips_ip_arp_publish_interval); 3755 } 3756 mutex_exit(&ncec->ncec_lock); 3757 } 3758 3759 /* 3760 * If ncec_xmit_interval is 0, user has configured us to send the first 3761 * probe right away. Do so, and set up for the subsequent probes. 3762 */ 3763 if (ndp_need_dad) { 3764 mutex_enter(&ncec->ncec_lock); 3765 if (ncec->ncec_pcnt == 0) { 3766 /* 3767 * DAD probes and announce can be 3768 * administratively disabled by setting the 3769 * probe_count to zero. Restart the timer in 3770 * this case to mark the ipif as ready. 3771 */ 3772 ncec->ncec_unsolicit_count = 0; 3773 mutex_exit(&ncec->ncec_lock); 3774 nce_restart_timer(ncec, 0); 3775 } else { 3776 mutex_exit(&ncec->ncec_lock); 3777 delay = ((ncec->ncec_flags & NCE_F_FAST) ? 3778 ipst->ips_arp_probe_delay : 3779 ipst->ips_arp_fastprobe_delay); 3780 nce_dad(ncec, NULL, (delay == 0 ? B_TRUE : B_FALSE)); 3781 } 3782 } 3783 return (0); 3784 } 3785 3786 /* 3787 * ncec_walk routine to update all entries that have a given destination or 3788 * gateway address and cached link layer (MAC) address. This is used when ARP 3789 * informs us that a network-to-link-layer mapping may have changed. 3790 */ 3791 void 3792 nce_update_hw_changed(ncec_t *ncec, void *arg) 3793 { 3794 nce_hw_map_t *hwm = arg; 3795 ipaddr_t ncec_addr; 3796 3797 if (ncec->ncec_state != ND_REACHABLE) 3798 return; 3799 3800 IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, ncec_addr); 3801 if (ncec_addr != hwm->hwm_addr) 3802 return; 3803 3804 mutex_enter(&ncec->ncec_lock); 3805 if (hwm->hwm_flags != 0) 3806 ncec->ncec_flags = hwm->hwm_flags; 3807 nce_update(ncec, ND_STALE, hwm->hwm_hwaddr); 3808 mutex_exit(&ncec->ncec_lock); 3809 } 3810 3811 void 3812 ncec_refhold(ncec_t *ncec) 3813 { 3814 mutex_enter(&(ncec)->ncec_lock); 3815 (ncec)->ncec_refcnt++; 3816 ASSERT((ncec)->ncec_refcnt != 0); 3817 #ifdef DEBUG 3818 ncec_trace_ref(ncec); 3819 #endif 3820 mutex_exit(&(ncec)->ncec_lock); 3821 } 3822 3823 void 3824 ncec_refhold_notr(ncec_t *ncec) 3825 { 3826 mutex_enter(&(ncec)->ncec_lock); 3827 (ncec)->ncec_refcnt++; 3828 ASSERT((ncec)->ncec_refcnt != 0); 3829 mutex_exit(&(ncec)->ncec_lock); 3830 } 3831 3832 static void 3833 ncec_refhold_locked(ncec_t *ncec) 3834 { 3835 ASSERT(MUTEX_HELD(&(ncec)->ncec_lock)); 3836 (ncec)->ncec_refcnt++; 3837 #ifdef DEBUG 3838 ncec_trace_ref(ncec); 3839 #endif 3840 } 3841 3842 /* ncec_inactive destroys the mutex thus no mutex_exit is needed */ 3843 void 3844 ncec_refrele(ncec_t *ncec) 3845 { 3846 mutex_enter(&(ncec)->ncec_lock); 3847 #ifdef DEBUG 3848 ncec_untrace_ref(ncec); 3849 #endif 3850 ASSERT((ncec)->ncec_refcnt != 0); 3851 if (--(ncec)->ncec_refcnt == 0) { 3852 ncec_inactive(ncec); 3853 } else { 3854 mutex_exit(&(ncec)->ncec_lock); 3855 } 3856 } 3857 3858 void 3859 ncec_refrele_notr(ncec_t *ncec) 3860 { 3861 mutex_enter(&(ncec)->ncec_lock); 3862 ASSERT((ncec)->ncec_refcnt != 0); 3863 if (--(ncec)->ncec_refcnt == 0) { 3864 ncec_inactive(ncec); 3865 } else { 3866 mutex_exit(&(ncec)->ncec_lock); 3867 } 3868 } 3869 3870 /* 3871 * Common to IPv4 and IPv6. 3872 */ 3873 void 3874 nce_restart_timer(ncec_t *ncec, uint_t ms) 3875 { 3876 timeout_id_t tid; 3877 3878 ASSERT(!MUTEX_HELD(&(ncec)->ncec_lock)); 3879 3880 /* First cancel any running timer */ 3881 mutex_enter(&ncec->ncec_lock); 3882 tid = ncec->ncec_timeout_id; 3883 ncec->ncec_timeout_id = 0; 3884 if (tid != 0) { 3885 mutex_exit(&ncec->ncec_lock); 3886 (void) untimeout(tid); 3887 mutex_enter(&ncec->ncec_lock); 3888 } 3889 3890 /* Restart timer */ 3891 nce_start_timer(ncec, ms); 3892 mutex_exit(&ncec->ncec_lock); 3893 } 3894 3895 static void 3896 nce_start_timer(ncec_t *ncec, uint_t ms) 3897 { 3898 ASSERT(MUTEX_HELD(&ncec->ncec_lock)); 3899 /* 3900 * Don't start the timer if the ncec has been deleted, or if the timer 3901 * is already running 3902 */ 3903 if (!NCE_ISCONDEMNED(ncec) && ncec->ncec_timeout_id == 0) { 3904 ncec->ncec_timeout_id = timeout(nce_timer, ncec, 3905 MSEC_TO_TICK(ms) == 0 ? 1 : MSEC_TO_TICK(ms)); 3906 } 3907 } 3908 3909 int 3910 nce_set_multicast_v4(ill_t *ill, const in_addr_t *dst, 3911 uint16_t flags, nce_t **newnce) 3912 { 3913 uchar_t *hw_addr; 3914 int err = 0; 3915 ip_stack_t *ipst = ill->ill_ipst; 3916 in6_addr_t dst6; 3917 nce_t *nce; 3918 3919 ASSERT(!ill->ill_isv6); 3920 3921 IN6_IPADDR_TO_V4MAPPED(*dst, &dst6); 3922 mutex_enter(&ipst->ips_ndp4->ndp_g_lock); 3923 if ((nce = nce_lookup_addr(ill, &dst6)) != NULL) { 3924 mutex_exit(&ipst->ips_ndp4->ndp_g_lock); 3925 goto done; 3926 } 3927 if (ill->ill_net_type == IRE_IF_RESOLVER) { 3928 /* 3929 * For IRE_IF_RESOLVER a hardware mapping can be 3930 * generated, for IRE_IF_NORESOLVER, resolution cookie 3931 * in the ill is copied in nce_add_v4(). 3932 */ 3933 hw_addr = kmem_alloc(ill->ill_phys_addr_length, KM_NOSLEEP); 3934 if (hw_addr == NULL) { 3935 mutex_exit(&ipst->ips_ndp4->ndp_g_lock); 3936 return (ENOMEM); 3937 } 3938 ip_mcast_mapping(ill, (uchar_t *)dst, hw_addr); 3939 } else { 3940 /* 3941 * IRE_IF_NORESOLVER type simply copies the resolution 3942 * cookie passed in. So no hw_addr is needed. 3943 */ 3944 hw_addr = NULL; 3945 } 3946 ASSERT(flags & NCE_F_MCAST); 3947 ASSERT(flags & NCE_F_NONUD); 3948 /* nce_state will be computed by nce_add_common() */ 3949 err = nce_add_v4(ill, hw_addr, ill->ill_phys_addr_length, dst, flags, 3950 ND_UNCHANGED, &nce); 3951 mutex_exit(&ipst->ips_ndp4->ndp_g_lock); 3952 if (err == 0) 3953 err = nce_add_v4_postprocess(nce); 3954 if (hw_addr != NULL) 3955 kmem_free(hw_addr, ill->ill_phys_addr_length); 3956 if (err != 0) { 3957 ip1dbg(("nce_set_multicast_v4: create failed" "%d\n", err)); 3958 return (err); 3959 } 3960 done: 3961 if (newnce != NULL) 3962 *newnce = nce; 3963 else 3964 nce_refrele(nce); 3965 return (0); 3966 } 3967 3968 /* 3969 * This is used when scanning for "old" (least recently broadcast) NCEs. We 3970 * don't want to have to walk the list for every single one, so we gather up 3971 * batches at a time. 3972 */ 3973 #define NCE_RESCHED_LIST_LEN 8 3974 3975 typedef struct { 3976 ill_t *ncert_ill; 3977 uint_t ncert_num; 3978 ncec_t *ncert_nces[NCE_RESCHED_LIST_LEN]; 3979 } nce_resched_t; 3980 3981 /* 3982 * Pick the longest waiting NCEs for defense. 3983 */ 3984 /* ARGSUSED */ 3985 static int 3986 ncec_reschedule(ill_t *ill, nce_t *nce, void *arg) 3987 { 3988 nce_resched_t *ncert = arg; 3989 ncec_t **ncecs; 3990 ncec_t **ncec_max; 3991 ncec_t *ncec_temp; 3992 ncec_t *ncec = nce->nce_common; 3993 3994 ASSERT(ncec->ncec_ill == ncert->ncert_ill); 3995 /* 3996 * Only reachable entries that are ready for announcement are eligible. 3997 */ 3998 if (!NCE_MYADDR(ncec) || ncec->ncec_state != ND_REACHABLE) 3999 return (0); 4000 if (ncert->ncert_num < NCE_RESCHED_LIST_LEN) { 4001 ncec_refhold(ncec); 4002 ncert->ncert_nces[ncert->ncert_num++] = ncec; 4003 } else { 4004 ncecs = ncert->ncert_nces; 4005 ncec_max = ncecs + NCE_RESCHED_LIST_LEN; 4006 ncec_refhold(ncec); 4007 for (; ncecs < ncec_max; ncecs++) { 4008 ASSERT(ncec != NULL); 4009 if ((*ncecs)->ncec_last_time_defended > 4010 ncec->ncec_last_time_defended) { 4011 ncec_temp = *ncecs; 4012 *ncecs = ncec; 4013 ncec = ncec_temp; 4014 } 4015 } 4016 ncec_refrele(ncec); 4017 } 4018 return (0); 4019 } 4020 4021 /* 4022 * Reschedule the ARP defense of any long-waiting NCEs. It's assumed that this 4023 * doesn't happen very often (if at all), and thus it needn't be highly 4024 * optimized. (Note, though, that it's actually O(N) complexity, because the 4025 * outer loop is bounded by a constant rather than by the length of the list.) 4026 */ 4027 static void 4028 nce_ill_reschedule(ill_t *ill, nce_resched_t *ncert) 4029 { 4030 ncec_t *ncec; 4031 ip_stack_t *ipst = ill->ill_ipst; 4032 uint_t i, defend_rate; 4033 4034 i = ill->ill_defend_count; 4035 ill->ill_defend_count = 0; 4036 if (ill->ill_isv6) 4037 defend_rate = ipst->ips_ndp_defend_rate; 4038 else 4039 defend_rate = ipst->ips_arp_defend_rate; 4040 /* If none could be sitting around, then don't reschedule */ 4041 if (i < defend_rate) { 4042 DTRACE_PROBE1(reschedule_none, ill_t *, ill); 4043 return; 4044 } 4045 ncert->ncert_ill = ill; 4046 while (ill->ill_defend_count < defend_rate) { 4047 nce_walk_common(ill, ncec_reschedule, ncert); 4048 for (i = 0; i < ncert->ncert_num; i++) { 4049 4050 ncec = ncert->ncert_nces[i]; 4051 mutex_enter(&ncec->ncec_lock); 4052 ncec->ncec_flags |= NCE_F_DELAYED; 4053 mutex_exit(&ncec->ncec_lock); 4054 /* 4055 * we plan to schedule this ncec, so incr the 4056 * defend_count in anticipation. 4057 */ 4058 if (++ill->ill_defend_count >= defend_rate) 4059 break; 4060 } 4061 if (ncert->ncert_num < NCE_RESCHED_LIST_LEN) 4062 break; 4063 } 4064 } 4065 4066 /* 4067 * Check if the current rate-limiting parameters permit the sending 4068 * of another address defense announcement for both IPv4 and IPv6. 4069 * Returns B_TRUE if rate-limiting is in effect (i.e., send is not 4070 * permitted), and B_FALSE otherwise. The `defend_rate' parameter 4071 * determines how many address defense announcements are permitted 4072 * in any `defense_perio' interval. 4073 */ 4074 static boolean_t 4075 ill_defend_rate_limit(ill_t *ill, ncec_t *ncec) 4076 { 4077 clock_t now = ddi_get_lbolt(); 4078 ip_stack_t *ipst = ill->ill_ipst; 4079 clock_t start = ill->ill_defend_start; 4080 uint32_t elapsed, defend_period, defend_rate; 4081 nce_resched_t ncert; 4082 boolean_t ret; 4083 int i; 4084 4085 if (ill->ill_isv6) { 4086 defend_period = ipst->ips_ndp_defend_period; 4087 defend_rate = ipst->ips_ndp_defend_rate; 4088 } else { 4089 defend_period = ipst->ips_arp_defend_period; 4090 defend_rate = ipst->ips_arp_defend_rate; 4091 } 4092 if (defend_rate == 0) 4093 return (B_TRUE); 4094 bzero(&ncert, sizeof (ncert)); 4095 mutex_enter(&ill->ill_lock); 4096 if (start > 0) { 4097 elapsed = now - start; 4098 if (elapsed > SEC_TO_TICK(defend_period)) { 4099 ill->ill_defend_start = now; 4100 /* 4101 * nce_ill_reschedule will attempt to 4102 * prevent starvation by reschduling the 4103 * oldest entries, which are marked with 4104 * the NCE_F_DELAYED flag. 4105 */ 4106 nce_ill_reschedule(ill, &ncert); 4107 } 4108 } else { 4109 ill->ill_defend_start = now; 4110 } 4111 ASSERT(ill->ill_defend_count <= defend_rate); 4112 mutex_enter(&ncec->ncec_lock); 4113 if (ncec->ncec_flags & NCE_F_DELAYED) { 4114 /* 4115 * This ncec was rescheduled as one of the really old 4116 * entries needing on-going defense. The 4117 * ill_defend_count was already incremented in 4118 * nce_ill_reschedule. Go ahead and send the announce. 4119 */ 4120 ncec->ncec_flags &= ~NCE_F_DELAYED; 4121 mutex_exit(&ncec->ncec_lock); 4122 ret = B_FALSE; 4123 goto done; 4124 } 4125 mutex_exit(&ncec->ncec_lock); 4126 if (ill->ill_defend_count < defend_rate) 4127 ill->ill_defend_count++; 4128 if (ill->ill_defend_count == defend_rate) { 4129 /* 4130 * we are no longer allowed to send unbidden defense 4131 * messages. Wait for rescheduling. 4132 */ 4133 ret = B_TRUE; 4134 } else { 4135 ret = B_FALSE; 4136 } 4137 done: 4138 mutex_exit(&ill->ill_lock); 4139 /* 4140 * After all the locks have been dropped we can restart nce timer, 4141 * and refrele the delayed ncecs 4142 */ 4143 for (i = 0; i < ncert.ncert_num; i++) { 4144 clock_t xmit_interval; 4145 ncec_t *tmp; 4146 4147 tmp = ncert.ncert_nces[i]; 4148 xmit_interval = nce_fuzz_interval(tmp->ncec_xmit_interval, 4149 B_FALSE); 4150 nce_restart_timer(tmp, xmit_interval); 4151 ncec_refrele(tmp); 4152 } 4153 return (ret); 4154 } 4155 4156 boolean_t 4157 ndp_announce(ncec_t *ncec) 4158 { 4159 return (ndp_xmit(ncec->ncec_ill, ND_NEIGHBOR_ADVERT, ncec->ncec_lladdr, 4160 ncec->ncec_lladdr_length, &ncec->ncec_addr, &ipv6_all_hosts_mcast, 4161 nce_advert_flags(ncec))); 4162 } 4163 4164 ill_t * 4165 nce_resolve_src(ncec_t *ncec, in6_addr_t *src) 4166 { 4167 mblk_t *mp; 4168 in6_addr_t src6; 4169 ipaddr_t src4; 4170 ill_t *ill = ncec->ncec_ill; 4171 ill_t *src_ill = NULL; 4172 ipif_t *ipif = NULL; 4173 boolean_t is_myaddr = NCE_MYADDR(ncec); 4174 boolean_t isv6 = (ncec->ncec_ipversion == IPV6_VERSION); 4175 4176 ASSERT(src != NULL); 4177 ASSERT(IN6_IS_ADDR_UNSPECIFIED(src)); 4178 src6 = *src; 4179 if (is_myaddr) { 4180 src6 = ncec->ncec_addr; 4181 if (!isv6) 4182 IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, src4); 4183 } else { 4184 /* 4185 * try to find one from the outgoing packet. 4186 */ 4187 mutex_enter(&ncec->ncec_lock); 4188 mp = ncec->ncec_qd_mp; 4189 if (mp != NULL) { 4190 if (isv6) { 4191 ip6_t *ip6h = (ip6_t *)mp->b_rptr; 4192 4193 src6 = ip6h->ip6_src; 4194 } else { 4195 ipha_t *ipha = (ipha_t *)mp->b_rptr; 4196 4197 src4 = ipha->ipha_src; 4198 IN6_IPADDR_TO_V4MAPPED(src4, &src6); 4199 } 4200 } 4201 mutex_exit(&ncec->ncec_lock); 4202 } 4203 4204 /* 4205 * For outgoing packets, if the src of outgoing packet is one 4206 * of the assigned interface addresses use it, otherwise we 4207 * will pick the source address below. 4208 * For local addresses (is_myaddr) doing DAD, NDP announce 4209 * messages are mcast. So we use the (IPMP) cast_ill or the 4210 * (non-IPMP) ncec_ill for these message types. The only case 4211 * of unicast DAD messages are for IPv6 ND probes, for which 4212 * we find the ipif_bound_ill corresponding to the ncec_addr. 4213 */ 4214 if (!IN6_IS_ADDR_UNSPECIFIED(&src6) || is_myaddr) { 4215 if (isv6) { 4216 ipif = ipif_lookup_addr_nondup_v6(&src6, ill, ALL_ZONES, 4217 ill->ill_ipst); 4218 } else { 4219 ipif = ipif_lookup_addr_nondup(src4, ill, ALL_ZONES, 4220 ill->ill_ipst); 4221 } 4222 4223 /* 4224 * If no relevant ipif can be found, then it's not one of our 4225 * addresses. Reset to :: and try to find a src for the NS or 4226 * ARP request using ipif_select_source_v[4,6] below. 4227 * If an ipif can be found, but it's not yet done with 4228 * DAD verification, and we are not being invoked for 4229 * DAD (i.e., !is_myaddr), then just postpone this 4230 * transmission until later. 4231 */ 4232 if (ipif == NULL) { 4233 src6 = ipv6_all_zeros; 4234 src4 = INADDR_ANY; 4235 } else if (!ipif->ipif_addr_ready && !is_myaddr) { 4236 DTRACE_PROBE2(nce__resolve__ipif__not__ready, 4237 ncec_t *, ncec, ipif_t *, ipif); 4238 ipif_refrele(ipif); 4239 return (NULL); 4240 } 4241 } 4242 4243 if (IN6_IS_ADDR_UNSPECIFIED(&src6) && !is_myaddr) { 4244 /* 4245 * Pick a source address for this solicitation, but 4246 * restrict the selection to addresses assigned to the 4247 * output interface. We do this because the destination will 4248 * create a neighbor cache entry for the source address of 4249 * this packet, so the source address had better be a valid 4250 * neighbor. 4251 */ 4252 if (isv6) { 4253 ipif = ipif_select_source_v6(ill, &ncec->ncec_addr, 4254 B_TRUE, IPV6_PREFER_SRC_DEFAULT, ALL_ZONES, 4255 B_FALSE, NULL); 4256 } else { 4257 ipaddr_t nce_addr; 4258 4259 IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, nce_addr); 4260 ipif = ipif_select_source_v4(ill, nce_addr, ALL_ZONES, 4261 B_FALSE, NULL); 4262 } 4263 if (ipif == NULL && IS_IPMP(ill)) { 4264 ill_t *send_ill = ipmp_ill_get_xmit_ill(ill, B_TRUE); 4265 4266 if (send_ill != NULL) { 4267 if (isv6) { 4268 ipif = ipif_select_source_v6(send_ill, 4269 &ncec->ncec_addr, B_TRUE, 4270 IPV6_PREFER_SRC_DEFAULT, ALL_ZONES, 4271 B_FALSE, NULL); 4272 } else { 4273 IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, 4274 src4); 4275 ipif = ipif_select_source_v4(send_ill, 4276 src4, ALL_ZONES, B_TRUE, NULL); 4277 } 4278 ill_refrele(send_ill); 4279 } 4280 } 4281 4282 if (ipif == NULL) { 4283 char buf[INET6_ADDRSTRLEN]; 4284 4285 ip1dbg(("nce_resolve_src: No source ipif for dst %s\n", 4286 inet_ntop((isv6 ? AF_INET6 : AF_INET), 4287 (char *)&ncec->ncec_addr, buf, sizeof (buf)))); 4288 DTRACE_PROBE1(nce__resolve__no__ipif, ncec_t *, ncec); 4289 return (NULL); 4290 } 4291 src6 = ipif->ipif_v6lcl_addr; 4292 } 4293 *src = src6; 4294 if (ipif != NULL) { 4295 src_ill = ipif->ipif_ill; 4296 if (IS_IPMP(src_ill)) 4297 src_ill = ipmp_ipif_hold_bound_ill(ipif); 4298 else 4299 ill_refhold(src_ill); 4300 ipif_refrele(ipif); 4301 DTRACE_PROBE2(nce__resolve__src__ill, ncec_t *, ncec, 4302 ill_t *, src_ill); 4303 } 4304 return (src_ill); 4305 } 4306 4307 void 4308 ip_nce_lookup_and_update(ipaddr_t *addr, ipif_t *ipif, ip_stack_t *ipst, 4309 uchar_t *hwaddr, int hwaddr_len, int flags) 4310 { 4311 ill_t *ill; 4312 ncec_t *ncec; 4313 nce_t *nce; 4314 uint16_t new_state; 4315 4316 ill = (ipif ? ipif->ipif_ill : NULL); 4317 if (ill != NULL) { 4318 /* 4319 * only one ncec is possible 4320 */ 4321 nce = nce_lookup_v4(ill, addr); 4322 if (nce != NULL) { 4323 ncec = nce->nce_common; 4324 mutex_enter(&ncec->ncec_lock); 4325 if (NCE_ISREACHABLE(ncec)) 4326 new_state = ND_UNCHANGED; 4327 else 4328 new_state = ND_STALE; 4329 ncec->ncec_flags = flags; 4330 nce_update(ncec, new_state, hwaddr); 4331 mutex_exit(&ncec->ncec_lock); 4332 nce_refrele(nce); 4333 return; 4334 } 4335 } else { 4336 /* 4337 * ill is wildcard; clean up all ncec's and ire's 4338 * that match on addr. 4339 */ 4340 nce_hw_map_t hwm; 4341 4342 hwm.hwm_addr = *addr; 4343 hwm.hwm_hwlen = hwaddr_len; 4344 hwm.hwm_hwaddr = hwaddr; 4345 hwm.hwm_flags = flags; 4346 4347 ncec_walk_common(ipst->ips_ndp4, NULL, 4348 (pfi_t)nce_update_hw_changed, (uchar_t *)&hwm, B_TRUE); 4349 } 4350 } 4351 4352 /* 4353 * Common function to add ncec entries. 4354 * we always add the ncec with ncec_ill == ill, and always create 4355 * nce_t on ncec_ill. A dlpi fastpath message may be triggered if the 4356 * ncec is !reachable. 4357 * 4358 * When the caller passes in an nce_state of ND_UNCHANGED, 4359 * nce_add_common() will determine the state of the created nce based 4360 * on the ill_net_type and nce_flags used. Otherwise, the nce will 4361 * be created with state set to the passed in nce_state. 4362 */ 4363 static int 4364 nce_add_common(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len, 4365 const in6_addr_t *addr, uint16_t flags, uint16_t nce_state, nce_t **retnce) 4366 { 4367 static ncec_t nce_nil; 4368 uchar_t *template = NULL; 4369 int err; 4370 ncec_t *ncec; 4371 ncec_t **ncep; 4372 ip_stack_t *ipst = ill->ill_ipst; 4373 uint16_t state; 4374 boolean_t fastprobe = B_FALSE; 4375 struct ndp_g_s *ndp; 4376 nce_t *nce = NULL; 4377 mblk_t *dlur_mp = NULL; 4378 4379 if (ill->ill_isv6) 4380 ndp = ill->ill_ipst->ips_ndp6; 4381 else 4382 ndp = ill->ill_ipst->ips_ndp4; 4383 4384 *retnce = NULL; 4385 4386 ASSERT(MUTEX_HELD(&ndp->ndp_g_lock)); 4387 4388 if (IN6_IS_ADDR_UNSPECIFIED(addr)) { 4389 ip0dbg(("nce_add_common: no addr\n")); 4390 return (EINVAL); 4391 } 4392 if ((flags & ~NCE_EXTERNAL_FLAGS_MASK)) { 4393 ip0dbg(("nce_add_common: flags = %x\n", (int)flags)); 4394 return (EINVAL); 4395 } 4396 4397 if (ill->ill_isv6) { 4398 ncep = ((ncec_t **)NCE_HASH_PTR_V6(ipst, *addr)); 4399 } else { 4400 ipaddr_t v4addr; 4401 4402 IN6_V4MAPPED_TO_IPADDR(addr, v4addr); 4403 ncep = ((ncec_t **)NCE_HASH_PTR_V4(ipst, v4addr)); 4404 } 4405 4406 /* 4407 * The caller has ensured that there is no nce on ill, but there could 4408 * still be an nce_common_t for the address, so that we find exisiting 4409 * ncec_t strucutures first, and atomically add a new nce_t if 4410 * one is found. The ndp_g_lock ensures that we don't cross threads 4411 * with an ncec_delete(). Unlike ncec_lookup_illgrp() we do not 4412 * compare for matches across the illgrp because this function is 4413 * called via nce_lookup_then_add_v* -> nce_add_v* -> nce_add_common, 4414 * with the nce_lookup_then_add_v* passing in the ipmp_ill where 4415 * appropriate. 4416 */ 4417 ncec = *ncep; 4418 for (; ncec != NULL; ncec = ncec->ncec_next) { 4419 if (ncec->ncec_ill == ill) { 4420 if (IN6_ARE_ADDR_EQUAL(&ncec->ncec_addr, addr)) { 4421 /* 4422 * We should never find *retnce to be 4423 * MYADDR, since the caller may then 4424 * incorrectly restart a DAD timer that's 4425 * already running. However, if we are in 4426 * forwarding mode, and the interface is 4427 * moving in/out of groups, the data 4428 * path ire lookup (e.g., ire_revalidate_nce) 4429 * may have determined that some destination 4430 * is offlink while the control path is adding 4431 * that address as a local address. 4432 * Recover from this case by failing the 4433 * lookup 4434 */ 4435 if (NCE_MYADDR(ncec)) 4436 return (ENXIO); 4437 *retnce = nce_ill_lookup_then_add(ill, ncec); 4438 if (*retnce != NULL) 4439 break; 4440 } 4441 } 4442 } 4443 if (*retnce != NULL) /* caller must trigger fastpath on nce */ 4444 return (0); 4445 4446 ncec = kmem_cache_alloc(ncec_cache, KM_NOSLEEP); 4447 if (ncec == NULL) 4448 return (ENOMEM); 4449 *ncec = nce_nil; 4450 ncec->ncec_ill = ill; 4451 ncec->ncec_ipversion = (ill->ill_isv6 ? IPV6_VERSION : IPV4_VERSION); 4452 ncec->ncec_flags = flags; 4453 ncec->ncec_ipst = ipst; /* No netstack_hold */ 4454 4455 if (!ill->ill_isv6) { 4456 ipaddr_t addr4; 4457 4458 /* 4459 * DAD probe interval and probe count are set based on 4460 * fast/slow probe settings. If the underlying link doesn't 4461 * have reliably up/down notifications or if we're working 4462 * with IPv4 169.254.0.0/16 Link Local Address space, then 4463 * don't use the fast timers. Otherwise, use them. 4464 */ 4465 ASSERT(IN6_IS_ADDR_V4MAPPED(addr)); 4466 IN6_V4MAPPED_TO_IPADDR(addr, addr4); 4467 if (ill->ill_note_link && !IS_IPV4_LL_SPACE(&addr4)) 4468 fastprobe = B_TRUE; 4469 if (fastprobe) { 4470 ncec->ncec_xmit_interval = 4471 ipst->ips_arp_fastprobe_interval; 4472 ncec->ncec_pcnt = 4473 ipst->ips_arp_fastprobe_count; 4474 ncec->ncec_flags |= NCE_F_FAST; 4475 } else { 4476 ncec->ncec_xmit_interval = 4477 ipst->ips_arp_probe_interval; 4478 ncec->ncec_pcnt = 4479 ipst->ips_arp_probe_count; 4480 } 4481 if (NCE_PUBLISH(ncec)) { 4482 ncec->ncec_unsolicit_count = 4483 ipst->ips_ip_arp_publish_count; 4484 } 4485 } else { 4486 /* 4487 * probe interval is constant: ILL_PROBE_INTERVAL 4488 * probe count is constant: ND_MAX_UNICAST_SOLICIT 4489 */ 4490 ncec->ncec_pcnt = ND_MAX_UNICAST_SOLICIT; 4491 if (NCE_PUBLISH(ncec)) { 4492 ncec->ncec_unsolicit_count = 4493 ipst->ips_ip_ndp_unsolicit_count; 4494 } 4495 } 4496 ncec->ncec_rcnt = ill->ill_xmit_count; 4497 ncec->ncec_addr = *addr; 4498 ncec->ncec_qd_mp = NULL; 4499 ncec->ncec_refcnt = 1; /* for ncec getting created */ 4500 mutex_init(&ncec->ncec_lock, NULL, MUTEX_DEFAULT, NULL); 4501 ncec->ncec_trace_disable = B_FALSE; 4502 4503 /* 4504 * ncec_lladdr holds link layer address 4505 */ 4506 if (hw_addr_len > 0) { 4507 template = kmem_alloc(hw_addr_len, KM_NOSLEEP); 4508 if (template == NULL) { 4509 err = ENOMEM; 4510 goto err_ret; 4511 } 4512 ncec->ncec_lladdr = template; 4513 ncec->ncec_lladdr_length = hw_addr_len; 4514 bzero(ncec->ncec_lladdr, hw_addr_len); 4515 } 4516 if ((flags & NCE_F_BCAST) != 0) { 4517 state = ND_REACHABLE; 4518 ASSERT(hw_addr_len > 0); 4519 } else if (ill->ill_net_type == IRE_IF_RESOLVER) { 4520 state = ND_INITIAL; 4521 } else if (ill->ill_net_type == IRE_IF_NORESOLVER) { 4522 /* 4523 * NORESOLVER entries are always created in the REACHABLE 4524 * state. 4525 */ 4526 state = ND_REACHABLE; 4527 if (ill->ill_phys_addr_length == IP_ADDR_LEN && 4528 ill->ill_mactype != DL_IPV4 && 4529 ill->ill_mactype != DL_6TO4) { 4530 /* 4531 * We create a nce_res_mp with the IP nexthop address 4532 * as the destination address if the physical length 4533 * is exactly 4 bytes for point-to-multipoint links 4534 * that do their own resolution from IP to link-layer 4535 * address (e.g. IP over X.25). 4536 */ 4537 bcopy((uchar_t *)addr, 4538 ncec->ncec_lladdr, ill->ill_phys_addr_length); 4539 } 4540 if (ill->ill_phys_addr_length == IPV6_ADDR_LEN && 4541 ill->ill_mactype != DL_IPV6) { 4542 /* 4543 * We create a nce_res_mp with the IP nexthop address 4544 * as the destination address if the physical legnth 4545 * is exactly 16 bytes for point-to-multipoint links 4546 * that do their own resolution from IP to link-layer 4547 * address. 4548 */ 4549 bcopy((uchar_t *)addr, 4550 ncec->ncec_lladdr, ill->ill_phys_addr_length); 4551 } 4552 /* 4553 * Since NUD is not part of the base IPv4 protocol definition, 4554 * IPv4 neighbor entries on NORESOLVER interfaces will never 4555 * age, and are marked NCE_F_NONUD. 4556 */ 4557 if (!ill->ill_isv6) 4558 ncec->ncec_flags |= NCE_F_NONUD; 4559 } else if (ill->ill_net_type == IRE_LOOPBACK) { 4560 state = ND_REACHABLE; 4561 } 4562 4563 if (hw_addr != NULL || ill->ill_net_type == IRE_IF_NORESOLVER) { 4564 /* 4565 * We are adding an ncec with a deterministic hw_addr, 4566 * so the state can only be one of {REACHABLE, STALE, PROBE}. 4567 * 4568 * if we are adding a unicast ncec for the local address 4569 * it would be REACHABLE; we would be adding a ND_STALE entry 4570 * for the requestor of an ARP_REQUEST/ND_SOLICIT. Our own 4571 * addresses are added in PROBE to trigger DAD. 4572 */ 4573 if ((flags & (NCE_F_MCAST|NCE_F_BCAST)) || 4574 ill->ill_net_type == IRE_IF_NORESOLVER) 4575 state = ND_REACHABLE; 4576 else if (!NCE_PUBLISH(ncec)) 4577 state = ND_STALE; 4578 else 4579 state = ND_PROBE; 4580 if (hw_addr != NULL) 4581 nce_set_ll(ncec, hw_addr); 4582 } 4583 /* caller overrides internally computed state */ 4584 if (nce_state != ND_UNCHANGED) 4585 state = nce_state; 4586 4587 if (state == ND_PROBE) 4588 ncec->ncec_flags |= NCE_F_UNVERIFIED; 4589 4590 ncec->ncec_state = state; 4591 4592 if (state == ND_REACHABLE) { 4593 ncec->ncec_last = ncec->ncec_init_time = 4594 TICK_TO_MSEC(ddi_get_lbolt64()); 4595 } else { 4596 ncec->ncec_last = 0; 4597 if (state == ND_INITIAL) 4598 ncec->ncec_init_time = TICK_TO_MSEC(ddi_get_lbolt64()); 4599 } 4600 list_create(&ncec->ncec_cb, sizeof (ncec_cb_t), 4601 offsetof(ncec_cb_t, ncec_cb_node)); 4602 /* 4603 * have all the memory allocations out of the way before taking locks 4604 * and adding the nce. 4605 */ 4606 nce = kmem_cache_alloc(nce_cache, KM_NOSLEEP); 4607 if (nce == NULL) { 4608 err = ENOMEM; 4609 goto err_ret; 4610 } 4611 if (ncec->ncec_lladdr != NULL || 4612 ill->ill_net_type == IRE_IF_NORESOLVER) { 4613 dlur_mp = ill_dlur_gen(ncec->ncec_lladdr, 4614 ill->ill_phys_addr_length, ill->ill_sap, 4615 ill->ill_sap_length); 4616 if (dlur_mp == NULL) { 4617 err = ENOMEM; 4618 goto err_ret; 4619 } 4620 } 4621 4622 /* 4623 * Atomically ensure that the ill is not CONDEMNED, before 4624 * adding the NCE. 4625 */ 4626 mutex_enter(&ill->ill_lock); 4627 if (ill->ill_state_flags & ILL_CONDEMNED) { 4628 mutex_exit(&ill->ill_lock); 4629 err = EINVAL; 4630 goto err_ret; 4631 } 4632 if (!NCE_MYADDR(ncec) && 4633 (ill->ill_state_flags & ILL_DOWN_IN_PROGRESS)) { 4634 mutex_exit(&ill->ill_lock); 4635 DTRACE_PROBE1(nce__add__on__down__ill, ncec_t *, ncec); 4636 err = EINVAL; 4637 goto err_ret; 4638 } 4639 /* 4640 * Acquire the ncec_lock even before adding the ncec to the list 4641 * so that it cannot get deleted after the ncec is added, but 4642 * before we add the nce. 4643 */ 4644 mutex_enter(&ncec->ncec_lock); 4645 if ((ncec->ncec_next = *ncep) != NULL) 4646 ncec->ncec_next->ncec_ptpn = &ncec->ncec_next; 4647 *ncep = ncec; 4648 ncec->ncec_ptpn = ncep; 4649 4650 /* Bump up the number of ncec's referencing this ill */ 4651 DTRACE_PROBE3(ill__incr__cnt, (ill_t *), ill, 4652 (char *), "ncec", (void *), ncec); 4653 ill->ill_ncec_cnt++; 4654 /* 4655 * Since we hold the ncec_lock at this time, the ncec cannot be 4656 * condemned, and we can safely add the nce. 4657 */ 4658 *retnce = nce_add_impl(ill, ncec, nce, dlur_mp); 4659 mutex_exit(&ncec->ncec_lock); 4660 mutex_exit(&ill->ill_lock); 4661 4662 /* caller must trigger fastpath on *retnce */ 4663 return (0); 4664 4665 err_ret: 4666 if (ncec != NULL) 4667 kmem_cache_free(ncec_cache, ncec); 4668 if (nce != NULL) 4669 kmem_cache_free(nce_cache, nce); 4670 freemsg(dlur_mp); 4671 if (template != NULL) 4672 kmem_free(template, ill->ill_phys_addr_length); 4673 return (err); 4674 } 4675 4676 /* 4677 * take a ref on the nce 4678 */ 4679 void 4680 nce_refhold(nce_t *nce) 4681 { 4682 mutex_enter(&nce->nce_lock); 4683 nce->nce_refcnt++; 4684 ASSERT((nce)->nce_refcnt != 0); 4685 mutex_exit(&nce->nce_lock); 4686 } 4687 4688 /* 4689 * release a ref on the nce; In general, this 4690 * cannot be called with locks held because nce_inactive 4691 * may result in nce_inactive which will take the ill_lock, 4692 * do ipif_ill_refrele_tail etc. Thus the one exception 4693 * where this can be called with locks held is when the caller 4694 * is certain that the nce_refcnt is sufficient to prevent 4695 * the invocation of nce_inactive. 4696 */ 4697 void 4698 nce_refrele(nce_t *nce) 4699 { 4700 ASSERT((nce)->nce_refcnt != 0); 4701 mutex_enter(&nce->nce_lock); 4702 if (--nce->nce_refcnt == 0) 4703 nce_inactive(nce); /* destroys the mutex */ 4704 else 4705 mutex_exit(&nce->nce_lock); 4706 } 4707 4708 /* 4709 * free the nce after all refs have gone away. 4710 */ 4711 static void 4712 nce_inactive(nce_t *nce) 4713 { 4714 ill_t *ill = nce->nce_ill; 4715 4716 ASSERT(nce->nce_refcnt == 0); 4717 4718 ncec_refrele_notr(nce->nce_common); 4719 nce->nce_common = NULL; 4720 freemsg(nce->nce_fp_mp); 4721 freemsg(nce->nce_dlur_mp); 4722 4723 mutex_enter(&ill->ill_lock); 4724 DTRACE_PROBE3(ill__decr__cnt, (ill_t *), ill, 4725 (char *), "nce", (void *), nce); 4726 ill->ill_nce_cnt--; 4727 nce->nce_ill = NULL; 4728 /* 4729 * If the number of ncec's associated with this ill have dropped 4730 * to zero, check whether we need to restart any operation that 4731 * is waiting for this to happen. 4732 */ 4733 if (ILL_DOWN_OK(ill)) { 4734 /* ipif_ill_refrele_tail drops the ill_lock */ 4735 ipif_ill_refrele_tail(ill); 4736 } else { 4737 mutex_exit(&ill->ill_lock); 4738 } 4739 4740 mutex_destroy(&nce->nce_lock); 4741 kmem_cache_free(nce_cache, nce); 4742 } 4743 4744 /* 4745 * Add an nce to the ill_nce list. 4746 */ 4747 static nce_t * 4748 nce_add_impl(ill_t *ill, ncec_t *ncec, nce_t *nce, mblk_t *dlur_mp) 4749 { 4750 bzero(nce, sizeof (*nce)); 4751 mutex_init(&nce->nce_lock, NULL, MUTEX_DEFAULT, NULL); 4752 nce->nce_common = ncec; 4753 nce->nce_addr = ncec->ncec_addr; 4754 nce->nce_ill = ill; 4755 DTRACE_PROBE3(ill__incr__cnt, (ill_t *), ill, 4756 (char *), "nce", (void *), nce); 4757 ill->ill_nce_cnt++; 4758 4759 nce->nce_refcnt = 1; /* for the thread */ 4760 ncec->ncec_refcnt++; /* want ncec_refhold_locked_notr(ncec) */ 4761 nce->nce_dlur_mp = dlur_mp; 4762 4763 /* add nce to the ill's fastpath list. */ 4764 nce->nce_refcnt++; /* for the list */ 4765 list_insert_head(&ill->ill_nce, nce); 4766 return (nce); 4767 } 4768 4769 static nce_t * 4770 nce_add(ill_t *ill, ncec_t *ncec) 4771 { 4772 nce_t *nce; 4773 mblk_t *dlur_mp = NULL; 4774 4775 ASSERT(MUTEX_HELD(&ill->ill_lock)); 4776 ASSERT(MUTEX_HELD(&ncec->ncec_lock)); 4777 4778 nce = kmem_cache_alloc(nce_cache, KM_NOSLEEP); 4779 if (nce == NULL) 4780 return (NULL); 4781 if (ncec->ncec_lladdr != NULL || 4782 ill->ill_net_type == IRE_IF_NORESOLVER) { 4783 dlur_mp = ill_dlur_gen(ncec->ncec_lladdr, 4784 ill->ill_phys_addr_length, ill->ill_sap, 4785 ill->ill_sap_length); 4786 if (dlur_mp == NULL) { 4787 kmem_cache_free(nce_cache, nce); 4788 return (NULL); 4789 } 4790 } 4791 return (nce_add_impl(ill, ncec, nce, dlur_mp)); 4792 } 4793 4794 /* 4795 * remove the nce from the ill_faspath list 4796 */ 4797 void 4798 nce_delete(nce_t *nce) 4799 { 4800 ill_t *ill = nce->nce_ill; 4801 4802 ASSERT(MUTEX_HELD(&ill->ill_lock)); 4803 4804 mutex_enter(&nce->nce_lock); 4805 if (nce->nce_is_condemned) { 4806 /* 4807 * some other thread has removed this nce from the ill_nce list 4808 */ 4809 mutex_exit(&nce->nce_lock); 4810 return; 4811 } 4812 nce->nce_is_condemned = B_TRUE; 4813 mutex_exit(&nce->nce_lock); 4814 4815 list_remove(&ill->ill_nce, nce); 4816 /* 4817 * even though we are holding the ill_lock, it is ok to 4818 * call nce_refrele here because we know that we should have 4819 * at least 2 refs on the nce: one for the thread, and one 4820 * for the list. The refrele below will release the one for 4821 * the list. 4822 */ 4823 nce_refrele(nce); 4824 } 4825 4826 nce_t * 4827 nce_lookup(ill_t *ill, const in6_addr_t *addr) 4828 { 4829 nce_t *nce = NULL; 4830 4831 ASSERT(ill != NULL); 4832 ASSERT(MUTEX_HELD(&ill->ill_lock)); 4833 4834 for (nce = list_head(&ill->ill_nce); nce != NULL; 4835 nce = list_next(&ill->ill_nce, nce)) { 4836 if (IN6_ARE_ADDR_EQUAL(&nce->nce_addr, addr)) 4837 break; 4838 } 4839 4840 /* 4841 * if we found the nce on the ill_nce list while holding 4842 * the ill_lock, then it cannot be condemned yet. 4843 */ 4844 if (nce != NULL) { 4845 ASSERT(!nce->nce_is_condemned); 4846 nce_refhold(nce); 4847 } 4848 return (nce); 4849 } 4850 4851 /* 4852 * Walk the ill_nce list on ill. The callback function func() cannot perform 4853 * any destructive actions. 4854 */ 4855 static void 4856 nce_walk_common(ill_t *ill, pfi_t func, void *arg) 4857 { 4858 nce_t *nce = NULL, *nce_next; 4859 4860 ASSERT(MUTEX_HELD(&ill->ill_lock)); 4861 for (nce = list_head(&ill->ill_nce); nce != NULL; ) { 4862 nce_next = list_next(&ill->ill_nce, nce); 4863 if (func(ill, nce, arg) != 0) 4864 break; 4865 nce = nce_next; 4866 } 4867 } 4868 4869 void 4870 nce_walk(ill_t *ill, pfi_t func, void *arg) 4871 { 4872 mutex_enter(&ill->ill_lock); 4873 nce_walk_common(ill, func, arg); 4874 mutex_exit(&ill->ill_lock); 4875 } 4876 4877 void 4878 nce_flush(ill_t *ill, boolean_t flushall) 4879 { 4880 nce_t *nce, *nce_next; 4881 list_t dead; 4882 4883 list_create(&dead, sizeof (nce_t), offsetof(nce_t, nce_node)); 4884 mutex_enter(&ill->ill_lock); 4885 for (nce = list_head(&ill->ill_nce); nce != NULL; ) { 4886 nce_next = list_next(&ill->ill_nce, nce); 4887 if (!flushall && NCE_PUBLISH(nce->nce_common)) { 4888 nce = nce_next; 4889 continue; 4890 } 4891 /* 4892 * nce_delete requires that the caller should either not 4893 * be holding locks, or should hold a ref to ensure that 4894 * we wont hit ncec_inactive. So take a ref and clean up 4895 * after the list is flushed. 4896 */ 4897 nce_refhold(nce); 4898 nce_delete(nce); 4899 list_insert_tail(&dead, nce); 4900 nce = nce_next; 4901 } 4902 mutex_exit(&ill->ill_lock); 4903 while ((nce = list_head(&dead)) != NULL) { 4904 list_remove(&dead, nce); 4905 nce_refrele(nce); 4906 } 4907 ASSERT(list_is_empty(&dead)); 4908 list_destroy(&dead); 4909 } 4910 4911 /* Return an interval that is anywhere in the [1 .. intv] range */ 4912 static clock_t 4913 nce_fuzz_interval(clock_t intv, boolean_t initial_time) 4914 { 4915 clock_t rnd, frac; 4916 4917 (void) random_get_pseudo_bytes((uint8_t *)&rnd, sizeof (rnd)); 4918 /* Note that clock_t is signed; must chop off bits */ 4919 rnd &= (1ul << (NBBY * sizeof (rnd) - 1)) - 1; 4920 if (initial_time) { 4921 if (intv <= 0) 4922 intv = 1; 4923 else 4924 intv = (rnd % intv) + 1; 4925 } else { 4926 /* Compute 'frac' as 20% of the configured interval */ 4927 if ((frac = intv / 5) <= 1) 4928 frac = 2; 4929 /* Set intv randomly in the range [intv-frac .. intv+frac] */ 4930 if ((intv = intv - frac + rnd % (2 * frac + 1)) <= 0) 4931 intv = 1; 4932 } 4933 return (intv); 4934 } 4935 4936 void 4937 nce_resolv_ipmp_ok(ncec_t *ncec) 4938 { 4939 mblk_t *mp; 4940 uint_t pkt_len; 4941 iaflags_t ixaflags = IXAF_NO_TRACE; 4942 nce_t *under_nce; 4943 ill_t *ill = ncec->ncec_ill; 4944 boolean_t isv6 = (ncec->ncec_ipversion == IPV6_VERSION); 4945 ipif_t *src_ipif = NULL; 4946 ip_stack_t *ipst = ill->ill_ipst; 4947 ill_t *send_ill; 4948 uint_t nprobes; 4949 4950 ASSERT(IS_IPMP(ill)); 4951 4952 mutex_enter(&ncec->ncec_lock); 4953 nprobes = ncec->ncec_nprobes; 4954 mp = ncec->ncec_qd_mp; 4955 ncec->ncec_qd_mp = NULL; 4956 ncec->ncec_nprobes = 0; 4957 mutex_exit(&ncec->ncec_lock); 4958 4959 while (mp != NULL) { 4960 mblk_t *nxt_mp; 4961 4962 nxt_mp = mp->b_next; 4963 mp->b_next = NULL; 4964 if (isv6) { 4965 ip6_t *ip6h = (ip6_t *)mp->b_rptr; 4966 4967 pkt_len = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN; 4968 src_ipif = ipif_lookup_addr_nondup_v6(&ip6h->ip6_src, 4969 ill, ALL_ZONES, ipst); 4970 } else { 4971 ipha_t *ipha = (ipha_t *)mp->b_rptr; 4972 4973 ixaflags |= IXAF_IS_IPV4; 4974 pkt_len = ntohs(ipha->ipha_length); 4975 src_ipif = ipif_lookup_addr_nondup(ipha->ipha_src, 4976 ill, ALL_ZONES, ipst); 4977 } 4978 4979 /* 4980 * find a new nce based on an under_ill. The first IPMP probe 4981 * packet gets queued, so we could still find a src_ipif that 4982 * matches an IPMP test address. 4983 */ 4984 if (src_ipif == NULL || IS_IPMP(src_ipif->ipif_ill)) { 4985 /* 4986 * if src_ipif is null, this could be either a 4987 * forwarded packet or a probe whose src got deleted. 4988 * We identify the former case by looking for the 4989 * ncec_nprobes: the first ncec_nprobes packets are 4990 * probes; 4991 */ 4992 if (src_ipif == NULL && nprobes > 0) 4993 goto drop_pkt; 4994 4995 /* 4996 * For forwarded packets, we use the ipmp rotor 4997 * to find send_ill. 4998 */ 4999 send_ill = ipmp_ill_get_xmit_ill(ncec->ncec_ill, 5000 B_TRUE); 5001 } else { 5002 send_ill = src_ipif->ipif_ill; 5003 ill_refhold(send_ill); 5004 } 5005 5006 DTRACE_PROBE4(nce__resolve__ipmp, (mblk_t *), mp, 5007 (ncec_t *), ncec, (ipif_t *), 5008 src_ipif, (ill_t *), send_ill); 5009 5010 if (send_ill == NULL) { 5011 if (src_ipif != NULL) 5012 ipif_refrele(src_ipif); 5013 goto drop_pkt; 5014 } 5015 /* create an under_nce on send_ill */ 5016 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 5017 if (IS_IN_SAME_ILLGRP(send_ill, ncec->ncec_ill)) 5018 under_nce = nce_fastpath_create(send_ill, ncec); 5019 else 5020 under_nce = NULL; 5021 rw_exit(&ipst->ips_ill_g_lock); 5022 if (under_nce != NULL && NCE_ISREACHABLE(ncec)) 5023 nce_fastpath_trigger(under_nce); 5024 5025 ill_refrele(send_ill); 5026 if (src_ipif != NULL) 5027 ipif_refrele(src_ipif); 5028 5029 if (under_nce != NULL) { 5030 (void) ip_xmit(mp, under_nce, ixaflags, pkt_len, 0, 5031 ALL_ZONES, 0, NULL); 5032 nce_refrele(under_nce); 5033 if (nprobes > 0) 5034 nprobes--; 5035 mp = nxt_mp; 5036 continue; 5037 } 5038 drop_pkt: 5039 if (isv6) { 5040 BUMP_MIB(&ipst->ips_ip6_mib, ipIfStatsOutDiscards); 5041 } else { 5042 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); 5043 } 5044 ip_drop_output("ipIfStatsOutDiscards - no under_ill", mp, NULL); 5045 freemsg(mp); 5046 if (nprobes > 0) 5047 nprobes--; 5048 mp = nxt_mp; 5049 } 5050 ncec_cb_dispatch(ncec); /* complete callbacks */ 5051 } 5052